In [279]:
import pandas as pd
import numpy as np
import re

import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('omw-1.4')
# nltk.download('punkt')

In [280]:
df = pd.read_csv("data/tweet_emotions.csv")
df.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


In [281]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   tweet_id   40000 non-null  int64 
 1   sentiment  40000 non-null  object
 2   content    40000 non-null  object
dtypes: int64(1), object(2)
memory usage: 937.6+ KB


In [282]:
df.isnull().sum()

tweet_id     0
sentiment    0
content      0
dtype: int64

## We dont need the tweet_id

In [283]:
df.drop(columns=['tweet_id'],inplace=True)

In [284]:
df.head()

Unnamed: 0,sentiment,content
0,empty,@tiffanylue i know i was listenin to bad habi...
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
3,enthusiasm,wants to hang out with friends SOON!
4,neutral,@dannycastillo We want to trade with someone w...


## Changing sentiment to numeric labels

In [285]:
df.drop(df[df['sentiment'] == 'empty'].index,inplace=True)

In [286]:
df['sentiment'].value_counts()

sentiment
neutral       8638
worry         8459
happiness     5209
sadness       5165
love          3842
surprise      2187
fun           1776
relief        1526
hate          1323
enthusiasm     759
boredom        179
anger          110
Name: count, dtype: int64

In [287]:
df['sentiment'].unique()


array(['sadness', 'enthusiasm', 'neutral', 'worry', 'surprise', 'love',
       'fun', 'hate', 'happiness', 'boredom', 'relief', 'anger'],
      dtype=object)

In [288]:
sentiment_mapping={
    'sadness':0,
    'worry':1,
    'anger':2,
    'hate':2,
    'neutral':3,
    'boredom':3,
    'enthusiasm':4,
    'surprise':4,
    'fun':4,
    'relief':5,
    'love':6,
    'happiness':7
}

In [289]:
df['label'] = df['sentiment'].map(sentiment_mapping)
# df.rename(columns={"sentiment":"label"},inplace=True)
df.head()

Unnamed: 0,sentiment,content,label
1,sadness,Layin n bed with a headache ughhhh...waitin o...,0
2,sadness,Funeral ceremony...gloomy friday...,0
3,enthusiasm,wants to hang out with friends SOON!,4
4,neutral,@dannycastillo We want to trade with someone w...,3
5,worry,Re-pinging @ghostridah14: why didn't you go to...,1


In [290]:
df['label'].value_counts()


label
3    8817
1    8459
7    5209
0    5165
4    4722
6    3842
5    1526
2    1433
Name: count, dtype: int64

In [291]:
X=df['content']
y=df['label']
print(len(X))
print(len(y))
print(len(X)==len(y))

39173
39173
True


## Data cleaning

In [292]:
def clean_text_1(text):
    #removing @ tags 
    text=re.sub(r'@[a-zA-z0-9]+\s*',' ',str(text))
    
    #removing urls
    text=re.sub(r'http\S+', '', text)
    text=re.sub(r'www\.\S+', '', text)
    
    #replacing multiple whitesapces by a single
    text = re.sub(r'\s+',' ',text)
    
    # remove all single characters(surrounded by whitespace)
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    
    # remove all single characters except i and I (surrounded by whitespace)
    # text = re.sub(r'\s+(?![iI])[a-zA-Z]\s+', ' ', text)
    
    
    # Converting to Lowercase 
    text = text.lower()
    
    # Lemmatization- splits into list of words ['The', 'quick', ....]
    text = text.split()

    lemma = WordNetLemmatizer()
    text = [lemma.lemmatize(word) for word in text]
    text = ' '.join(text)
    
    return text

In [293]:
df['content']=df['content'].apply(clean_text_1)

## Stop words are common enginlish words that dont contain too much information

In [294]:
print(nltk.word_tokenize("a big girl is sg"))
set(nltk.word_tokenize("a big girl is sg"))&stop_words

['a', 'big', 'girl', 'is', 'sg']


{'a', 'is'}

In [295]:
def count_stopwords(text):
    stop_words = set(stopwords.words("english"))
    words = nltk.word_tokenize(text)
    return len(set(words) & stop_words)

def list_stopwords(text):
    stop_words = set(stopwords.words("english"))
    words = nltk.word_tokenize(text)
    return list(set(words) & stop_words)

def remove_stopwords(text):
    # Tokenize the text into words
    words = nltk.word_tokenize(text)
    
    # Get the list of stopwords
    stop_words = set(stopwords.words('english'))
    
    # Remove stopwords from the text
    filtered_words = [word for word in words if word.lower() not in stop_words]
    
    # Recreate the text without stopwords
    filtered_text = ' '.join(filtered_words)
    
    return filtered_text

In [296]:

# Add a new col 'stop_words' that contains the count of stopwords
df['stop_words'] = df['content'].apply(count_stopwords)
value_counts = df['stop_words'].value_counts()
df.head()

Unnamed: 0,sentiment,content,label,stop_words
1,sadness,layin bed with headache ughhhh...waitin on you...,0,3
2,sadness,funeral ceremony...gloomy friday...,0,0
3,enthusiasm,want to hang out with friend soon!,4,3
4,neutral,we want to trade with someone who ha houston t...,3,7
5,worry,re-pinging : why didn't you go to prom? bc my ...,1,5


In [297]:
# Lets examine a row 
df.iloc[3]['content'],df.iloc[3]['stop_words'],list_stopwords(df.iloc[3]['content'])

('we want to trade with someone who ha houston tickets, but no one will.',
 7,
 ['no', 'with', 'will', 'we', 'who', 'to', 'but'])

In [298]:
df['content'] = df['content'].apply(remove_stopwords)
df.head()

Unnamed: 0,sentiment,content,label,stop_words
1,sadness,layin bed headache ughhhh ... waitin call ...,0,3
2,sadness,funeral ceremony ... gloomy friday ...,0,0
3,enthusiasm,want hang friend soon !,4,3
4,neutral,"want trade someone ha houston tickets , one .",3,7
5,worry,re-pinging : n't go prom ? bc bf n't like friend,1,5


## Data Cleaning

In [299]:
def clean_text_2(text):
    #converting special character and numbers to whitespaces
    text = re.sub(r'\W+',' ',text)
    text=re.sub(r'\d+', '', text)
    #replacing multiple whitesapces by a single
    text = re.sub(r'\s+',' ',text)
    
    # remove all single characters(surrounded by whitespace)
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    
    # remove all single characters except i and I (surrounded by whitespace)
    # text = re.sub(r'\s+(?![iI])[a-zA-Z]\s+', ' ', text)
    
    
    # Converting to Lowercase 
    text = text.lower()
    
    # Lemmatization- splits into list of words ['The', 'quick', ....]
    text = text.split()

    lemma = WordNetLemmatizer()
    text = [lemma.lemmatize(word) for word in text]
    text = ' '.join(text)
    
    return text

In [300]:
# # Cleaning 
# cleaned = []

# for i in range(0, len(X)):
    
#     #removing @ tags
#     text=re.sub(r'@[a-zA-z0-9]+\s*',' ',str(X.iloc[i]))
    
#     #removing urls
#     text=re.sub(r'http\S+', '', text)
#     text=re.sub(r'www\.\S+', '', text)
    
#     #converting special character and numbers to whitespaces
#     text = re.sub(r'\W+',' ',text)
    
#     #replacing multiple whitesapces by a single
#     text = re.sub(r'\s+',' ',text)
    
#     # remove all single characters(surrounded by whitespace)
#     text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    
#     # remove all single characters except i and I (surrounded by whitespace)
#     # text = re.sub(r'\s+(?![iI])[a-zA-Z]\s+', ' ', text)
    
    
#     # Converting to Lowercase 
#     text = text.lower()
    
#     # Lemmatization- splits into list of words ['The', 'quick', ....]
#     text = text.split()

#     lemma = WordNetLemmatizer()
#     text = [lemma.lemmatize(word) for word in text]
#     text = ' '.join(text)
    
#     cleaned.append(text)
# print(cleaned[:10])

In [301]:
# # Making sure that cleaned data has the same length as X
# len(cleaned), len(X),len(y)

In [302]:
df['content'] = df['content'].apply(clean_text_2)
df.head()

Unnamed: 0,sentiment,content,label,stop_words
1,sadness,layin bed headache ughhhh waitin call,0,3
2,sadness,funeral ceremony gloomy friday,0,0
3,enthusiasm,want hang friend soon,4,3
4,neutral,want trade someone ha houston ticket one,3,7
5,worry,re pinging t go prom bc bf t like friend,1,5


In [303]:
df['content'] = df['content'].apply(remove_stopwords)
df.head()

Unnamed: 0,sentiment,content,label,stop_words
1,sadness,layin bed headache ughhhh waitin call,0,3
2,sadness,funeral ceremony gloomy friday,0,0
3,enthusiasm,want hang friend soon,4,3
4,neutral,want trade someone ha houston ticket one,3,7
5,worry,pinging go prom bc bf like friend,1,5


In [304]:
# Lets examine a row 
i=12
df.iloc[i]['content'],df.iloc[i]['stop_words']

('watch hill london realise tourture week week late watch itonlinelol', 11)

In [305]:
df.to_csv('data/cleaned.csv')

NameError: name 'tf' is not defined