In [2]:
import pandas as pd
import numpy as np
import re

import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('omw-1.4')
# nltk.download('punkt')

In [3]:
df = pd.read_csv("data/tweet_emotions.csv")
df.head()

Unnamed: 0,text,label
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,3
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,3


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16000 entries, 0 to 15999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    16000 non-null  object
 1   label   16000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 250.1+ KB


In [5]:
df.isnull().sum()

text     0
label    0
dtype: int64

## Data cleaning

In [292]:
def clean_text_1(text):
    #removing @ tags 
    text=re.sub(r'@[a-zA-z0-9]+\s*',' ',str(text))
    
    #removing urls
    text=re.sub(r'http\S+', '', text)
    text=re.sub(r'www\.\S+', '', text)
    
    #replacing multiple whitesapces by a single
    text = re.sub(r'\s+',' ',text)
    
    # remove all single characters(surrounded by whitespace)
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    
    # remove all single characters except i and I (surrounded by whitespace)
    # text = re.sub(r'\s+(?![iI])[a-zA-Z]\s+', ' ', text)
    
    
    # Converting to Lowercase 
    text = text.lower()
    
    # Lemmatization- splits into list of words ['The', 'quick', ....]
    text = text.split()

    lemma = WordNetLemmatizer()
    text = [lemma.lemmatize(word) for word in text]
    text = ' '.join(text)
    
    return text

In [293]:
df['text']=df['text'].apply(clean_text_1)

In [20]:
df['label'].value_counts()

label
1    5362
0    4666
3    2159
4    1937
2    1304
5     572
Name: count, dtype: int64

## Stop words are common enginlish words that dont contain too much information

In [7]:
def count_stopwords(text):
    stop_words = set(stopwords.words("english"))
    words = nltk.word_tokenize(text)
    return len(set(words) & stop_words)

def list_stopwords(text):
    stop_words = set(stopwords.words("english"))
    words = nltk.word_tokenize(text)
    return list(set(words) & stop_words)

def remove_stopwords(text):
    # Tokenize the text into words
    words = nltk.word_tokenize(text)
    
    # Get the list of stopwords
    stop_words = set(stopwords.words('english'))
    
    # Remove stopwords from the text
    filtered_words = [word for word in words if word.lower() not in stop_words]
    
    # Recreate the text without stopwords
    filtered_text = ' '.join(filtered_words)
    
    return filtered_text

In [8]:

# Add a new col 'stop_words' that contains the count of stopwords
df['stop_words'] = df['text'].apply(count_stopwords)
value_counts = df['stop_words'].value_counts()
df.head()

Unnamed: 0,text,label,stop_words
0,i didnt feel humiliated,0,1
1,i can go from feeling so hopeless to so damned...,0,10
2,im grabbing a minute to post i feel greedy wrong,3,3
3,i am ever feeling nostalgic about the fireplac...,2,9
4,i am feeling grouchy,3,2


In [10]:
# Lets examine a row 
df.iloc[3]['text'],df.iloc[3]['stop_words'],list_stopwords(df.iloc[3]['text'])

('i am ever feeling nostalgic about the fireplace i will know that it is still on the property',
 9,
 ['it', 'am', 'i', 'on', 'will', 'the', 'that', 'about', 'is'])

In [11]:
df['text'] = df['text'].apply(remove_stopwords)
df.head()

Unnamed: 0,text,label,stop_words
0,didnt feel humiliated,0,1
1,go feeling hopeless damned hopeful around some...,0,10
2,im grabbing minute post feel greedy wrong,3,3
3,ever feeling nostalgic fireplace know still pr...,2,9
4,feeling grouchy,3,2


## Data Cleaning

In [12]:
def clean_text_2(text):
    #converting special character and numbers to whitespaces
    text = re.sub(r'\W+',' ',text)
    text=re.sub(r'\d+', '', text)
    #replacing multiple whitesapces by a single
    text = re.sub(r'\s+',' ',text)
    
    # remove all single characters(surrounded by whitespace)
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    
    # remove all single characters except i and I (surrounded by whitespace)
    # text = re.sub(r'\s+(?![iI])[a-zA-Z]\s+', ' ', text)
    
    
    # Converting to Lowercase 
    text = text.lower()
    
    # Lemmatization- splits into list of words ['The', 'quick', ....]
    text = text.split()

    lemma = WordNetLemmatizer()
    text = [lemma.lemmatize(word) for word in text]
    text = ' '.join(text)
    
    return text

In [13]:
# # Cleaning 
# cleaned = []

# for i in range(0, len(X)):
    
#     #removing @ tags
#     text=re.sub(r'@[a-zA-z0-9]+\s*',' ',str(X.iloc[i]))
    
#     #removing urls
#     text=re.sub(r'http\S+', '', text)
#     text=re.sub(r'www\.\S+', '', text)
    
#     #converting special character and numbers to whitespaces
#     text = re.sub(r'\W+',' ',text)
    
#     #replacing multiple whitesapces by a single
#     text = re.sub(r'\s+',' ',text)
    
#     # remove all single characters(surrounded by whitespace)
#     text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    
#     # remove all single characters except i and I (surrounded by whitespace)
#     # text = re.sub(r'\s+(?![iI])[a-zA-Z]\s+', ' ', text)
    
    
#     # Converting to Lowercase 
#     text = text.lower()
    
#     # Lemmatization- splits into list of words ['The', 'quick', ....]
#     text = text.split()

#     lemma = WordNetLemmatizer()
#     text = [lemma.lemmatize(word) for word in text]
#     text = ' '.join(text)
    
#     cleaned.append(text)
# print(cleaned[:10])

In [14]:
# # Making sure that cleaned data has the same length as X
# len(cleaned), len(X),len(y)

In [15]:
df['text'] = df['text'].apply(clean_text_2)
df.head()

Unnamed: 0,text,label,stop_words
0,didnt feel humiliated,0,1
1,go feeling hopeless damned hopeful around some...,0,10
2,im grabbing minute post feel greedy wrong,3,3
3,ever feeling nostalgic fireplace know still pr...,2,9
4,feeling grouchy,3,2


In [16]:
df['text'] = df['text'].apply(remove_stopwords)
df.head()

Unnamed: 0,text,label,stop_words
0,didnt feel humiliated,0,1
1,go feeling hopeless damned hopeful around some...,0,10
2,im grabbing minute post feel greedy wrong,3,3
3,ever feeling nostalgic fireplace know still pr...,2,9
4,feeling grouchy,3,2


In [19]:
# Lets examine a row 
i=12
df.iloc[i]['text'],df.iloc[i]['stop_words']

('think easiest time year feel dissatisfied', 6)

In [21]:
df.to_csv('data/cleaned.csv')

In [25]:
df[df['label']==2]

Unnamed: 0,text,label,stop_words
3,ever feeling nostalgic fireplace know still pr...,2,9
9,feel romantic,2,2
47,let go sad feeling want accepted first home mine,2,10
61,ate could feel gentle tingle throughout almost...,2,7
68,suppose truth need shared havent feeling faith...,2,12
...,...,...,...
15936,im better rest feeling feeling accepted,2,7
15958,feel going sequel would liked closure book ending,2,12
15963,love feel blessed lucky able travel creative m...,2,9
15971,feel would loving warn impending social crisis...,2,9
