In [137]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [138]:
df = pd.read_csv(
  '../Data/train.txt',
  sep = ';',
  header = None,
  names = [
    'Text',
    'Emotions'
  ]
)
df

Unnamed: 0,Text,Emotions
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger
...,...,...
15995,i just had a very brief time in the beanbag an...,sadness
15996,i am now turning and i feel pathetic that i am...,sadness
15997,i feel strong and good overall,joy
15998,i feel like this was such a rude comment and i...,anger


In [139]:
df.isnull().sum()

Text        0
Emotions    0
dtype: int64

In [140]:
unique_emotions = df.Emotions.unique()
unique_emotions

array(['sadness', 'anger', 'love', 'surprise', 'fear', 'joy'],
      dtype=object)

In [141]:
dictionary = {}
i = 0

for emotion in unique_emotions:
  dictionary[emotion] = i
  i += 1
dictionary

{'sadness': 0, 'anger': 1, 'love': 2, 'surprise': 3, 'fear': 4, 'joy': 5}

In [142]:
df['Emotions'] = df.Emotions.map(dictionary)
df

Unnamed: 0,Text,Emotions
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1
...,...,...
15995,i just had a very brief time in the beanbag an...,0
15996,i am now turning and i feel pathetic that i am...,0
15997,i feel strong and good overall,5
15998,i feel like this was such a rude comment and i...,1


In [143]:
import string

def remove_punc(txt):
  return txt.translate(str.maketrans('', '', string.punctuation))

df['Text'] = df['Text'].apply(remove_punc)
df

Unnamed: 0,Text,Emotions
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1
...,...,...
15995,i just had a very brief time in the beanbag an...,0
15996,i am now turning and i feel pathetic that i am...,0
15997,i feel strong and good overall,5
15998,i feel like this was such a rude comment and i...,1


In [144]:
def remove_num(txt):
  new = ''
  for i in txt:
    if not i.isdigit():
      new += i
  return new

df['Text'] = df['Text'].apply(remove_num)
df

Unnamed: 0,Text,Emotions
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1
...,...,...
15995,i just had a very brief time in the beanbag an...,0
15996,i am now turning and i feel pathetic that i am...,0
15997,i feel strong and good overall,5
15998,i feel like this was such a rude comment and i...,1


In [145]:
def remove_emojis(txt):
  new = ''
  for i in txt:
    if i.isascii():
      new += i
  return new

df['Text'] = df['Text'].apply(remove_emojis)
df

Unnamed: 0,Text,Emotions
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1
...,...,...
15995,i just had a very brief time in the beanbag an...,0
15996,i am now turning and i feel pathetic that i am...,0
15997,i feel strong and good overall,5
15998,i feel like this was such a rude comment and i...,1


In [146]:
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to /home/harsh/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /home/harsh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [147]:
stop_words = set(stopwords.words('english'))
print(stop_words)
len(stop_words)

{"haven't", 'while', 'on', 'have', 'down', 'shan', 'her', 'had', "wasn't", 'aren', "they'd", "weren't", 'your', 'having', 'himself', "isn't", "it'd", 'their', "you'd", 'each', "she'd", "we'll", "i've", 'myself', 'or', 'there', 'too', 'herself', 'in', 'to', 'were', 'mustn', 'by', 'whom', 'above', 'against', 'of', "should've", 'theirs', 'this', 'about', 'being', 'itself', 'why', 'then', 'those', 'won', 'any', 'the', 'isn', "we'd", 'who', 'again', "hadn't", 'between', 'didn', 'shouldn', 'o', 'all', 'some', 'yourselves', "mightn't", 'ain', 'same', 'very', "he'll", 'needn', 'and', "doesn't", 'does', 'ours', "shouldn't", 'with', 'haven', 'until', 'but', 'doesn', 'as', 'more', 'no', "won't", "mustn't", 'you', 'from', 'further', 'his', "you've", 'has', 'couldn', 'ourselves', 'do', "he's", 'now', 'be', "i'll", 'm', 'only', "it'll", 'wouldn', 'wasn', 'my', "hasn't", 'off', 'so', 'how', 'here', 'was', "we've", 'will', 'just', 're', 'both', 'them', 'is', 'he', 'where', 'mightn', "i'm", 'once', 'fe

198

In [148]:
def remove_stop(txt):
  words = word_tokenize(txt)
  cleaned = []
  
  for w in words:
    if w not in stop_words:
      cleaned.append(w)
  return ' '.join(cleaned)

df['Text'] = df['Text'].apply(remove_stop)
df

Unnamed: 0,Text,Emotions
0,didnt feel humiliated,0
1,go feeling hopeless damned hopeful around some...,0
2,im grabbing minute post feel greedy wrong,1
3,ever feeling nostalgic fireplace know still pr...,2
4,feeling grouchy,1
...,...,...
15995,brief time beanbag said anna feel like beaten,0
15996,turning feel pathetic still waiting tables sub...,0
15997,feel strong good overall,5
15998,feel like rude comment im glad,1
