In [35]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 

In [36]:
df = pd.read_csv('train.txt', sep=';', header= None, names=['text', 'emotion'])

In [39]:
df['emotion'].value_counts()

emotion
joy         5362
sadness     4666
anger       2159
fear        1937
love        1304
surprise     572
Name: count, dtype: int64

In [61]:
df[df['emotion'] =='surprise']['text'][0:10].values

array(['ive been taking or milligrams or times recommended amount and ive fallen asleep a lot faster but i also feel like so funny',
       'i have seen heard and read over the past couple of days i am left feeling impressed by more than a few companies',
       'i keep feeling pleasantly surprised at his supportiveness and also his ease in new situations',
       'i am now nearly finished the week detox and i feel amazing',
       'i too feel as if i am a stranger in a strange land and i am raising my son in a place that is not his father s ancestral home',
       'i started feeling funny and then friday i woke up sick as a dog',
       'im certainly not going to sit and tell you whats going on in my personal life but i feel that if you were ever curious about whats going in my life all youd have to do is watch the show',
       'im sorry that there wasnt more humor in this post but im not feeling all that funny',
       'i want to hold this feeling of shocked awe and wonder forever',

In [None]:
df.head()

Unnamed: 0,text,emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [4]:
df.isnull().sum()

text       0
emotion    0
dtype: int64

In [5]:
df['emotion'].unique()

array(['sadness', 'anger', 'love', 'surprise', 'fear', 'joy'],
      dtype=object)

In [6]:
unique_emotions = df['emotion'].unique()
emotion_numbers = {}
i = 0
for emo in unique_emotions:
  emotion_numbers[emo] = i
  i +=1

df['emotion'] = df['emotion'].map(emotion_numbers)


In [7]:
print(df['emotion'].value_counts())


emotion
5    5362
0    4666
1    2159
4    1937
2    1304
3     572
Name: count, dtype: int64


In [8]:
df

Unnamed: 0,text,emotion
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1
...,...,...
15995,i just had a very brief time in the beanbag an...,0
15996,i am now turning and i feel pathetic that i am...,0
15997,i feel strong and good overall,5
15998,i feel like this was such a rude comment and i...,1


In [9]:
df['text'] = df['text'].apply(lambda x : x.lower())

In [10]:
import string

In [11]:
def remove_punc(txt):
  return txt.translate(str.maketrans('','',string.punctuation))

In [12]:
df['text'] = df['text'].apply(remove_punc)

In [13]:
def remove_numbers(txt):
    new = ""
    for i in txt:
        if not i.isdigit():
            new = new + i
    return new

df['text'] = df['text'].apply(remove_numbers)
df['text']

0                                  i didnt feel humiliated
1        i can go from feeling so hopeless to so damned...
2         im grabbing a minute to post i feel greedy wrong
3        i am ever feeling nostalgic about the fireplac...
4                                     i am feeling grouchy
                               ...                        
15995    i just had a very brief time in the beanbag an...
15996    i am now turning and i feel pathetic that i am...
15997                       i feel strong and good overall
15998    i feel like this was such a rude comment and i...
15999    i know a lot but i feel so stupid because i ca...
Name: text, Length: 16000, dtype: object

In [14]:
def remove_emojis(txt):
    new = ""
    for i in txt:
        if i.isascii():
            new += i
    return new

df['text'] = df['text'].apply(remove_emojis)

In [15]:
import nltk

In [16]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [17]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yoush\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yoush\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [18]:
stop_words = set(stopwords.words('english'))
len(stop_words)

198

In [19]:
df.loc[1]['text']

'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake'

In [20]:
def remove(txt):
  words = txt.split()
  cleaned = []
  for i in words:
    if not i in stop_words:
      cleaned.append(i)

  return ' '.join(cleaned)

In [21]:
df['text'] = df['text'].apply(remove)

In [22]:
df.loc[1]['text']

'go feeling hopeless damned hopeful around someone cares awake'

In [23]:
df.head()

Unnamed: 0,text,emotion
0,didnt feel humiliated,0
1,go feeling hopeless damned hopeful around some...,0
2,im grabbing minute post feel greedy wrong,1
3,ever feeling nostalgic fireplace know still pr...,2
4,feeling grouchy,1


In [24]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['text'], df['emotion'], test_size=0.20, random_state=42)

In [25]:
X_train

676      refers course though cant help feeling somehow...
12113                im starting feel im suffering fatigue
7077     feel like probably would liked book little bit...
13005                                  really feel awkward
12123    im feeling little grumpy today lame weather te...
                               ...                        
13418    love leave reader feeling confused slightly de...
5390                                         feel delicate
860                          starting feel little stressed
15795             feel stressed tired worn shape neglected
7270         feel someone rude wrongly done something lose
Name: text, Length: 12800, dtype: object

In [26]:
X_test

8756                             ive made week feel beaten
4660                              feel strategy worthwhile
6095                     feel worthless weak say want find
304                                        feel clever nov
8241                      im moved ive feeling kind gloomy
                               ...                        
15578    feel useful pulpit find ironic often question ...
5746             dried bladders ready day im feeling brave
6395                             feel thrilled matter days
7624     woke morning text mr c declaring walking work ...
15245                                            feel dumb
Name: text, Length: 3200, dtype: object

In [27]:
df.shape

(16000, 2)

In [28]:

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

bow_vectorizer = CountVectorizer()
X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)


nb_model = MultinomialNB()
nb_model.fit(X_train_bow, y_train)


pred_bow = nb_model.predict(X_test_bow)
print(accuracy_score(y_test, pred_bow))

0.768125


In [29]:
y_test

8756     0
4660     5
6095     0
304      5
8241     0
        ..
15578    5
5746     5
6395     5
7624     5
15245    0
Name: emotion, Length: 3200, dtype: int64

In [30]:
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


nb2_model = MultinomialNB()
nb2_model.fit(X_train_tfidf,y_train)
y_pred = nb2_model.predict(X_test_tfidf)
print(accuracy_score(y_test, y_pred))

0.6609375


In [31]:
from sklearn.linear_model import LogisticRegression

logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_train_tfidf,y_train)

log_pred = logistic_model.predict(X_test_tfidf)

print(accuracy_score(y_test,log_pred ))

0.8628125


In [32]:
import joblib

In [33]:
joblib.dump(logistic_model, 'logistic_model.pkl')
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')
joblib.dump(emotion_numbers, 'label_mapping.pkl')


['label_mapping.pkl']