In [2]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns

In [3]:
df = pd.read_csv('train.txt',sep=';',header=None,names=['text','emotion'])

In [4]:
df.head()

Unnamed: 0,text,emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [5]:
df.isnull().sum()

text       0
emotion    0
dtype: int64

In [6]:
unique_emotions = df['emotion'].unique()
emotion_numbers = {}
i = 0
for emo in unique_emotions: 
    emotion_numbers[emo] = i
    i+=1

df['emotion'] = df['emotion'].map(emotion_numbers)

In [7]:
df

Unnamed: 0,text,emotion
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1
...,...,...
15995,i just had a very brief time in the beanbag an...,0
15996,i am now turning and i feel pathetic that i am...,0
15997,i feel strong and good overall,5
15998,i feel like this was such a rude comment and i...,1


In [8]:
df['text'] = df['text'].apply(lambda x : x.lower())

In [9]:
import string

def remove_punch(txt):
    return txt.translate(str.maketrans('','',string.punctuation))

In [10]:
df['text'] = df['text'].apply(remove_punch)

In [11]:
def remove_numbers(txt): 
    new = ""
    for i in txt:
        if not i.isdigit(): 
            new = new+i 
    return new 

df['text'] = df['text'].apply(remove_numbers)

In [12]:
def remove_emojis(txt): 
    new = ""
    for i in txt:
        if i.isascii(): 
            new += i 
    return new

df['text'] = df['text'].apply(remove_emojis)

In [13]:
import nltk

In [14]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [15]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kaust\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kaust\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [16]:
stop_words = set(stopwords.words('english'))

In [17]:
df.loc[1]['text']

'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake'

In [18]:
def remove(txt): 
    words = word_tokenize(txt)
    cleaned = []
    for i in words: 
        if not i in stop_words: 
            cleaned.append(i)
    return ' '.join(cleaned)

In [19]:
df['text'] = df['text'].apply(remove)

In [20]:
df.loc[1]['text']

'go feeling hopeless damned hopeful around someone cares awake'

In [21]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['text'],df['emotion'], test_size=0.20, random_state=42)

In [22]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [23]:
bow_vectorizer = CountVectorizer()

In [24]:
X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)

In [25]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [26]:
NB_Model = MultinomialNB()


In [27]:
NB_Model.fit(X_train_bow,y_train)

In [28]:
y_pred_bow = NB_Model.predict(X_test_bow)

In [29]:
accuracy_score(y_pred_bow,y_test)

0.7678125

In [30]:
tf_idf_vectorizer = TfidfVectorizer()

In [31]:
 X_train_tf = tf_idf_vectorizer.fit_transform(X_train)
X_test_tf = tf_idf_vectorizer.transform(X_test)

In [32]:
NB2_model = MultinomialNB()

NB2_model.fit(X_train_tf,y_train)

In [33]:
y_pred_tf = NB2_model.predict(X_test_tf)

In [34]:
accuracy_score(y_pred_tf,y_test)

0.6609375

In [35]:
from sklearn.linear_model import LogisticRegression

model_LR = LogisticRegression(max_iter=1000)

In [36]:
model_LR.fit(X_train_tf,y_train)

In [37]:
y_pred_tf2 = model_LR.predict(X_test_tf)

In [38]:
accuracy_score(y_pred_tf2,y_test)

0.8615625

In [39]:
import joblib

joblib.dump(model_LR,'LogisticRegression.pkl')

['LogisticRegression.pkl']