In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('train.txt',sep = ';',header = None,names = ['text','emotion'])

In [3]:
df.head()

Unnamed: 0,text,emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [4]:
df.isnull().sum()

text       0
emotion    0
dtype: int64

In [155]:
unique_emotions = df['emotion'].unique()
unique_emotions

array([0, 1, 2, 3, 4, 5], dtype=int64)

In [5]:
unique_emotions = df['emotion'].unique()
emotions_numbers = {}
i = 0
for emo in unique_emotions:
    emotions_numbers[emo] = i
    i+=1
df['emotion'] = df['emotion'].map(emotions_numbers)

In [6]:
df

Unnamed: 0,text,emotion
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1
...,...,...
15995,i just had a very brief time in the beanbag an...,0
15996,i am now turning and i feel pathetic that i am...,0
15997,i feel strong and good overall,5
15998,i feel like this was such a rude comment and i...,1


In [7]:
df['text'] = df['text'].apply(lambda x:x.lower())

In [8]:
import string
def remove_punc(txt):
    return txt.translate(str.maketrans('','',string.punctuation))
    

In [9]:
df['text'] = df['text'].apply(remove_punc)

In [10]:
def remove_digits(txt):
    new = ""
    for i in txt:
        if not i.isdigit():
            new = new+i
    return new

df['text'] = df['text'].apply(remove_digits)


In [11]:
def remove_emojies(txt):
    new =""
    for i in txt:
        if i.isascii():
            new +=i
    return new

df['text'] = df['text'].apply(remove_emojies)
    

In [12]:
import nltk

In [13]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [14]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [15]:
stop_words = set(stopwords.words('english'))


In [16]:
len(stop_words)

198

In [17]:
df.loc[1]['text']

'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake'

In [18]:
def remove(txt):
    words = txt.split()
    cleaned_text = []
    for i in words:
        if not i in stop_words:
            cleaned_text.append(i)

    return ' '.join(cleaned_text)
        

In [19]:
df['text'] = df['text'].apply(remove)

In [20]:
df.loc[1]['text']

'go feeling hopeless damned hopeful around someone cares awake'

In [61]:
df.head()

Unnamed: 0,text,emotion
0,didnt feel humiliated,0
1,go feeling hopeless damned hopeful around some...,0
2,im grabbing minute post feel greedy wrong,1
3,ever feeling nostalgic fireplace know still pr...,2
4,feeling grouchy,1


In [65]:
from sklearn.model_selection import train_test_split



X_train, X_test, y_train, y_test = train_test_split(df['text'], df['emotion'], test_size=0.20, random_state=42)

In [75]:
X_test.shape

(3200,)

In [79]:
X_train.shape

(12800,)

In [81]:
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer

In [83]:
bow_vectorizer = CountVectorizer()

In [89]:
X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.fit_transform(X_test)

In [87]:
X_train_bow

<12800x13361 sparse matrix of type '<class 'numpy.int64'>'
	with 116059 stored elements in Compressed Sparse Row format>

In [91]:
X_test_bow

<3200x6175 sparse matrix of type '<class 'numpy.int64'>'
	with 28719 stored elements in Compressed Sparse Row format>

In [93]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [95]:
nb_model = MultinomialNB()

In [101]:
nb_model.fit(X_train_bow,y_train)

In [105]:

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

bow_vectorizer = CountVectorizer()
X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)


nb_model = MultinomialNB()
nb_model.fit(X_train_bow, y_train)


pred_bow = nb_model.predict(X_test_bow)
print(accuracy_score(y_test, pred_bow))
     

0.768125


In [107]:
pred_bow

array([0, 5, 0, ..., 5, 5, 0], dtype=int64)

In [109]:
y_test

8756     0
4660     5
6095     0
304      5
8241     0
        ..
15578    5
5746     5
6395     5
7624     5
15245    0
Name: emotion, Length: 3200, dtype: int64

In [117]:
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

nb2_model = MultinomialNB()
nb2_model.fit(X_train_tfidf,y_train)

In [121]:
y_pred = nb2_model.predict(X_test_tfidf)

In [123]:
print(accuracy_score(y_test, y_pred))

0.6609375


In [125]:
from sklearn.linear_model import LogisticRegression

In [127]:
logistic_model = LogisticRegression(max_iter=1000)


In [129]:
logistic_model.fit(X_train_tfidf,y_train)

In [131]:
log_pred = logistic_model.predict(X_test_tfidf)

In [133]:
print(accuracy_score(y_test, log_pred))

0.8628125


In [147]:
import pickle

# After training your model and vectorizer
pickle.dump(logistic_model, open("sentiment_model.pkl", "wb"))
pickle.dump(tfidf_vectorizer, open("vectorizer.pkl", "wb"))

In [149]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(df['emotion']) 

In [151]:
pickle.dump(le, open("label_encoder.pkl", "wb"))