In [1]:
import numpy as np 
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt 

In [2]:
df = pd.read_csv('train.txt', sep = ';', header = None, names = ['text','emotion'])

In [3]:
df.head()

Unnamed: 0,text,emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [4]:
df.shape

(16000, 2)

In [5]:
df.isnull().sum()

text       0
emotion    0
dtype: int64

In [6]:
# will see how many unique emotions are there 

unique_emotions = df['emotion'].unique()
print(unique_emotions)

['sadness' 'anger' 'love' 'surprise' 'fear' 'joy']


In [7]:
emotion_numbers = {}
i = 0
for emo in unique_emotions:
    emotion_numbers[emo] = i
    i+=1 

    
df['emotion'] = df['emotion'].map(emotion_numbers)

In [8]:
df.head()

Unnamed: 0,text,emotion
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1


In [9]:
# converting texts in lowercase 

df['text'] = df['text'].apply(lambda x : x.lower())

In [10]:
# removing punctuations 

import string
def remove_punctuation(txt):
    return txt.translate(str.maketrans('','', string.punctuation))
    

In [11]:
df['text'] = df['text'].apply(remove_punctuation)

In [12]:
# Removing digits 

def remove_nums(txt):
    new = "" 
    for i in txt: 
        if not i.isdigit():
            new = new + i 
    return new 
df['text'] = df['text'].apply(remove_nums)

In [13]:
# Removing Emojis

def remove_emojis(txt): 
    new = "" 
    for i in txt: 
        if i.isascii(): 
            new += i 
    return new 

df['text'] = df['text'].apply(remove_emojis)    

In [14]:
# remove stopwords (we usually remove it in machine learning context but not neccesarily in Deep Learning)

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [15]:
nltk.download('punkt')           # it tokenizes the words
nltk.download('stopwords')

[nltk_data] Downloading package punkt to C:\Users\samsung/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\samsung/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [16]:
stop_words = set(stopwords.words('english'))
print(stop_words)

{'an', 'very', 'a', "you'd", 'here', 'before', 'off', 'only', 'yourselves', 'them', 'aren', 'how', 'so', 'was', 'below', 'out', "mustn't", 'down', 'just', 'doesn', 'but', 'itself', 'himself', 'll', "they'd", "won't", 'hers', 'then', 'shouldn', "haven't", 'we', 'are', 'what', 'hasn', 'in', 's', 'those', 'up', 'ma', 'theirs', 'between', 'am', 'than', 'with', "wouldn't", 'had', 'myself', 'd', 'will', 'because', 'each', 'that', 'him', 'all', "mightn't", 'such', 'if', 'i', 'of', 'mustn', 'me', 'herself', 'couldn', 'wasn', "weren't", 'won', "shan't", 'during', "you've", 'by', 'nor', 'did', "should've", 'being', "didn't", "she'll", 'both', 'above', 'this', 'where', "he'd", 'my', 'her', "we'll", "wasn't", 't', 'these', 'again', 'ours', 'mightn', 'further', 'weren', 'the', "he'll", 'when', 'isn', 'other', 'should', 'shan', "i've", 'under', "don't", "it's", "isn't", 'as', "we've", 'or', "doesn't", 'm', "aren't", "shouldn't", 'their', 'any', 'needn', 'over', 'which', 'too', 'own', 'at', "hasn't",

In [17]:
df.loc[1]['text']

'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake'

In [18]:
def remove_sw(txt):
    
    words = txt.split()
    cleaned = []
    
    for i in words: 
        if i not in stop_words:
            cleaned.append(i)
                 
    # Join the cleaned words with spaces
    return " ".join(cleaned)

In [19]:
df['text'] = df['text'].apply(remove_sw)

In [20]:
df.loc[1]['text']

'go feeling hopeless damned hopeful around someone cares awake'

# now model training

In [21]:
from sklearn.model_selection import train_test_split
X = df['text']
y = df['emotion']


In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [23]:
print(X_train)

676      refers course though cant help feeling somehow...
12113                im starting feel im suffering fatigue
7077     feel like probably would liked book little bit...
13005                                  really feel awkward
12123    im feeling little grumpy today lame weather te...
                               ...                        
13418    love leave reader feeling confused slightly de...
5390                                         feel delicate
860                          starting feel little stressed
15795             feel stressed tired worn shape neglected
7270         feel someone rude wrongly done something lose
Name: text, Length: 12800, dtype: object


In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer


In [25]:
# using CountVectorizer

bow_vectorizer = CountVectorizer()

In [26]:
X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)

# Insights
In NLP we usually use 3 models mostly,


1 - NAIVE BAYES


2 - LOGISTIC REGRESSION


3 - SVM (Support Vector Machine)

In [27]:
# First model using Naive Bayes and CountVectorizer


from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [28]:
nb_model = MultinomialNB()

In [29]:
nb_model.fit(X_train_bow,y_train)

In [30]:
pred_nb_bow = nb_model.predict(X_test_bow)

In [31]:
accuracy_nb_model = accuracy_score(y_test,pred_nb_bow) 
accuracy_nb_model

0.768125

In [32]:
# Second using TTfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()

In [33]:
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf =tfidf_vectorizer.transform(X_test)

In [34]:
nb2_model = MultinomialNB()

In [35]:
nb2_model.fit(X_train_tfidf,y_train)

In [36]:
pred_nb_tidf = nb2_model.predict(X_test_tfidf)

In [37]:
accuracy_nb2_model = accuracy_score(y_test,pred_nb_tidf)
accuracy_nb2_model

0.6609375

In [38]:
# now lets try it out with LOGISTIC REGRESSIONH and TFIDFVECTORIZER, Logistic Reg because it works on probability so it might work weli in this scenario

In [39]:
from sklearn.linear_model import LogisticRegression

In [40]:
logistic_model = LogisticRegression(max_iter=1000)

In [41]:
logistic_model.fit(X_train_tfidf,y_train)

In [42]:
y_pred_logistic_model = logistic_model.predict(X_test_tfidf)

In [43]:
accuracy_Logistic_model = accuracy_score(y_test, y_pred_logistic_model)
accuracy_Logistic_model

0.8628125

In [46]:
# saving our training model as a pickle file
import pickle

model_data = {
    "model": logistic_model,
    "features_names": X.index.to_list() if hasattr(X, 'index') else list(X.keys())
}

with open("SentimentAnalysis.pkl", "wb") as f:
    pickle.dump(model_data, f)

In [47]:
with open('SentimentAnalysis.pkl', 'rb') as file:
    loaded_model = pickle.load(file)