# Sentiment Analysis Model

### Importing necessary dependencies

In [73]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from string import punctuation
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from string import punctuation

### Loading the data

In [74]:
tweets_df = pd.read_csv("./dataset/train.csv", encoding="latin")

### Exploratory Data Analysis

In [75]:
tweets_df.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [76]:
tweets_df.shape

(31962, 3)

In [77]:
tweets_df.describe()

Unnamed: 0,id,label
count,31962.0,31962.0
mean,15981.5,0.070146
std,9226.778988,0.255397
min,1.0,0.0
25%,7991.25,0.0
50%,15981.5,0.0
75%,23971.75,0.0
max,31962.0,1.0


In [78]:
tweets_df.isnull()

Unnamed: 0,id,label,tweet
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,False
...,...,...,...
31957,False,False,False
31958,False,False,False
31959,False,False,False
31960,False,False,False


In [79]:
(tweets_df['label']== 0).sum()

29720

In [80]:
(tweets_df['label']== 1).sum()

2242

In [81]:
one_sub = tweets_df.loc[tweets_df['label']== 1, :]
sampled = sub.sample(2242)

In [82]:
zero_sub = tweets_df.loc[tweets_df['label']== 0, :]

In [83]:
tweets_df = pd.concat([one_sub, zero_sub], ignore_index = True)

In [84]:
tweets_df.describe()

Unnamed: 0,id,label
count,31962.0,31962.0
mean,15981.5,0.070146
std,9226.778988,0.255397
min,1.0,0.0
25%,7991.25,0.0
50%,15981.5,0.0
75%,23971.75,0.0
max,31962.0,1.0


In [85]:
tweets_df.shape

(31962, 3)

### Data Preprocessing

In [86]:
porterStem = PorterStemmer()

In [87]:
negation_list = ["arent","isnt","not","cant","couldnt", "werent","dont","doesnt",
                "wont","didnt","never","nothing","nowhere","noone","none"
                "hasnt","hadnt","shouldnt","wouldnt","aint"]

In [88]:
def preProcess(tweet):
    tweet = tweet.lower()
    # removing punctuation, usermention, urls and hashtags
    tweet = re.sub('n[^A-Za-z ]t','nt', tweet)
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', '', tweet)
    tweet = re.sub('@[^\s]+', '', tweet)
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
    
    tweet = word_tokenize(tweet)
    tweet_list = [];
    negate = False
    
    for word in tweet:
        word = porterStem.stem(word)
        if word in negation_list:
            negate = True
        elif negate is True and word in list(punctuation):
            negate = False     
        if negate and word not in negation_list:
            word = "not_"+word
        else:
            pass
        word = re.sub('[^A-Za-z_ ]+', '', word)   
        if len(word) > 2 and word not in stopwords.words('english'):
            tweet_list.append(word)
    tweet_set = set(tweet_list)
    return " ".join(tweet_set)

In [89]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\khush\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\khush\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [90]:
tweets_df["cleaned_text"] = tweets_df["tweet"].apply(preProcess)

In [91]:
tweets_df.to_csv("./dataset/preprocessed_data.csv")

### Training the classification model

In [92]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.pipeline import Pipeline
import pickle

In [93]:
# Loading the preprocessed tweets
processed_tweets = pd.read_csv("./dataset/preprocessed_data.csv")

In [94]:
processed_tweets.describe()

Unnamed: 0.1,Unnamed: 0,id,label
count,31962.0,31962.0,31962.0
mean,15980.5,15981.5,0.070146
std,9226.778988,9226.778988,0.255397
min,0.0,1.0,0.0
25%,7990.25,7991.25,0.0
50%,15980.5,15981.5,0.0
75%,23970.75,23971.75,0.0
max,31961.0,31962.0,1.0


In [95]:
processed_tweets["cleaned_text"]=processed_tweets["cleaned_text"].astype('U')

In [96]:
processed_tweets.head()

Unnamed: 0.1,Unnamed: 0,id,label,tweet,cleaned_text
0,0,14,1,@user #cnn calls #michigan middle school 'buil...,michigan build tcot call chant cnn middl schoo...
1,1,15,1,no comment! in #australia #opkillingbay #se...,australia thecov opkillingbay comment helpcove...
2,2,18,1,retweet if you agree!,agre retweet
3,3,24,1,@user @user lumpy says i am a . prove it lumpy.,say prove lumpi
4,4,35,1,it's unbelievable that in the 21st century we'...,thi someth like neverump centuri xenophobia un...


In [97]:
# Creating a pipeline to automate the training process
pipeline = Pipeline([
   ( 'bow',CountVectorizer()),
    ('classifier',MultinomialNB()),
])

In [104]:
# Training-testing data split 
from sklearn.model_selection import train_test_split
msg_train,msg_test,label_train,label_test = train_test_split(processed_tweets['cleaned_text'],processed_tweets['label'],test_size=0.2)

In [105]:
print(len(msg_train))
print(len(label_train))
print(len(msg_test))
print(len(label_test))

25569
25569
6393
6393


In [109]:
model_pipeline = pipeline.fit(msg_train, label_train)

In [110]:
pred = pipeline.predict(msg_test)

In [111]:
print(classification_report(pred, label_test))

              precision    recall  f1-score   support

           0       1.00      0.96      0.98      6160
           1       0.48      0.93      0.63       233

    accuracy                           0.96      6393
   macro avg       0.74      0.95      0.80      6393
weighted avg       0.98      0.96      0.97      6393



In [112]:
pickle.dump(model_pipeline,open("model_pipeline.pkl","wb"))