# Sentiment Analysis Model

### Importing necessary dependencies

In [4]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from string import punctuation
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from string import punctuation

### Loading the data

In [6]:
tweets_df = pd.read_csv("./dataset/train.csv", encoding="latin")

### Exploratory Data Analysis

In [7]:
tweets_df.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [47]:
tweets_df.shape

(31962, 5)

### Data Preprocessing

In [9]:
porterStem = PorterStemmer()

In [10]:
negation_list = ["arent","isnt","not","cant","couldnt", "werent","dont","doesnt",
                "wont","didnt","never","nothing","nowhere","noone","none"
                "hasnt","hadnt","shouldnt","wouldnt","aint"]

In [16]:
def preProcess(tweet):
    tweet = tweet.lower()
    # removing punctuation, usermention, urls and hashtags
    tweet = re.sub('n[^A-Za-z ]t','nt', tweet)
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', '', tweet)
    tweet = re.sub('@[^\s]+', '', tweet)
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
    
    tweet = word_tokenize(tweet)
    tweet_list = [];
    negate = False
    
    for word in tweet:
        word = porterStem.stem(word)
        if word in negation_list:
            negate = True
        elif negate is True and word in list(punctuation):
            negate = False     
        if negate and word not in negation_list:
            word = "not_"+word
        else:
            pass
        word = re.sub('[^A-Za-z_ ]+', '', word)   
        if len(word) > 2 and word not in stopwords.words('english'):
            tweet_list.append(word)
    tweet_set = set(tweet_list)
    return " ".join(tweet_set)

In [21]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\khush\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\khush\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [22]:
tweets_df["cleaned_text"] = tweets_df["tweet"].apply(preProcess)

In [24]:
tweets_df.to_csv("./dataset/preprocessed_data.csv")

### Training the classification model

In [26]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.pipeline import Pipeline
import pickle

In [29]:
# Loading the preprocessed tweets
processed_tweets = pd.read_csv("./dataset/preprocessed_data.csv")

In [30]:
processed_tweets.describe()

Unnamed: 0.1,Unnamed: 0,id,label
count,31962.0,31962.0,31962.0
mean,15980.5,15981.5,0.070146
std,9226.778988,9226.778988,0.255397
min,0.0,1.0,0.0
25%,7990.25,7991.25,0.0
50%,15980.5,15981.5,0.0
75%,23970.75,23971.75,0.0
max,31961.0,31962.0,1.0


In [34]:
processed_tweets["cleaned_text"]=processed_tweets["cleaned_text"].astype('U')

In [35]:
processed_tweets.head()

Unnamed: 0.1,Unnamed: 0,id,label,tweet,cleaned_text
0,0,1,0,@user when a father is dysfunctional and is s...,kid dysfunct drag selfish run father
1,1,2,0,@user @user thanks for #lyft credit i can't us...,not_offer cant lyft not_van not_wheelchair cre...
2,2,3,0,bihday your majesty,majesti bihday
3,3,4,0,#model i love u take with u all the time in ...,time model take love
4,4,5,0,factsguide: society now #motivation,motiv societi factsguid


In [36]:
# Creating a pipeline to automate the training process
pipeline = Pipeline([
   ( 'bow',CountVectorizer()),
    ('classifier',MultinomialNB()),
])

In [41]:
# Training-testing data split 
from sklearn.model_selection import train_test_split
msg_train,msg_test,label_train,label_test = train_test_split(processed_tweets['cleaned_text'],processed_tweets['label'],test_size=0.3)

In [42]:
print(len(msg_train))
print(len(label_train))
print(len(msg_test))
print(len(label_test))

22373
22373
9589
9589


In [43]:
pipeline.fit(msg_train, label_train)

In [44]:
pred = pipeline.predict(msg_test)

In [45]:
print(classification_report(pred, label_test))

              precision    recall  f1-score   support

           0       1.00      0.96      0.98      9237
           1       0.45      0.89      0.60       352

    accuracy                           0.96      9589
   macro avg       0.73      0.92      0.79      9589
weighted avg       0.98      0.96      0.96      9589

