In [4]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [5]:
df_list = []
for csv_file in ['amazon_cells_labelled.txt' , 'imdb_labelled.txt' , 'yelp_labelled.txt']:
    temp_df = pd.read_csv(csv_file , sep="\t" , header = 0 , names = ['text' , 'sentiment'])
    df_list.append(temp_df)
df = pd.concat(df_list)

In [6]:
df

Unnamed: 0,text,sentiment
0,"Good case, Excellent value.",1
1,Great for the jawbone.,1
2,Tied to charger for conversations lasting more...,0
3,The mic is great.,1
4,I have to jiggle the plug to get it to line up...,0
...,...,...
994,I think food should have flavor and texture an...,0
995,Appetite instantly gone.,0
996,Overall I was not impressed and would not go b...,0
997,"The whole experience was underwhelming, and I ...",0


In [7]:
#Checking the ditribution of classes to get an idea before implementing any classification task
df['sentiment'].value_counts()

1    1385
0    1360
Name: sentiment, dtype: int64

In [8]:
#Looking at any random 7 texts from the data and the sentiment assigned to them
df[['text' , 'sentiment']].sample(7 , random_state = 5)

Unnamed: 0,text,sentiment
622,"Plus, I seriously do not believe it is worth i...",0
194,"Light weight, I hardly notice it is there.",1
585,This movie is so awesome!,1
608,Editing: The editing of this film was phenomen...,1
178,If you like a loud buzzing to override all you...,0
441,"Anyway, this FS restaurant has a wonderful bre...",1
936,Don't bother coming here.,0


In [9]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df , test_size=0.2,random_state=42)

In [10]:
y_train = df_train['sentiment']
y_test = df_test['sentiment']

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer(ngram_range=(1,3), min_df=3, strip_accents='ascii')

x_train = vec.fit_transform(df_train['text'])
x_test = vec.transform(df_test['text'])

In [39]:
from sklearn.metrics import precision_recall_fscore_support 
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB(fit_prior=True)
clf.fit(x_train, y_train)
y_test_pred = clf.predict(x_test)

p, r, f, s = precision_recall_fscore_support(y_test, y_test_pred)

pd1 = pd.DataFrame(
    {
        'Precision': p,
        'Recall': r,
        'F': f,
        'Support': s,
    },
    index=[0,1] 
).round(4)

In [43]:
from sklearn.metrics import precision_recall_fscore_support 
from sklearn.naive_bayes import BernoulliNB

clf = BernoulliNB(fit_prior=True)
clf.fit(x_train, y_train)
y_test_pred = clf.predict(x_test)

p, r, f, s = precision_recall_fscore_support(y_test, y_test_pred)

pd2 = pd.DataFrame(
    {
        'Precision': p,
        'Recall': r,
        'F': f,
        'Support': s,
    },
    index=[0,1] 
).round(4)

In [44]:
from sklearn.metrics import precision_recall_fscore_support 
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
clf.fit(x_train, y_train)
y_test_pred = clf.predict(x_test)

p, r, f, s = precision_recall_fscore_support(y_test, y_test_pred)

pd3 = pd.DataFrame(
    {
        'Precision': p,
        'Recall': r,
        'F': f,
        'Support': s,
    },
    index=[0,1] 
).round(4)

In [49]:
#Comparing the scores for the different models that have been implemented- Multinomial Naive Bayes, Bernoulli Naive
#Bayes and the Logistic Regression. 
print(pd1)
print("\n")
print(pd2)
print("\n")
print(pd3)

   Precision  Recall       F  Support
0     0.8134  0.8044  0.8089      271
1     0.8114  0.8201  0.8157      278


   Precision  Recall       F  Support
0     0.8040  0.7417  0.7716      271
1     0.7659  0.8237  0.7938      278


   Precision  Recall       F  Support
0     0.8175  0.8266  0.8220      271
1     0.8291  0.8201  0.8246      278


In [50]:
#Since I am going to implement Grid Search later, which only takes one parameter, a pipeline wrapper is used to combine
#multiple estimators into one
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

pipe = Pipeline(steps = [('CountVectorizer' , CountVectorizer()) , ('MultinomialNB' , MultinomialNB()) ])

In [54]:
#The Grid Search technique is used to find the optimal hyperparametrs for the estimators. This helps to improve the
#score of our model
param_grid = {'CountVectorizer__ngram_range': [(1,1),(1,2),(1,3)],
             'MultinomialNB__alpha': [0.1,1],
             'MultinomialNB__fit_prior': [True, False],
             }
from sklearn.model_selection import GridSearchCV
search = GridSearchCV(pipe,param_grid,scoring='precision_macro',n_jobs=-1)
search.fit(df_train['text'],y_train)
print(search.best_params_)

{'CountVectorizer__ngram_range': (1, 2), 'MultinomialNB__alpha': 1, 'MultinomialNB__fit_prior': True}


In [58]:
y_test_pred = search.predict(df_test['text'])

p, r, f, s = precision_recall_fscore_support(y_test, y_test_pred, average='macro')

print(f'Macro Precision = {p:.2%} & Recall={r:.2%} & F-score={f:.2%}  ')

Macro Precision = 84.89% & Recall=84.89% & F-score=84.88%  


In [None]:
#Therefore we can see that after implementing Grid Search to optimize our hyperparameters, our various scores have risen
#We also implemented various models, but could not find any significant difference in their scores.
#Finally we can conclude that, we can classify sentences, coming from three different documents, with a precision close
#to 85%.