In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from statistics import mean, stdev
from sklearn.metrics import f1_score

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
#import spacy
#from spacy.lang.en.stop_words import STOP_WORDS

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df_text = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/df_text_nltk.csv')

In [None]:
df_text

Unnamed: 0,Positive,Text,ProcessedText
0,1,I have bought several of the Vitality canned d...,bought sever vital can dog food product found ...
1,0,Product arrived labeled as Jumbo Salted Peanut...,product arriv label jumbo salt peanut peanut a...
2,1,This is a confection that has been around a fe...,confect around centuri light pillowi citrus ge...
3,0,If you are looking for the secret ingredient i...,look secret ingredi robitussin believ found go...
4,1,Great taffy at a great price. There was a wid...,great taffi great price wide assort yummi taff...
...,...,...,...
393928,1,Great for sesame chicken..this is a good if no...,great sesam chicken good not better restur eat...
393929,0,I'm disappointed with the flavor. The chocolat...,disappoint flavor chocol note especi weak milk...
393930,1,"These stars are small, so you can give 10-15 o...",star small give one train session tri train do...
393931,1,These are the BEST treats for training and rew...,best treat train reward dog good groom lower c...


In [None]:
df_text.isnull().sum()

Positive         0
Text             0
ProcessedText    3
dtype: int64

In [None]:
df_text['Positive'] = df_text['Positive'].astype("category")

In [None]:
df_text = df_text[df_text['ProcessedText'].notnull()] #drop the rows of null values

In [None]:
df_text = df_text.reset_index().drop('index',axis=1)

In [None]:
df_text.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 393930 entries, 0 to 393929
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype   
---  ------         --------------   -----   
 0   Positive       393930 non-null  category
 1   Text           393930 non-null  object  
 2   ProcessedText  393930 non-null  object  
dtypes: category(1), object(2)
memory usage: 6.4+ MB


In [None]:
X = df_text['ProcessedText']
y = df_text['Positive']

### Check if features need to be further cleaned

In [None]:
Tfidf_vec = TfidfVectorizer(stop_words='english', max_df=0.8, dtype= np.float32)

vectorizedX = Tfidf_vec.fit_transform(X)

vectorizedX.shape

(393930, 73780)

In [None]:
[feature for feature in Tfidf_vec.get_feature_names()]



['aa',
 'aaah',
 'aabout',
 'aabsolut',
 'aachen',
 'aack',
 'aacur',
 'aacut',
 'aad',
 'aadd',
 'aadmit',
 'aadp',
 'aadult',
 'aaf',
 'aafco',
 'aafter',
 'aagh',
 'aah',
 'aahh',
 'aahya',
 'aain',
 'aakaufman',
 'aalmost',
 'aaloo',
 'aamazon',
 'aamzon',
 'aana',
 'aand',
 'aani',
 'aanoth',
 'aap',
 'aar',
 'aardvark',
 'aargh',
 'aaron',
 'aaround',
 'aarp',
 'aarrggh',
 'aarrgh',
 'aarthur',
 'aarti',
 'aasanfood',
 'aash',
 'aauc',
 'aaw',
 'ab',
 'aback',
 'abalon',
 'abamectin',
 'abandn',
 'abando',
 'abandon',
 'abaolut',
 'abash',
 'abat',
 'abattoir',
 'abb',
 'abba',
 'abbay',
 'abbazabba',
 'abberlin',
 'abbey',
 'abbi',
 'abbondanza',
 'abbot',
 'abbott',
 'abbrevi',
 'abbypomeroy',
 'abc',
 'abcess',
 'abcstor',
 'abd',
 'abdi',
 'abdomen',
 'abdomin',
 'abduct',
 'abdul',
 'abe',
 'abeja',
 'abel',
 'abenefici',
 'aber',
 'aberdeen',
 'aberfoyl',
 'abernook',
 'aberr',
 'abet',
 'abett',
 'abfab',
 'abhor',
 'abhorr',
 'abi',
 'abid',
 'abiet',
 'abigirl',
 'abil',

## Use ML Algorithm to predict the label (negative/ positive)

### Naive Bayes

In [None]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=1)
lst_accu_stratified = []

In [None]:
from sklearn import naive_bayes
Naive = naive_bayes.MultinomialNB()

In [None]:
for train_index, test_index in skf.split(X, y):
    X_train_fold, X_test_fold = X[train_index], X[test_index]
    y_train_fold, y_test_fold = y[train_index], y[test_index]
    
    Tfidf_vect = TfidfVectorizer(stop_words='english', max_df=0.8, dtype= np.float32)
    Tfidf_vect.fit(X_train_fold)
    
    X_train_Tfidf = Tfidf_vect.transform(X_train_fold)
    X_test_Tfidf = Tfidf_vect.transform(X_test_fold)
    Naive.fit(X_train_Tfidf, y_train_fold)
    y_pred = Naive.predict(X_test_Tfidf)
    lst_accu_stratified.append(f1_score(y_test_fold, y_pred))

In [None]:
print('List of possible F1 score:', lst_accu_stratified)
print('\nMaximum F1 score That can be obtained from this model is:',
      max(lst_accu_stratified)*100, '%')
print('\nMinimum F1 score:',
      min(lst_accu_stratified)*100, '%')
print('\nOverall F1 score:',
      mean(lst_accu_stratified)*100, '%')
print('\nStandard Deviation is:', stdev(lst_accu_stratified))

List of possible F1 score: [0.8914590332213512, 0.8911640381307757, 0.8911468261602047, 0.8909237153781592]

Maximum F1 score That can be obtained from this model is: 89.14590332213513 %

Minimum F1 score: 89.09237153781592 %

Overall F1 score: 89.11734032226228 %

Standard Deviation is: 0.00021963800599970014


## SGD Classifier

In [None]:
from sklearn.linear_model import SGDClassifier
sgd = SGDClassifier()

In [None]:
lst_accu_stratified_sgd = []
for train_index, test_index in skf.split(X, y):
    X_train_fold, X_test_fold = X[train_index], X[test_index]
    y_train_fold, y_test_fold = y[train_index], y[test_index]
    
    Tfidf_vect = TfidfVectorizer(stop_words='english', max_df=0.8, dtype= np.float32)
    Tfidf_vect.fit(X_train_fold)
    
    X_train_Tfidf = Tfidf_vect.transform(X_train_fold)
    X_test_Tfidf = Tfidf_vect.transform(X_test_fold)
    sgd.fit(X_train_Tfidf, y_train_fold)

    y_pred = sgd.predict(X_test_Tfidf)
    lst_accu_stratified_sgd.append(f1_score(y_test_fold, y_pred))

In [None]:
print('List of possible F1 score:', lst_accu_stratified_sgd)
print('\nMaximum F1 score That can be obtained from this model is:',
      max(lst_accu_stratified_sgd)*100, '%')
print('\nMinimum F1 score:',
      min(lst_accu_stratified_sgd)*100, '%')
print('\nOverall F1 score:',
      mean(lst_accu_stratified_sgd)*100, '%')
print('\nStandard Deviation is:', stdev(lst_accu_stratified_sgd))

List of possible F1 score: [0.918450211579385, 0.9183829511325337, 0.9182471484052476, 0.917640304642314]

Maximum F1 score That can be obtained from this model is: 91.8450211579385 %

Minimum F1 score: 91.76403046423141 %

Overall F1 score: 91.81801539398701 %

Standard Deviation is: 0.00036967704485381517


In [None]:
importances = sgd.coef_.tolist()[0]
#importances = Naive.feature_importances_
indices = np.argsort(importances)
features = Tfidf_vect.get_feature_names()

imp_df = pd.DataFrame({'Feature':[features[i] for i in indices],'Importance': [importances[i] for i in indices]})



In [None]:
print(imp_df.tail(10)) #top 10 most positive words

       Feature  Importance
64373  favorit    1.830844
64374   awesom    1.851652
64375     good    2.134706
64376     amaz    2.154049
64377    excel    2.489259
64378  perfect    2.716438
64379   delici    3.026104
64380     love    3.066141
64381     best    3.344341
64382    great    3.987900


In [None]:
print(imp_df.head(10)) #top 10 most negative words

      Feature  Importance
0  disappoint   -4.488774
1       worst   -3.522368
2          ok   -3.468223
3      return   -3.306934
4          aw   -3.040941
5    unfortun   -2.956783
6     terribl   -2.927215
7     horribl   -2.865595
8        okay   -2.714790
9       bland   -2.639161


## Dimentionality Reduction

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

Tfidf_vect_svd = TfidfVectorizer(stop_words='english', max_df=0.8, dtype= np.float32)
Tfidf_vect_svd.fit(X_train)
    
X_train_Tfidf_svd = Tfidf_vect_svd.transform(X_train)
X_test_Tfidf_svd = Tfidf_vect_svd.transform(X_test)

In [None]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components = 100) # n_components = 100, as recommended in sklearn documentation

X_train_svd = svd.fit_transform(X_train_Tfidf_svd, y_train)
X_test_svd = svd.transform(X_test_Tfidf_svd)

In [None]:
svd.explained_variance_ratio_

array([0.00336754, 0.00920605, 0.00873247, 0.00607381, 0.00520763,
       0.00438224, 0.00394556, 0.00369774, 0.00356199, 0.0035139 ,
       0.00336865, 0.00326978, 0.00320677, 0.00314526, 0.0030553 ,
       0.002951  , 0.00289857, 0.00282251, 0.00265473, 0.0025899 ,
       0.00254021, 0.00252584, 0.00252072, 0.00242683, 0.00237731,
       0.00237117, 0.00230685, 0.00227535, 0.00226593, 0.00219792,
       0.00211904, 0.00210478, 0.00208849, 0.0020585 , 0.00202972,
       0.00198882, 0.00195961, 0.0019215 , 0.00190299, 0.0018793 ,
       0.00187089, 0.00185326, 0.00182584, 0.00180979, 0.00179237,
       0.00175463, 0.00175282, 0.00171879, 0.001703  , 0.00169518,
       0.00167955, 0.00167625, 0.00165775, 0.00163765, 0.00162429,
       0.00161658, 0.00160447, 0.00159014, 0.00158354, 0.00156051,
       0.00154829, 0.00153724, 0.00153277, 0.00151346, 0.00149404,
       0.00147371, 0.00146221, 0.00145614, 0.00144671, 0.00143112,
       0.00142872, 0.0014095 , 0.00140326, 0.0013943 , 0.00137

In [None]:
len(svd.explained_variance_ratio_)

100

In [None]:
sgd_sdv = SGDClassifier()
sgd_sdv.fit(X_train_svd, y_train)
y_pred_sgd_sdv = sgd_sdv.predict(X_test_svd)

print("F1 Score: ", f1_score(y_test, y_pred_sgd_sdv))

F1 Score:  0.8938094588098446


In [None]:
svd1 = TruncatedSVD(n_components = 500) # try to increase the number of components

X_train_svd1 = svd1.fit_transform(X_train_Tfidf_svd, y_train)
X_test_svd1 = svd1.transform(X_test_Tfidf_svd)

In [None]:
sgd_sdv1 = SGDClassifier()
sgd_sdv1.fit(X_train_svd1, y_train)
y_pred_sgd_sdv1 = sgd_sdv1.predict(X_test_svd1)

print("F1 Score: ", f1_score(y_test, y_pred_sgd_sdv1))

F1 Score:  0.9095921019077342


We can keep 500 components from SVD, instead of all 73780 features of TFIDF.


## Bigram

In [None]:
import nltk

In [None]:
tfidf_n = TfidfVectorizer(stop_words='english', max_df=0.8, dtype= np.float32, ngram_range=(2))

In [None]:
sgd_bi = SGDClassifier()
lst_accu_stratified_bi = []

for train_index, test_index in skf.split(X, y):
    X_train_fold, X_test_fold = X[train_index], X[test_index]
    y_train_fold, y_test_fold = y[train_index], y[test_index]
    
    Tfidf_bi = TfidfVectorizer(stop_words='english', max_df=0.8, dtype= np.float32,ngram_range=(2,2))
    Tfidf_bi.fit(X_train_fold)
    X_train_bi = Tfidf_bi.transform(X_train_fold)
    X_test_bi = Tfidf_bi.transform(X_test_fold)

    sgd_bi.fit(X_train_bi, y_train_fold)
    y_pred_bi = sgd_bi.predict(X_test_bi)
    lst_accu_stratified_bi.append(f1_score(y_test_fold, y_pred_bi))

In [None]:
print('List of possible F1 score:', lst_accu_stratified_bi)
print('\nMaximum F1 score That can be obtained from this model is:',
      max(lst_accu_stratified_bi)*100, '%')
print('\nMinimum F1 score:',
      min(lst_accu_stratified_bi)*100, '%')
print('\nOverall F1 score:',
      mean(lst_accu_stratified_bi)*100, '%')
print('\nStandard Deviation is:', stdev(lst_accu_stratified_bi))

List of possible F1 score: [0.8772421044210669, 0.8771679190834023, 0.8771919800243411, 0.8771677208861157]

Maximum F1 score That can be obtained from this model is: 87.72421044210668 %

Minimum F1 score: 87.71677208861158 %

Overall F1 score: 87.71924311037314 %

Standard Deviation is: 3.501940377826289e-05


In [None]:
features_bi = Tfidf_bi.get_feature_names()
len(features_bi)



2449463

In [None]:
importances_bi = sgd_bi.coef_.tolist()[0]
indices_bi = np.argsort(importances_bi)


imp_df_bi = pd.DataFrame({'Feature':[features_bi[i] for i in indices_bi],'Importance': [importances_bi[i] for i in indices_bi]})

In [None]:
print(imp_df_bi.tail(10)) #top 10 most positive bigram

                  Feature  Importance
2449453          far best    0.292919
2449454       realli good    0.299654
2449455         best tast    0.300389
2449456        love stuff    0.310013
2449457        great tast    0.319084
2449458       definit buy    0.335500
2449459     great product    0.344050
2449460        tast great    0.350020
2449461  pleasant surpris    0.428914
2449462    high recommend    0.690828


In [None]:
print(imp_df_bi.head(10)) #top 10 most negative words

              Feature  Importance
0          wast money   -5.883018
1              wo buy   -2.909699
2  disappoint product   -1.438679
3          threw away   -1.405127
4         buyer bewar   -1.340235
5           bad batch   -1.300711
6            wo order   -1.261163
7          throw away   -1.220730
8            tast bad   -1.210144
9             tast ok   -1.192638
