In [4]:
import kaggle
import pandas as pd
import matplotlib.pyplot as plt 
import pyarrow
import fastparquet
import numpy as np
import os
from collections import Counter, defaultdict
import warnings
import seaborn as sns
warnings.filterwarnings("ignore")
from wordcloud import WordCloud 
#kaggle.api.authenticate()
import nltk
from string import punctuation
import textacy.preprocessing as tprep
from nltk.corpus import stopwords
from wordcloud import WordCloud, STOPWORDS
import spacy
nlp = spacy.load("en_core_web_sm")
from sklearn.metrics import accuracy_score 
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier

In [5]:
new_df=pd.read_parquet('prepared_text.parquet.gzip', engine='pyarrow')
new_df.head(5)

Unnamed: 0,Rating,new_reviews,lemmas,adjs_verbs,nouns,noun_phrases,adj_noun_phrases,entities,tokens
0,5,feel lucky found used phone us used hard phone...,,,,,,,"[feel, lucky, found, used, phone, us, used, ha..."
1,4,nice phone nice grade pantach revue clean set ...,,,,,,,"[nice, phone, nice, grade, pantach, revue, cle..."
2,5,pleased,,,,,,,[pleased]
3,4,works good goes slow sometimes good phone love,,,,,,,"[works, good, goes, slow, sometimes, good, pho..."
4,4,great phone replace lost phone thing volume bu...,,,,,,,"[great, phone, replace, lost, phone, thing, vo..."


In [6]:

new_df =new_df[[
    'Rating','new_reviews','tokens'
]]

In [7]:
new_df.info()
print('\n')
print(new_df.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 413840 entries, 0 to 413839
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   Rating       413840 non-null  int64 
 1   new_reviews  413840 non-null  object
 2   tokens       413840 non-null  object
dtypes: int64(1), object(2)
memory usage: 9.5+ MB


(413840, 3)


## Preparing Data for a Supervised Learning Approach

In [8]:
# Assigning a new [1,0] target class label based on the product rating
new_df['sentiment'] = 0
new_df.loc[new_df['Rating'] > 3, 'sentiment'] = 1
new_df.loc[new_df['Rating'] < 3, 'sentiment'] = 0

## Train-Test Split

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(new_df['tokens'],
                                                        new_df['sentiment'],
                                                        test_size=0.2,
                                                        random_state=42,
                                                        stratify=new_df['sentiment'])
print ('Size of Training Data ', X_train.shape[0]) 
print ('Size of Test Data ', X_test.shape[0])
print ('Distribution of classes in Training Data :')
print ('Positive Sentiment ', str(sum(Y_train == 1)/ len(Y_train) * 100.0))
print ('Negative Sentiment ', str(sum(Y_train == 0)/ len(Y_train) * 100.0))
print ('Distribution of classes in Testing Data :')
print ('Positive Sentiment ', str(sum(Y_test == 1)/ len(Y_test) * 100.0)) 
print ('Negative Sentiment ', str(sum(Y_test == 0)/ len(Y_test) * 100.0))

Size of Training Data  331072
Size of Test Data  82768
Distribution of classes in Training Data :
Positive Sentiment  68.86659095302532
Negative Sentiment  31.133409046974673
Distribution of classes in Testing Data :
Positive Sentiment  68.86598685482312
Negative Sentiment  31.13401314517688


## Text Vectorization

In [10]:
def identity_tokenizer(text):
  return text


tfidf = TfidfVectorizer(tokenizer=identity_tokenizer,min_df = 10,ngram_range=(1,1),lowercase=False)
X_train_tf = tfidf.fit_transform(X_train)
X_test_tf = tfidf.transform(X_test)

## Training the Machine Learning Model

In [11]:

model_svc = LinearSVC(random_state=42, tol=1e-5)
model_svc.fit(X_train_tf, Y_train)
Y_pred_svc = model_svc.predict(X_test_tf)
print ('Accuracy Score  - ', accuracy_score(Y_test, Y_pred_svc)) 
print ('ROC-AUC Score - ', roc_auc_score(Y_test, Y_pred_svc))

Accuracy Score  -  0.9146650879566982
ROC-AUC Score -  0.8939663352596879


In [33]:
sample_reviews = new_df.sample(10)
sample_reviews_tf = tfidf.transform(sample_reviews['tokens'])
sentiment_predictions_svc = model_svc.predict(sample_reviews_tf)
sentiment_predictions_svc = pd.DataFrame(data = sentiment_predictions_svc,
                                         index=sample_reviews.index,
                                         columns=['sentiment_prediction'])
sample_reviews_svc = pd.concat([sample_reviews, sentiment_predictions_svc], axis=1)
print ('Some sample reviews with their sentiment - ') 
sample_reviews_svc[['new_reviews','sentiment_prediction']]


Some sample reviews with their sentiment - 


Unnamed: 0,new_reviews,sentiment_prediction
70355,excellent,1
62665,excelente producto,1
271831,device sold open group wireless att box unlock...,1
177492,written package dissappointing,0
330452,stopped working 7months,0
108540,loud listening music fine without case sides p...,1
272571,excellent thing like battery last long otherwi...,1
49702,great,1
377420,pleased phonemit bought replace one like thril...,1
78481,great wife liked 38mm apple watch much said or...,1


In [12]:
##Linear Regression Supervised Model

#model_reg = LinearRegression()
#model_reg. fit(X_train_tf, Y_train)
#Y_pred_reg = model_reg.predict(X_test_tf)
#print ('Accuracy Score - ', accuracy_score(Y_test, Y_pred_reg)) 
#print ('ROC-AUC Score - ', roc_auc_score(Y_test, Y_pred_reg))

In [13]:
model_clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
model_clf.fit(X_train_tf, Y_train)
Y_pred_clf= model_clf.predict(X_test_tf)
print ('Accuracy Score - ', accuracy_score(Y_test, Y_pred_clf)) 
print ('ROC-AUC Score - ', roc_auc_score(Y_test, Y_pred_clf))

Accuracy Score -  0.9126594819253818
ROC-AUC Score -  0.8930629852679421


In [34]:
sentiment_predictions_clf = model_clf.predict(sample_reviews_tf)
sentiment_predictions_clf = pd.DataFrame(data = sentiment_predictions_clf,
                                         index=sample_reviews.index,
                                         columns=['sentiment_prediction'])
sample_reviews_clf = pd.concat([sample_reviews, sentiment_predictions_clf], axis=1)
print ('Some sample reviews with their sentiment - ') 
sample_reviews_clf[['new_reviews','sentiment_prediction']]


Some sample reviews with their sentiment - 


Unnamed: 0,new_reviews,sentiment_prediction
70355,excellent,1
62665,excelente producto,1
271831,device sold open group wireless att box unlock...,1
177492,written package dissappointing,0
330452,stopped working 7months,0
108540,loud listening music fine without case sides p...,1
272571,excellent thing like battery last long otherwi...,1
49702,great,1
377420,pleased phonemit bought replace one like thril...,1
78481,great wife liked 38mm apple watch much said or...,1


In [16]:
model_rf= RandomForestClassifier()
model_rf.fit(X_train_tf, Y_train)
Y_pred_rf= model_rf.predict(X_test_tf)
print ('Accuracy Score - ', accuracy_score(Y_test, Y_pred_rf)) 
print ('ROC-AUC Score - ', roc_auc_score(Y_test, Y_pred_rf))

Accuracy Score -  0.957954765126619
ROC-AUC Score -  0.9454788121205813


In [35]:
sentiment_predictions_rf = model_clf.predict(sample_reviews_tf)
sentiment_predictions_rf = pd.DataFrame(data = sentiment_predictions_rf,
                                         index=sample_reviews.index,
                                         columns=['sentiment_prediction'])
sample_reviews_rf = pd.concat([sample_reviews, sentiment_predictions_rf], axis=1)
print ('Some sample reviews with their sentiment - ') 
sample_reviews_rf[['new_reviews','sentiment_prediction']]

Some sample reviews with their sentiment - 


Unnamed: 0,new_reviews,sentiment_prediction
70355,excellent,1
62665,excelente producto,1
271831,device sold open group wireless att box unlock...,1
177492,written package dissappointing,0
330452,stopped working 7months,0
108540,loud listening music fine without case sides p...,1
272571,excellent thing like battery last long otherwi...,1
49702,great,1
377420,pleased phonemit bought replace one like thril...,1
78481,great wife liked 38mm apple watch much said or...,1
