In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from imblearn import over_sampling
from IPython.display import display

In [2]:
df_prod_review = pd.read_csv('https://raw.githubusercontent.com/aakashgoel12/blogs/master/input/product_review_sentiment.csv',\
                      encoding='latin-1')

In [3]:
df_prod_review.to_csv('date')

In [4]:
df_prod_review.head()

Unnamed: 0,Review,user_sentiment
0,awesome love album good hip hop side current p...,1
1,good good flavor review collect part promotion,1
2,good good flavor,1
3,disappoint read review look buy one couple lub...,0
4,irritation husband buy gel us gel caused irrit...,0


In [5]:
display(df_prod_review.sample(n=5 ,random_state=2))

Unnamed: 0,Review,user_sentiment
7995,awesome product love use clean greasy kitchen ...,1
4220,great vacuum great vacuum doesnt blow dirt job,1
13489,clorox excellent product product excellent cle...,1
25491,great lesson friendship life challenge nephews...,1
6884,clorox wipe work great clean great smell nice ...,1


## Data preprocessing

In [6]:
x,y=df_prod_review['Review'],df_prod_review['user_sentiment']


In [7]:
x.shape,y.shape

((26873,), (26873,))

In [8]:
y

0        1
1        1
2        1
3        0
4        0
        ..
26868    1
26869    1
26870    1
26871    1
26872    1
Name: user_sentiment, Length: 26873, dtype: int64

In [9]:
x

0        awesome love album good hip hop side current p...
1           good good flavor review collect part promotion
2                                         good good flavor
3        disappoint read review look buy one couple lub...
4        irritation husband buy gel us gel caused irrit...
                               ...                        
26868    softness get conditioner influenster try im lo...
26869    love love receive review purpose influenster l...
26870    hair smooth use first love smell product wash ...
26871    perfect oily hair receive influenster never go...
26872    condition healthy receive product complimentar...
Name: Review, Length: 26873, dtype: object

In [10]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.30, random_state=42)

In [11]:
x_train.shape,y_train.shape,x_test.shape,y_test.shape

((18811,), (18811,), (8062,), (8062,))

In [12]:
y.nunique()

2

In [13]:
import seaborn as sns

In [15]:
ros = over_sampling.RandomOverSampler(random_state=0)
x_train, y_train = ros.fit_resample(pd.DataFrame(x_train), pd.Series(y_train))
print("Checking distribution of +ve and -ve review sentiment after oversampling: \n{}".format(y_train.value_counts(normalize=True)))
#convert into list of string
x_train = x_train['Review'].tolist()



Checking distribution of +ve and -ve review sentiment after oversampling: 
user_sentiment
1    0.5
0    0.5
Name: proportion, dtype: float64


## feature engineering

In [16]:
word_vectorizer = TfidfVectorizer(strip_accents='unicode', token_pattern=r'\w{1,}',\
                                ngram_range=(1, 3), stop_words='english', sublinear_tf=True)

In [17]:
word_vectorizer.fit(x_train,y_train)

In [18]:
x_train_transformed = word_vectorizer.transform(x_train)
x_test_transformed = word_vectorizer.transform(x_test.tolist())

In [19]:
def evaluate_model(y_pred,y_actual):
    print(classification_report(y_true = y_actual, y_pred = y_pred))
    #confusion matrix
    cm = confusion_matrix(y_true = y_actual, y_pred = y_pred)
    TN = cm[0, 0] 
    FP = cm[0, 1]
    FN = cm[1, 0]
    TP = cm[1, 1]
    #Calculating the Sensitivity
    sensitivity = round(TP/float(FN + TP),2)
    print("sensitivity: {}".format(sensitivity))
    #Calculating the Specificity
    specificity = round(TN / float(TN + FP),2)
    print("specificity: {}".format(specificity))

In [20]:
# model training
logit = LogisticRegression()
logit.fit(x_train_transformed,y_train)

In [21]:
y_pred_train= logit.predict(x_train_transformed)
#4.3 Prediction on Test Data
y_pred_test = logit.predict(x_test_transformed)

In [22]:
print("Evaluation on Train dataset ..")
evaluate_model(y_pred = y_pred_train, y_actual = y_train)
print("Evaluation on Test dataset ..")
#4.5 Evaluation on Test
evaluate_model(y_pred = y_pred_test, y_actual = y_test)

Evaluation on Train dataset ..
              precision    recall  f1-score   support

           0       0.98      1.00      0.99     16693
           1       1.00      0.98      0.99     16693

    accuracy                           0.99     33386
   macro avg       0.99      0.99      0.99     33386
weighted avg       0.99      0.99      0.99     33386

sensitivity: 0.98
specificity: 1.0
Evaluation on Test dataset ..
              precision    recall  f1-score   support

           0       0.50      0.56      0.52       881
           1       0.94      0.93      0.94      7181

    accuracy                           0.89      8062
   macro avg       0.72      0.74      0.73      8062
weighted avg       0.90      0.89      0.89      8062

sensitivity: 0.93
specificity: 0.56


In [23]:
df_prod_review = pd.read_csv('https://raw.githubusercontent.com/aakashgoel12/blogs/master/input/product_review.csv',\
                      encoding='latin-1')

In [24]:
display(df_prod_review.sample(n=5, random_state=42))

Unnamed: 0,prod_name,Review
2501,Hawaiian Punch Berry Limeade Blast Juice,pretty good stuff much sugar kid like
21252,Godzilla 3d Includes Digital Copy Ultraviolet ...,enteraining great interesting version classic ...
23503,Godzilla 3d Includes Digital Copy Ultraviolet ...,best godzilla date like previous godzilla film...
26827,"Storkcraft Tuscany Glider and Ottoman, Beige C...",comfy good put baby sleep calming sister mom n...
18210,Clorox Disinfecting Bathroom Cleaner,product easy use product easy use open use har...


In [38]:
ml=pickle.dump(logit,open('E:\\model\\logit.pkl','wb'))
word_vectorizer=pickle.dump(word_vectorizer,open('E:\\model\\word_vectorizer.pkl', 'wb'))

Earlier rated products by user id:joshua as below


Unnamed: 0,userId,rating,prod_name
0,joshua,5,Pink Friday: Roman Reloaded Re-Up (w/dvd)
17718,joshua,5,Smead174 Recycled Letter Size Manila File Back...
22379,joshua,5,Cheetos Crunchy Flamin' Hot Cheese Flavored Sn...
1541,joshua,3,Dark Shadows (includes Digital Copy) (ultravio...


Earlier rated products by user id:joshua as below


Unnamed: 0,userId,rating,prod_name
0,joshua,5,Pink Friday: Roman Reloaded Re-Up (w/dvd)
17718,joshua,5,Smead174 Recycled Letter Size Manila File Back...
22379,joshua,5,Cheetos Crunchy Flamin' Hot Cheese Flavored Sn...
1541,joshua,3,Dark Shadows (includes Digital Copy) (ultravio...
