In [0]:
# necessary imports
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [2]:
#reading metadata file
metadf = pd.read_csv("metadata", sep='\t',  lineterminator='\n', names = ['user_id','prod_id','rating','label','date'])
metadf

Unnamed: 0,user_id,prod_id,rating,label,date
0,923,0,3.0,-1,2014-12-08
1,924,0,3.0,-1,2013-05-16
2,925,0,4.0,-1,2013-07-01
3,926,0,4.0,-1,2011-07-28
4,927,0,4.0,-1,2010-11-01
...,...,...,...,...,...
359047,161146,349,5.0,1,2014-02-06
359048,116424,349,5.0,1,2014-01-31
359049,161147,349,5.0,1,2014-01-30
359050,97930,349,5.0,1,2014-01-25


In [3]:
#reading reviewContent file and loading it as pandas Dataframe
reviewdf = pd.read_csv("reviewContent", sep='\t',  lineterminator='\n', names = ['user_id','prod_id','date','review'])
reviewdf

Unnamed: 0,user_id,prod_id,date,review
0,923,0,2014-12-08,The food at snack is a selection of popular Gr...
1,924,0,2013-05-16,This little place in Soho is wonderful. I had ...
2,925,0,2013-07-01,ordered lunch for 15 from Snack last Friday. ...
3,926,0,2011-07-28,This is a beautiful quaint little restaurant o...
4,927,0,2010-11-01,Snack is great place for a casual sit down lu...
...,...,...,...,...
258786,33201,668,2011-05-15,Best. Gelato. Outside. Of. Italy. No joke. Thi...
258787,78777,668,2011-05-15,This place has everything to make you Italian....
258788,26478,668,2011-05-15,One of my favorite places to go to in NYC!
258789,129766,668,2011-05-14,I was apprehensive about trying this place but...


In [4]:
# Cleaning null and Nan valuesif any
reviewdf.dropna
metadf.dropna

<bound method DataFrame.dropna of         user_id  prod_id  rating  label        date
0           923        0     3.0     -1  2014-12-08
1           924        0     3.0     -1  2013-05-16
2           925        0     4.0     -1  2013-07-01
3           926        0     4.0     -1  2011-07-28
4           927        0     4.0     -1  2010-11-01
...         ...      ...     ...    ...         ...
359047   161146      349     5.0      1  2014-02-06
359048   116424      349     5.0      1  2014-01-31
359049   161147      349     5.0      1  2014-01-30
359050    97930      349     5.0      1  2014-01-25
359051     5260      349     5.0      1  2014-01-25

[359052 rows x 5 columns]>

In [5]:
# merging both dataframes to make it easy for applying algorithm on common attributes like user_id, prod_id and date
merged = metadf.merge(reviewdf, on=['user_id', 'prod_id','date'])
merged

Unnamed: 0,user_id,prod_id,rating,label,date,review
0,923,0,3.0,-1,2014-12-08,The food at snack is a selection of popular Gr...
1,924,0,3.0,-1,2013-05-16,This little place in Soho is wonderful. I had ...
2,925,0,4.0,-1,2013-07-01,ordered lunch for 15 from Snack last Friday. ...
3,926,0,4.0,-1,2011-07-28,This is a beautiful quaint little restaurant o...
4,927,0,4.0,-1,2010-11-01,Snack is great place for a casual sit down lu...
...,...,...,...,...,...,...
258786,33201,668,5.0,1,2011-05-15,Best. Gelato. Outside. Of. Italy. No joke. Thi...
258787,78777,668,5.0,1,2011-05-15,This place has everything to make you Italian....
258788,26478,668,5.0,1,2011-05-15,One of my favorite places to go to in NYC!
258789,129766,668,5.0,1,2011-05-14,I was apprehensive about trying this place but...


In [6]:
# checking the imbalance in positive and negative labels
np.unique(merged['label'], return_counts=True)

(array([-1,  1]), array([ 26966, 231825]))

In [7]:
# sorting the dataframe for splitting
df = merged[['user_id','prod_id','date','rating','review','label']]
df

Unnamed: 0,user_id,prod_id,date,rating,review,label
0,923,0,2014-12-08,3.0,The food at snack is a selection of popular Gr...,-1
1,924,0,2013-05-16,3.0,This little place in Soho is wonderful. I had ...,-1
2,925,0,2013-07-01,4.0,ordered lunch for 15 from Snack last Friday. ...,-1
3,926,0,2011-07-28,4.0,This is a beautiful quaint little restaurant o...,-1
4,927,0,2010-11-01,4.0,Snack is great place for a casual sit down lu...,-1
...,...,...,...,...,...,...
258786,33201,668,2011-05-15,5.0,Best. Gelato. Outside. Of. Italy. No joke. Thi...,1
258787,78777,668,2011-05-15,5.0,This place has everything to make you Italian....,1
258788,26478,668,2011-05-15,5.0,One of my favorite places to go to in NYC!,1
258789,129766,668,2011-05-14,5.0,I was apprehensive about trying this place but...,1


In [8]:
# assigning reviews to X variable for smooth operation likewise with label
data_x = df['review']
data_y = df['label']
print(data_x.shape, data_y.shape)

(258791,) (258791,)


In [0]:
# splitting the data into test and train samples
X_train, X_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.3)

In [0]:
# converting text reviews to numerical values
cv =  CountVectorizer()

X_traincv = cv.fit_transform(X_train)
X_testcv = cv.transform(X_test)

In [11]:
#declaring and training the naivebayes model
nbayes = MultinomialNB()

nbayes.fit(X_traincv, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [0]:
# getting predictions from the trained model
y_predictions = nbayes.predict(X_testcv)

In [13]:
# example output of labeling done by the model against test set reviews
y_result = list(y_predictions)
yp=["Real" if a==1 else "Fake" for a in y_result]
X_testlist = list(X_test)
output_fm = pd.DataFrame({'Review':X_testlist ,'label':yp})
print(np.unique(output_fm['label'], return_counts=True))
output_fm.head()

(array(['Fake', 'Real'], dtype=object), array([ 5522, 72116]))


Unnamed: 0,Review,label
0,We came here based on a recommendation from a ...,Real
1,"Totally a fan of this new sports bar in the ""H...",Real
2,"In a word - roaches. But wait, there's more! I...",Fake
3,"Nice atmosphere, friendly service, and pretty ...",Real
4,On a non ritzy side street of soho lies this p...,Real


In [14]:
print("Accuracy % :",metrics.accuracy_score(y_test, y_predictions)*100)
print("Precision Score: ", precision_score(y_test, y_predictions, average='micro'))
print("Recall Score: ",recall_score(y_test, y_predictions, average='micro') )
print("F1 Score: ",f1_score(y_test, y_predictions, average='micro') )

Accuracy % : 86.13436719132383
Precision Score:  0.8613436719132384
Recall Score:  0.8613436719132384
F1 Score:  0.8613436719132384
