In [1]:
import numpy as np
import pandas as pd

In [2]:
import nltk
from sklearn.linear_model import LogisticRegression
from nltk.corpus import stopwords

In [3]:
#reading product reviews on electronics category
e_data = pd.read_csv('review_files/electronics/amazon_electronics_reviews.csv')

In [4]:
#reading food products reviews
f_data = pd.read_csv('review_files/food/food_reviews.csv')

In [5]:
#drop null values in both the datasets
e_data.dropna(inplace=True)
f_data.dropna(inplace=True)

In [6]:
f_data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [7]:
#Rating are in the range of 1 to 5, But Reviews with a score or rating of #3 adds no real value to finding the truth about the product
#So, reviews with rating of #3 are removed for the files.
f_data = f_data[f_data['Score']!=3]
e_data = e_data[e_data['Rating']!=3]
print('ok')

ok


In [8]:
f_data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [9]:
#drop all the useless columns
e_data.drop('Product Name',inplace=True,axis=1)
e_data.drop('Brand Name',inplace=True,axis=1)
e_data.drop('Price',inplace=True,axis=1)
e_data.drop('Review Votes',inplace=True,axis=1)

f_data.drop('Id',inplace=True,axis=1)
f_data.drop('ProductId',inplace=True,axis=1)
f_data.drop('UserId',inplace=True,axis=1)
f_data.drop('ProfileName',inplace=True,axis=1)
f_data.drop('HelpfulnessNumerator',inplace=True,axis=1)
f_data.drop('HelpfulnessDenominator',inplace=True,axis=1)
f_data.drop('Time',inplace=True,axis=1)
f_data.drop('Summary',inplace=True,axis=1)

In [10]:
e_data = e_data.sample(frac=1).reset_index(drop=True)
f_data = f_data.sample(frac=1).reset_index(drop=True)

In [11]:
e_data = e_data.drop(e_data.index[200000:])
f_data = f_data.drop(f_data.index[200000:])

In [12]:
#################################################

In [13]:
#create a function which will normalized the review_text
def normalize_words(raw_txt):
    words = raw_txt.lower().split()
    stops = set(stopwords.words("english"))
    meaningful_words = [w for w in words if not w in stops]
    return( " ".join( meaningful_words ))

In [14]:
#applying the normalization function in e_data
e_data['clean_reviews'] = e_data['Reviews'].apply(normalize_words)

In [15]:
#applying the normalization function in f_data
f_data['clean_reviews'] = f_data['Text'].apply(normalize_words)

In [16]:
e_data.head()

Unnamed: 0,Rating,Reviews,clean_reviews
0,5,excellent item,excellent item
1,1,Please do not BUY this phone !!!I bought two o...,please buy phone !!!i bought two one one girl ...
2,2,This phone looked like good deal until I recei...,phone looked like good deal received it. cosme...
3,5,Can't beat this for the price. My son needed a...,can't beat price. son needed new phone want sp...
4,2,The actual item received did not match up to w...,"actual item received match expected, said new ..."


In [17]:
f_data.head()

Unnamed: 0,Score,Text,clean_reviews
0,5,What can I even say about these individually w...,even say individually warped little mouth wate...
1,5,I have to say that I am very pleased with the ...,"say pleased saeco vienna, even though bought s..."
2,5,I couldn't breastfeed so I decided to go with ...,breastfeed decided go next best thing. baby ea...
3,5,"These come out to about 1$ a bar or less, and ...","come 1$ bar less, consider third size 'normal'..."
4,5,"Saw this at a local big-box chain store, so of...","saw local big-box chain store, course look ama..."


In [18]:
#Putting the features columns which we will need in different dataframe and is done for both
e_df = e_data[['clean_reviews','Rating']]
f_df = f_data[['clean_reviews','Score']]

In [19]:
#changing the column names to 'Reviews_text' and 'Rating' in both the dataframe
e_df.columns = ['Reviews_text','Rating']
f_df.columns = ['Reviews_text','Rating']

In [20]:
#combining or stacking the two dataframe together in one dataframe call clean_file
clean_file = f_df.append(e_df)

In [21]:
#now the data we have stack looks like this
clean_file

Unnamed: 0,Reviews_text,Rating
0,even say individually warped little mouth wate...,5
1,"say pleased saeco vienna, even though bought s...",5
2,breastfeed decided go next best thing. baby ea...,5
3,"come 1$ bar less, consider third size 'normal'...",5
4,"saw local big-box chain store, course look ama...",5
5,first tried cookie delta flight. really liked ...,2
6,admit little expensive. sell organic mangoes s...,5
7,one tablespoon elderberry extract plus one dro...,5
8,go raw several others bars taste really good. ...,1
9,"tried wellness webb bars, whitefish sweetpotat...",1


In [22]:
len(clean_file)

400000

In [23]:
clean_data = pd.DataFrame()

In [24]:
#since the data we have right now need to be suffled to prevent learning issues
clean_data = clean_file.sample(frac=1).reset_index(drop=True)

In [25]:
#The clean data is now ready for classification and apply ML model
clean_data.head()

Unnamed: 0,Reviews_text,Rating
0,"opened package, salt look right. compared hima...",1
1,"purchase phone gift christmas, received good c...",1
2,great price nairn's scottish oatcakes. scot ro...,5
3,originally bought sauce pittsburgh vacationing...,5
4,"must admit love jasmine tea (<a href=""http://w...",5


In [48]:
#Create a new column where reviews greater than #3 are given a score of 1 and less than #3 a score of 0
#Of course this is to simplify our understanding and not have to worry about all the review scores as a whole
clean_data['new_rate'] = np.where(clean_data['Rating']>3,1,0) #using the numpy where method

In [49]:
clean_data.head()

Unnamed: 0,Reviews_text,Rating,new_rate
0,"opened package, salt look right. compared hima...",1,0
1,"purchase phone gift christmas, received good c...",1,0
2,great price nairn's scottish oatcakes. scot ro...,5,1
3,originally bought sauce pittsburgh vacationing...,5,1
4,"must admit love jasmine tea (<a href=""http://w...",5,1


In [50]:
#Time to split our data into training sets and test sets using our Reviews_text and Rating column
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(clean_data['Reviews_text'],clean_data['new_rate'],random_state=0)

In [51]:
X_train.shape

(300000,)

In [28]:
X_train[1]

'purchase phone gift christmas, received good condition, plug phone test give . power charge light come phone screen blinks every couple seconds battery level indicator screen showing red charging. change charger posh phone currently using thinking charger fault, outcome. reverted back original charger left plugged several hours. phone charge, try power show posh start screen one second shut down..so thought maybe battery defective sometimes new things defective. ordered replacement battery posh revel, insert battery phone plugged in..i disappointed phone show charge light continue blink off. came conclusion phone bad egg got supplier. decided check return policy phone return it, unfortunately says window return item closed november 26th. spending $65 purchase defective phone, live trinidad caribbean, pay u$100.00 airfreight taxes miami broker here... disappointing know redress purchase.. would never purchase another phone amazon.. advice others, go local stores , ok u pay little extra

In [52]:
#To convert bag of words with count vectorizer is used , which can be0 found in the scikit learn docs
from sklearn.feature_extraction.text import CountVectorizer
count_Vect = CountVectorizer()
count_Vect.fit(X_train)
count_Vect

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [53]:
#Convert the train data to vectorized data
X_train_vec_data = count_Vect.transform(X_train)

In [54]:
X_train_vec_data

<300000x93394 sparse matrix of type '<class 'numpy.int64'>'
	with 8049791 stored elements in Compressed Sparse Row format>

In [55]:
#Finally let's apply Logistic regression on the data and create our model
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train_vec_data,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [56]:
model.score(count_Vect.transform(X_test),y_test)

0.93234

In [59]:
#We can try another means to score our model using the AUC(Area under the curve) score
from sklearn.metrics import roc_auc_score
pred = model.predict(count_Vect.transform(X_test))
print('Score: ',roc_auc_score(y_test,pred))

Score:  0.8769614049539456


In [58]:
model.predict(count_Vect.transform(['the product is very good and high quality','Worst product ']))

array([1, 0])