In [None]:
import numpy as np
import pandas as pd

In [None]:
import nltk
from sklearn.linear_model import LogisticRegression
from nltk.corpus import stopwords

In [None]:
#reading product reviews on electronics category
e_data = pd.read_csv('review_files/electronics/amazon_electronics_reviews.csv')

In [None]:
#reading food products reviews
f_data = pd.read_csv('review_files/food/food_reviews.csv')

In [None]:
#drop null values in both the datasets
e_data.dropna(inplace=True)
f_data.dropna(inplace=True)

In [None]:
f_data.head()

In [None]:
#Rating are in the range of 1 to 5, But Reviews with a score or rating of #3 adds no real value to finding the truth about the product
#So, reviews with rating of #3 are removed for the files.
f_data[f_data['Score']!=3]
e_data[e_data['Rating']!=3]
print('ok')

In [None]:
f_data.head()

In [None]:
#drop all the useless columns
e_data.drop('Product Name',inplace=True,axis=1)
e_data.drop('Brand Name',inplace=True,axis=1)
e_data.drop('Price',inplace=True,axis=1)
e_data.drop('Review Votes',inplace=True,axis=1)

f_data.drop('Id',inplace=True,axis=1)
f_data.drop('ProductId',inplace=True,axis=1)
f_data.drop('UserId',inplace=True,axis=1)
f_data.drop('ProfileName',inplace=True,axis=1)
f_data.drop('HelpfulnessNumerator',inplace=True,axis=1)
f_data.drop('HelpfulnessDenominator',inplace=True,axis=1)
f_data.drop('Time',inplace=True,axis=1)
f_data.drop('Summary',inplace=True,axis=1)

In [None]:
e_data = e_data.sample(frac=1).reset_index(drop=True)
f_data = f_data.sample(frac=1).reset_index(drop=True)

In [None]:
e_data = e_data.drop(e_data.index[200000:])
f_data = f_data.drop(f_data.index[200000:])

In [None]:
#create a function which will normalized the review_text
def normalize_words(raw_txt):
    words = raw_txt.lower().split()
    stops = set(stopwords.words("english"))
    meaningful_words = [w for w in words if not w in stops]
    return( " ".join( meaningful_words ))

In [None]:
#applying the normalization function in e_data
e_data['clean_reviews'] = e_data['Reviews'].apply(normalize_words)

In [None]:
#applying the normalization function in f_data
f_data['clean_reviews'] = f_data['Text'].apply(normalize_words)

In [None]:
e_data.head()

In [None]:
f_data.head()

In [None]:
#Putting the features columns which we will need in different dataframe and is done for both
e_df = e_data[['clean_reviews','Rating']]
f_df = f_data[['clean_reviews','Score']]

In [None]:
#changing the column names to 'Reviews_text' and 'Rating' in both the dataframe
e_df.columns = ['Reviews_text','Rating']
f_df.columns = ['Reviews_text','Rating']

In [None]:
#combining or stacking the two dataframe together in one dataframe call clean_file
clean_file = f_df.append(e_df)

In [None]:
#now the data we have stack looks like this
clean_file

In [None]:
len(clean_file)

In [None]:
clean_data = pd.DataFrame()

In [None]:
#since the data we have right now need to be suffled to prevent learning issues
clean_data = clean_file.sample(frac=1).reset_index(drop=True)

In [None]:
#The clean data is now ready for classification and apply ML model
clean_data.head()

In [None]:
#Create a new column where reviews greater than #3 are given a score of 1 and less than #3 a score of 0
#Of course this is to simplify our understanding and not have to worry about all the review scores as a whole
clean_data['new_rate'] = np.where(clean_data['Rating']>3,1,0) #using the numpy where method

In [None]:
clean_data.head()

In [None]:
#Time to split our data into training sets and test sets using our Reviews_text and Rating column
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(clean_data['Reviews_text'],clean_data['new_rate'],random_state=0)

In [None]:
X_train.shape

In [None]:
X_train[1]

In [None]:
#To convert bag of words with count vectorizer is used , which can be0 found in the scikit learn docs
from sklearn.feature_extraction.text import CountVectorizer
count_Vect = CountVectorizer(ngrams_range=(2,2))
count_Vect.fit(X_train)
count_Vect

In [None]:
#Convert the train data to vectorized data
X_train_vec_data = count_Vect.transform(X_train)

In [None]:
X_train_vec_data

In [None]:
#Finally let's apply Logistic regression on the data and create our model
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train_vec_data,y_train)

In [None]:
model.score(count_Vect.transform(X_test),y_test)

In [None]:
#We can try another means to score our model using the AUC(Area under the curve) score
from sklearn.metrics import roc_auc_score
pred = model.predict(count_Vect.transform(X_test))
print('Score: ',roc_auc_score(y_test,pred))

In [None]:
model.predict(count_Vect.transform(['the product is very good and high quality','Worst product ']))

### Shifting to SGD classifier to reduce the running time

In [None]:
#Finally let's apply SGD regression on the data and create our model
start =tm.time()
from sklearn.linear_model import SGDClassifier
#from sklearn.linear_model import cross_val_predict
SGD_model = SGDClassifier(max_iter = 1500,tol=0.0001)
SGD_model.fit(X_train_Vec,y_train)
end = tm.time() - start
#print("time taken: %d"%(end))
end

In [None]:
from sklearn.metrics import accuracy_score
pred = SGD_model.predict(count_Vect.transform(X_test))
print("Accuracy score:",accuracy_score(y_test,pred))

In [None]:
#We can try another means to score our model using the AUC(Area under the curve) score
import time
start = time.time()
from sklearn.metrics import roc_auc_score
pred = SGD_model.predict(count_Vect.transform(X_test))
print('Score: ',roc_auc_score(y_test,pred))
duration = time.time() - start
print(duration)