In [112]:
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from bs4 import BeautifulSoup

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\itsab\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [113]:
import pandas as pd
data=pd.read_csv('../dataset/all_kindle_review .csv')

In [114]:
data.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,asin,helpful,rating,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,0,11539,B0033UV8HI,"[8, 10]",3,"Jace Rankin may be short, but he's nothing to ...","09 2, 2010",A3HHXRELK8BHQG,Ridley,Entertaining But Average,1283385600
1,1,5957,B002HJV4DE,"[1, 1]",5,Great short read. I didn't want to put it dow...,"10 8, 2013",A2RGNZ0TRF578I,Holly Butler,Terrific menage scenes!,1381190400
2,2,9146,B002ZG96I4,"[0, 0]",3,I'll start by saying this is the first of four...,"04 11, 2014",A3S0H2HV6U1I7F,Merissa,Snapdragon Alley,1397174400
3,3,7038,B002QHWOEU,"[1, 3]",3,Aggie is Angela Lansbury who carries pocketboo...,"07 5, 2014",AC4OQW3GZ919J,Cleargrace,very light murder cozy,1404518400
4,4,1776,B001A06VJ8,"[0, 1]",4,I did not expect this type of book to be in li...,"12 31, 2012",A3C9V987IQHOQD,Rjostler,Book,1356912000


In [115]:
##Extracting noly 2 columns , One for Input and other for Output
data=data[['reviewText','rating']]

In [116]:
data.shape

(12000, 2)

In [117]:
##Checking missing value
data.isnull().sum()

reviewText    0
rating        0
dtype: int64

In [118]:
##Cheking unique value for rating columns
data['rating'].unique()

array([3, 5, 4, 2, 1], dtype=int64)

In [119]:
## For preprocessing and Cleaning
data['rating']=data['rating'].apply(lambda x:0 if x<3 else 1)

In [120]:
##To check the output column value
data['rating'].unique

<bound method Series.unique of 0        1
1        1
2        1
3        1
4        1
        ..
11995    1
11996    1
11997    1
11998    0
11999    1
Name: rating, Length: 12000, dtype: int64>

In [121]:
##Checking if the dataset is balanced or not
data['rating'].value_counts()

rating
1    8000
0    4000
Name: count, dtype: int64

In [122]:
## on Input dataset
data['reviewText']=data['reviewText'].str.lower()

In [123]:
##Cleaning
## for special character
data['reviewText']=data['reviewText'].apply(lambda x:re.sub('[^a-z A-Z 0-9]+','',x))

In [124]:
##For stopwords
data['reviewText']=data['reviewText'].apply(lambda x:" ".join([y for y in x.split() if y not in stopwords.words('english')]))

In [125]:
##Removing urls
data['reviewText']=data['reviewText'].apply(lambda x: re.sub(r'(http|https|ftp|ssh)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&/~+#-]*[\w.,@?^=%&/~+#-])?','',str(x)))

In [126]:
##Remove html tags
data['reviewText']=data['reviewText'].apply(lambda x: BeautifulSoup(x,'lxml').get_text())

In [127]:
## Removing all extra spaces
data['reviewText']=data['reviewText'].apply(lambda x: " ".join(x.split()))

In [128]:
data.head()

Unnamed: 0,reviewText,rating
0,jace rankin may short hes nothing mess man hau...,1
1,great short read didnt want put read one sitti...,1
2,ill start saying first four books wasnt expect...,1
3,aggie angela lansbury carries pocketbooks inst...,1
4,expect type book library pleased find price right,1


In [129]:
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\itsab\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [130]:
def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

data['reviewText']=data['reviewText'].apply(lambda x:lemmatize_words(x))

In [131]:
##Train - Test Split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(data['reviewText'],data['rating'],test_size=0.20)

In [132]:
## For BOW
from sklearn.feature_extraction.text import CountVectorizer
bow=CountVectorizer()
X_train_bow=bow.fit_transform(X_train).toarray()
X_test_bow=bow.transform(X_test).toarray()


In [133]:
##For TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer()
X_train_tfidf=tfidf.fit_transform(X_train).toarray()
X_test_tfidf=tfidf.transform(X_test).toarray()

In [134]:
from sklearn.ensemble import RandomForestClassifier
nb_model_bow=RandomForestClassifier().fit(X_train_bow,y_train)
nb_model_tfidf=RandomForestClassifier().fit(X_train_tfidf,y_train)

In [135]:
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report

In [136]:
y_pred_bow=nb_model_bow.predict(X_test_bow)
y_pred_tfidf=nb_model_tfidf.predict(X_test_tfidf)

In [137]:
import numpy as np
import gensim


In [138]:
##Avg Woed2Vec
X_train_tokens = [gensim.utils.simple_preprocess(text) for text in X_train]
X_test_tokens = [gensim.utils.simple_preprocess(text) for text in X_test]

w2v_model = gensim.models.Word2Vec(X_train_tokens, vector_size=100, window=5, min_count=2, workers=4)

def avg_word2vec(doc_tokens):
   
    words_in_vocab = [word for word in doc_tokens if word in w2v_model.wv.index_to_key]
    
    if len(words_in_vocab) > 0:
        return np.mean(w2v_model.wv[words_in_vocab], axis=0)
    else:
        return np.zeros(w2v_model.vector_size)

X_train_w2v = np.array([avg_word2vec(tokens) for tokens in X_train_tokens])
X_test_w2v = np.array([avg_word2vec(tokens) for tokens in X_test_tokens])


In [139]:
rf_w2v = RandomForestClassifier()
rf_w2v.fit(X_train_w2v, y_train)
rf_w2v.fit(X_train_w2v, y_train)
y_pred_w2v = rf_w2v.predict(X_test_w2v)

In [140]:
print("BOW accuracy: ",accuracy_score(y_test,y_pred_bow))
print("tfidf accuracy: ",accuracy_score(y_test,y_pred_tfidf))
print("AvgWord2Vec Accuracy:", accuracy_score(y_test, y_pred_w2v))

BOW accuracy:  0.8083333333333333
tfidf accuracy:  0.7966666666666666
AvgWord2Vec Accuracy: 0.75875


In [141]:
print("BOW Matrix: \n",confusion_matrix(y_test,y_pred_bow))
print("tfidf Matrix: \n",confusion_matrix(y_test,y_pred_tfidf))
print("Average Word2Vec:\n",confusion_matrix(y_test, y_pred_w2v))

BOW Matrix: 
 [[ 394  392]
 [  68 1546]]
tfidf Matrix: 
 [[ 367  419]
 [  69 1545]]
Average Word2Vec:
 [[ 435  351]
 [ 228 1386]]


In [142]:
import pickle

pickle.dump(tfidf, open('vectorizer.pkl', 'wb'))
pickle.dump(nb_model_tfidf, open('model.pkl', 'wb'))
