In [1]:
## Read the dataset
import numpy as np
import pandas as pd

data = pd.read_csv('all_kindle_review.csv')
data.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,asin,helpful,rating,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,0,11539,B0033UV8HI,"[8, 10]",3,"Jace Rankin may be short, but he's nothing to ...","09 2, 2010",A3HHXRELK8BHQG,Ridley,Entertaining But Average,1283385600
1,1,5957,B002HJV4DE,"[1, 1]",5,Great short read. I didn't want to put it dow...,"10 8, 2013",A2RGNZ0TRF578I,Holly Butler,Terrific menage scenes!,1381190400
2,2,9146,B002ZG96I4,"[0, 0]",3,I'll start by saying this is the first of four...,"04 11, 2014",A3S0H2HV6U1I7F,Merissa,Snapdragon Alley,1397174400
3,3,7038,B002QHWOEU,"[1, 3]",3,Aggie is Angela Lansbury who carries pocketboo...,"07 5, 2014",AC4OQW3GZ919J,Cleargrace,very light murder cozy,1404518400
4,4,1776,B001A06VJ8,"[0, 1]",4,I did not expect this type of book to be in li...,"12 31, 2012",A3C9V987IQHOQD,Rjostler,Book,1356912000


In [2]:
data = data[['reviewText','rating']]

In [3]:
data.isnull().sum()

reviewText    0
rating        0
dtype: int64

In [4]:
data['rating'].unique()

array([3, 5, 4, 2, 1])

In [5]:
data['rating'].value_counts()

5    3000
4    3000
3    2000
2    2000
1    2000
Name: rating, dtype: int64

In [6]:
#Preprocessing and Cleaning
data['review'] = data['rating'].apply(lambda x: 0 if x < 3 else 1)
data.head()

Unnamed: 0,reviewText,rating,review
0,"Jace Rankin may be short, but he's nothing to ...",3,1
1,Great short read. I didn't want to put it dow...,5,1
2,I'll start by saying this is the first of four...,3,1
3,Aggie is Angela Lansbury who carries pocketboo...,3,1
4,I did not expect this type of book to be in li...,4,1


In [7]:
data['review'].value_counts()

1    8000
0    4000
Name: review, dtype: int64

In [8]:
#lowercase the text
data['reviewText']=data['reviewText'].str.lower()
data.head()

Unnamed: 0,reviewText,rating,review
0,"jace rankin may be short, but he's nothing to ...",3,1
1,great short read. i didn't want to put it dow...,5,1
2,i'll start by saying this is the first of four...,3,1
3,aggie is angela lansbury who carries pocketboo...,3,1
4,i did not expect this type of book to be in li...,4,1


In [9]:
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kumarbaibhav/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/kumarbaibhav/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/kumarbaibhav/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [10]:
from bs4 import BeautifulSoup

In [11]:
# Removing special characters
data['reviewText']=data['reviewText'].apply(lambda x:re.sub('[^a-z A-z 0-9-]+', '',x))
# Remove the stopswords
data['reviewText']=data['reviewText'].apply(lambda x:" ".join([y for y in x.split() if y not in stopwords.words('english')]))
# Remove url 
data['reviewText']=data['reviewText'].apply(lambda x: re.sub(r'(http|https|ftp|ssh)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '' , str(x)))
# Remove html tags
data['reviewText']=data['reviewText'].apply(lambda x: BeautifulSoup(x, 'lxml').get_text())
# Remove any additional spaces
data['reviewText']=data['reviewText'].apply(lambda x: " ".join(x.split()))

  data['reviewText']=data['reviewText'].apply(lambda x: BeautifulSoup(x, 'lxml').get_text())


In [12]:
data.head()

Unnamed: 0,reviewText,rating,review
0,jace rankin may short hes nothing mess man hau...,3,1
1,great short read didnt want put read one sitti...,5,1
2,ill start saying first four books wasnt expect...,3,1
3,aggie angela lansbury carries pocketbooks inst...,3,1
4,expect type book library pleased find price right,4,1


In [13]:
#Apply Lemmatization
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

#create function for lemmatization
def lemmatize(x):
    
    return " ".join([lemmatizer.lemmatize(word,pos='v') for word in x.split()])

In [14]:
#apply to review text column
data['reviewText'] = data['reviewText'].apply(lambda x: lemmatize(x))
data.head()

Unnamed: 0,reviewText,rating,review
0,jace rankin may short hes nothing mess man hau...,3,1
1,great short read didnt want put read one sit s...,5,1
2,ill start say first four book wasnt expect 34c...,3,1
3,aggie angela lansbury carry pocketbooks instea...,3,1
4,expect type book library please find price right,4,1


In [15]:
#train test split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(data['reviewText'],data['review'],test_size=0.20)

In [16]:
#apply BOW 
from sklearn.feature_extraction.text import CountVectorizer
bow = CountVectorizer()
X_train_bow = bow.fit_transform(X_train).toarray()
X_test_bow = bow.transform(X_test).toarray()

In [17]:
#apply TFIDF 
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train).toarray()
X_test_tfidf = tfidf.transform(X_test).toarray()

In [18]:
#apply naive bayes algorithm
from sklearn.naive_bayes import GaussianNB
nb_model_bow = GaussianNB().fit(X_train_bow,y_train)
nb_model_tfidf = GaussianNB().fit(X_train_tfidf,y_train)

In [19]:
#check performance metrics
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
y_pred_bow = nb_model_bow.predict(X_test_bow)
y_pred_tfidf = nb_model_tfidf.predict(X_test_tfidf)
print(f'Performance metrics for BOW Model-------------------------------')
print(f'{confusion_matrix(y_test,y_pred_bow)}')
print(f'{classification_report(y_test,y_pred_bow)}')
print(f'{accuracy_score(y_test,y_pred_bow)}')
print('\n')
print(f'Performance metrics for TFIDF Model-------------------------------')
print(f'{confusion_matrix(y_test,y_pred_tfidf)}')
print(f'{classification_report(y_test,y_pred_tfidf)}')
print(f'{accuracy_score(y_test,y_pred_tfidf)}')
print('\n')

Performance metrics for BOW Model-------------------------------
[[522 263]
 [744 871]]
              precision    recall  f1-score   support

           0       0.41      0.66      0.51       785
           1       0.77      0.54      0.63      1615

    accuracy                           0.58      2400
   macro avg       0.59      0.60      0.57      2400
weighted avg       0.65      0.58      0.59      2400

0.5804166666666667


Performance metrics for TFIDF Model-------------------------------
[[511 274]
 [736 879]]
              precision    recall  f1-score   support

           0       0.41      0.65      0.50       785
           1       0.76      0.54      0.64      1615

    accuracy                           0.58      2400
   macro avg       0.59      0.60      0.57      2400
weighted avg       0.65      0.58      0.59      2400

0.5791666666666667




In [20]:
#create a list of words for every sentence and append into a list
words = []
for index,row in data.iterrows():
    words.append([i for i in row['reviewText'].split()])

In [21]:
#implement Word2Vec model
import gensim
from gensim.models import Word2Vec, KeyedVectors
import gensim.downloader as api
#wv = api.load('word2vec-google-news-300')


In [26]:
wv.most_similar('badass')

NameError: name 'wv' is not defined

In [27]:
#word2vec model
word2vec_model = Word2Vec(sentences = words)
#apply average word2vec
def avg_word2vec(doc):
    # remove out-of-vocabulary words
    #sent = [word for word in doc if word in model.wv.index_to_key]
    #print(sent)
    return np.mean([word2vec_model.wv[word] for word in doc if word in word2vec_model.wv.index_to_key],axis=0)
                #or [np.zeros(len(model.wv.index_to_key))], axis=0)

In [23]:
#pretrained model
def avg_word2vec(doc):
    # Remove out-of-vocabulary words and compute the mean of the word vectors
    return np.mean([wv[word] for word in doc if word in wv.index_to_key], axis=0)

In [None]:
save_path = 'C:/Users/gaya/Desktop/project/word2vec_model.model'
word2vec_model.save(save_path)

In [28]:
word2vec_model.corpus_count

12000

In [None]:
!pip install tqdm


In [29]:
#words for the 1st sentence
words[0]

['jace',
 'rankin',
 'may',
 'short',
 'hes',
 'nothing',
 'mess',
 'man',
 'haul',
 'saloon',
 'undertaker',
 'know',
 'hes',
 'famous',
 'bounty',
 'hunter',
 'oregon',
 '1890s',
 'shoot',
 'man',
 'saloon',
 'finish',
 'years',
 'long',
 'quest',
 'avenge',
 'sisters',
 'murder',
 'try',
 'figure',
 'next',
 'snotty-nosed',
 'farm',
 'boy',
 'rescue',
 'gang',
 'bully',
 'offer',
 'money',
 'kill',
 'man',
 'force',
 'ranch',
 'reluctantly',
 'agree',
 'bring',
 'man',
 'justice',
 'kill',
 'outright',
 'first',
 'need',
 'tell',
 'sisters',
 'widower',
 'newskyla',
 'kyle',
 'springer',
 'bailey',
 'rid',
 'trail',
 'sleep',
 'grind',
 'past',
 'month',
 'try',
 'find',
 'jace',
 'want',
 'revenge',
 'man',
 'kill',
 'husband',
 'take',
 'ranch',
 'amongst',
 'crimes',
 'shes',
 'keen',
 'detour',
 'jace',
 'want',
 'take',
 'realize',
 'shes',
 'options',
 'hide',
 'behind',
 'boy',
 'persona',
 'best',
 'try',
 'keep',
 'pace',
 'confrontation',
 'along',
 'way',
 'get',
 'shoot'

In [30]:
from tqdm import tqdm
#apply for the entire sentences
import numpy as np
X=[]
for i in tqdm(range(len(words))):
    X.append(avg_word2vec(words[i]))

100%|███████████████████████████████████| 12000/12000 [00:05<00:00, 2022.47it/s]


In [31]:
#check shape of vector embedding for a document
X[0].shape

(100,)

In [32]:
X_new = np.array(X)
X_new.shape

(12000, 100)

In [33]:
X[0].reshape(1,-1).shape

(1, 100)

In [34]:
df = pd.DataFrame(X_new)

In [35]:
#appending all the vectors after using avg word2vec into a dataframe
df = pd.DataFrame()
for i in tqdm(range(len(words))):
    df = pd.concat([pd.DataFrame(X[i].reshape(1, -1)) for i in range(len(X))], ignore_index=True)


  1%|▍                                      | 153/12000 [00:39<51:06,  3.86it/s]


KeyboardInterrupt: 

In [36]:
#dataframe with each sentence represented as a 300 dimensional vector 
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.020624,0.290489,-0.207166,-0.089109,0.014833,-0.474143,0.189333,0.530447,-0.180703,-0.247802,...,0.495438,0.060727,0.007973,0.108346,0.424585,0.117388,0.230643,-0.454489,0.036424,-0.022297
1,0.036068,0.164903,-0.198505,0.190031,-0.106824,-0.548203,0.087713,0.76015,-0.502068,-0.783824,...,0.381369,0.170673,-0.185213,0.276593,0.814066,0.399272,0.251647,-0.380382,-0.173469,-0.159441
2,-0.171994,0.380069,-0.126491,0.216998,-0.147193,-0.456888,0.173166,0.703797,-0.480871,-0.521802,...,0.335733,0.138489,-0.017147,0.224223,0.813675,0.354883,0.284375,-0.369756,-0.17159,-0.127394
3,-0.250549,0.368768,-0.05613,0.201408,-0.159876,-0.356677,-0.119307,0.57925,-0.471635,-0.352775,...,0.244894,0.163364,0.060336,0.11317,0.639825,0.581436,0.089397,-0.186249,-0.067916,-0.206023
4,-0.216182,0.245601,0.183347,0.588376,0.083498,-0.610692,0.323728,0.837952,-0.447431,-0.581264,...,0.282189,0.142825,-0.118187,0.169801,0.740734,0.231928,0.24471,-0.208078,-0.378202,0.01091


In [38]:
#append review column to word2vec dataframe
df['target'] = data['review']
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,target
0,-0.020624,0.290489,-0.207166,-0.089109,0.014833,-0.474143,0.189333,0.530447,-0.180703,-0.247802,...,0.060727,0.007973,0.108346,0.424585,0.117388,0.230643,-0.454489,0.036424,-0.022297,1
1,0.036068,0.164903,-0.198505,0.190031,-0.106824,-0.548203,0.087713,0.76015,-0.502068,-0.783824,...,0.170673,-0.185213,0.276593,0.814066,0.399272,0.251647,-0.380382,-0.173469,-0.159441,1
2,-0.171994,0.380069,-0.126491,0.216998,-0.147193,-0.456888,0.173166,0.703797,-0.480871,-0.521802,...,0.138489,-0.017147,0.224223,0.813675,0.354883,0.284375,-0.369756,-0.17159,-0.127394,1
3,-0.250549,0.368768,-0.05613,0.201408,-0.159876,-0.356677,-0.119307,0.57925,-0.471635,-0.352775,...,0.163364,0.060336,0.11317,0.639825,0.581436,0.089397,-0.186249,-0.067916,-0.206023,1
4,-0.216182,0.245601,0.183347,0.588376,0.083498,-0.610692,0.323728,0.837952,-0.447431,-0.581264,...,0.142825,-0.118187,0.169801,0.740734,0.231928,0.24471,-0.208078,-0.378202,0.01091,1


In [39]:
#check for null values
df.isnull().sum()

0         0
1         0
2         0
3         0
4         0
         ..
96        0
97        0
98        0
99        0
target    0
Length: 101, dtype: int64

In [40]:
#train test split
X = df.drop('target',axis=1)
y = df['target']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [41]:
#model building
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)
#check performance
print(f'Model Performance-----------------------')
print('\n')
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test,y_pred))

Model Performance-----------------------


[[ 435  366]
 [ 212 1387]]
              precision    recall  f1-score   support

           0       0.67      0.54      0.60       801
           1       0.79      0.87      0.83      1599

    accuracy                           0.76      2400
   macro avg       0.73      0.71      0.71      2400
weighted avg       0.75      0.76      0.75      2400

0.7591666666666667


In [42]:
#performance on training data
y_pred_train = rf.predict(X_train)
print(confusion_matrix(y_train,y_pred_train))
print(classification_report(y_train,y_pred_train))
print(accuracy_score(y_train,y_pred_train))

[[3199    0]
 [   0 6401]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3199
           1       1.00      1.00      1.00      6401

    accuracy                           1.00      9600
   macro avg       1.00      1.00      1.00      9600
weighted avg       1.00      1.00      1.00      9600

1.0


In [43]:
#implement boosting algorithms
from sklearn.ensemble import GradientBoostingClassifier,AdaBoostClassifier
gb = GradientBoostingClassifier()
gb.fit(X_train,y_train)
y_pred = gb.predict(X_test)
#check performance
print(f'Model Performance-----------------------')
print('\n')
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test,y_pred))

Model Performance-----------------------


[[ 461  340]
 [ 206 1393]]
              precision    recall  f1-score   support

           0       0.69      0.58      0.63       801
           1       0.80      0.87      0.84      1599

    accuracy                           0.77      2400
   macro avg       0.75      0.72      0.73      2400
weighted avg       0.77      0.77      0.77      2400

0.7725


In [44]:
ad = AdaBoostClassifier()
ad.fit(X_train,y_train)
y_pred = ad.predict(X_test)
#check performance
print(f'Model Performance-----------------------')
print('\n')
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test,y_pred))



Model Performance-----------------------


[[ 446  355]
 [ 230 1369]]
              precision    recall  f1-score   support

           0       0.66      0.56      0.60       801
           1       0.79      0.86      0.82      1599

    accuracy                           0.76      2400
   macro avg       0.73      0.71      0.71      2400
weighted avg       0.75      0.76      0.75      2400

0.75625


In [45]:
from xgboost import XGBClassifier
xgb = AdaBoostClassifier()
xgb.fit(X_train,y_train)
y_pred = xgb.predict(X_test)
#check performance
print(f'Model Performance-----------------------')
print('\n')
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test,y_pred))



Model Performance-----------------------


[[ 446  355]
 [ 230 1369]]
              precision    recall  f1-score   support

           0       0.66      0.56      0.60       801
           1       0.79      0.86      0.82      1599

    accuracy                           0.76      2400
   macro avg       0.73      0.71      0.71      2400
weighted avg       0.75      0.76      0.75      2400

0.75625


In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

# Define the parameter distributions
param_dist = {
    'n_estimators': randint(100, 500),
    'max_depth': randint(10, 50),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 5),
}

# Create a Random Forest model
rf = RandomForestClassifier()

# Create RandomizedSearchCV object
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, n_iter=50, cv=5, scoring='recall', n_jobs=-1, random_state=42)

# Fit the random search to the data
random_search.fit(X_train, y_train)

# Get the best parameters and best recall score
best_params = random_search.best_params_
best_recall = random_search.best_score_

print("Best Parameters:", best_params)
print("Best Recall Score:", best_recall)


In [None]:
import joblib

# Assume `rf` is your trained random forest model
joblib_file = 'ad.pkl'
joblib.dump(rf, joblib_file)


In [None]:
import joblib

# Assume `rf` is your trained random forest model
joblib_file = 'rf.pkl'
joblib.dump(rf, joblib_file)


In [None]:
# from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
# from sklearn.metrics import make_scorer, f1_score
# import numpy as np

# # Define the parameter grid
# param_distributions = {
#     'n_estimators': [100, 200, 300, 400, 500],
#     'max_depth': [None, 10, 20, 30, 40, 50],
#     'min_samples_split': [2, 5, 10, 15, 20],
#     'min_samples_leaf': [1, 2, 4, 6, 8],
#     'max_features': ['auto', 'sqrt', 'log2', None]
# }

# # Create a RandomForestClassifier
# rf = RandomForestClassifier()

# # Define stratified k-fold cross-validation
# cv = StratifiedKFold(n_splits=5)

# # Use F1 score as the scoring metric
# f1_scorer = make_scorer(f1_score)

# # Create RandomizedSearchCV object
# random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_distributions,
#                                    scoring=f1_scorer, cv=cv, n_jobs=-1, verbose=2, 
#                                    n_iter=100, random_state=42)

# # Fit the model on the training data
# random_search.fit(X_train, y_train)

# # Print the best parameters and the best F1 score
# print("Best parameters found: ", random_search.best_params_)
# print("Best F1 score: ", random_search.best_score_)

# # Use the best model found by RandomizedSearchCV
# best_rf = random_search.best_estimator_

# # Predict using the best model
# y_pred = best_rf.predict(X_test)

# # Calculate F1 score on the test set
# test_f1_score = f1_score(y_test, y_pred)
# print("Test set F1 score: ", test_f1_score)


In [46]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

#divide data into train and test split
X_train,X_test,y_train,y_test = train_test_split(data['reviewText'],data['review'],test_size=0.20)

## Tokenize the text-creating indexes for words
tokenizer=Tokenizer(num_words=20000)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [47]:
# Adjust based on your data
max_length = 100  
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length)

In [48]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

#apply earlystopping criteria to monitor validation loss
from tensorflow.keras.callbacks import EarlyStopping
# Initialize EarlyStopping callback
early_stopping = EarlyStopping(
    monitor='val_loss',            # Metric to monitor
    patience=3,                    # Number of epochs to wait for improvement
    restore_best_weights=True      # Restore the best weights
)
# Define the model
model = Sequential()
model.add(Embedding(input_dim=20000, output_dim=128, input_length=max_length))
model.add(LSTM(128, return_sequences=True))
model.add(LSTM(64))
model.add(Dense(1, activation='sigmoid'))  # For binary classification

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train_pad, y_train, epochs=5, batch_size=32, validation_split=0.2,callbacks=[early_stopping])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5


In [119]:
# Evaluate model
loss, accuracy = model.evaluate(X_test_pad, y_test)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

Test Loss: 0.3983
Test Accuracy: 0.8154


In [120]:
#lets check performance metrics 
y_pred = model.predict(X_test_pad)
y_pred = (y_pred>=0.5).astype(int)
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
print(confusion_matrix(y_test,y_pred))
print('\n')
print(classification_report(y_test,y_pred))

[[ 606  178]
 [ 265 1351]]


              precision    recall  f1-score   support

           0       0.70      0.77      0.73       784
           1       0.88      0.84      0.86      1616

    accuracy                           0.82      2400
   macro avg       0.79      0.80      0.80      2400
weighted avg       0.82      0.82      0.82      2400



In [49]:
# Make predictions
new_reviews = ["I had an amazing experience at McDonald's today! The staff was super friendly, and my order was ready in less than five minutes. The Big Mac I ordered was fresh and tasted delicious, and the fries were hot and crispy, just the way I like them. The restaurant was clean, and the atmosphere was really pleasant. It was a quick and satisfying meal. I’ll definitely be coming back more often!"]
new_reviews_seq = tokenizer.texts_to_sequences(new_reviews)
new_reviews_pad = pad_sequences(new_reviews_seq, maxlen=max_length)
predictions = model.predict(new_reviews_pad)
print(predictions)

[[0.8615594]]
