In [32]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns 
import nltk 
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split 
import re 

In [2]:
reviews_df = pd.read_csv('Book_reviews.csv')
reviews_df.head(3)

Unnamed: 0.1,Unnamed: 0,rating,reviewText,summary
0,0,5,This book was the very first bookmobile book I...,50 + years ago...
1,1,1,"When I read the description for this book, I c...",Boring! Boring! Boring!
2,2,5,I just had to edit this review. This book is a...,Wiggleliscious/new toy ready/!!


## Data Preprocessing Part

In [3]:
reviews_df = reviews_df.iloc[:,[1,2]]
reviews_df.head(3)

Unnamed: 0,rating,reviewText
0,5,This book was the very first bookmobile book I...
1,1,"When I read the description for this book, I c..."
2,5,I just had to edit this review. This book is a...


In [4]:
reviews_df.rating.value_counts()

5    3000
4    3000
1    2000
3    2000
2    2000
Name: rating, dtype: int64

In [5]:
reviews_df.loc[(reviews_df['rating'] == 1) | (reviews_df['rating'] == 2), 'rating'] = 'Bad'
reviews_df.loc[reviews_df['rating'] == 3, 'rating'] = 'Average'
reviews_df.loc[(reviews_df['rating'] == 4) | (reviews_df['rating'] == 5), 'rating'] = 'Good'

In [6]:
reviews_df['rating'].value_counts()

Good       6000
Bad        4000
Average    2000
Name: rating, dtype: int64

In [7]:
reviews_df

Unnamed: 0,rating,reviewText
0,Good,This book was the very first bookmobile book I...
1,Bad,"When I read the description for this book, I c..."
2,Good,I just had to edit this review. This book is a...
3,Good,I don't normally buy 'mystery' novels because ...
4,Good,"This isn't the kind of book I normally read, a..."
...,...,...
11995,Bad,Had to read certain passages twice--typos. Wi...
11996,Average,Not what i expected. yet a very interesting bo...
11997,Good,Dragon Knights is a world where Knights ride d...
11998,Good,"Since this story is very short, it's hard to s..."


In [8]:
# def filtering_text(text):
filtered_text=[]
stemming=PorterStemmer()
for i in range(len(reviews_df)):
    sentence_preprocessing = re.sub('[^a-zA-z]',' ',reviews_df.reviewText[i])
    sentence_preprocessing = sentence_preprocessing.lower()
    sentence_preprocessing = sentence_preprocessing.split()

    sec_processed=[]
    for word in sentence_preprocessing:
        if word not in stopwords.words('english'): # to remove puntuations 
            stem_word=stemming.stem(word)
            sec_processed.append(stem_word)


    sentence_preprocessing = ' '.join(sec_processed)    
    filtered_text.append(sentence_preprocessing)

#    filtered_text_2.append(sentence_preprocessing)

In [9]:
#filtered_text_2[100]

In [12]:
filtered_text[0]

'book first bookmobil book bought school book club love stori bet dollar donut love memori serv bought book th grade would look forward reliv memori'

In [13]:
reviews_df['CLeaned_Reviewes'] = filtered_text

In [14]:
reviews_df.iloc[-1,1]

'from 1922 an amazing collection of info on symbols from cultures around the world and zodiac signs and gems.  while there are some ocr errors, there are also a very comprehensive linked table of contents and index.'

In [52]:
reviews_df['rating']

0           Good
1            Bad
2           Good
3           Good
4           Good
          ...   
11995        Bad
11996    Average
11997       Good
11998       Good
11999       Good
Name: rating, Length: 12000, dtype: object

In [15]:
x_train,x_test,y_train,y_test = train_test_split(reviews_df['CLeaned_Reviewes'], reviews_df['rating'],test_size=0.2,random_state=10,stratify=reviews_df['rating'])

In [16]:
pd.DataFrame(y_train).value_counts()

rating 
Good       4800
Bad        3200
Average    1600
dtype: int64

In [17]:
vectorizer= TfidfVectorizer(input = 'english')
x_train_transformed = vectorizer.fit_transform(x_train)
x_test_transformed = vectorizer.transform(x_test)

## Modeling Part

In [18]:
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression 

model = RandomForestClassifier(n_estimators=300)
model.fit(x_train_transformed,y_train)


model2 = LogisticRegression(max_iter = 250)
model2.fit(x_train_transformed,y_train)

LogisticRegression(max_iter=250)

In [19]:
y_pred = model.predict(x_test_transformed)
y_pred2 = model2.predict(x_test_transformed)

In [20]:
from sklearn.metrics import accuracy_score, confusion_matrix  

In [66]:
print(accuracy_score(y_test,y_pred))
confusion_matrix(y_test,y_pred)

0.7020833333333333


array([[   5,  117,  278],
       [   1,  565,  234],
       [   0,   85, 1115]], dtype=int64)

In [38]:
accuracy_score(y_test, y_pred2)
print(confusion_matrix(y_test,y_pred2))

[[  71  129  200]
 [  33  625  142]
 [  31   89 1080]]


In [39]:
#Saving the transformaion
import joblib
joblib.dump(vectorizer, 'TFDF_Transformation')

['TFDF_Transformation']

In [36]:
#SAving the model 
# joblib.dump(model2,'logistic_regression_model')
# joblib.dump(model,'RandomForest_model')

['RandomForest_model']

In [24]:
a = "This book is very bad"
a = re.sub('[^a-zA-z]',' ',a)
print((a))
b = vectorizer.transform([a]) 

This book is very bad


In [25]:
model2.predict(b)

array(['Bad'], dtype=object)

In [26]:
model.predict(b)

array(['Bad'], dtype=object)

## Code for the project part 

In [85]:
import joblib 
model = joblib.load('logistic_regression_model')
TFIDF_vectorizer = joblib.load('TFDF_Transformation')

def prediction(text):
    text = re.sub('[^a-zA-z]',' ',text)
    text = text.lower()
    text = text.split()

    new_text=[]
    for word in text:
        if word not in stopwords.words('english'): # to remove puntuations 
            stem_word=stemming.stem(word)
            new_text.append(stem_word)

    new_text = ' '.join(new_text) 
    
    transformed_text = TFIDF_vectorizer.transform([new_text])
    prediction = model.predict(transformed_text)
    
    
    return prediction[0]

In [86]:
prediction(text)

'Average'