#  Business Review Rating - predict the star rating


## Load dataset

In [137]:
import numpy as np
import pandas as pd

In [138]:
data = pd.read_csv('data.csv')

In [139]:
data.head()

Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,cool,useful,funny
0,9yKzy9PApeiPPOUJEtnvkg,2011-01-26,fWKvX83p0-ka4JS3dc6E5A,5,My wife took me here on my birthday for breakf...,review,rLtl8ZkDX5vH5nAx9C3q5Q,2,5,0
1,ZRJwVLyzEJq1VAihDhYiow,2011-07-27,IjZ33sJrzXqU-0X6U8NwyA,5,I have no idea why some people give bad review...,review,0a2KyEL0d3Yb1V6aivbIuQ,0,0,0
2,6oRAC4uyJCsJl1X0WZpVSA,2012-06-14,IESLBzqUCLdSzSqm0eCSxQ,4,love the gyro plate. Rice is so good and I als...,review,0hT2KtfLiobPvh6cDC8JQg,0,1,0
3,_1QQZuf4zZOyFCvXc0o6Vg,2010-05-27,G-WvGaISbqqaMHlNnByodA,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,uZetl9T0NcROGOyFfughhg,1,2,0
4,6ozycU1RpktNG2-1BroVtw,2012-01-05,1uJFq2r5QfJG_6ExMRCaGw,5,General Manager Scott Petello is a good egg!!!...,review,vYmM4KTsC8ZfQBg-j5MWkw,0,0,0


In [140]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
business_id    10000 non-null object
date           10000 non-null object
review_id      10000 non-null object
stars          10000 non-null int64
text           10000 non-null object
type           10000 non-null object
user_id        10000 non-null object
cool           10000 non-null int64
useful         10000 non-null int64
funny          10000 non-null int64
dtypes: int64(4), object(6)
memory usage: 781.3+ KB


## New DataFrame for 5-star and 1-star reviews

In [141]:
df = data[(data.stars == 5) | (data.stars == 1)]

In [142]:
df.head()

Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,cool,useful,funny
0,9yKzy9PApeiPPOUJEtnvkg,2011-01-26,fWKvX83p0-ka4JS3dc6E5A,5,My wife took me here on my birthday for breakf...,review,rLtl8ZkDX5vH5nAx9C3q5Q,2,5,0
1,ZRJwVLyzEJq1VAihDhYiow,2011-07-27,IjZ33sJrzXqU-0X6U8NwyA,5,I have no idea why some people give bad review...,review,0a2KyEL0d3Yb1V6aivbIuQ,0,0,0
3,_1QQZuf4zZOyFCvXc0o6Vg,2010-05-27,G-WvGaISbqqaMHlNnByodA,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,uZetl9T0NcROGOyFfughhg,1,2,0
4,6ozycU1RpktNG2-1BroVtw,2012-01-05,1uJFq2r5QfJG_6ExMRCaGw,5,General Manager Scott Petello is a good egg!!!...,review,vYmM4KTsC8ZfQBg-j5MWkw,0,0,0
6,zp713qNhx8d9KCJJnrw1xA,2010-02-12,riFQ3vxNpP4rWLk_CSri2A,5,Drop what you're doing and drive here. After I...,review,wFweIWhv2fREZV_dYkz_1g,7,7,4


In [143]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4086 entries, 0 to 9999
Data columns (total 10 columns):
business_id    4086 non-null object
date           4086 non-null object
review_id      4086 non-null object
stars          4086 non-null int64
text           4086 non-null object
type           4086 non-null object
user_id        4086 non-null object
cool           4086 non-null int64
useful         4086 non-null int64
funny          4086 non-null int64
dtypes: int64(4), object(6)
memory usage: 351.1+ KB


## Slice the Feature and target and split into train and test

In [144]:
df1 = df.iloc[:,3:5]

In [145]:
df1.head()

Unnamed: 0,stars,text
0,5,My wife took me here on my birthday for breakf...
1,5,I have no idea why some people give bad review...
3,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!..."
4,5,General Manager Scott Petello is a good egg!!!...
6,5,Drop what you're doing and drive here. After I...


In [146]:
X = df1.text
y = df1.stars

In [147]:
y.head()

0    5
1    5
3    5
4    5
6    5
Name: stars, dtype: int64

### The feature is a pandas Series

In [148]:
type(X)

pandas.core.series.Series

In [149]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [150]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

## CountVectorizer and create document-term matrices from X_train and X_test

In [151]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()

In [152]:
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)

## Multinomial Naive Bayes classifier to predict the reviews for testing set

In [153]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()

In [154]:
classifier.fit(X_train_dtm, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [155]:
pred_rating = classifier.predict(X_test_dtm)

In [156]:
from sklearn import metrics
from sklearn.metrics import classification_report,accuracy_score

### Accuracy and confusion matrix

In [157]:
print(metrics.classification_report(y_test, pred_rating))
print('Accuracy of the classifier on test set: {:.2f}'.format(accuracy_score(y_test, pred_rating)))

              precision    recall  f1-score   support

           1       0.91      0.65      0.76       237
           5       0.92      0.98      0.95       989

   micro avg       0.92      0.92      0.92      1226
   macro avg       0.92      0.82      0.85      1226
weighted avg       0.92      0.92      0.91      1226

Accuracy of the classifier on test set: 0.92
