In [1]:
import pandas as pd
import re
import os
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from collections import Counter
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import tree, metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

# Model 1 (Naive Bayes with Count vectorizer)

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')



In [3]:

X_train = pd.read_csv('train_clean.csv')
y_train = train['rating']

X_test = pd.read_csv('test_clean.csv')
y_test = test['rating']

In [4]:
y_train.shape

(1944,)

In [5]:
cv = CountVectorizer(max_df=0.95, min_df=2)

In [6]:
X_train1 = cv.fit_transform(X_train['review'].values.astype('U'))
X_test1  = cv.transform(X_test['review'].values.astype('U'))

In [7]:
model = MultinomialNB()
model.fit(X_train1, y_train)

MultinomialNB()

In [8]:
y_pred = model.predict(X_test1)

In [9]:
print("Bayes model accuracy score: ", accuracy_score(y_test, y_pred))

Bayes model accuracy score:  0.7529976019184652


### Now we will use stemmed data to see the model prediction

In [10]:
X_train_st = pd.read_csv('stem_train.csv')
X_test_st = pd.read_csv('stem_test.csv')

In [11]:
cv2 = CountVectorizer(max_df=0.95, min_df=2)

In [12]:
X_train2 = cv2.fit_transform(X_train_st['review'].values.astype('U'))
X_test2  = cv2.transform(X_test_st['review'].values.astype('U'))

In [13]:
model_st = MultinomialNB()
model_st.fit(X_train2, y_train)

MultinomialNB()

In [14]:
y_pred_st = model_st.predict(X_test2)
y_test.shape
y_pred_st.shape

(834,)

In [15]:
print("Bayes model accuracy score: ", accuracy_score(y_test, y_pred_st))

Bayes model accuracy score:  0.7254196642685852


# Model 2 (Naive Bayes with TF-IDF)

In [16]:
tf_idf = TfidfVectorizer(max_df=0.95, min_df=2)

In [17]:
vect_train = tf_idf.fit_transform(X_train['review'].values.astype('U'))
vect_test = tf_idf.transform(X_test['review'].values.astype('U'))

In [18]:
model2 = MultinomialNB()
model2.fit(vect_train, y_train)

MultinomialNB()

In [19]:
y_pred2 = model2.predict(vect_test)

In [20]:
print("Bayes model accuracy score: ", accuracy_score(y_test, y_pred2))

Bayes model accuracy score:  0.6306954436450839


#### Using cross validation

In [21]:
cv_score = []

In [22]:
scores = cross_val_score(model2,vect_train,y_train,cv=5,scoring='accuracy')
cv_score.append(scores)

In [23]:
print(cv_score)

[array([0.62210797, 0.63753213, 0.65809769, 0.66066838, 0.63659794])]


### Using Stemmed Data

In [24]:
tf_idf2 = TfidfVectorizer(max_df=0.95, min_df=2)

vect_train_st = tf_idf2.fit_transform(X_train_st['review'].values.astype('U'))
vect_test_st = tf_idf2.transform(X_test_st['review'].values.astype('U'))

In [25]:
model2_st = MultinomialNB()
model2_st.fit(vect_train_st, y_train)

MultinomialNB()

In [26]:
y_pred2_st = model2_st.predict(vect_test_st)

In [27]:
print("Bayes model accuracy score: ", accuracy_score(y_test, y_pred2_st))

Bayes model accuracy score:  0.6354916067146283


# Model 3 (Decision Trees)

### With count vectorizer and data where only stopped words are removed

## Entropy Model

In [28]:

entr_model = tree.DecisionTreeClassifier(criterion="entropy", random_state = 12)


entr_model.fit(X_train1, y_train)

 
y_pred =  entr_model.predict(X_test1)


entr_model

DecisionTreeClassifier(criterion='entropy', random_state=12)

In [29]:
print("Accuracy:", metrics.accuracy_score(y_test,y_pred))
print("Balanced accuracy:", metrics.balanced_accuracy_score(y_test,y_pred))




Accuracy: 0.8836930455635491
Balanced accuracy: 0.8236072414502609


### Using stemmed data

In [30]:
entr_model2 = tree.DecisionTreeClassifier(criterion="entropy", random_state = 12)

entr_model2.fit(X_train2, y_train)

y_pred_st =  entr_model2.predict(X_test2)

In [31]:
print("Accuracy:", metrics.accuracy_score(y_test,y_pred_st))
print("Balanced accuracy:", metrics.balanced_accuracy_score(y_test,y_pred_st))

Accuracy: 0.8812949640287769
Balanced accuracy: 0.8329400355003627


## Gini Impurity Model

In [32]:

gini_model = tree.DecisionTreeClassifier(criterion="gini", random_state = 12)


gini_model.fit(X_train1, y_train)

 
y_pred =  gini_model.predict(X_test1)


gini_model

DecisionTreeClassifier(random_state=12)

In [33]:
print("Accuracy:", metrics.accuracy_score(y_test,y_pred))
print("Balanced accuracy:", metrics.balanced_accuracy_score(y_test,y_pred))

Accuracy: 0.8860911270983214
Balanced accuracy: 0.8359671462265739


# Random Forrests

In [34]:
rfmodel = RandomForestClassifier()

In [35]:
rfmodel.fit(X_train1, y_train)

RandomForestClassifier()

In [36]:
y_pred = rfmodel.predict(X_test1)

In [37]:
print("Accuracy:", metrics.accuracy_score(y_test,y_pred))
print("Balanced accuracy:", metrics.balanced_accuracy_score(y_test,y_pred))

Accuracy: 0.8884892086330936
Balanced accuracy: 0.7972220663242486


### Now we predict on the data where we have removed common words from the ratings

In [38]:
X_train_stop = pd.read_csv('train_stop.csv')
X_test_stop = pd.read_csv('test_stop.csv')

In [39]:
cv3 = CountVectorizer(max_df=0.95, min_df=2)

In [40]:
X_train3 = cv3.fit_transform(X_train_stop['review'].values.astype('U'))
X_test3  = cv3.transform(X_test_stop['review'].values.astype('U'))

In [41]:
rfmodel2 = RandomForestClassifier()

In [42]:
rfmodel2.fit(X_train3, y_train)

RandomForestClassifier()

In [43]:
y_pred2 = rfmodel2.predict(X_test3)

In [44]:
print("Accuracy:", metrics.accuracy_score(y_test,y_pred2))
print("Balanced accuracy:", metrics.balanced_accuracy_score(y_test,y_pred2))

Accuracy: 0.8788968824940048
Balanced accuracy: 0.794777648761386
