In [5]:
import pandas as pd
import re
import os
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from collections import Counter
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import tree, metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.feature_selection import SelectFromModel

# Model 1 (Naive Bayes with Count vectorizer)

In [6]:
train = pd.read_csv('../Data/Test-Train/train.csv')
test = pd.read_csv('../Data/Test-Train/test.csv')



In [7]:

X_train = pd.read_csv('../Data/Test-Train(Stopped Words)/train_clean.csv')
y_train = train['rating']

X_test = pd.read_csv('../Data/Test-Train(Stopped Words)/test_clean.csv')
y_test = test['rating']

In [91]:
X_train.shape

(1944, 2)

In [9]:
cv = CountVectorizer(max_df=0.95, min_df=2)

In [118]:
from scipy.sparse import csr_matrix


In [109]:
X_train1mat = cv.fit_transform(X_train['review'].values.astype('U'))
X_test1mat  = cv.transform(X_test['review'].values.astype('U'))

In [124]:
#X_train1mat.to_frame
X_train1 = pd.DataFrame(X_train1mat.todense())
X_test1 = pd.DataFrame(X_test1mat.todense())

In [126]:
X_train1.columns = cv.get_feature_names()
X_test1.columns = cv.get_feature_names()


In [11]:
model = MultinomialNB()
model.fit(X_train1, y_train)

MultinomialNB()

In [12]:
y_pred = model.predict(X_test1)

In [13]:
print("Bayes model accuracy score: ", accuracy_score(y_test, y_pred))

Bayes model accuracy score:  0.7529976019184652


In [235]:
X_train1.head(2)

Unnamed: 0,aa,aamras,abck,able,abroad,absolutely,absorbed,accept,access,accompaniment,...,yo,young,youthful,youtubber,youtube,yr,yummy,zero,zip,zones
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Now we will use stemmed data to see the model prediction

In [128]:
X_train_st = pd.read_csv('../Data/Test-Train(Stemmed Words)/stem_train.csv')
X_test_st = pd.read_csv('../Data/Test-Train(Stemmed Words)/stem_test.csv')

In [129]:
cv2 = CountVectorizer(max_df=0.95, min_df=2)

In [130]:
X_train2 = cv2.fit_transform(X_train_st['review'].values.astype('U'))
X_test2  = cv2.transform(X_test_st['review'].values.astype('U'))

In [131]:
model_st = MultinomialNB()
model_st.fit(X_train2, y_train)

MultinomialNB()

In [132]:
y_pred_st = model_st.predict(X_test2)
y_test.shape
y_pred_st.shape

(834,)

In [133]:
print("Bayes model accuracy score: ", accuracy_score(y_test, y_pred_st))

Bayes model accuracy score:  0.7254196642685852


# Model 2 (Naive Bayes with TF-IDF)

In [134]:
tf_idf = TfidfVectorizer(max_df=0.95, min_df=2)

In [135]:
vect_train = tf_idf.fit_transform(X_train['review'].values.astype('U'))
vect_test = tf_idf.transform(X_test['review'].values.astype('U'))

In [136]:
model2 = MultinomialNB()
model2.fit(vect_train, y_train)

MultinomialNB()

In [137]:
y_pred2 = model2.predict(vect_test)

In [138]:
print("Bayes model accuracy score: ", accuracy_score(y_test, y_pred2))

Bayes model accuracy score:  0.6306954436450839


#### Using cross validation

In [139]:
cv_score = []

In [140]:
scores = cross_val_score(model2,vect_train,y_train,cv=5,scoring='accuracy')
cv_score.append(scores)

In [236]:
print(np.mean(cv_score))

0.6430008215620279


### Using Stemmed Data

In [142]:
tf_idf2 = TfidfVectorizer(max_df=0.95, min_df=2)

vect_train_st = tf_idf2.fit_transform(X_train_st['review'].values.astype('U'))
vect_test_st = tf_idf2.transform(X_test_st['review'].values.astype('U'))

In [143]:
model2_st = MultinomialNB()
model2_st.fit(vect_train_st, y_train)

MultinomialNB()

In [144]:
y_pred2_st = model2_st.predict(vect_test_st)

In [145]:
print("Bayes model accuracy score: ", accuracy_score(y_test, y_pred2_st))

Bayes model accuracy score:  0.6354916067146283


# Model 3 (Decision Trees)

### With count vectorizer and data where only stopped words are removed

## Entropy Model

In [146]:

entr_model = tree.DecisionTreeClassifier(criterion="entropy", random_state = 12)


entr_model.fit(X_train1, y_train)

 
y_pred =  entr_model.predict(X_test1)


entr_model

DecisionTreeClassifier(criterion='entropy', random_state=12)

In [147]:
print("Accuracy:", metrics.accuracy_score(y_test,y_pred))
print("Balanced accuracy:", metrics.balanced_accuracy_score(y_test,y_pred))




Accuracy: 0.8836930455635491
Balanced accuracy: 0.8236072414502609


### Using stemmed data

In [148]:
entr_model2 = tree.DecisionTreeClassifier(criterion="entropy", random_state = 12)

entr_model2.fit(X_train2, y_train)

y_pred_st =  entr_model2.predict(X_test2)

In [149]:
print("Accuracy:", metrics.accuracy_score(y_test,y_pred_st))
print("Balanced accuracy:", metrics.balanced_accuracy_score(y_test,y_pred_st))

Accuracy: 0.8812949640287769
Balanced accuracy: 0.8329400355003627


## Gini Impurity Model

In [150]:

gini_model = tree.DecisionTreeClassifier(criterion="gini", random_state = 12)


gini_model.fit(X_train1, y_train)

 
y_pred =  gini_model.predict(X_test1)


gini_model

DecisionTreeClassifier(random_state=12)

In [151]:
print("Accuracy:", metrics.accuracy_score(y_test,y_pred))
print("Balanced accuracy:", metrics.balanced_accuracy_score(y_test,y_pred))

Accuracy: 0.8860911270983214
Balanced accuracy: 0.8359671462265739


# Random Forrests

In [152]:
rfmodel = RandomForestClassifier()

In [153]:
rfmodel.fit(X_train1, y_train)

RandomForestClassifier()

In [154]:
y_pred = rfmodel.predict(X_test1)

In [155]:
print("Accuracy:", metrics.accuracy_score(y_test,y_pred))
print("Balanced accuracy:", metrics.balanced_accuracy_score(y_test,y_pred))

Accuracy: 0.8908872901678657
Balanced accuracy: 0.8019405373333697


### Now we predict on the data where we have removed common words from the ratings

In [156]:
X_train_stop = pd.read_csv('../Data/Test-Train(Common Words)/train_stop.csv')
X_test_stop = pd.read_csv('../Data/Test-Train(Common Words)/test_stop.csv')

In [157]:
cv3 = CountVectorizer(max_df=0.95, min_df=2)

In [158]:
X_train3 = cv3.fit_transform(X_train_stop['review'].values.astype('U'))
X_test3  = cv3.transform(X_test_stop['review'].values.astype('U'))

In [159]:
rfmodel2 = RandomForestClassifier()

In [160]:
rfmodel2.fit(X_train3, y_train)

RandomForestClassifier()

In [161]:
y_pred2 = rfmodel2.predict(X_test3)

In [162]:
print("Accuracy:", metrics.accuracy_score(y_test,y_pred2))
print("Balanced accuracy:", metrics.balanced_accuracy_score(y_test,y_pred2))

Accuracy: 0.8860911270983214
Balanced accuracy: 0.797523644184727


In [163]:
X_train3.shape

(1944, 3401)

In [164]:
type(X_train1)

pandas.core.frame.DataFrame

# Feature Importance

In [165]:
rf = RandomForestClassifier(n_estimators = 90,
                           n_jobs = -1,
                           oob_score = True,
                           bootstrap = True,
                           random_state = 42)

In [166]:
rf.fit(X_train1, y_train)

RandomForestClassifier(n_estimators=90, n_jobs=-1, oob_score=True,
                       random_state=42)

In [167]:
print('R^2 Training Score: {:.2f} \nOOB Score: {:.2f} \nR^2 Validation Score: {:.2f}'.format(rf.score(X_train1, y_train), 
                                                                                             rf.oob_score_,
                                                                                             rf.score(X_test1, y_test)))

R^2 Training Score: 0.98 
OOB Score: 0.90 
R^2 Validation Score: 0.90


In [168]:
X_train1.shape

(1944, 3424)

In [169]:
X_test1.shape

(834, 3424)

# Dimensionality Reduction

In [170]:
from sklearn.decomposition import TruncatedSVD, SparsePCA

In [171]:
svd = TruncatedSVD(
  n_components=1000,
  n_iter=10,
  random_state=42
  )

X_tran = svd.fit_transform(X_train1)
X_tran.shape

X_tes = svd.transform(X_test1)
X_tes.shape

(834, 1000)

In [172]:
spca = SparsePCA(
  n_components=100,
  random_state=0
  )

## first reduced by SVD, then PCA.
X_tran1 = spca.fit_transform(X_tran)

X_tran1.shape

X_tes1 = spca.transform(X_tes)
X_tes1.shape

(834, 100)

### Modelling

In [173]:
rfmodel.fit(X_tran1, y_train)

RandomForestClassifier()

In [174]:
y_pred22 = rfmodel.predict(X_tes1)

In [175]:
X_tran1.shape

(1944, 100)

In [176]:
print("Accuracy:", metrics.accuracy_score(y_test,y_pred22))
print("Balanced accuracy:", metrics.balanced_accuracy_score(y_test,y_pred22))

print('R^2 Training Score: {:.2f}  \nR^2 Validation Score: {:.2f}'.format(rfmodel.score(X_tran1, y_train), 
                                                                                               rfmodel.score(X_tes1, y_test)))

Accuracy: 0.8908872901678657
Balanced accuracy: 0.7952588802317558
R^2 Training Score: 0.98  
R^2 Validation Score: 0.89


In [177]:
rf.fit(X_tran1,y_train)

RandomForestClassifier(n_estimators=90, n_jobs=-1, oob_score=True,
                       random_state=42)

In [178]:
print('R^2 Training Score: {:.2f} \nOOB Score: {:.2f} \nR^2 Validation Score: {:.2f}'.format(rf.score(X_tran1, y_train), 
                                                                                             rf.oob_score_,
                                                                                             rf.score(X_tes1, y_test)))

R^2 Training Score: 0.98 
OOB Score: 0.89 
R^2 Validation Score: 0.89


In [179]:
ypred23 = rf.predict(X_tes1)

In [180]:
print("Accuracy:", metrics.accuracy_score(y_test,ypred23))

Accuracy: 0.8860911270983214


In [181]:
rfmodel.feature_importances_

array([0.00997038, 0.0097137 , 0.013505  , 0.00810418, 0.01243502,
       0.01056626, 0.01594434, 0.01060571, 0.01162146, 0.01336263,
       0.01777732, 0.01099917, 0.01129626, 0.00857197, 0.00904856,
       0.00834599, 0.01393438, 0.00965259, 0.01559797, 0.00843689,
       0.01097924, 0.02502369, 0.0090787 , 0.00826345, 0.01422886,
       0.00922644, 0.00857841, 0.0118356 , 0.00812864, 0.00970515,
       0.00831612, 0.00968527, 0.01080852, 0.01199424, 0.00863786,
       0.00979761, 0.01259497, 0.00890002, 0.00812259, 0.01205331,
       0.01313665, 0.00959354, 0.00910414, 0.00783414, 0.01047679,
       0.00946371, 0.00899411, 0.01037644, 0.00960171, 0.00962663,
       0.00907953, 0.00844112, 0.00880764, 0.00907463, 0.00930107,
       0.00974398, 0.00810481, 0.01040433, 0.00710946, 0.00997325,
       0.00679276, 0.00889155, 0.00679695, 0.00808067, 0.00821398,
       0.0082596 , 0.01046472, 0.00823473, 0.00919304, 0.00979968,
       0.00803574, 0.01198832, 0.00960017, 0.00914658, 0.00816

# Feature Importance and dimensionality reduction

In [182]:
rf_new = RandomForestClassifier(n_estimators = 90,
                           n_jobs = -1,
                           oob_score = True,
                           bootstrap = True,
                           random_state = 42)

In [183]:
rf_new.fit(X_train1, y_train)

RandomForestClassifier(n_estimators=90, n_jobs=-1, oob_score=True,
                       random_state=42)

In [184]:
imp = rf_new.feature_importances_

In [185]:
print('Feature Importances')

Feature Importances


In [187]:
indices = np.argsort(imp)[::-1]
#for f in range(X_train1.shape[1]):
#    print("%d. feature %d (%f)" % (f + 1, indices[f], imp[indices[f]]))


In [188]:
#help(SelectFromModel)

In [237]:
X_train1.shape

(1944, 3424)

### Selection of Features

In [239]:
sfm = SelectFromModel(rf_new,threshold=0.001)

In [240]:
sfm.fit(X_train1,y_train)

SelectFromModel(estimator=RandomForestClassifier(n_estimators=90, n_jobs=-1,
                                                 oob_score=True,
                                                 random_state=42),
                threshold=0.001)

In [241]:
X_imp_train.shape

(1944, 216)

In [228]:
X_imp_train = sfm.transform(X_train1)
X_imp_test = sfm.transform(X_test1)

### Training a new random forrest model with important feature dataset

In [229]:
rf_imp = RandomForestClassifier(n_estimators = 90,
                           n_jobs = -1,
                           oob_score = True,
                           bootstrap = True,
                           random_state = 42)

In [230]:
rf_imp.fit(X_imp_train,y_train)

RandomForestClassifier(n_estimators=90, n_jobs=-1, oob_score=True,
                       random_state=42)

In [231]:
ypred = rf_imp.predict(X_imp_test)

In [232]:
accuracy_score(y_test,ypred)

0.8920863309352518

In [209]:
print(type(X_imp_test))

<class 'numpy.ndarray'>
