In [72]:
import pandas as pd
import re
import os
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from collections import Counter
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import tree, metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.feature_selection import SelectFromModel

# Model 1 (Naive Bayes with Count vectorizer)

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')



In [3]:

X_train = pd.read_csv('train_clean.csv')
y_train = train['rating']

X_test = pd.read_csv('test_clean.csv')
y_test = test['rating']

In [4]:
y_train.shape

(1944,)

In [5]:
cv = CountVectorizer(max_df=0.95, min_df=2)

In [6]:
X_train1 = cv.fit_transform(X_train['review'].values.astype('U'))
X_test1  = cv.transform(X_test['review'].values.astype('U'))

In [7]:
model = MultinomialNB()
model.fit(X_train1, y_train)

MultinomialNB()

In [8]:
y_pred = model.predict(X_test1)

In [9]:
print("Bayes model accuracy score: ", accuracy_score(y_test, y_pred))

Bayes model accuracy score:  0.7529976019184652


### Now we will use stemmed data to see the model prediction

In [10]:
X_train_st = pd.read_csv('stem_train.csv')
X_test_st = pd.read_csv('stem_test.csv')

In [11]:
cv2 = CountVectorizer(max_df=0.95, min_df=2)

In [12]:
X_train2 = cv2.fit_transform(X_train_st['review'].values.astype('U'))
X_test2  = cv2.transform(X_test_st['review'].values.astype('U'))

In [13]:
model_st = MultinomialNB()
model_st.fit(X_train2, y_train)

MultinomialNB()

In [14]:
y_pred_st = model_st.predict(X_test2)
y_test.shape
y_pred_st.shape

(834,)

In [15]:
print("Bayes model accuracy score: ", accuracy_score(y_test, y_pred_st))

Bayes model accuracy score:  0.7254196642685852


# Model 2 (Naive Bayes with TF-IDF)

In [16]:
tf_idf = TfidfVectorizer(max_df=0.95, min_df=2)

In [17]:
vect_train = tf_idf.fit_transform(X_train['review'].values.astype('U'))
vect_test = tf_idf.transform(X_test['review'].values.astype('U'))

In [18]:
model2 = MultinomialNB()
model2.fit(vect_train, y_train)

MultinomialNB()

In [19]:
y_pred2 = model2.predict(vect_test)

In [20]:
print("Bayes model accuracy score: ", accuracy_score(y_test, y_pred2))

Bayes model accuracy score:  0.6306954436450839


#### Using cross validation

In [21]:
cv_score = []

In [22]:
scores = cross_val_score(model2,vect_train,y_train,cv=5,scoring='accuracy')
cv_score.append(scores)

In [23]:
print(cv_score)

[array([0.62210797, 0.63753213, 0.65809769, 0.66066838, 0.63659794])]


### Using Stemmed Data

In [24]:
tf_idf2 = TfidfVectorizer(max_df=0.95, min_df=2)

vect_train_st = tf_idf2.fit_transform(X_train_st['review'].values.astype('U'))
vect_test_st = tf_idf2.transform(X_test_st['review'].values.astype('U'))

In [25]:
model2_st = MultinomialNB()
model2_st.fit(vect_train_st, y_train)

MultinomialNB()

In [26]:
y_pred2_st = model2_st.predict(vect_test_st)

In [27]:
print("Bayes model accuracy score: ", accuracy_score(y_test, y_pred2_st))

Bayes model accuracy score:  0.6354916067146283


# Model 3 (Decision Trees)

### With count vectorizer and data where only stopped words are removed

## Entropy Model

In [28]:

entr_model = tree.DecisionTreeClassifier(criterion="entropy", random_state = 12)


entr_model.fit(X_train1, y_train)

 
y_pred =  entr_model.predict(X_test1)


entr_model

DecisionTreeClassifier(criterion='entropy', random_state=12)

In [29]:
print("Accuracy:", metrics.accuracy_score(y_test,y_pred))
print("Balanced accuracy:", metrics.balanced_accuracy_score(y_test,y_pred))




Accuracy: 0.8836930455635491
Balanced accuracy: 0.8236072414502609


### Using stemmed data

In [30]:
entr_model2 = tree.DecisionTreeClassifier(criterion="entropy", random_state = 12)

entr_model2.fit(X_train2, y_train)

y_pred_st =  entr_model2.predict(X_test2)

In [31]:
print("Accuracy:", metrics.accuracy_score(y_test,y_pred_st))
print("Balanced accuracy:", metrics.balanced_accuracy_score(y_test,y_pred_st))

Accuracy: 0.8812949640287769
Balanced accuracy: 0.8329400355003627


## Gini Impurity Model

In [32]:

gini_model = tree.DecisionTreeClassifier(criterion="gini", random_state = 12)


gini_model.fit(X_train1, y_train)

 
y_pred =  gini_model.predict(X_test1)


gini_model

DecisionTreeClassifier(random_state=12)

In [33]:
print("Accuracy:", metrics.accuracy_score(y_test,y_pred))
print("Balanced accuracy:", metrics.balanced_accuracy_score(y_test,y_pred))

Accuracy: 0.8860911270983214
Balanced accuracy: 0.8359671462265739


# Random Forrests

In [34]:
rfmodel = RandomForestClassifier()

In [35]:
rfmodel.fit(X_train1, y_train)

RandomForestClassifier()

In [36]:
y_pred = rfmodel.predict(X_test1)

In [37]:
print("Accuracy:", metrics.accuracy_score(y_test,y_pred))
print("Balanced accuracy:", metrics.balanced_accuracy_score(y_test,y_pred))

Accuracy: 0.8908872901678657
Balanced accuracy: 0.8077384982491314


### Now we predict on the data where we have removed common words from the ratings

In [38]:
X_train_stop = pd.read_csv('train_stop.csv')
X_test_stop = pd.read_csv('test_stop.csv')

In [39]:
cv3 = CountVectorizer(max_df=0.95, min_df=2)

In [40]:
X_train3 = cv3.fit_transform(X_train_stop['review'].values.astype('U'))
X_test3  = cv3.transform(X_test_stop['review'].values.astype('U'))

In [41]:
rfmodel2 = RandomForestClassifier()

In [42]:
rfmodel2.fit(X_train3, y_train)

RandomForestClassifier()

In [43]:
y_pred2 = rfmodel2.predict(X_test3)

In [44]:
print("Accuracy:", metrics.accuracy_score(y_test,y_pred2))
print("Balanced accuracy:", metrics.balanced_accuracy_score(y_test,y_pred2))

Accuracy: 0.8884892086330936
Balanced accuracy: 0.8020226163827999


In [45]:
X_train3.shape

(1944, 3401)

In [46]:
type(X_train1)

scipy.sparse.csr.csr_matrix

# Feature Importance

In [47]:
rf = RandomForestClassifier(n_estimators = 90,
                           n_jobs = -1,
                           oob_score = True,
                           bootstrap = True,
                           random_state = 42)

In [48]:
rf.fit(X_train1, y_train)

RandomForestClassifier(n_estimators=90, n_jobs=-1, oob_score=True,
                       random_state=42)

In [49]:
print('R^2 Training Score: {:.2f} \nOOB Score: {:.2f} \nR^2 Validation Score: {:.2f}'.format(rf.score(X_train1, y_train), 
                                                                                             rf.oob_score_,
                                                                                             rf.score(X_test1, y_test)))

R^2 Training Score: 0.98 
OOB Score: 0.90 
R^2 Validation Score: 0.90


In [50]:
X_train1.shape

(1944, 3424)

In [51]:
X_test1.shape

(834, 3424)

# Dimensionality Reduction

In [52]:
from sklearn.decomposition import TruncatedSVD, SparsePCA

In [53]:
svd = TruncatedSVD(
  n_components=1000,
  n_iter=10,
  random_state=42
  )

X_tran = svd.fit_transform(X_train1)
X_tran.shape

X_tes = svd.transform(X_test1)
X_tes.shape

(834, 1000)

In [54]:
spca = SparsePCA(
  n_components=100,
  random_state=0
  )

## first reduced by SVD, then PCA.
X_tran1 = spca.fit_transform(X_tran)

X_tran1.shape

X_tes1 = spca.transform(X_tes)
X_tes1.shape

(834, 100)

### Modelling

In [55]:
rfmodel.fit(X_tran1, y_train)

RandomForestClassifier()

In [56]:
y_pred22 = rfmodel.predict(X_tes1)

In [57]:
X_tran1.shape

(1944, 100)

In [58]:
print("Accuracy:", metrics.accuracy_score(y_test,y_pred22))
print("Balanced accuracy:", metrics.balanced_accuracy_score(y_test,y_pred22))

print('R^2 Training Score: {:.2f}  \nR^2 Validation Score: {:.2f}'.format(rfmodel.score(X_tran1, y_train), 
                                                                                               rfmodel.score(X_tes1, y_test)))

Accuracy: 0.8884892086330936
Balanced accuracy: 0.7960256192135978
R^2 Training Score: 0.98  
R^2 Validation Score: 0.89


In [59]:
rf.fit(X_tran1,y_train)

RandomForestClassifier(n_estimators=90, n_jobs=-1, oob_score=True,
                       random_state=42)

In [60]:
print('R^2 Training Score: {:.2f} \nOOB Score: {:.2f} \nR^2 Validation Score: {:.2f}'.format(rf.score(X_tran1, y_train), 
                                                                                             rf.oob_score_,
                                                                                             rf.score(X_tes1, y_test)))

R^2 Training Score: 0.98 
OOB Score: 0.89 
R^2 Validation Score: 0.90


In [61]:
ypred23 = rf.predict(X_tes1)

In [62]:
print("Accuracy:", metrics.accuracy_score(y_test,ypred23))

Accuracy: 0.89568345323741


In [63]:
rfmodel.feature_importances_

array([0.01031083, 0.00958837, 0.01344841, 0.00960382, 0.01276908,
       0.00914062, 0.01827445, 0.01061327, 0.01082383, 0.01115736,
       0.01961748, 0.01293818, 0.00994927, 0.01007094, 0.01140091,
       0.00927881, 0.01272804, 0.00837437, 0.01409377, 0.00865978,
       0.00988255, 0.02354814, 0.00872129, 0.00790482, 0.01364958,
       0.01038906, 0.00856761, 0.0093749 , 0.00849627, 0.00859203,
       0.00769773, 0.00964357, 0.01192117, 0.01331596, 0.00962847,
       0.00951091, 0.01186849, 0.00803994, 0.00869492, 0.01202252,
       0.01667612, 0.00849628, 0.00877569, 0.00869453, 0.00879189,
       0.01014495, 0.00881964, 0.00894517, 0.00896983, 0.00957124,
       0.00923552, 0.00760661, 0.00770914, 0.00981698, 0.00837546,
       0.00836974, 0.00874924, 0.00971303, 0.00829945, 0.00896071,
       0.00732456, 0.00804662, 0.00758449, 0.00874537, 0.01036644,
       0.00863107, 0.00920317, 0.00924602, 0.01007277, 0.00958985,
       0.00962521, 0.01268859, 0.00863016, 0.00775541, 0.00886

# Feature Importance and dimensionality reduction

In [65]:
rf_new = RandomForestClassifier(n_estimators = 90,
                           n_jobs = -1,
                           oob_score = True,
                           bootstrap = True,
                           random_state = 42)

In [66]:
rf_new.fit(X_train1, y_train)

RandomForestClassifier(n_estimators=90, n_jobs=-1, oob_score=True,
                       random_state=42)

In [67]:
imp = rf_new.feature_importances_

In [68]:
print('Feature Importances')

Feature Importances


In [69]:
indices = np.argsort(imp)[::-1]
for f in range(X_train1.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], imp[indices[f]]))


1. feature 1269 (0.013804)
2. feature 2324 (0.013429)
3. feature 255 (0.010309)
4. feature 2436 (0.009044)
5. feature 2046 (0.007698)
6. feature 3310 (0.007639)
7. feature 2528 (0.007514)
8. feature 203 (0.007308)
9. feature 1981 (0.006990)
10. feature 901 (0.006075)
11. feature 380 (0.005924)
12. feature 1591 (0.005904)
13. feature 183 (0.005801)
14. feature 1739 (0.005554)
15. feature 830 (0.005509)
16. feature 343 (0.005365)
17. feature 1053 (0.004953)
18. feature 2061 (0.004851)
19. feature 3010 (0.004679)
20. feature 699 (0.004551)
21. feature 2768 (0.004487)
22. feature 1676 (0.004462)
23. feature 1677 (0.004336)
24. feature 2047 (0.004277)
25. feature 3102 (0.004272)
26. feature 903 (0.004040)
27. feature 1915 (0.003997)
28. feature 2107 (0.003952)
29. feature 3386 (0.003775)
30. feature 2431 (0.003762)
31. feature 1292 (0.003759)
32. feature 758 (0.003689)
33. feature 3017 (0.003687)
34. feature 2381 (0.003552)
35. feature 2622 (0.003544)
36. feature 90 (0.003435)
37. feature 1

1312. feature 2247 (0.000137)
1313. feature 71 (0.000137)
1314. feature 1075 (0.000136)
1315. feature 2299 (0.000136)
1316. feature 2194 (0.000136)
1317. feature 3299 (0.000136)
1318. feature 2825 (0.000136)
1319. feature 1358 (0.000136)
1320. feature 972 (0.000136)
1321. feature 1526 (0.000135)
1322. feature 1361 (0.000135)
1323. feature 2039 (0.000135)
1324. feature 43 (0.000135)
1325. feature 856 (0.000135)
1326. feature 2485 (0.000134)
1327. feature 1580 (0.000134)
1328. feature 2706 (0.000134)
1329. feature 3183 (0.000134)
1330. feature 174 (0.000134)
1331. feature 1025 (0.000133)
1332. feature 2156 (0.000133)
1333. feature 2089 (0.000133)
1334. feature 2638 (0.000133)
1335. feature 1360 (0.000133)
1336. feature 1334 (0.000132)
1337. feature 3178 (0.000132)
1338. feature 2736 (0.000132)
1339. feature 2960 (0.000132)
1340. feature 2373 (0.000132)
1341. feature 1015 (0.000132)
1342. feature 2437 (0.000132)
1343. feature 3292 (0.000132)
1344. feature 2185 (0.000131)
1345. feature 213

2323. feature 2864 (0.000029)
2324. feature 788 (0.000029)
2325. feature 2248 (0.000029)
2326. feature 3088 (0.000029)
2327. feature 2517 (0.000029)
2328. feature 2807 (0.000029)
2329. feature 566 (0.000029)
2330. feature 30 (0.000029)
2331. feature 3003 (0.000029)
2332. feature 29 (0.000028)
2333. feature 422 (0.000028)
2334. feature 2542 (0.000028)
2335. feature 872 (0.000028)
2336. feature 2455 (0.000028)
2337. feature 1551 (0.000028)
2338. feature 2361 (0.000028)
2339. feature 1320 (0.000028)
2340. feature 787 (0.000028)
2341. feature 223 (0.000028)
2342. feature 628 (0.000028)
2343. feature 548 (0.000028)
2344. feature 3160 (0.000028)
2345. feature 1561 (0.000028)
2346. feature 2212 (0.000028)
2347. feature 2932 (0.000028)
2348. feature 1585 (0.000028)
2349. feature 659 (0.000028)
2350. feature 3189 (0.000028)
2351. feature 2543 (0.000028)
2352. feature 937 (0.000028)
2353. feature 301 (0.000028)
2354. feature 1154 (0.000028)
2355. feature 2732 (0.000027)
2356. feature 0 (0.000027

In [75]:
help(SelectFromModel)

Help on class SelectFromModel in module sklearn.feature_selection._from_model:

class SelectFromModel(sklearn.base.MetaEstimatorMixin, sklearn.feature_selection._base.SelectorMixin, sklearn.base.BaseEstimator)
 |  SelectFromModel(estimator, *, threshold=None, prefit=False, norm_order=1, max_features=None)
 |  
 |  Meta-transformer for selecting features based on importance weights.
 |  
 |  .. versionadded:: 0.17
 |  
 |  Parameters
 |  ----------
 |  estimator : object
 |      The base estimator from which the transformer is built.
 |      This can be both a fitted (if ``prefit`` is set to True)
 |      or a non-fitted estimator. The estimator must have either a
 |      ``feature_importances_`` or ``coef_`` attribute after fitting.
 |  
 |  threshold : string, float, optional default None
 |      The threshold value to use for feature selection. Features whose
 |      importance is greater or equal are kept while the others are
 |      discarded. If "median" (resp. "mean"), then the `

### Selection of Features

In [73]:
sfm = SelectFromModel(rf_new,threshold=0.0010)

In [74]:
sfm.fit(X_train1,y_train)

SelectFromModel(estimator=RandomForestClassifier(n_estimators=90, n_jobs=-1,
                                                 oob_score=True,
                                                 random_state=42),
                threshold=0.001)

In [79]:
X_imp_train = sfm.transform(X_train1)
X_imp_test = sfm.transform(X_test1)

### Training a new random forrest model with important feature dataset

In [81]:
rf_imp = RandomForestClassifier(n_estimators = 90,
                           n_jobs = -1,
                           oob_score = True,
                           bootstrap = True,
                           random_state = 42)

In [82]:
rf_imp.fit(X_imp_train,y_train)

RandomForestClassifier(n_estimators=90, n_jobs=-1, oob_score=True,
                       random_state=42)

In [83]:
ypred = rf_imp.predict(X_imp_test)

In [84]:
accuracy_score(y_test,ypred)

0.8920863309352518