In [2]:
import pandas as pd
import re
import os
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from collections import Counter
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import tree, metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

# Model 1 (Naive Bayes with Count vectorizer)

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')



In [4]:

X_train = pd.read_csv('train_clean.csv')
y_train = train['rating']

X_test = pd.read_csv('test_clean.csv')
y_test = test['rating']

In [5]:
y_train.shape

(1944,)

In [6]:
cv = CountVectorizer(max_df=0.95, min_df=2)

In [7]:
X_train1 = cv.fit_transform(X_train['review'].values.astype('U'))
X_test1  = cv.transform(X_test['review'].values.astype('U'))

In [8]:
model = MultinomialNB()
model.fit(X_train1, y_train)

MultinomialNB()

In [9]:
y_pred = model.predict(X_test1)

In [10]:
print("Bayes model accuracy score: ", accuracy_score(y_test, y_pred))

Bayes model accuracy score:  0.7529976019184652


### Now we will use stemmed data to see the model prediction

In [11]:
X_train_st = pd.read_csv('stem_train.csv')
X_test_st = pd.read_csv('stem_test.csv')

In [12]:
cv2 = CountVectorizer(max_df=0.95, min_df=2)

In [13]:
X_train2 = cv2.fit_transform(X_train_st['review'].values.astype('U'))
X_test2  = cv2.transform(X_test_st['review'].values.astype('U'))

In [14]:
model_st = MultinomialNB()
model_st.fit(X_train2, y_train)

MultinomialNB()

In [15]:
y_pred_st = model_st.predict(X_test2)
y_test.shape
y_pred_st.shape

(834,)

In [16]:
print("Bayes model accuracy score: ", accuracy_score(y_test, y_pred_st))

Bayes model accuracy score:  0.7254196642685852


# Model 2 (Naive Bayes with TF-IDF)

In [17]:
tf_idf = TfidfVectorizer(max_df=0.95, min_df=2)

In [18]:
vect_train = tf_idf.fit_transform(X_train['review'].values.astype('U'))
vect_test = tf_idf.transform(X_test['review'].values.astype('U'))

In [19]:
model2 = MultinomialNB()
model2.fit(vect_train, y_train)

MultinomialNB()

In [20]:
y_pred2 = model2.predict(vect_test)

In [21]:
print("Bayes model accuracy score: ", accuracy_score(y_test, y_pred2))

Bayes model accuracy score:  0.6306954436450839


#### Using cross validation

In [22]:
cv_score = []

In [23]:
scores = cross_val_score(model2,vect_train,y_train,cv=5,scoring='accuracy')
cv_score.append(scores)

In [24]:
print(cv_score)

[array([0.62210797, 0.63753213, 0.65809769, 0.66066838, 0.63659794])]


### Using Stemmed Data

In [25]:
tf_idf2 = TfidfVectorizer(max_df=0.95, min_df=2)

vect_train_st = tf_idf2.fit_transform(X_train_st['review'].values.astype('U'))
vect_test_st = tf_idf2.transform(X_test_st['review'].values.astype('U'))

In [26]:
model2_st = MultinomialNB()
model2_st.fit(vect_train_st, y_train)

MultinomialNB()

In [27]:
y_pred2_st = model2_st.predict(vect_test_st)

In [28]:
print("Bayes model accuracy score: ", accuracy_score(y_test, y_pred2_st))

Bayes model accuracy score:  0.6354916067146283


# Model 3 (Decision Trees)

### With count vectorizer and data where only stopped words are removed

## Entropy Model

In [29]:

entr_model = tree.DecisionTreeClassifier(criterion="entropy", random_state = 12)


entr_model.fit(X_train1, y_train)

 
y_pred =  entr_model.predict(X_test1)


entr_model

DecisionTreeClassifier(criterion='entropy', random_state=12)

In [30]:
print("Accuracy:", metrics.accuracy_score(y_test,y_pred))
print("Balanced accuracy:", metrics.balanced_accuracy_score(y_test,y_pred))




Accuracy: 0.8836930455635491
Balanced accuracy: 0.8236072414502609


### Using stemmed data

In [31]:
entr_model2 = tree.DecisionTreeClassifier(criterion="entropy", random_state = 12)

entr_model2.fit(X_train2, y_train)

y_pred_st =  entr_model2.predict(X_test2)

In [32]:
print("Accuracy:", metrics.accuracy_score(y_test,y_pred_st))
print("Balanced accuracy:", metrics.balanced_accuracy_score(y_test,y_pred_st))

Accuracy: 0.8812949640287769
Balanced accuracy: 0.8329400355003627


## Gini Impurity Model

In [33]:

gini_model = tree.DecisionTreeClassifier(criterion="gini", random_state = 12)


gini_model.fit(X_train1, y_train)

 
y_pred =  gini_model.predict(X_test1)


gini_model

DecisionTreeClassifier(random_state=12)

In [34]:
print("Accuracy:", metrics.accuracy_score(y_test,y_pred))
print("Balanced accuracy:", metrics.balanced_accuracy_score(y_test,y_pred))

Accuracy: 0.8860911270983214
Balanced accuracy: 0.8359671462265739


# Random Forrests

In [35]:
rfmodel = RandomForestClassifier()

In [36]:
rfmodel.fit(X_train1, y_train)

RandomForestClassifier()

In [37]:
y_pred = rfmodel.predict(X_test1)

In [38]:
print("Accuracy:", metrics.accuracy_score(y_test,y_pred))
print("Balanced accuracy:", metrics.balanced_accuracy_score(y_test,y_pred))

Accuracy: 0.8884892086330936
Balanced accuracy: 0.7972220663242486


### Now we predict on the data where we have removed common words from the ratings

In [39]:
X_train_stop = pd.read_csv('train_stop.csv')
X_test_stop = pd.read_csv('test_stop.csv')

In [40]:
cv3 = CountVectorizer(max_df=0.95, min_df=2)

In [41]:
X_train3 = cv3.fit_transform(X_train_stop['review'].values.astype('U'))
X_test3  = cv3.transform(X_test_stop['review'].values.astype('U'))

In [42]:
rfmodel2 = RandomForestClassifier()

In [43]:
rfmodel2.fit(X_train3, y_train)

RandomForestClassifier()

In [44]:
y_pred2 = rfmodel2.predict(X_test3)

In [45]:
print("Accuracy:", metrics.accuracy_score(y_test,y_pred2))
print("Balanced accuracy:", metrics.balanced_accuracy_score(y_test,y_pred2))

Accuracy: 0.8860911270983214
Balanced accuracy: 0.8049570498332356


In [46]:
X_train3.shape

(1944, 3401)

In [49]:
type(X_train1)

scipy.sparse.csr.csr_matrix

# Feature Importance

In [51]:
rf = RandomForestClassifier(n_estimators = 100,
                           n_jobs = -1,
                           oob_score = True,
                           bootstrap = True,
                           random_state = 42)

In [52]:
rf.fit(X_train1, y_train)

RandomForestClassifier(n_jobs=-1, oob_score=True, random_state=42)

In [55]:
print('R^2 Training Score: {:.2f} \nOOB Score: {:.2f} \nR^2 Validation Score: {:.2f}'.format(rf.score(X_train1, y_train), 
                                                                                             rf.oob_score_,
                                                                                             rf.score(X_test1, y_test)))

R^2 Training Score: 0.98 
OOB Score: 0.90 
R^2 Validation Score: 0.89


In [59]:
conda install -c conda-forge rfpimp 

Collecting package metadata (repodata.json): ...working... done
Solving environment: ...working... done

## Package Plan ##

  environment location: C:\Users\Harsh\Anaconda3

  added / updated specs:
    - rfpimp


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    rfpimp-1.3.2               |             py_0          12 KB  conda-forge
    ------------------------------------------------------------
                                           Total:          12 KB

The following NEW packages will be INSTALLED:

  rfpimp             conda-forge/noarch::rfpimp-1.3.2-py_0

The following packages will be UPDATED:

  certifi            pkgs/main::certifi-2020.12.5-py37haa9~ --> conda-forge::certifi-2020.12.5-py37h03978a9_1

The following packages will be SUPERSEDED by a higher-priority channel:

  ca-certificates    pkgs/main::ca-certificates-2021.1.19-~ --> conda-forge::ca-certificates-2020.12

In [62]:
from rfpimp import permutation_importances
from sklearn.metrics import r2_score

In [67]:


def r2(rf, X_train, y_train):
    return r2_score(y_train, rf.predict(X_train))

perm_imp_rfpimp = permutation_importances(rf, getnnz(X_train1), y_train, r2)

NameError: name 'getnnz' is not defined

In [68]:
help(CountVectorizer)

Help on class CountVectorizer in module sklearn.feature_extraction.text:

class CountVectorizer(_VectorizerMixin, sklearn.base.BaseEstimator)
 |  CountVectorizer(*, input='content', encoding='utf-8', decode_error='strict', strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, stop_words=None, token_pattern='(?u)\\b\\w\\w+\\b', ngram_range=(1, 1), analyzer='word', max_df=1.0, min_df=1, max_features=None, vocabulary=None, binary=False, dtype=<class 'numpy.int64'>)
 |  
 |  Convert a collection of text documents to a matrix of token counts
 |  
 |  This implementation produces a sparse representation of the counts using
 |  scipy.sparse.csr_matrix.
 |  
 |  If you do not provide an a-priori dictionary and you do not use an analyzer
 |  that does some kind of feature selection then the number of features will
 |  be equal to the vocabulary size found by analyzing the data.
 |  
 |  Read more in the :ref:`User Guide <text_feature_extraction>`.
 |  
 |  Parameters
 |  ------