Importing libraries

In [32]:
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.model_selection import cross_val_score
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

Loading the dataset

In [33]:
dataset=pd.read_csv('reviews.txt',delimiter='\t',names=['review','output'])

In [34]:
dataset.head()

Unnamed: 0,review,output
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


Stopword - A word that is automatically omitted from a computer-generated concordance or index.

Downloading stopwords package which is available in nltk suite of libraries

In [35]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\him\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Stemmers remove morphological affixes from words, leaving only the word stem

- Using Porter Stemmer for stemming on 'review' column
- Removing stopwords from 'review' column
- Appending cleaned texts to corpus which is a list


In [36]:
corpus = []
ps = PorterStemmer()
for i in range(0, 1000):
    review = re.sub('[^a-zA-Z]', ' ', dataset['review'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

- Creating bag of words model
- Reducing sparsity using max_features argument

In [38]:
cv = CountVectorizer(max_features = 1500)

Creating independent and dependent variable matrix

In [39]:
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, 1].values

Splitting the dataset into Training and Test set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

Fitting Random Forest to the Training set

In [41]:
# Fitting Random Forest to the Training set

classifier = RandomForestClassifier(n_estimators = 230, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

# Fitting Naive Bayes to the Training set
# from sklearn.naive_bayes import GaussianNB
# classifier = GaussianNB()
# classifier.fit(X_train, y_train)

#Fitting Decision tree to the training set
# from sklearn.tree import DecisionTreeClassifier
# classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
# classifier.fit(X_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=230, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

Predicting the Test set results

In [42]:
y_pred = classifier.predict(X_test)

Making the Confusion Matrix

In [43]:
cm = confusion_matrix(y_test, y_pred)

In [44]:
cm

array([[87,  9],
       [24, 80]], dtype=int64)

Evaluating model using k-Fold Cross Validation

In [45]:
accuracies=cross_val_score(estimator = classifier,X = X_train,y = y_train, cv =10)
mean=accuracies.mean()

In [46]:
mean

0.7941317002656666

***