In [1]:
import pandas as pd

# reading the training data
docs = pd.read_csv('movie_review_train.csv')
docs.head()

Unnamed: 0,class,text
0,Pos,a common complaint amongst film critics is ...
1,Pos,whew this film oozes energy the kind of b...
2,Pos,steven spielberg s amistad which is bas...
3,Pos,he has spent his entire life in an awful litt...
4,Pos,being that it is a foreign language film with...


In [2]:
# number of SMSes / documents
len(docs)

1600

In [5]:
# mapping labels to 0 and 1
docs['label'] = docs["class"].map({'Pos':1, 'Neg':0})

In [6]:
docs.head()

Unnamed: 0,class,text,label
0,Pos,a common complaint amongst film critics is ...,1
1,Pos,whew this film oozes energy the kind of b...,1
2,Pos,steven spielberg s amistad which is bas...,1
3,Pos,he has spent his entire life in an awful litt...,1
4,Pos,being that it is a foreign language film with...,1


In [7]:
# we can now drop the column 'Class'
docs = docs.drop('class', axis=1)
docs.head()

Unnamed: 0,text,label
0,a common complaint amongst film critics is ...,1
1,whew this film oozes energy the kind of b...,1
2,steven spielberg s amistad which is bas...,1
3,he has spent his entire life in an awful litt...,1
4,being that it is a foreign language film with...,1


In [9]:
# convert to X and y
X = docs.text
y = docs.label
print(X.shape)
print(y.shape)

(1600,)
(1600,)


In [10]:
# vectorizing the sentences; removing stop words
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(stop_words='english')

In [11]:
vect.fit(X)

In [12]:
# printing the vocabulary
vect.vocabulary_

{'common': 6284,
 'complaint': 6360,
 'film': 11832,
 'critics': 7378,
 'aren': 1810,
 'literate': 18603,
 'scripts': 27918,
 'available': 2319,
 'quiz': 25250,
 'gives': 13262,
 'signs': 28809,
 'hope': 15074,
 'art': 1928,
 'writing': 35521,
 'isn': 16771,
 'dead': 7904,
 'hollywood': 14963,
 'need': 21281,
 'look': 18770,
 'independent': 15988,
 'films': 11851,
 'thoughtful': 32060,
 'content': 6771,
 'paul': 23045,
 'attanasio': 2173,
 'script': 27912,
 'takes': 31486,
 'tepid': 31839,
 'thriller': 32090,
 'scandals': 27629,
 'late': 18017,
 '50s': 313,
 'delivers': 8215,
 'telling': 31765,
 'parable': 22815,
 'emptiness': 10340,
 'post': 24178,
 'war': 34707,
 'american': 1334,
 'dream': 9554,
 'golden': 13449,
 'bubble': 4285,
 'surrounds': 31156,
 'protects': 24849,
 'tv': 33056,
 'networks': 21378,
 'sponsors': 29948,
 'riddled': 26743,
 'symbols': 31370,
 '58': 328,
 'chrysler': 5614,
 'radio': 25314,
 'announcement': 1534,
 'sputnik': 30053,
 'heavy': 14532,
 'handed': 14185,

In [13]:
# vocab size
len(vect.vocabulary_.keys())

35858

In [14]:
vect = CountVectorizer(stop_words='english', min_df=.03, max_df=.8)

In [15]:
vect.fit(X)

In [16]:
# vocab size
len(vect.vocabulary_.keys())

1643

In [17]:
docs_test=pd.read_csv('movie_review_test.csv')

In [19]:
# mapping labels to 0 and 1
docs_test['label'] = docs_test["class"].map({'Pos':1, 'Neg':0})

# we can now drop the column 'Class'
docs_test = docs_test.drop('class', axis=1)

In [20]:
# convert to X and y
X_test = docs_test.text
y_test = docs_test.label
print(X_test.shape)
print(y_test.shape)

(400,)
(400,)


In [21]:
X_train_transformed = vect.transform(X)
X_test_transformed = vect.transform(X_test)

In [26]:
print(X_test_transformed.count_nonzero())

51663


In [28]:
# training the NB model and making predictions
from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()

# fit
bnb.fit(X_train_transformed,y)

# predict class
y_pred_class = bnb.predict(X_test_transformed)

# predict probabilities
y_pred_proba = bnb.predict_proba(X_test_transformed)


In [29]:
# printing the overall accuracy
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

0.79

In [30]:
# confusion matrix
metrics.confusion_matrix(y_test, y_pred_class)
# help(metrics.confusion_matrix)

array([[177,  23],
       [ 61, 139]], dtype=int64)

In [31]:
# training the NB model and making predictions
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()

# fit
mnb.fit(X_train_transformed,y)

# predict class
y_pred_class = mnb.predict(X_test_transformed)

# predict probabilities
y_pred_proba = mnb.predict_proba(X_test_transformed)


In [32]:
metrics.accuracy_score(y_test, y_pred_class)

0.8275

In [33]:
# confusion matrix
metrics.confusion_matrix(y_test, y_pred_class)
# help(metrics.confusion_matrix)

array([[172,  28],
       [ 41, 159]], dtype=int64)