In [1]:
import numpy as np 
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import *
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression

import re
import time

In [2]:
def read_csv(prefix=''):
    train_data = pd.read_csv('../../data/{}train_data.csv'.format(prefix), encoding='ISO-8859-1', keep_default_na=False)
    dev_data = pd.read_csv('../../data/{}dev_data.csv'.format(prefix), encoding='ISO-8859-1', keep_default_na=False)
    test_data = pd.read_csv('../../data/{}test_data.csv'.format(prefix), encoding='ISO-8859-1', keep_default_na=False)
    return train_data, dev_data, test_data

train_data, dev_data, test_data = read_csv('fully_cleansed_')

In [3]:
train_data.head(10)

Unnamed: 0,text,sentiment
0,can someon fix twitter pleas smell of aquat ma...,0
1,just walk up stop stori escal at peachtre cent...,0
2,ye there proof two lovebird go to enjoy,4
3,im so sorri youtub video accident delet danc v...,0
4,want to go home now come back on thursday,0
5,mmmm shamrat arriv,4
6,go round aunit in pool x,4
7,cheer buddi will good see last night bit of ba...,4
8,sorri to hear will come kick ass in day if wan...,0
9,think time to retir saab,0


In [4]:
test_data.head(10)

Unnamed: 0,text,sentiment
0,sure thing should start with ustream amp blip ...,4
1,thank for video for alo girl tonight enjoy ver...,4
2,so sad didnt camera while burn,0
3,not nice night to out on tile drive safe,0
4,look for anyon gd to help with logo design pro...,4
5,weve got holiday rain,0
6,workout wick hard alway limp when finish,4
7,damn u den just threw away,0
8,ye isnt interest own mom made day about can te...,4
9,cant wait to get your best ohh def drove n hal...,0


In [5]:
print('Positive rows in train data: {}'.format(train_data[ train_data['sentiment'] == 4]['sentiment'].size))
print('Negative rows in train data: {}'.format(train_data[ train_data['sentiment'] == 0]['sentiment'].size))

Positive rows in train data: 512240
Negative rows in train data: 511760


# Creating Baseline NB model

In [6]:
X_train, Y_train = train_data['text'].values, train_data['sentiment'].values
X_dev, Y_dev = dev_data['text'].values, dev_data['sentiment'].values
X_test, Y_test = test_data['text'].values, test_data['sentiment'].values
print(X_train.shape, Y_train.shape)
print(X_dev.shape, Y_dev.shape)
print(X_test.shape, Y_test.shape)

(1024000,) (1024000,)
(256000,) (256000,)
(320000,) (320000,)


In [7]:
# transform text data using Tfidf vectorizer
tfidf = TfidfVectorizer(strip_accents='ascii', ngram_range=(1,2), min_df=2, 
                        use_idf=False, sublinear_tf=True)
tfidf_train = tfidf.fit_transform(X_train)
tfidf_test = tfidf.transform(X_test)
train_tfidf_names = tfidf.get_feature_names()
print("Size of the vocabulary is", tfidf_train.shape[1])
print(tfidf_train.shape, Y_train.shape)

Size of the vocabulary is 675688
(1024000, 675688) (1024000,)


In [8]:
# remove 0-weight terms using logistic regression
logreg = LogisticRegression(penalty='l1', tol=0.01, C=1)
logreg.fit(tfidf_train, Y_train)
nonzero_feature_index = np.array(np.nonzero(logreg.coef_[0])[0])
features = [train_tfidf_names[int(w)] for w in nonzero_feature_index]

In [9]:
tfidf = TfidfVectorizer(strip_accents='ascii', ngram_range=(1,2), min_df=2, 
                        use_idf=False, sublinear_tf=True, max_features=len(features))
tfidf_train = tfidf.fit_transform(X_train)
tfidf_test = tfidf.transform(X_test)
train_tfidf_names = tfidf.get_feature_names()
print("Size of the vocabulary is", tfidf_train.shape[1])
print(tfidf_train.shape, Y_train.shape)

Size of the vocabulary is 18932
(1024000, 18932) (1024000,)


In [16]:
bnb = BernoulliNB(alpha=1)

start = time.time()
bnb.fit(tfidf_train, Y_train)
end = time.time()
print("Elapsed time:", "{:.2f}".format(end - start), "s")

predicted = bnb.predict(tfidf_test)
print (classification_report(predicted, Y_test))

Elapsed time: 0.38 s
             precision    recall  f1-score   support

          0       0.76      0.79      0.77    152701
          4       0.80      0.77      0.78    167299

avg / total       0.78      0.78      0.78    320000



In [17]:
mnb = MultinomialNB(alpha=1)

start = time.time()
mnb.fit(tfidf_train, Y_train)
end = time.time()
print("Elapsed time:", "{:.2f}".format(end - start), "s")

predicted = mnb.predict(tfidf_test)
print (classification_report(predicted, Y_test))

Elapsed time: 0.27 s
             precision    recall  f1-score   support

          0       0.79      0.78      0.79    162671
          4       0.78      0.79      0.78    157329

avg / total       0.79      0.79      0.79    320000



In [18]:
log_prob = mnb.feature_log_prob_
prob = np.exp(log_prob)
sorted_prob = np.copy(prob)
sorted_prob.sort(axis=1)
feature_names = tfidf.get_feature_names()
# Save 20 features in a list
positive_index = []
negative_index = []
positive_feature_list = []
negative_feature_list = []
for i in range(len(prob[1])):
    if prob[1][i] in sorted_prob[1][-11:-1]:
        positive_index.append(i)
for ind in positive_index:
    positive_feature_list.append(feature_names[ind])
    
for i in range(len(prob[0])):
    if prob[0][i] in sorted_prob[0][-11:-1]:
        negative_index.append(i)
for ind in negative_index:
    negative_feature_list.append(feature_names[ind])
    
print("Top 10 positive words:", positive_feature_list)
print("Top 10 negative words:", negative_feature_list)

Top 10 positive words: ['for', 'good', 'im', 'in', 'just', 'love', 'of', 'on', 'thank', 'with']
Top 10 negative words: ['but', 'for', 'go', 'im', 'in', 'just', 'not', 'of', 'on', 'so']
