In [1]:
import numpy as np 
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, CuDNNLSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical

from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import *
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression

import re

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Only keeping the necessary columns.

In [2]:
train_data = pd.read_csv('../data/fully_cleansed_train_data.csv'.format(prefix), encoding='ISO-8859-1', keep_default_na=False)
test_data = pd.read_csv('../data/fully_cleansed_test_data.csv'.format(prefix), encoding='ISO-8859-1', keep_default_na=False)

In [5]:
train_data.head(10)

Unnamed: 0,text,sentiment
551940,"@DonnieWahlberg Haha! Would love to, but sadl...",0
1085039,@taytaymonay hey girrl glad to have you on her...,4
1170478,@steph_davies It's only 9:30pm. I'll wait for...,4
1181071,@successforall Couldn't agree with you more!,4
430836,@jamesheart24 I am great.. 'Revising' for my l...,0
1496476,@LLPH a wife away from her husband?,4
36314,hoping to go to Nyc or L.A soon i miss it there,0
461189,anyone know of an app or program that helps me...,0
1445099,"@raquelaberakiki Hahaha, paid or not, I'm sure...",4
9629,So sleepy. Friday night was ruined by a stupid...,0


In [6]:
train_data['sentiment'].unique()

array([0, 4])

In [26]:
test_data = test_data[test_data['sentiment'].isin([0, 4])]
test_data['sentiment'].unique()

array([4, 0])

In [27]:
test_data.head(10)

Unnamed: 0,text,sentiment
0,@stellargirl I loooooooovvvvvveee my Kindle2. ...,4
1,Reading my kindle2... Love it... Lee childs i...,4
2,"Ok, first assesment of the #kindle2 ...it fuck...",4
3,@kenburbary You'll love your Kindle2. I've had...,4
4,@mikefish Fair enough. But i have the Kindle2...,4
5,@richardebaker no. it is too big. I'm quite ha...,4
6,Fuck this economy. I hate aig and their non lo...,0
7,Jquery is my new best friend.,4
8,Loves twitter,4
9,how can you not love Obama? he makes jokes abo...,4


In [29]:
print('Positive rows in train data: {}'.format(train_data[ train_data['sentiment'] == 4]['sentiment'].size))
print('Negative rows in train data: {}'.format(train_data[ train_data['sentiment'] == 0]['sentiment'].size))

Positive rows in train data: 800000
Negative rows in train data: 800000


# Creating Baseline NB model



In [30]:
X_train, Y_train = train_data['text'].values, train_data['sentiment'].values
X_test, Y_test = test_data['text'].values, test_data['sentiment'].values
print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

(1600000,) (1600000,)
(359,) (359,)


In [31]:
print ('positive train data: ', len(np.where(Y_train==4)[0]), 
       ', negative train data: ', len(np.where(Y_train==0)[0]))
print ('positive test data: ', len(np.where(Y_test==4)[0]), 
       ', negative test data: ', len(np.where(Y_test==0)[0]))

positive train data:  800000 , negative train data:  800000
positive test data:  182 , negative test data:  177


In [32]:
# transform text data using Tfidf vectorizer
max_features = 10000
tfidf = TfidfVectorizer(strip_accents='ascii', ngram_range=(1,1), min_df=2, 
                        stop_words='english', use_idf=False, sublinear_tf=True, max_features=max_features)
tfidf_train = tfidf.fit_transform(X_train)
tfidf_test = tfidf.transform(X_test)
train_tfidf_names = tfidf.get_feature_names()
print("Size of the vocabulary is", tfidf_train.shape[1])
print(tfidf_train.shape, Y_train.shape)

Size of the vocabulary is 10000
(1600000, 10000) (1600000,)


In [33]:
# remove 0-weight terms using logistic regression
logreg = LogisticRegression(penalty='l1', tol=0.01, C=10)
logreg.fit(tfidf_train, Y_train)
nonzero_feature_index = np.array(np.nonzero(logreg.coef_[0])[0])
features = [train_tfidf_names[int(w)] for w in nonzero_feature_index]

In [34]:
tfidf = TfidfVectorizer(use_idf=False, sublinear_tf=True, vocabulary=list(set(features)))
tfidf_train = tfidf.fit_transform(X_train)
tfidf_test = tfidf.transform(X_test)
print("Size of the vocabulary is", tfidf_train.shape[1])
print(tfidf_train.shape, Y_train.shape)

Size of the vocabulary is 9773
(1600000, 9773) (1600000,)


In [35]:
bnb = BernoulliNB(alpha=0.01)
bnb.fit(tfidf_train, Y_train)
predicted = bnb.predict(tfidf_test)
print (classification_report(predicted, Y_test))

             precision    recall  f1-score   support

          0       0.82      0.81      0.81       179
          4       0.81      0.82      0.82       180

avg / total       0.82      0.82      0.82       359



In [36]:
log_prob = bnb.feature_log_prob_
prob = np.exp(log_prob)
sorted_prob = np.copy(prob)
sorted_prob.sort(axis=1)
feature_names = tfidf.get_feature_names()
# Save 20 features in a list
positive_index = []
negative_index = []
positive_feature_list = []
negative_feature_list = []
for i in range(len(prob[1])):
    if prob[1][i] in sorted_prob[1][-11:-1]:
        positive_index.append(i)
for ind in positive_index:
    positive_feature_list.append(feature_names[ind])
    
for i in range(len(prob[0])):
    if prob[0][i] in sorted_prob[0][-11:-1]:
        negative_index.append(i)
for ind in negative_index:
    negative_feature_list.append(feature_names[ind])
    
print("Top 10 positive words:", positive_feature_list)
print("Top 10 negative words:", negative_feature_list)

Top 10 positive words: ['good', 'thanks', 'today', 'love', 'like', 'going', 'lol', 'com', 'day', 'http']
Top 10 negative words: ['want', 'work', 'today', 'really', 'like', 'going', 'got', 'miss', 'don', 'day']
