In [1]:
import numpy as np 
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, CuDNNLSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical

from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import *
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression

import re

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Only keeping the necessary columns.

In [2]:
train_data = pd.read_csv('../data/sentiment140_train.zip', encoding='ISO-8859-1', header=None, names=['sentiment','id','timestamp','type','user','text'])
test_data = pd.read_csv('../data/sentiment140_test.zip', encoding='ISO-8859-1', header=None, names=['sentiment','id','timestamp','type','user','text'])

# The original training data are sorted by sentiment value. Shuffle the training data for randomness
train_data = train_data[['text','sentiment']].sample(frac=1)
test_data = test_data[['text','sentiment']]

In [3]:
train_data.dtypes

text         object
sentiment     int64
dtype: object

In [4]:
train_data.head(10)

Unnamed: 0,text,sentiment
1362915,please this account is inactive i would like t...,4
1405689,I do miss her! at t'time u didn't catch my jok...,4
1349269,"friday already??? rajskub, you are right, life...",4
101823,No! Please don't rain! Sunshine I already miss...,0
1228317,@popstarmagazine http://twitpic.com/6djtf - @s...,4
1273018,Just finished late night with @jimmyfallon... ...,4
634879,"@jojo_the_brat sorry, my phone is being SUPER ...",0
487446,i was too late to the zoo and didnt get to see...,0
556869,@judithkeane,0
195237,Good night twitter ;). Aww... My baby is sick...,0


In [5]:
train_data['sentiment'].unique()

array([4, 0])

In [6]:
test_data.head(10)

Unnamed: 0,text,sentiment
0,@stellargirl I loooooooovvvvvveee my Kindle2. ...,4
1,Reading my kindle2... Love it... Lee childs i...,4
2,"Ok, first assesment of the #kindle2 ...it fuck...",4
3,@kenburbary You'll love your Kindle2. I've had...,4
4,@mikefish Fair enough. But i have the Kindle2...,4
5,@richardebaker no. it is too big. I'm quite ha...,4
6,Fuck this economy. I hate aig and their non lo...,0
7,Jquery is my new best friend.,4
8,Loves twitter,4
9,how can you not love Obama? he makes jokes abo...,4


In [7]:
test_data['sentiment'].unique()

array([4, 0, 2])

In [8]:
print('Positive rows: {}'.format(train_data[ train_data['sentiment'] == 4]['sentiment'].size))
print('Negative rows: {}'.format(train_data[ train_data['sentiment'] == 0]['sentiment'].size))

Positive rows: 800000
Negative rows: 800000


# Creating Baseline NB model



In [9]:
X_train, Y_train = train_data['text'].values[len(train_data['sentiment'])//2:], train_data['sentiment'].values[len(train_data['sentiment'])//2:]
X_dev, Y_dev = train_data['text'].values[:len(train_data['sentiment'])//2], train_data['sentiment'].values[:len(train_data['sentiment'])//2]
print(X_train.shape, Y_train.shape)
print(X_dev.shape, Y_dev.shape)

(800000,) (800000,)
(800000,) (800000,)


In [10]:
print ('positive train data: ', len(np.where(Y_train==4)[0]), 
       ', negative train data: ', len(np.where(Y_train==0)[0]))
print ('positive dev data: ', len(np.where(Y_dev==4)[0]), 
       ', negative dev data: ', len(np.where(Y_dev==0)[0]))

positive train data:  400295 , negative train data:  399705
positive dev data:  399705 , negative dev data:  400295


In [11]:
# transform text data using Tfidf vectorizer
max_features = 10000
tfidf = TfidfVectorizer(strip_accents='ascii', ngram_range=(1,1), min_df=2, 
                        stop_words='english', use_idf=False, sublinear_tf=True, max_features=max_features)
tfidf_train = tfidf.fit_transform(X_train)
tfidf_dev = tfidf.transform(X_dev)
train_tfidf_names = tfidf.get_feature_names()
print("Size of the vocabulary is", tfidf_train.shape[1])
print(tfidf_train.shape, Y_train.shape)

Size of the vocabulary is 10000
(800000, 10000) (800000,)


In [12]:
# remove 0-weight terms using logistic regression
logreg = LogisticRegression(penalty='l1', tol=0.01, C=10)
logreg.fit(tfidf_train, Y_train)
nonzero_feature_index = np.array(np.nonzero(logreg.coef_[0])[0])
features = [train_tfidf_names[int(w)] for w in nonzero_feature_index]

In [13]:
tfidf = TfidfVectorizer(use_idf=False, sublinear_tf=True, vocabulary=list(set(features)))
tfidf_train = tfidf.fit_transform(X_train)
tfidf_dev = tfidf.transform(X_dev)
print("Size of the vocabulary is", tfidf_train.shape[1])
print(tfidf_train.shape, Y_train.shape)

Size of the vocabulary is 9634
(800000, 9634) (800000,)


In [14]:
bnb = BernoulliNB(alpha=0.01)
bnb.fit(tfidf_train, Y_train)
predicted = bnb.predict(tfidf_dev)
print (classification_report(predicted, Y_dev))

             precision    recall  f1-score   support

          0       0.75      0.77      0.76    393498
          4       0.77      0.76      0.76    406502

avg / total       0.76      0.76      0.76    800000



In [15]:
log_prob = bnb.feature_log_prob_
print(log_prob.shape)
print(log_prob)

(2, 9634)
[[ -7.47790286 -10.18976546  -9.85348359 ...  -9.20935268 -10.18976546
   -6.17463764]
 [ -7.01105145  -9.49842644  -9.64147602 ...  -8.60936069  -9.7640282
   -6.39565396]]


In [16]:
prob = np.exp(log_prob)
prob

array([[5.65441985e-04, 3.75526932e-05, 5.25637631e-05, ...,
        1.00098818e-04, 3.75526932e-05, 2.08156005e-03],
       [9.01859833e-04, 7.49697061e-05, 6.49770762e-05, ...,
        1.82390478e-04, 5.74826037e-05, 1.66879418e-03]])

In [17]:
sorted_prob = np.copy(prob)
sorted_prob.sort(axis=1)
print(sorted_prob)

[[2.50184499e-08 2.50184499e-08 2.50184499e-08 ... 4.91887993e-02
  5.48479726e-02 7.78099059e-02]
 [2.49815748e-08 2.49815748e-08 2.49815748e-08 ... 5.78823339e-02
  7.42802396e-02 7.64311532e-02]]


In [22]:
log_prob = bnb.feature_log_prob_
prob = np.exp(log_prob)
sorted_prob = np.copy(prob)
sorted_prob.sort(axis=1)
feature_names = tfidf.get_feature_names()
# Save 20 features in a list
positive_index = []
negative_index = []
positive_feature_list = []
negative_feature_list = []
for i in range(len(prob[1])):
    if prob[1][i] in sorted_prob[1][-11:-1]:
        positive_index.append(i)
for ind in positive_index:
    positive_feature_list.append(feature_names[ind])
    
for i in range(len(prob[0])):
    if prob[0][i] in sorted_prob[0][-11:-1]:
        negative_index.append(i)
for ind in negative_index:
    negative_feature_list.append(feature_names[ind])
    
print("Top 10 positive words:", positive_feature_list)
print("Top 10 negative words:", negative_feature_list)

Top 10 positive words: ['today', 'day', 'love', 'http', 'like', 'lol', 'good', 'com', 'thanks', 'going']
Top 10 negative words: ['really', 'today', 'day', 'don', 'like', 'work', 'miss', 'got', 'going', 'want']
