In [None]:
import numpy as np 
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import *
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression

import re

Only keeping the necessary columns.

In [None]:
def read_csv(prefix=''):
    train_data = pd.read_csv('../data/{}train_data.csv'.format(prefix), encoding='ISO-8859-1', keep_default_na=False)
    dev_data = pd.read_csv('../data/{}dev_data.csv'.format(prefix), encoding='ISO-8859-1', keep_default_na=False)
    test_data = pd.read_csv('../data/{}test_data.csv'.format(prefix), encoding='ISO-8859-1', keep_default_na=False)
    return train_data, dev_data, test_data

train_data, dev_data, test_data = read_csv('fully_cleansed_')

In [None]:
train_data.head(10)

In [None]:
test_data.head(10)

In [None]:
print('Positive rows in train data: {}'.format(train_data[ train_data['sentiment'] == 4]['sentiment'].size))
print('Negative rows in train data: {}'.format(train_data[ train_data['sentiment'] == 0]['sentiment'].size))

# Creating Baseline NB model

In [None]:
X_train, Y_train = train_data['text'].values, train_data['sentiment'].values
X_dev, Y_dev = dev_data['text'].values, dev_data['sentiment'].values
X_test, Y_test = test_data['text'].values, test_data['sentiment'].values
print(X_train.shape, Y_train.shape)
print(X_dev.shape, Y_dev.shape)
print(X_test.shape, Y_test.shape)

In [None]:
# transform text data using Tfidf vectorizer
tfidf = TfidfVectorizer(strip_accents='ascii', ngram_range=(1,2), min_df=2, 
                        use_idf=False, sublinear_tf=True)
tfidf_train = tfidf.fit_transform(X_train)
tfidf_test = tfidf.transform(X_test)
train_tfidf_names = tfidf.get_feature_names()
print("Size of the vocabulary is", tfidf_train.shape[1])
print(tfidf_train.shape, Y_train.shape)

In [None]:
# remove 0-weight terms using logistic regression
logreg = LogisticRegression(penalty='l1', tol=0.01, C=1)
logreg.fit(tfidf_train, Y_train)
nonzero_feature_index = np.array(np.nonzero(logreg.coef_[0])[0])
features = [train_tfidf_names[int(w)] for w in nonzero_feature_index]

In [None]:
tfidf = TfidfVectorizer(strip_accents='ascii', ngram_range=(1,2), min_df=2, 
                        use_idf=False, sublinear_tf=True, max_features=len(features))
tfidf_train = tfidf.fit_transform(X_train)
tfidf_test = tfidf.transform(X_test)
train_tfidf_names = tfidf.get_feature_names()
print("Size of the vocabulary is", tfidf_train.shape[1])
print(tfidf_train.shape, Y_train.shape)

In [None]:
bnb = BernoulliNB(alpha=1)
bnb.fit(tfidf_train, Y_train)
predicted = bnb.predict(tfidf_test)
print (classification_report(predicted, Y_test))

In [None]:
mnb = MultinomialNB(alpha=1)
mnb.fit(tfidf_train, Y_train)
predicted = mnb.predict(tfidf_test)
print (classification_report(predicted, Y_test))

In [None]:
log_prob = mnb.feature_log_prob_
prob = np.exp(log_prob)
sorted_prob = np.copy(prob)
sorted_prob.sort(axis=1)
feature_names = tfidf.get_feature_names()
# Save 20 features in a list
positive_index = []
negative_index = []
positive_feature_list = []
negative_feature_list = []
for i in range(len(prob[1])):
    if prob[1][i] in sorted_prob[1][-11:-1]:
        positive_index.append(i)
for ind in positive_index:
    positive_feature_list.append(feature_names[ind])
    
for i in range(len(prob[0])):
    if prob[0][i] in sorted_prob[0][-11:-1]:
        negative_index.append(i)
for ind in negative_index:
    negative_feature_list.append(feature_names[ind])
    
print("Top 10 positive words:", positive_feature_list)
print("Top 10 negative words:", negative_feature_list)