Reading XML data for Sentiment Analysis on Amazon Electronics dataset. Link: http://www.cs.jhu.edu/~mdredze/datasets/sentiment/index2.html

In [1]:
import pandas as pd
import numpy as np
import re
import nltk

To avoid SSL Error, Try the following code

In [2]:
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/induda/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from bs4 import BeautifulSoup
nltk.download('punkt')
nltk.download('wordnet')
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
lemmatize_obj = WordNetLemmatizer()


[nltk_data] Downloading package punkt to /home/induda/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/induda/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
#Reading data using BeautifulSoup
positive_reviews = BeautifulSoup(open('/home/induda/positive.review').read())
negative_reviews = BeautifulSoup(open('/home/induda/negative.review').read())

positive_reviews = positive_reviews.findAll('review_text')
negative_reviews = negative_reviews.findAll('review_text')

In [5]:
type(positive_reviews)

bs4.element.ResultSet

In [6]:
def clean_data(data):
    lst = []
    for review in data:
        sentence = review.text.lower()
        sentence = re.sub('[^a-zA-Z]', ' ', sentence)
        sentence = word_tokenize(sentence)  
        sentence = [lemmatize_obj.lemmatize(word) for word in sentence if not word in set(stopwords.words("english")) and len(word) >1]
        sentence = ' '.join(sentence)
        #sentence = sentence.translate(None, string.punctuation)
        lst.append(sentence)
    return lst
    
corpus = clean_data(positive_reviews)
corpus_neg = clean_data(negative_reviews)

#Concatenating pos and neg reviews
review_list = corpus + corpus_neg

#Created vocab for countvectorizer
vocab_dict = {}
i = 0
for word in review_list:
    word = word.split()
    for w in word:
        if w not in vocab_dict.keys():
            vocab_dict[w] = i
            i+=1
     

In [7]:
import pandas as pd
df_pos = pd.DataFrame(corpus)
df_pos['label'] = np.zeros(len(corpus))
df_pos.columns = ['reviews', 'label']

df_neg = pd.DataFrame(corpus_neg)
df_neg['label'] = np.ones(len(corpus_neg))
df_neg.columns = ['reviews', 'label']

final_df = df_pos.append(df_neg)

from sklearn.utils import shuffle
final_df = shuffle(final_df)

In [8]:
#instanciating CountVectorizer to count frequencies
cv = CountVectorizer(vocabulary = vocab_dict)
cv_input = final_df.iloc[:,0]
X = cv.fit_transform(cv_input).toarray()
y = final_df.iloc[:,-1].values

In [9]:
len(cv.vocabulary_)

9407

In [10]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train,y_test = train_test_split(X,y, test_size = 0.3, random_state = 0)

from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import f1_score
ad = AdaBoostClassifier()
ad.fit(X_train, y_train)
y_pred = ad.predict(X_test)

print "adaboost score is ",ad.score(X_test,y_test)

from sklearn.metrics import confusion_matrix
print "confusion_matrix", confusion_matrix(y_test,y_pred)

print "F1_Score ", f1_score(y_test,y_pred)


adaboost score is  0.753333333333
confusion_matrix [[216 100]
 [ 48 236]]
F1_Score  0.761290322581


In [11]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train,y_test = train_test_split(X,y, test_size = 0.3, random_state = 0)

from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
y_pred = log_reg.predict(X_test)

print "LogisticRegression score is ",log_reg.score(X_test,y_test)

from sklearn.metrics import confusion_matrix
print "confusion_matrix", confusion_matrix(y_test,y_pred)

print "F1_Score ", f1_score(y_test,y_pred)

LogisticRegression score is  0.815
confusion_matrix [[261  55]
 [ 56 228]]
F1_Score  0.804232804233


In [12]:
log_reg.coef_

array([[ 0.69647407,  0.17985374,  0.75088725, ...,  0.08990477,
         0.08990477,  0.01097713]])

In [13]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train,y_test = train_test_split(X,y, test_size = 0.3, random_state = 0)

from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)

print "MultinomialNB score is ",nb.score(X_test,y_test)

from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

print "confusion_matrix", confusion_matrix(y_test,y_pred)

print "F1_Score ", f1_score(y_test,y_pred)

MultinomialNB score is  0.765
confusion_matrix [[235  81]
 [ 60 224]]
F1_Score  0.760611205433


In [14]:
log_reg.coef_[0][0]

0.69647407466797651

In [15]:
for k,v in vocab_dict.items():
    print k, log_reg.coef_[0][v]

raining -0.045401383794
conspiratively 0.0101982669112
yellow -0.169540200903
four 0.234162550248
gag 4.83602733682e-05
circuitry 0.0993138924364
hanging -0.0322960304322
centimeter 0.0013710684623
marching 0.0
shure -0.0810854825766
looking 0.100307260955
accupower 0.0
eligible -0.0490308098444
electricity -0.0772891875463
scold -0.034340844365
unanswered -8.29960752121e-06
superficially 1.05893589657e-05
xtc 0.0
crossbar 0.000939994105176
sputter -0.00527760294994
lord -0.0424707683677
swivel -0.105052847308
shielding 0.0204682245079
regional 0.00109681206314
dell -0.108001211427
hdtv -0.215831702854
replaces 0.0
foul 0.100904164399
malfunctioned -0.00235155131096
reinstalled 0.300047406027
bringing 0.139725933915
internally 0.0
reformatted 0.045497629542
customizable 0.0
persisted 0.015286109517
succession 0.0
straight -0.0612393362334
tired 0.0726735332093
preface 0.0
existance 0.0
elegant -0.0528609833206
second 0.110829206657
scraped 0.0
dangled 0.0
cooking 0.0
designing 0.000335

In [16]:
#Reading data using BeautifulSoup
unknown_reviews = BeautifulSoup(open('/home/induda/unlabeled.review').read())
unknown_reviews = unknown_reviews.findAll('review_text')
corpus_reviews = clean_data(unknown_reviews)
reviews_data = cv.transform(corpus_reviews).toarray()


In [17]:
y_pred = log_reg.predict(reviews_data)

In [18]:
y_pred

array([ 1.,  0.,  1., ...,  0.,  0.,  0.])