In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy
import sklearn
import seaborn as sns
from sklearn.naive_bayes import BernoulliNB
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

# Data
This dataset consists of 1000 reviews from Amazon that are assigned as being positive or negative. We're going to extract features from this dataset to use in a Naive Bayes model and evaluate the performance on other binary reviews such as from IMDB and Yelp.

In [2]:
data_path=('file:///C:/Users/JChaotogo/Desktop/New%20folder/sentiment%20labelled%20sentences/amazon_cells_labelled.txt')
df=pd.read_csv(data_path, delimiter='\t', header=None)
df.columns=['sentence','positive'] 
df

Unnamed: 0,sentence,positive
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1
5,I have to jiggle the plug to get it to line up...,0
6,If you have several dozen or several hundred c...,0
7,If you are Razr owner...you must have this!,1
8,"Needless to say, I wasted my money.",0
9,What a waste of money and time!.,0


# Data Cleaning
The data consists of many special characters and numbers that make it difficult to use the words as features. In addition, stripping special characters helps us normalize some words such as don't and dont. In addition, we will use stopwords to remove words that are generally neutral from our bag of words featureset.

In [3]:
#Here I'm going to clean the data to make it uniform
#First, we want to lower-case all words
df['sentence']=df['sentence'].str.lower()
df.replace(to_replace='\d', regex=True,value='')
#Then we want to replace special characters
specialcharacters=['!', '?', '.', '$', '#', '-', '(', ')', '*', '&', '_', 
                   '+', '=', '"', '<', '>', ':', ';', '~', '`', '@', ',','/', '[',']']
for character in specialcharacters:
    df['sentence']=df.sentence.str.replace(character, ' ')

df['sentence']=df.sentence.str.replace("'", '')
df['sentence']=df.sentence.str.strip()

In [4]:
dfpos=df[df.positive==1]
dfneg=df[df.positive==0]
#dfpos['sentence'] = dfpos.apply(lambda row: nltk.word_tokenize(row['sentence']), axis=1)
#dfneg['sentence'] = dfneg.apply(lambda row: nltk.word_tokenize(row['sentence']), axis=1)

dfpos

Unnamed: 0,sentence,positive
1,good case excellent value,1
2,great for the jawbone,1
4,the mic is great,1
7,if you are razr owner you must have this,1
10,and the sound quality is great,1
11,he was very impressed when going from the orig...,1
13,very good quality though,1
15,highly recommend for any one who has a blue to...,1
17,so far so good,1
18,works great,1


In [5]:
array1 = list(set(df[df.positive==0]['sentence'].str.cat(sep=' ').split()))
array2 = list(set(df[df.positive==1]['sentence'].str.cat(sep=' ').split()))
negativelist = np.setdiff1d(array1, array2, assume_unique=True).tolist()
negativelist

['improvement',
 'son',
 'runs',
 'fond',
 'abhor',
 'cingulair',
 'somewhere',
 'supposedly',
 'wired',
 'longer',
 'mother',
 'kitchen',
 'counterfeit',
 'party',
 'numbers',
 'might',
 'soyo',
 'toilet',
 'instance',
 'fliptop',
 'once',
 'advise',
 'except',
 'felt',
 'regretted',
 'external',
 'chinese',
 'bed',
 'crawl',
 'reverse',
 'irda',
 'buyit',
 'tied',
 'startac',
 'connecting',
 'colored',
 'totally',
 'website',
 'saying',
 'defect',
 'button',
 'backlight',
 'along',
 'megapixels',
 'disapoinment',
 'accept',
 'happened',
 'compared',
 'receiving',
 'jack',
 'ready',
 'study',
 'infuriating',
 'accessory',
 'linksys',
 'trust',
 'online',
 'flawed',
 'pause',
 'basically',
 'units',
 'discarded',
 'starter',
 'bougth',
 'tape',
 'please',
 'calendar',
 'flimsy',
 'frustration',
 'seperated',
 'difficult',
 'fee',
 'hinge',
 'mode',
 'sucked',
 'dna',
 'deaf',
 'ad',
 'describe',
 'died',
 'replacementr',
 'fun',
 'mention',
 'garbage',
 'fairly',
 'four',
 'defeats',
 

In [6]:
#Create list of words 'negonly' to represent words that are in the negative list but not in the positive list. Using intuition, 
#We also added a few words that would normally be representative of a negative review.
concentratedwords = df.sentence.str.cat(sep=' ')
words = word_tokenize(concentratedwords)

all_words =[]
stop_words = set(stopwords.words("English"))
for w in words: 
    if w not in stop_words:
        all_words.append(w)
all_words = nltk.FreqDist(all_words)
len(all_words)

neg_words= []
concentratednegwords = dfneg.sentence.str.cat(sep=' ')
negativewords = word_tokenize(concentratednegwords)
for w in negativewords: 
    if w not in stop_words:
        neg_words.append(w)

len(nltk.FreqDist(neg_words))
neg_words = nltk.FreqDist(neg_words)
neg_wordsdf = pd.DataFrame.from_dict(neg_words, orient='index', columns=['words'])
neg_wordsdf.words.sort_values(ascending=False)

pos_words=[]
concentratedposwords = dfpos.sentence.str.cat(sep=' ')
positivewords = word_tokenize(concentratedposwords)
for w in positivewords:
    if w not in stop_words:
        pos_words.append(w)
#Compare the list of positive words to the list of negative words and return a list of words found only in the negative word list
negonly = []
for word in neg_words:
    if word not in pos_words:
        negonly.append(word)
keywords = ['bad', 'disgusting', 'terrible', 'not good', 'no good', 'trash', 'garbage', 'horrible', 'waste']
for key in keywords:
    negonly.append(key)
negonly

['unless',
 'converter',
 'tied',
 'conversations',
 'lasting',
 '45',
 'major',
 'jiggle',
 'dozen',
 'hundred',
 'imagine',
 'fun',
 'needless',
 'wasted',
 'waste',
 'seperated',
 'mere',
 'ft',
 'excessive',
 'garbled',
 'odd',
 'advise',
 'fooled',
 'clicks',
 'wonder',
 'mechanism',
 'website',
 'followed',
 'directions',
 'commercials',
 'misleading',
 'mother',
 'didnt',
 'instructions',
 'pull',
 'earphone',
 'breakage',
 'unacceptible',
 'unusable',
 'moving',
 'freeway',
 'speed',
 'contract',
 'hate',
 'mins',
 'short',
 'poor',
 'worthless',
 'garbage',
 'mind',
 'gon',
 'na',
 'arguing',
 'returned',
 'disappointed',
 'bad',
 'essentially',
 'forget',
 'microsofts',
 'tech',
 'support',
 'particular',
 'angle',
 'party',
 'clearly',
 'drawback',
 'player',
 'cover',
 'pause',
 'skip',
 'songs',
 'lock',
 'week',
 'later',
 'activated',
 'suddenly',
 'died',
 'bmw',
 'fairly',
 'quiet',
 'hearing',
 'person',
 'saying',
 'd807',
 'wrongly',
 'longer',
 'runs',
 'broke',
 '

In [24]:
#Use the Bernoulli classifier to fit the prediction to the imdb review list.
df2=df
for key in negativelist:
    df2[str(key)]=df2.sentence.str.contains(' '+str(key)+' ', case=False)
data = df2[negativelist]
target = df2['positive']

bnb = BernoulliNB()
bnb.fit(data, target)
y_pred = bnb.predict(data)

print("Number of mislabeled points out of a total {} points : {}".format(
    data.shape[0],
    (target != y_pred).sum()))

Number of mislabeled points out of a total 1000 points : 309


In [23]:
df2=df
for key in negonly:
    df2[str(key)]=df2.sentence.str.contains(' '+str(key)+' ', case=False)
data = df2[negonly]
target = df2['positive']

bnb = BernoulliNB()
bnb.fit(data, target)
y_pred = bnb.predict(data)

print("Number of mislabeled points out of a total {} points : {}".format(
    data.shape[0],
    (target != y_pred).sum()))

Number of mislabeled points out of a total 1000 points : 307


In [9]:
confusion_matrix(target, y_pred)

array([[193, 307],
       [  0, 500]], dtype=int64)

# Yelp Dataset
Now we will try the classifier on the Yelp data sets to see how effective it is at detecting negative sentiment.

In [10]:
data_path=('file:///C:/Users/JChaotogo/Desktop/New%20folder/sentiment%20labelled%20sentences/yelp_labelled.txt')
yelp = pd.read_csv(data_path, delimiter='\t', header=None)
yelp.columns=['reviews','positive'] 


In [11]:
yelp.head()
yelp.reviews.describe()

count                                  1000
unique                                  996
top       I would not recommend this place.
freq                                      2
Name: reviews, dtype: object

In [12]:
yelp['reviews'] = yelp.reviews.str.lower()
yelp['reviews'] = yelp.reviews.str.replace("'", '')
for character in specialcharacters:
    yelp['reviews'] = yelp.reviews.str.replace(character, ' ')
yelp.head()

Unnamed: 0,reviews,positive
0,wow loved this place,1
1,crust is not good,0
2,not tasty and the texture was just nasty,0
3,stopped by during the late may bank holiday of...,1
4,the selection on the menu was great and so wer...,1


In [13]:
for key in negonly:
    yelp[str(key)]=yelp.reviews.str.contains(' '+str(key)+' ', case=False)

yelptarget = yelp['positive']
data = yelp[negonly]
bnb.fit(data, yelptarget)
y_pred2 = bnb.predict(data)

print("Number of mislabeled points out of a total {} points : {}".format(
    data.shape[0],
    (yelptarget != y_pred2).sum()))

Number of mislabeled points out of a total 1000 points : 284


In [14]:
confusion_matrix(yelptarget, y_pred2)

array([[230, 270],
       [ 14, 486]], dtype=int64)

In [15]:
data_path=('file:///C:/Users/JChaotogo/Desktop/New%20folder/sentiment%20labelled%20sentences/imdb_labelled.txt')
imdb = pd.read_csv(data_path, delimiter='\t', header=None)
imdb.columns=['reviews','positive'] 
imdb.head()

Unnamed: 0,reviews,positive
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [16]:
#Cleaning imdb reviews
imdb['reviews'] = imdb.reviews.str.lower()
imdb['reviews'] = imdb.reviews.str.replace("'", '')
for character in specialcharacters:
    imdb['reviews'] = imdb.reviews.str.replace(character, ' ')
len(imdb.reviews)

748

In [17]:
#Using our negonly list from the amazon reviews, we use fit the data to the imdb target.
for key in negonly:
    imdb[str(key)]=imdb.reviews.str.contains(' '+str(key)+' ', case=False)
    imdb[str(key)]=imdb.reviews.str.contains(str(key)+' ', case=False)

imdbtarget = imdb['positive']
imdbdata = imdb[negonly]
bnb.fit(imdbdata, imdbtarget)
y_pred3 = bnb.predict(imdbdata)

print("Number of mislabeled points out of a total {} points : {}".format(
    imdbdata.shape[0],
    (imdbtarget != y_pred3).sum()))
print(confusion_matrix(imdbtarget, y_pred3))

Number of mislabeled points out of a total 748 points : 205
[[176 186]
 [ 19 367]]


In [18]:
print('negative reviews', imdb[imdb.positive==0].count().head(1))
print('positive reviews', imdb[imdb.positive==1].count().head(1))
#There appears to be no class imbalance, but this classifier seems to be better at identifying negative reviews over positive.

negative reviews reviews    362
dtype: int64
positive reviews reviews    386
dtype: int64


We had 15 type 2 errors where reviews were identified as positive when they were actually negative.
We also incorrectly classified 214 positive reviews as negative. Type 1 errors.
Sensitivity of this model is 371/386 whereas our specificity is much lower at 148/362. This makes sense because the features in our classifier are the words that were not present in the positive sentiment reviews.

Because we are already pretty good at identifying negative sentiment, we can improve the performance of the classifier by removing features that are more generalized such as numbers or neutral words.

In [19]:
nulist=[]
for item in negonly:
    if item.isalpha()==True:
        nulist.append(item)
len(nulist)
neutral = ['website', 'freeway', 'mins', 'ft', 'in', 'conversations', 
           'party', 'songs', 'microsofts', 'person', 'music', 'buyer'
           'takes', 'minute', 'mail', 'check', 'tell', 'felt', 'experience', 'brand']
for word in nulist:
    if word in neutral:
        nulist.remove(word)

len(nulist)

697

In [20]:
#classifier with features in nulist
#First we test it on the imdb reviews.

for key in nulist:
    imdb[str(key)]=imdb.reviews.str.contains(' '+str(key)+' ', case=False)
    imdb[str(key)]=imdb.reviews.str.contains(str(key)+' ', case=False)

imdbtarget = imdb['positive']
imdbdata = imdb[nulist]
bnb.fit(imdbdata, imdbtarget)
y_pred4 = bnb.predict(imdbdata)

print("Number of mislabeled points out of a total {} points : {}".format(
    imdbdata.shape[0],
    (imdbtarget != y_pred4).sum()))
print(confusion_matrix(imdbtarget, y_pred4))
#This classifier has more type 2 errors than the original.

Number of mislabeled points out of a total 748 points : 206
[[175 187]
 [ 19 367]]


In [21]:
#Nouns often do not have significant impact on sentiment. For this classifier, we will use NLTK to remove the nouns from the
#negative word list to observe the effect.
is_noun = lambda pos: pos[:2] == 'NN'
nounsremoved = [word for (word, pos) in nltk.pos_tag(negonly) if is_noun(pos)==False]
len(nounsremoved)

for key in nounsremoved:
    imdb[str(key)]=imdb.reviews.str.contains(' '+str(key)+' ', case=False)
    imdb[str(key)]=imdb.reviews.str.contains(str(key)+' ', case=False)

imdbtarget = imdb['positive']
imdbdata = imdb[nounsremoved]
bnb.fit(imdbdata, imdbtarget)
y_pred5 = bnb.predict(imdbdata)

print("Number of mislabeled points out of a total {} points : {}".format(
    imdbdata.shape[0],
    (imdbtarget != y_pred5).sum()))
print(confusion_matrix(imdbtarget, y_pred5))

#This classifier had significantly more type 1 errors, but less type 2 errors. Removing some of the neutral verbs and adjectives
#may decrease the number of type 1 errors.

Number of mislabeled points out of a total 748 points : 218
[[159 203]
 [ 15 371]]


In [22]:
#Here we're going to test for overfitting using train/test subsets.

X_train, X_test, y_train, y_test = train_test_split(imdbdata, imdbtarget, test_size=0.2, random_state=20)
print('With 20% Holdout: ' + str(bnb.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(bnb.fit(imdbdata, imdbtarget).score(imdbdata, imdbtarget)))

#our model seems to have some overfitting problems due to the method I used to gather features.

With 20% Holdout: 0.64
Testing on Sample: 0.7085561497326203


# Conclusion
The accuracy of the model tends to hold at around 64-71% which is only marginally better than randomly guessing. The model seems to overfit quite a bit which is probably due to the method we used to obtain the features. Choosing only words that are not in the positive list as features creates overfitting towards the negative reviews. More advanced classifiers can be implemented to combat this. The model's performance drops below 50% when trying to correctly identify positive reviews. We can probably increase the performance by comparing the performance to other classifiers in the future.