In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy
import sklearn
import seaborn as sns
from sklearn.naive_bayes import BernoulliNB
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [3]:
data_path=('file:///C:/Users/JChaotogo/Desktop/New%20folder/sentiment%20labelled%20sentences/amazon_cells_labelled.txt')
df=pd.read_csv(data_path, delimiter='\t', header=None)
df.columns=['sentence','positive'] 
df

Unnamed: 0,sentence,positive
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1
5,I have to jiggle the plug to get it to line up...,0
6,If you have several dozen or several hundred c...,0
7,If you are Razr owner...you must have this!,1
8,"Needless to say, I wasted my money.",0
9,What a waste of money and time!.,0


In [4]:
#Here I'm going to clean the data to make it uniform
#First, we want to lower-case all words
df['sentence']=df['sentence'].str.lower()
#Then we want to replace special characters
specialcharacters=['!', '?', '.', '$', '#', '-', '(', ')', '*', '&', '_', 
                   '+', '=', '"', '<', '>', ':', ';', '~', '`', '@', ',','/', '[',']']
for character in specialcharacters:
    df['sentence']=df.sentence.str.replace(character, ' ')

df['sentence']=df.sentence.str.replace("'", '')
df['sentence']=df.sentence.str.strip()

In [5]:
dfpos=df[df.positive==1]
dfneg=df[df.positive==0]
#dfpos['sentence'] = dfpos.apply(lambda row: nltk.word_tokenize(row['sentence']), axis=1)
#dfneg['sentence'] = dfneg.apply(lambda row: nltk.word_tokenize(row['sentence']), axis=1)

dfpos

Unnamed: 0,sentence,positive
1,good case excellent value,1
2,great for the jawbone,1
4,the mic is great,1
7,if you are razr owner you must have this,1
10,and the sound quality is great,1
11,he was very impressed when going from the orig...,1
13,very good quality though,1
15,highly recommend for any one who has a blue to...,1
17,so far so good,1
18,works great,1


In [6]:
concentratedwords = df.sentence.str.cat(sep=' ')
words = word_tokenize(concentratedwords)

all_words =[]
stop_words = set(stopwords.words("English"))
for w in words: 
    if w not in stop_words:
        all_words.append(w)
all_words = nltk.FreqDist(all_words)
len(all_words)

neg_words= []
concentratednegwords = dfneg.sentence.str.cat(sep=' ')
negativewords = word_tokenize(concentratednegwords)
for w in negativewords: 
    if w not in stop_words:
        neg_words.append(w)

len(nltk.FreqDist(neg_words))
neg_words = nltk.FreqDist(neg_words)
neg_wordsdf = pd.DataFrame.from_dict(neg_words, orient='index', columns=['words'])
neg_wordsdf.words.sort_values(ascending=False)

pos_words=[]
concentratedposwords = dfpos.sentence.str.cat(sep=' ')
positivewords = word_tokenize(concentratedposwords)
for w in positivewords:
    if w not in stop_words:
        pos_words.append(w)
#Compare the list of positive words to the list of negative words and return a list of words found only in the negative word list
negonly = []
for word in neg_words:
    if word not in pos_words:
        negonly.append(word)
(negonly)

['unless',
 'converter',
 'tied',
 'conversations',
 'lasting',
 '45',
 'major',
 'jiggle',
 'dozen',
 'hundred',
 'imagine',
 'fun',
 'needless',
 'wasted',
 'waste',
 'seperated',
 'mere',
 'ft',
 'excessive',
 'garbled',
 'odd',
 'advise',
 'fooled',
 'clicks',
 'wonder',
 'mechanism',
 'website',
 'followed',
 'directions',
 'commercials',
 'misleading',
 'mother',
 'didnt',
 'instructions',
 'pull',
 'earphone',
 'breakage',
 'unacceptible',
 'unusable',
 'moving',
 'freeway',
 'speed',
 'contract',
 'hate',
 'mins',
 'short',
 'poor',
 'worthless',
 'garbage',
 'mind',
 'gon',
 'na',
 'arguing',
 'returned',
 'disappointed',
 'bad',
 'essentially',
 'forget',
 'microsofts',
 'tech',
 'support',
 'particular',
 'angle',
 'party',
 'clearly',
 'drawback',
 'player',
 'cover',
 'pause',
 'skip',
 'songs',
 'lock',
 'week',
 'later',
 'activated',
 'suddenly',
 'died',
 'bmw',
 'fairly',
 'quiet',
 'hearing',
 'person',
 'saying',
 'd807',
 'wrongly',
 'longer',
 'runs',
 'broke',
 '

In [7]:
df2=df
for key in negonly:
    df2[str(key)]=df2.sentence.str.contains(' '+str(key)+' ', case=False)
data = df2[negonly]
target = df2['positive']

bnb = BernoulliNB()
bnb.fit(data, target)
y_pred = bnb.predict(data)

print("Number of mislabeled points out of a total {} points : {}".format(
    data.shape[0],
    (target != y_pred).sum()))

Number of mislabeled points out of a total 1000 points : 298


#Attempt at creating a more accurate model. I wanted to find features based on each word and the corresponding sentiment.
#The features would then be used to train different classifiers to predict whether a review is +/-.
#find_features doesnt seem to work. Returns all False values.

positivetuple = [tuple(entry) for entry in dfpos.values]
negativetuple = [tuple(entry) for entry in dfneg.values]
word_features = list(all_words.keys())
word_features

documents = []
documents.append(positivetuple)
documents.append(negativetuple)
documents

def find_features(document):
    words = document
    features = {}
    for w in word_features:
        features[w] = (w in words)
    return features
print(find_features(documents[:10]))

for review, sentiment in documents:
    featuresets = (find_features(review), sentiment)
