In [1]:
# Sentiment Analysis

import nltk
import numpy as np
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from bs4 import BeautifulSoup
import lxml

In [2]:
wordnet_lemmatizer = WordNetLemmatizer() # this does stemming dogs to dog etc.

In [3]:
stopwords = set(w.rstrip() for w in open('stopwords.txt')) # get rid of words we don't need like 'and' 'the' etc.

In [4]:
positive_reviews = BeautifulSoup(open('sorted_data_acl/electronics/positive.review'), "lxml")

In [5]:
positive_reviews

<html><body><review>
<unique_id>
B00006HYUB:everyone_should_own_one:d._john_"looser"
</unique_id>
<asin>
B00006HYUB
</asin>
<product_name>
APC Back-UPS ES 500 Backup Battery and Surge Protector: Electronics
</product_name>
<product_type>
electronics
</product_type>
<helpful>
3 of 3
</helpful>
<rating>
5.0
</rating>
<title>
Everyone should own one
</title>
<date>
July 31, 2006
</date>
<reviewer>
D. John "Looser"
</reviewer>
<reviewer_location>
PA
</reviewer_location>
<review_text>
I purchased this unit due to frequent blackouts in my area and 2 power supplies going bad.  It will run my cable modem, router, PC, and LCD monitor for 5 minutes.  This is more than enough time to save work and shut down.   Equally important, I know that my electronics are receiving clean power.

I feel that this investment is minor compared to the loss of valuable data or the failure of equipment due to a power spike or an irregular power supply.

As always, Amazon had it to me in &lt;2 business days
</review

In [6]:
positive_reviews = positive_reviews.findAll('review_text')

In [7]:
negative_reviews = BeautifulSoup(open('sorted_data_acl/electronics/negative.review'), "lxml")

In [8]:
negative_reviews

<html><body><review>
<unique_id>
B00005UKBG:bad:j._brodeur_"disgusted_consumer"
</unique_id>
<asin>
B00005UKBG
</asin>
<product_name>
Atlantic 1316 CD Storage Case (110-Capacity, Wave): Electronics
</product_name>
<product_type>
electronics
</product_type>
<helpful>
15 of 16
</helpful>
<rating>
2.0
</rating>
<title>
bad
</title>
<date>
May 4, 2005
</date>
<reviewer>
J. Brodeur "disgusted consumer"
</reviewer>
<reviewer_location>
</reviewer_location>
<review_text>
cons
tips extremely easy on carpet and if you have a lot of cds stacked at the top

poorly designed, it is a vertical cd rack that doesnt have individual slots for cds, so if you want a cd from the bottom of a stack you have basically pull the whole stack to get to it

putting it together was a pain, the one i bought i had to break a piece of metal just to fit it in its guide holes.

again..poorly designed... doesnt even fit cds that well, there are gaps, and the cd casses are loose fitting

pros
..........
i guess it can hold

In [9]:
negative_reviews = negative_reviews.findAll('review_text')

In [10]:
np.random.shuffle(positive_reviews)

In [11]:
positive_reviews = positive_reviews[:len(negative_reviews)]

In [12]:
def my_tokenizer(s):
    s = s.lower()
    tokens = nltk.tokenize.word_tokenize(s)
    tokens = [t for t in tokens if len(t) > 2]
    tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens]
    tokens = [t for t in tokens if t not in stopwords]
    return tokens


word_index_map = {}
current_index = 0
positive_tokenized = []
negative_tokenized = []

In [13]:
for review in positive_reviews:
    tokens = my_tokenizer(review.text)
    positive_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1

for review in negative_reviews:
    tokens = my_tokenizer(review.text)
    negative_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1

In [14]:
def tokens_to_vector(tokens, label):
    x = np.zeros(len(word_index_map) + 1)
    for t in tokens:
        i = word_index_map[t]
        x[i] += 1
    x = x / x.sum()
    x[-1] = label
    return x

N = len(positive_tokenized) + len(negative_tokenized)

data = np.zeros((N, len(word_index_map) + 1))

i = 0

for tokens in positive_tokenized:
    xy = tokens_to_vector(tokens, 1)
    data[i,:] = xy
    i += 1
        
for tokens in negative_tokenized:
    xy = tokens_to_vector(tokens, 0)
    data[i,:] = xy
    i += 1
        
np.random.shuffle(data)

X = data[:,:-1]
Y = data[:,-1]

# last 100 rows will be test
Xtrain = X[:-100,]
Ytrain = Y[:-100,]
Xtest = X[-100:,]
Ytest = Y[-100:,]


model = LogisticRegression()
model.fit(Xtrain, Ytrain)

print("Classification rate: ", model.score(Xtest, Ytest))

Classification rate:  0.64


In [15]:
# take a look at the weight of each word
threshold = 0.5
for word, index in word_index_map.items():
    weight = model.coef_[0][index]
    if weight < threshold or weight < threshold:
        print(word, weight)

this -0.317997074536
storage 0.0670858717914
device 0.118834119232
store 0.22884417663
document 0.115092865705
travel 0.345044781885
purse 0.0326027610577
lanyard 0.0373713037791
help -0.268284094333
track -0.0632696812436
drive 0.175964883984
worth 0.381958520691
paid 0.24876805805
soundwise 0.00284743899175
klipsch -0.0184542280827
promedia 0.0145170486739
ultra 0.0756567705994
5.1s 0.00284743899175
supposed 0.0399167408379
bass 0.315640659107
powerful 0.0100616559655
overpowering 0.0246127700701
adjusted 0.0174460210748
crystal 0.106277031792
decent 0.185525536353
card -0.419715241017
setting 0.0209985627603
tweaked 0.00284743899175
perfection 0.00609300422185
setup 0.0495668910417
wa -1.603132513
breeze 0.0508155388795
instruction -0.18287331019
provided 0.0640371652657
note 0.0632936044152
wire 0.0981625586651
thicker 0.0138540186453
purchased -0.324931535914
picky 0.018761911498
themselves -0.0354890857237
n't -2.13385489936
trouble 0.0124774626866
reaching 0.00284743899175
assum

performs 0.150781055019
zippy 0.00287825277266
regular 0.108184591985
windowing 0.00287825277266
task 0.0130032682597
eclipse 0.0356838702305
photoshop 0.0244476700559
etc. 0.0123236625918
editing 0.0311839830807
stuff 0.179262621935
slight -0.00210760441078
warm 0.00257671613975
cast 0.00479274964236
nvidia 0.0710866291031
panel 0.00224688566698
adjust -0.0439731054716
manually 0.00903943001284
fear -0.0221944544359
course -0.107492927027
dead -0.184374990561
pixel 0.0792654174771
grace 0.00287825277266
lucky 0.0138442501508
duck 0.00287825277266
third -0.0212730437001
gotten 0.120301412725
brightness 0.0516771038454
uniform 0.000758523093851
subtle 0.0343021311669
drop -0.0710698032438
lower -0.064790325465
left 0.0634411246866
edge -0.107029331086
apparent -0.0241577850608
unless -0.246097430564
actively -0.0222990945494
programming -0.00713665656862
browsing -0.00299208110028
ala 0.00287825277266
3840 0.00287825277266
2400 0.00287825277266
strip 0.0304048090096
crazy -0.02564463206

business -0.0663638666641
margin 0.00674686338658
selling -0.134585422957
costing 0.014621980767
panasonic 0.0624145632988
profit 0.015882214185
fixed -0.0835316273903
percentage -0.00496735183896
1280 0.015007963798
somone 0.00385201195498
target -0.0505115872269
understand -0.0781786420062
nozzle 0.00385201195498
vey 0.00385201195498
mat 0.00385201195498
glossy 0.124982626789
proper -0.0457799360391
cue 0.00385201195498
proof 0.0114318347204
artist -0.00175876941729
graphic 0.00950536112845
designer -0.00282222868999
blessing 0.00571127172871
cost -0.237274384506
400 -0.0259614699993
parchase 0.00385201195498
compare 0.102439846772
profesional 0.00385201195498
workhorse 0.00385201195498
2000 0.000827890498391
cut -0.134601158644
minus 0.0297909571284
generous 0.0101414561296
amout 0.0062104935285
realestate 0.00839923704367
acomadate 0.00385201195498
1/2 0.0282370009606
payolla 0.00385201195498
cartriges 0.027766629856
pricey -0.00656956865352
dose -0.0395202098699
expences 0.0038520

thats 0.0010515596668
isnt 0.00386487662442
saying -0.211482463116
mone 0.0509790863506
significantly -0.0426206395523
drawing 0.0138110766126
adobe 0.0310365908815
illustrator 0.0131930607588
macromedia 0.0131930607588
brick 0.0362965524392
school 0.0760291759993
surely 0.0116550206964
storing 0.0878301268488
palm -0.132601153435
pilot -0.0129460663971
transferring -0.00594097204002
kept -0.0230226739115
telling -0.0318624494994
sanso 0.0
outweighed 0.0
returning -0.539318087051
hardly -0.0294549790781
pressesd 0.0
feedback -0.0651648230539
3am 0.0
tuffwrap 0.0
accent 0.00264159308091
xtrememac- 0.0
shack -0.0279395545726
tough -0.0715789688364
rounded 0.0113020729371
lining 0.0
protection -0.0532787034636
tunebase 0.0401242739447
lay 0.026361724676
flip -0.00120451334222
reach -0.0358148044687
bleed 0.0
happened -0.17239461485
nasty -0.0113964440922
hiss -0.0846851752113
fingertip 0.0
gut -0.029354484771
dealing -0.0491140696377
frustration -0.111434392785
greeting 0.0149922114304
al

whit 0.00801155109173
primary -0.0185854298044
extending 0.00801155109173
existing -0.0211347185793
bridge 0.0438938925559
trackman 0.0324095636493
corded 0.147457125913
sends/recieves 0.0111635729978
10d 0.034097879439
30d 0.0681957588781
s40 0.034097879439
1600 0.0283458732304
jpgs 0.034097879439
receives 0.0578483448976
proved -0.00629417785354
medialife 0.00375370991092
realplayer 0.00375370991092
divx -0.00152768132667
windvd 0.00375370991092
powerdvd 0.00375370991092
worship 0.00375370991092
hence -0.00800149219321
firefox -0.00440792569638
biggies 0.00375370991092
setpoint 0.0394104271501
assign -0.0199239369499
keystroke -0.0108114718076
alt+right 0.00375370991092
alt+left 0.00375370991092
a-browsin 0.00375370991092
duracell 0.0106532449893
admittedly 0.00375370991092
dorm-dweller 0.00375370991092
lie -0.0129747450259
cheap-plasticky 0.00375370991092
odd 0.0384324259051
ergonomic 0.00491514584698
troublesome -0.00349569319302
dvd/tv 0.0151599043897
blasting 0.00975647229285
lea

suffered -0.00109761470885
dried 0.0112381911629
clogging 0.0251030731532
truth -0.020892354632
technically 0.0359717501898
proficient 0.0140643442743
scored 0.0140643442743
gross 0.0140643442743
ignorance 0.0140643442743
astonishing 0.0140643442743
framed 0.0140643442743
techno-challenged 0.0140643442743
accidentaly 0.0448991844871
sat 0.0358847566739
struggle -0.00437146029141
sleakest 0.0105184643915
best-looking 0.0105184643915
apple-white 0.0105184643915
drone 0.0105184643915
focus 0.0116089227782
footprint -0.00562497104288
unimposing 0.0105184643915
updating -0.0189326595859
par 0.0209179868714
160gb 0.0085098620025
149 0.0153111691951
actuality 0.0085098620025
moving -0.0975594214861
incident -0.0109130790185
100g 0.0085098620025
fart 0.0085098620025
wal-mart -0.0307623026828
114 0.0085098620025
seagate 0.017677213136
300gb 0.0085098620025
altitude 0.00207898096171
shattering 0.0
30g 0.0
temperature -0.0220942543061
domain 0.0
spin -0.0620151963705
moisture 0.0
condensation 0.0

journalist 0.0132129801883
tote 0.0425940844144
microcassette 0.0132129801883
wherever 0.0445396110994
microphone/recording 0.0132129801883
interview 0.00830871028184
transcribe 0.0132129801883
.avi 0.0132129801883
rangemax -0.00895154733771
wpn311 0.0125959234764
automated -0.0161607587805
completion 0.00419864115881
utilization 0.00839728231762
executable 0.00419864115881
named -8.38045718093e-05
wlancfg5.exe 0.00839728231762
routine 0.00419864115881
csrss.exe 0.00419864115881
net.exe 0.00419864115881
net1.exe 0.00419864115881
emailed -0.0302849935174
manager -0.0132015567306
wlancfg5 0.00419864115881
re-booted 0.00419864115881
de-install 0.00419864115881
re-install -0.00539448304588
mimo 0.0142427295359
zero -0.0167809515128
configurator 0.00419864115881
wzc 0.00839728231762
microsoft.com 0.00419864115881
871122 0.00419864115881
wizard 0.0101350609353
ok. -0.0789560927036
execution 0.0146453913698
unwatchable -0.000669070001902
robust 0.0265356432577
unchoppy 0.0146453913698
speaker

giveaway 0.0152675749893
amazingly -0.00151462543429
businessweek 0.0181921208803
september -0.00882996582677
reviewed -0.0098010300586
rank 0.00606404029345
nav 0.0089641179558
excerpt 0.00606404029345
article 0.00113740918569
streetpilot 0.00606404029345
2720 0.0213071887838
1,000 0.00733001085293
approach -0.0211506546764
preferred 0.00725084219917
5-inch 0.00123845813826
encounter 0.0254541546725
reroute 0.00606404029345
importantly -0.0194911015394
freeing 0.00791124833945
inobtrusive 0.00791124833945
530 0.00791124833945
tryed 0.0123788952611
workday 0.0237402479627
soreness 0.01414904304
sits -0.00305916899908
lying 0.0184030568386
fadeout 0.01414904304
formed 0.0186852754503
dpi 0.0186852754503
raise 0.0351288204513
ibm 0.00209199371503
staring 0.00340054983551
microphone -0.196968665774
vocal 0.0138567096544
admitt 0.0269115939889
humming -0.034681062073
hissing 0.0251095123259
idle -0.0023470215528
ohtherwise 0.0269115939889
team -0.00230939959557
permanently -0.0035703169977

asio 0.00508148010657
audigy 0.0152444403197
esi 0.00508148010657
juli 0.00508148010657
m-audio 0.00508148010657
theme 0.00508148010657
beacuse 0.00508148010657
thier -0.0348852429251
warrnty 0.00508148010657
could't 0.00508148010657
surprize 0.00251936596022
4121s 0.0
admire 0.0
faithful 0.0
satsfied 0.0
referbished 0.00478546902596
hd841 0.00239273451298
642 0.00239273451298
reveiw 0.00108758654028
850 0.00239273451298
ups -0.0882296059185
upconvert 0.0143173715779
480p 0.00478546902596
honest -0.02483027786
720p 0.00239273451298
explaining 0.00239273451298
index 0.00239273451298
offbrand 0.00239273451298
perfomace 0.00239273451298
discription 0.00239273451298
suprise 0.00239273451298
mp4 0.00239273451298
instread 0.00239273451298
flack 0.00239273451298
chane 0.00239273451298
technosavy 0.00239273451298
ray 0.00239273451298
1080p 0.00239273451298
re 0.00239273451298
20-30 -0.00533439350411
duet 0.00371204112567
july -0.0175090087868
36.95 0.00371204112567
dealmac 0.00371204112567
2-p

hts800 0.0116795816126
high-end 0.0258785291854
pre-amp 0.0116795816126
high-pitched 0.0116795816126
acoustically 0.0116795816126
isolated 0.00304547939919
tcx800 0.0218570505549
deregistering 0.0218570505549
deregisters 0.0218570505549
microphoto 0.0109339312351
attachable 0.00584017679958
play/pause/ff/rew 0.00584017679958
comforable 0.00772326035196
improvment 0.00772326035196
skill 0.00772326035196
cheesy -0.0190169729556
gui 0.0023711489517
homemade 0.00174353759253
spends 0.0029786621982
w/answering 0.0160378335536
locating -0.0034599876775
misplacing 0.0160378335536
spoiled 0.0160378335536
abilty 0.0160378335536
t5320 0.0216529733572
t5400 0.0216529733572
t5420 0.0108264866786
t5600 0.0216529733572
t5620 0.0108264866786
t5700 0.0216529733572
t5720 0.0108264866786
t5800 0.0216529733572
t5820 0.0108264866786
2-way 0.0108264866786
53615 0.0108264866786
t4800 0.0108264866786
t4900 0.0108264866786
t5000 0.0108264866786
t5500 0.0108264866786
t5900 0.0108264866786
t6500 0.0108264866786

2.1.1 -0.0135385238033
.blue -0.0135385238033
bcuz -0.0135385238033
onoff -0.0135385238033
.simply -0.0135385238033
1.1. -0.0141848412322
comke -0.0141848412322
adapte -0.0141848412322
backplane -0.00308697084863
solving -0.0502020267755
disappearing -0.0280685485799
deduced -0.00308697084863
self-powered -0.00308697084863
messing -0.00941125612933
soundsticks -0.0230788752
faq -0.0717628783864
w2k/dell -0.0269404649689
senior -0.0124936612748
intelligble -0.0033527070157
resort -0.0148468381453
dwl-g650 -0.00670541403139
utterly -0.0359384460853
well- -0.00976416581423
online- -0.00976416581423
181 -0.00624098446093
honored -0.0107554671439
rma -0.0495883337311
m230 -0.0187229533828
c150 -0.0187229533828
rave -0.0500595701951
unsuccessful -0.0198794731938
e-reader -0.0198794731938
40mb -0.0151720893658
formatted -0.0151720893658
purport -0.0190896563613
sometime -0.0063242852807
problematic -0.0225589673081
scared -0.0254051108453
frisbee -0.0063242852807
template -0.0278071299718
sho

6010 -0.0184111851185
evertime -0.0184111851185
initializing -0.0184111851185
working.i -0.0184111851185
embarrassed -0.0503031952157
sp2 -0.0739451998975
-signal 0.0
.except -0.0106986390502
c'mon 0.0
non-direction 0.0
deducted 0.0
-told 0.0
firewall -0.0579562259359
deduct 0.0
-stupid 0.0
hexidecimal 0.0
alphabet 0.0
g-z 0.0
a-z 0.0
-slow 0.0
impatient 0.0
hibernate 0.0
disables 0.0
alongside -0.00948423539571
wrt54gs -0.0335285024982
flaky -0.0303344436297
rectify -0.00948423539571
dropout -0.0401188961328
ilove -0.0178330098573
abanded -0.0178330098573
brudle -0.0178330098573
becuse -0.0178330098573
wast -0.0178330098573
money.mine -0.0178330098573
ginger -0.0178330098573
silver/grey -0.0178330098573
newborn -0.0178330098573
kittin -0.0178330098573
o.k. -0.0149089052563
refurbs -0.0149089052563
bandwidth -0.110066468247
poor-to-marginal -0.0320518373661
wallmart -0.0170129699163
disguised -0.00567098997209
roof -0.0141545915228
cabling -0.0197571307174
uhf -0.00567098997209
vhf -0.

disposable -0.0137678197988
commodity -0.00549366520941
replacement/repair -0.00549366520941
dell-pod -0.00549366520941
apple/ipod -0.00549366520941
household -0.0184096322453
nov. -0.0143978877114
informs -0.00719894385572
29th -0.00719894385572
anticipated/promised -0.00719894385572
discouraging -0.00719894385572
800-201-7575. -0.00719894385572
metiocracy -0.00719894385572
independently -0.014338970884
o/s -0.014338970884
echo-cancellation -0.014338970884
name.. -0.0296579310174
philps -0.0296579310174
re-arrange -0.00721760341656
insistent -0.00721760341656
refurbish -0.00721760341656
thereof -0.00721760341656
jumpdrives -0.0136476178616
jumpdrive -0.0136476178616
keychain -0.0136476178616
multiswitch -0.023877022059
154.00 -0.0119385110295
crapped -0.0119385110295
wor -0.078592299595
caramel 0.0
veiled 0.0
indistinct 0.0
abundant 0.0
stax 0.0
superclean 0.0
clinical 0.0
mdr-w20g 0.0
auditioning 0.0
-small -0.00992108421232
-recording -0.00992108421232
-hold -0.00992108421232
-bugs 

'far -0.0175005756198
reseting -0.0175005756198
lowercase -0.01218793994
s510 -0.01218793994
subtract -0.01218793994
sheilding -0.0131399574425
12.0 -0.0355654077163
14.0 -0.0355654077163
10.0 -0.0355654077163
deceived -0.0249843151785
30.00 -0.0249843151785
existent -0.0249843151785
clix -0.0196142759005
ooze -0.0420280668952
print-outs -0.0146796075027
two+ -0.0146796075027
system/circuit -0.0146796075027
heavier-duty -0.0146796075027
4750 -0.0146796075027
smoothed -0.0102604506691
fanned -0.0102604506691
champ -0.0102604506691
laying -0.0117635781977
6682. -0.0140537428851
isse -0.0140537428851
w/xp -0.0411225750009
apps -0.0454173448722
dye 0.0
deterioration 0.0
stained 0.0
'correcting -0.0151226575878
undisclosed -0.0151226575878
heartattack -0.0217201668058
meter.. -0.0300540229269
80-year-old -0.0100194641363
cope -0.021359813697
typewriter -0.02889546149
overlapping -0.0100194641363
esoteric -0.0100194641363
re-program -0.0100194641363
40-year-old -0.0100194641363
e-bay -0.0100

mstasad -0.00885653321747
upset.. -0.00885653321747
0~1 -0.0284448564883
writen 0.0
ultras 0.0
smae 0.0
1800mah 0.0
reconized 0.0
landfill 0.0
differnt 0.0
sometiems 0.0
harddrives 0.0
ht27546 -0.0103208970544
outputwhich -0.0103208970544
caption -0.0103208970544
italk -0.035936338585
wowed -0.035936338585
partly 0.0
handlebar 0.0
pedestrian 0.0
helpline 0.0
mintues -0.0124267759632
88.1 -0.0124267759632
88.3 -0.0124267759632
88.5 -0.0124267759632
88.7 -0.0124267759632
.but -0.0156372296054
piercing -0.0156372296054
tolerable -0.0156372296054
recalculates -0.0136963942795
roundabout -0.0136963942795
checkout -0.0106393135119
mor -0.0106393135119
whaz 0.0
dat 0.0
5.0gb -0.0247116044939
forgive -0.0247116044939
best..unfortunately -0.00413288660262
wysiwyg..disappointing -0.00413288660262
miniature -0.00413288660262
positively -0.00413288660262
thorough -0.00413288660262
thorougly -0.00413288660262
buying..so -0.00413288660262
lopsided -0.00413288660262
reviews.. -0.00413288660262
beawar