In [1]:
from __future__ import print_function, division
from future.utils import iteritems
from builtins import range

In [2]:
import nltk
import numpy as np
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from bs4 import BeautifulSoup

In [3]:
wordnet_lemmatizer = WordNetLemmatizer()

In [4]:
stopwords = set(w.rstrip() for w in open('stopwords.txt'))

In [5]:
positive_reviews = BeautifulSoup(open('positive.review.txt').read())
positive_reviews = positive_reviews.findAll('review_text')

In [7]:
negative_reviews = BeautifulSoup(open('negative.review.txt').read())
negative_reviews = negative_reviews.findAll('review_text')

In [8]:
np.random.shuffle(positive_reviews)
positive_reviews = positive_reviews[:len(negative_reviews)]

In [9]:
def my_tokenizer(s):
    s = s.lower() # downcase
    tokens = nltk.tokenize.word_tokenize(s) # split string into words (tokens)
    tokens = [t for t in tokens if len(t) > 2] # remove short words, they're probably not useful
    tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens] # put words into base form
    tokens = [t for t in tokens if t not in stopwords] # remove stopwords
    return tokens

In [10]:
word_index_map = {}
current_index = 0
positive_tokenized = []
negative_tokenized = []
orig_reviews = []

In [11]:
for review in positive_reviews:
    orig_reviews.append(review.text)
    tokens = my_tokenizer(review.text)
    positive_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1

In [12]:
for review in negative_reviews:
    orig_reviews.append(review.text)
    tokens = my_tokenizer(review.text)
    negative_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1

In [13]:
print("len(word_index_map):", len(word_index_map))


len(word_index_map): 11081


In [14]:
def tokens_to_vector(tokens, label):
    x = np.zeros(len(word_index_map) + 1) # last element is for the label
    for t in tokens:
        i = word_index_map[t]
        x[i] += 1
    x = x / x.sum() # normalize it before setting label
    x[-1] = label
    return x

In [15]:
N = len(positive_tokenized) + len(negative_tokenized)

In [16]:
data = np.zeros((N, len(word_index_map) + 1))
i = 0
for tokens in positive_tokenized:
    xy = tokens_to_vector(tokens, 1)
    data[i,:] = xy
    i += 1

In [17]:
for tokens in negative_tokenized:
    xy = tokens_to_vector(tokens, 0)
    data[i,:] = xy
    i += 1

In [18]:
np.random.shuffle(data)


In [19]:
X = data[:,:-1]
Y = data[:, -1]

In [20]:
Xtrain = X[:-100,]
Ytrain = Y[:-100,]
Xtest = X[-100:,]
Ytest = Y[-100:,]

In [21]:
model = LogisticRegression()
model.fit(Xtrain, Ytrain)
print("Train accuracy:", model.score(Xtrain, Ytrain))
print("Test accuracy:", model.score(Xtest, Ytest))



Train accuracy: 0.783157894737
Test accuracy: 0.82


In [22]:
model = AdaBoostClassifier()
model.fit(Xtrain,Ytrain)
print("Train accuracy:", model.score(Xtrain, Ytrain))
print("Test accuracy:", model.score(Xtest, Ytest))

Train accuracy: 0.816315789474
Test accuracy: 0.73


In [23]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(Xtrain,Ytrain)
print("Train accuracy:", model.score(Xtrain, Ytrain))
print("Test accuracy:", model.score(Xtest, Ytest))

Train accuracy: 0.872631578947
Test accuracy: 0.76


In [24]:
threshold = 0.5
for word, index in iteritems(word_index_map):
    weight = model.coef_[0][index]
    if weight > threshold or weight < -threshold:
        print(word, weight)

bought -7.44780709744
this -5.8466453485
phone -7.42463999745
additional -9.08228891423
handset -8.83269219211
eleven -9.38183045324
month -8.10882176383
ago -8.76901944662
start -8.99079807012
wa -6.6715132321
promptly -9.23097466931
replaced -9.07667798681
returned -9.19487884944
service -8.71078615968
system -8.04916145562
fine -8.02551997964
2-story -9.38183045324
2,300 -9.37210716409
ft. -9.37324192109
home -8.23086003281
notice -9.2336085145
reception -8.69878943057
carrying -9.21859963151
closed -9.34034411943
garage -9.36656058997
base -8.7137255193
station -8.73460060767
upstairs -9.35265496411
opposite -9.31332014226
house -8.91637229859
test -9.27927490101
street -9.16129764293
100 -8.77604514729
yard -9.34201547797
dialtone -9.38183045324
located -9.37614237147
bedroom -9.32938454956
facing -9.36025373802
feature -8.23910491446
including -9.0581075348
paging -9.38183045324
call -8.7234262421
transfer -8.83748152076
speakerphone -9.12457021498
able -8.56835023045
110v -9.381

accessory -9.09692744502
attenuator -9.39120720322
soft -9.28221062355
travel -8.68563606354
earwax -9.38511451918
loop -9.33303738655
silvery -9.39120720322
metal -9.35331473845
storage -9.1389671995
winder -9.39120720322
help -9.00147935916
head -8.92472335587
tangling -9.39120720322
satisfactory -9.32485181338
everyday -9.07453860881
listen -8.90344174051
solely -9.32790167319
bass-dominant -9.39120720322
truth -9.36776650062
bass-heavy -9.37522897763
specifically -9.32350807635
bass-y -9.39120720322
well-designed -9.34486270316
fun -9.12985927629
reading -9.03989542279
online -9.11520700218
vs. -9.34316375516
portability -9.30514521063
radio/boom -9.37154518258
tree -9.33548889302
antenna -8.8216878299
foot -9.19411338425
door -9.15072886887
mind -9.15816439026
issue -8.72367974127
satellite -9.2096727771
placed -9.10695264578
kitchen -9.20181416179
hear -8.73483939683
floor -9.20013981792
wish -8.71578331064
random -9.12362591149
instead -8.78949785363
received -8.82679313812
adve

regularly -9.2809900935
gym -9.25468984241
slot -9.13730345642
attaching -9.31480165148
outside -9.0728775323
extension -9.10483119457
supposed -8.95751210149
whatsoever -9.12240112548
excelent -9.15774749369
seller -9.09785918954
thanks -9.05524675174
trackman -9.32892439979
corded -9.10373641182
office -8.78936722768
leave -8.99518316505
sends/recieves -9.37097812842
trackball -9.17838754206
comfortably -9.1973916567
minus -9.30630412629
website -9.16222354434
altec -9.2718964955
lansing -9.29784333082
linked -9.36522271684
seemingly -9.35703103643
unrelated -9.36522271684
runner -9.36522271684
followed -9.32210522262
instruction -9.17479754206
default -9.23206555936
safe -9.26738554085
comment -9.08816993142
morning -9.35733535201
faster -9.08557033894
sweet -9.29358735014
frustrated -9.38663681135
trying -9.05918422675
tell -9.1252433793
piece -8.98112685197
junk -9.32688729961
pulled-in -9.38663681135
relatively -9.27630566642
constantly -9.27649003475
twisted -9.38663681135
angle

greeting -9.36522271684
appearance -9.27142442352
sleek -9.30660857838
modern -9.305934237
accessible -9.37344559021
causing -9.34390759504
respect -9.38502534414
explain -9.34623136492
obvious -9.27511531648
include -9.36745132635
accent -9.3698189687
generated -9.37330854047
learned -9.30329143882
volt -9.3203897134
indicate -9.37173566603
sequence -9.35535513636
regarding -9.28242221288
answering -9.06513571518
intention -9.36876997506
expanding -9.38610701001
numerous -9.34729843067
bonus -9.2542485041
directory -9.26571264586
transfering -9.36523963002
entry -9.3209534301
name -9.0339016568
occupy -9.38610701001
page -8.96675522571
ring -9.07109525934
information -9.17409500814
incoming -9.332509824
displayed -9.37424705302
clarify -9.37451479624
literature -9.38610701001
camers -9.32608280851
multi -9.3071524148
notepad -9.32608280851
complete -9.26149986946
satisfaction -9.31462581554
ton -9.12877150045
memorex -8.97958803863
shelf -9.33164223909
lasting -9.29704386238
switched 

ps2 -9.28261647021
progam -9.36430402133
emergenies -9.36430402133
hung -9.36430402133
slowly -9.36254422172
unadulterated -9.38249689779
windows/dos -9.38249689779
availability -9.37641972227
scarce -9.38249689779
forum -9.27164015048
else -9.11279983394
closely -9.36403627305
majority -9.37237206976
pink -9.24480414653
tint -9.38249689779
re-engineered -9.38249689779
aluminum -9.36284601402
compliment -9.36599815574
powerbook -9.26153845713
bank -9.26521325777
ebay -9.34842595732
mint -9.38249689779
condition -9.07011889866
auction -9.38249689779
recieved -9.27920533484
wow -9.08164346341
cancellation -9.37798124663
interchangeability -9.37798124663
fusion -9.37798124663
provides -9.1112223955
firm -9.3459126133
ware -9.37487297268
layer -9.30290011126
fewer -9.37487297268
compatability -9.37487297268
momentarily -9.37487297268
torn -9.3634095181
opt -9.37487297268
itrip -9.22433644172
tunecastii -9.38684518086
icarplay -9.35244161373
tune -9.15241262827
transmitter -9.05136987276
in

discontinued -9.34358394757
r200/r300/r320 -9.37881515912
printable -9.36139858754
developed -9.3257638801
carpel -9.35062391742
movable -9.35062391742
zipper -9.38265316
a-carry -9.38265316
along- -9.38265316
loaded -9.36521452306
200v -9.38265316
zipped -9.38265316
.75 -9.38265316
greatly -9.34377188992
decide -9.34936605105
traveller -9.35870803582
doesn -9.35870803582
fofill -9.35870803582
hoe -9.35870803582
logi3 -9.35870803582
ippod -9.35870803582
conference -9.32081673452
opposed -9.30284982809
occasionnally -9.37309677328
cat -9.30780730215
walk -9.31400785691
wit -9.37309677328
paw -9.37309677328
cure -9.34143072631
reboots -9.37309677328
operates -9.37097812842
attach -9.29653716182
deserves -9.34508252114
batch -9.36730552433
workstation -9.37602748502
consideration -9.36564400515
audio-enhanced -9.37602748502
assisted -9.37602748502
training -9.37602748502
clever -9.37159779893
aggravating -9.37602748502
woe -9.37602748502
unpowered -9.37602748502
unamplified -9.37602748502

desheveled -9.38460438013
ocasionally -9.38460438013
sporadic -9.38460438013
reproduce -9.37132025833
crashed -9.38460438013
downfall -9.38460438013
8.99 -9.36522271684
total -9.27397135227
ripoff -9.36522271684
eektech -9.36522271684
shuffle -9.29293010819
engraved -9.32096770784
79.00 -9.32096770784
fool -9.35146951367
uninstall -9.35665398602
google -9.3707381956
brilliant -9.35215063528
calendar -9.33800641576
realestate -9.37804717946
dual-link -9.38594319643
geforce -9.32424962743
7800 -9.38594319643
quadro -9.38594319643
7800gt -9.36792469093
nvidia -9.26118800398
6800ultra -9.38594319643
ddl -9.37689336091
agp -9.36585862655
powermac -9.3240122387
6800gt -9.38594319643
6600 -9.38594319643
quadrofx -9.31900001872
4500 -9.35233704703
platform -9.38594319643
3400 -9.38594319643
3450 -9.38594319643
4400 -9.38594319643
7800gtx -9.37689336091
fry -9.34993162546
diego -9.38233665422
199.99 -9.38233665422
precision -9.38233665422
polk -9.35156508418
r30 -9.38233665422
klh -9.3823366542

t5820 -9.3740222708
2-way -9.3740222708
53615 -9.3740222708
t4800 -9.3740222708
t4900 -9.3740222708
t5000 -9.3740222708
t5500 -9.3740222708
t5900 -9.3740222708
t6500 -9.3740222708
fv500 -9.3740222708
walked -9.36608814312
projector -9.35425368547
playe -9.35425368547
.customer -9.37565759414
.more -9.37565759414
complaint..very -9.37565759414
g.p.s -9.37565759414
.also -9.37565759414
marine -9.37565759414
.one -9.37565759414
.do -9.37565759414
breast -9.37565759414
float..not -9.37565759414
waterproof..nuff -9.37565759414
ip811-2 -9.35251606558
vonage -9.27975841445
checked -9.36412643692
lately -9.36002741651
duh -9.37602748502
imported -9.33445105818
proccess -9.33445105818
hotel -9.26086353084
etrex -9.33445105818
wierd -9.37260282414
good.. -9.37260282414
.they -9.37260282414
recognize -9.29219043465
detail.. -9.33791726615
seattle -9.35175912938
salt -9.36840743291
lake -9.35397941893
boston -9.35175912938
4.99/each -9.36840743291
911 -9.36840743291
subtstanial -9.34100845872
subs

medially -9.38752847436
pinna -9.38003780263
superoposterior -9.38003780263
straightens -9.38752847436
doctor -9.38752847436
maneuver -9.38752847436
examines -9.38752847436
otoscope -9.38752847436
insertion -9.35956647468
sealed -9.38407631726
relieve -9.38752847436
eustachian -9.38752847436
tube -9.31983897326
swallow -9.38752847436
climbing -9.38752847436
skyscraper -9.38752847436
elevator -9.38752847436
anatomy -9.38752847436
guaranteed -9.38051425781
platinum -9.33053715886
anticipated -9.29976550019
likethe -9.37602748502
amps.. -9.37602748502
shoulld -9.37602748502
plug.. -9.37602748502
restraint -9.37602748502
wedge -9.37602748502
backin -9.37602748502
tie -9.37602748502
phoenomenal -9.31503297232
mute/unmute -9.34855566436
melody -9.34855566436
scare -9.35585496684
opossum -9.35585496684
porch -9.35585496684
prong -9.35585496684
scoot -9.35585496684
rehoboth -9.38233665422
sirius -9.38233665422
classic -9.38233665422
introduced -9.37831478636
nascar -9.2963806863
2007. -9.38233

comcast -9.39507567999
16:9 -9.3740222708
improves -9.39507567999
enjoyment -9.39507567999
mannual -9.39507567999
blew -9.35870803582
anyome -9.39507567999
trusted -9.36608814312
nuvi -9.33791726615
aggresive -9.36608814312
360 -9.36608814312
lazer -9.37768393728
tendonitis -9.36058950392
flare -9.37768393728
chronic -9.37768393728
6000. -9.36058950392
450. -9.37768393728
v450 -9.37768393728
sputter -9.39008813848
whirr -9.39008813848
peace -9.39008813848
width -9.37955840025
comparitively -9.39008813848
wrap -9.39008813848
seam -9.39008813848
artist-album-song -9.39008813848
stale -9.39008813848
art -9.3388441054
hit-or-miss -9.38512534914
colorful -9.39008813848
sunlight -9.39008813848
wedging -9.39008813848
underneath -9.39008813848
upward -9.39008813848
wmp11 -9.38512534914
theoretical -9.39008813848
bit-rate -9.39008813848
wmas -9.39008813848
on-the-go -9.39008813848
creation -9.39008813848
brick-and-mortar -9.39008813848
statement -9.3916451449
913ns -9.3916451449
712n -9.3916451

extreme -9.38800851277
head-unclogging -9.38800851277
mtink -9.38800851277
'blow -9.38800851277
reservoir -9.38800851277
sponge-sandpaper-like -9.38800851277
overflow -9.38800851277
smear -9.38800851277
windex -9.38800851277
towel -9.38800851277
'trait -9.38800851277
b-sized -9.38800851277
cid -9.35585496684
interface/menu -9.3752730527
non-intuitive -9.3752730527
redial -9.35585496684
see/get -9.3752730527
recomendation -9.37208616177
listend -9.37208616177
raido -9.37208616177
enables -9.35870803582
emap -9.35870803582
12v -9.35870803582
3-outlet -9.35870803582
protocol -9.35251606558
rudder -9.35998436018
pedal -9.35998436018
yoke -9.35998436018
x-plane -9.32608280851
fedex -9.35998436018
verbati -9.39507567999
rewind -9.33445105818
unbelievably -9.36522271684
giveaway -9.36522271684
tcx805 -9.37097812842
trivial -9.37097812842
automagically -9.37097812842
dialing -9.37097812842
upright -9.37097812842
finishing -9.3873536339
grill -9.3873536339
guard -9.3873536339
autoroute -9.36522

humax -9.39507567999
sir-t451 -9.39507567999
reliability/quality -9.39507567999
declined -9.39507567999
mainstream -9.39507567999
works.. -9.39507567999
mr. -9.39507567999
loftin.. -9.39507567999
ordered.. -9.39507567999
ka-put -9.39507567999
intelligible -9.39507567999
noooo -9.39507567999
metropolitan -9.39507567999
span -9.39507567999
massage -9.39507567999
voicepulse -9.39507567999
indicating -9.39507567999
workaround -9.39507567999
mouth -9.39507567999
resend -9.39507567999
c550 -9.39507567999
untrue -9.39507567999
stabilized -9.39507567999
antiroll -9.39507567999
crossbar -9.39507567999
vague -9.39507567999
18hz -9.39507567999
1990 -9.39507567999
mx400 -9.39507567999
reputable -9.39507567999
disconects -9.39507567999
samething -9.39507567999
helper -9.39507567999
apearently -9.39507567999
menus/programming -9.39507567999
dispute -9.39507567999
hassel -9.39507567999
cancel -9.39507567999
ohio -9.39507567999
michigan -9.39507567999
tbd -9.39507567999
murphy -9.39507567999
law -9.39

f12 -9.39507567999
thirdly -9.39507567999
logitechs -9.39507567999
cursor -9.39507567999
osx -9.39507567999
gasp -9.39507567999
horrendous -9.39507567999
clatter -9.39507567999
nonexistant -9.39507567999
shallow -9.39507567999
texture -9.39507567999
asdf -9.39507567999
jkl -9.39507567999
unforgiveable -9.39507567999
transforms -9.39507567999
peck -9.39507567999
typer -9.39507567999
..if -9.39507567999
..day -9.39507567999
nwb -9.39507567999
remodeling -9.39507567999
reverse -9.39507567999
adhere -9.39507567999
chunk -9.39507567999
140.00 -9.39507567999
woot.com -9.39507567999
transmission -9.39507567999
configurable -9.39507567999
~15 -9.39507567999
terribly -9.39507567999
cracked -9.39507567999
dlink -9.39507567999
hah -9.39507567999
uninitiated -9.39507567999
.ftp -9.39507567999
dwl-2100ap -9.39507567999
gibberish -9.39507567999
dope -9.39507567999
blah -9.39507567999
assessment -9.39507567999
pre-sales -9.39507567999
post-sales -9.39507567999
documentation/install -9.39507567999
whe

ever-present -9.39507567999
favorable -9.39507567999
flexibe -9.39507567999
st. -9.39507567999
louis -9.39507567999
intereference -9.39507567999
junky -9.39507567999
mdr-ex81lp -9.39507567999
30-40 -9.39507567999
er-6 -9.39507567999
vacuum -9.39507567999
environmental -9.39507567999
..choose -9.39507567999
attenuated -9.39507567999
mdr-ex51 -9.39507567999
325 -9.39507567999
message-print -9.39507567999
jam-i -9.39507567999
encouraged -9.39507567999
new'model- -9.39507567999
supposted -9.39507567999
ocz -9.39507567999
a-data -9.39507567999
unreliable.. -9.39507567999
msh-128 -9.39507567999
20th -9.39507567999
p100 -9.39507567999
c:13:01 -9.39507567999
travled -9.39507567999
recoginize -9.39507567999
re-insert -9.39507567999
oneday -9.39507567999
wiped -9.39507567999
300m -9.39507567999
recived -9.39507567999
swindled -9.39507567999
formating -9.39507567999
deletion -9.39507567999
remedy -9.39507567999
disceted -9.39507567999
suffering -9.39507567999
better.. -9.39507567999
bulgaria -9.3

linkage -9.39507567999
mac-happy -9.39507567999
babysit -9.39507567999
inconsistently -9.39507567999
graps -9.39507567999
disclosed -9.39507567999
10-page -9.39507567999
expedient -9.39507567999
omitted -9.39507567999
+90 -9.39507567999
foul -9.39507567999
swing -9.39507567999
dwl-g700ap -9.39507567999
appalling -9.39507567999
ip6000d -9.39507567999
anothe -9.39507567999
chemical -9.39507567999
squishy -9.39507567999
slipper -9.39507567999
brookstone -9.39507567999
theese -9.39507567999
fvs338 -9.39507567999
fvs318 -9.39507567999
1.2mbps -9.39507567999
infuriated -9.39507567999
comprehension -9.39507567999
deptartment -9.39507567999
netgears -9.39507567999
wich -9.39507567999
sued -9.39507567999
venting -9.39507567999
prosafe -9.39507567999
30mbs -9.39507567999
fsv318v3 -9.39507567999
25mbs -9.39507567999
6.5mbs -9.39507567999
keyword -9.39507567999
checkbox -9.39507567999
5mbs -9.39507567999
keywords -9.39507567999
price-points -9.39507567999
waterproof -9.39507567999
mising -9.395075

In [25]:
# check misclassified examples
preds = model.predict(X)
P = model.predict_proba(X)[:,1] # p(y = 1 | x)

# since there are many, just print the "most" wrong samples
minP_whenYis1 = 1
maxP_whenYis0 = 0
wrong_positive_review = None
wrong_negative_review = None
wrong_positive_prediction = None
wrong_negative_prediction = None
for i in range(N):
    p = P[i]
    y = Y[i]
    if y == 1 and p < 0.5:
        if p < minP_whenYis1:
            wrong_positive_review = orig_reviews[i]
            wrong_positive_prediction = preds[i]
            minP_whenYis1 = p
    elif y == 0 and p > 0.5:
        if p > maxP_whenYis0:
            wrong_negative_review = orig_reviews[i]
            wrong_negative_prediction = preds[i]
            maxP_whenYis0 = p

print("Most wrong positive review (prob = %s, pred = %s):" % (minP_whenYis1, wrong_positive_prediction))
print(wrong_positive_review)
print("Most wrong negative review (prob = %s, pred = %s):" % (maxP_whenYis0, wrong_negative_prediction))
print(wrong_negative_review)

Most wrong positive review (prob = 0.445956758749, pred = 0.0):

3 more of the useless flimsy holders you get in the IPod box!

Most wrong negative review (prob = 0.560982993853, pred = 1.0):

If you have a Palm, read this before buying this device:

<<<Although our SD GPS card should work with Palm OS without a driver as its NMEA-0183 compliant, currently we are not aware of any 3rd-party software that supports our SD GPS card. Based on this I regrettably have to say that at this moment it does not work with Palm OS device. We are currently in the process of developing a Palm OS driver that will allow the device to be used independent of 3rd-party software support. When this becomes available it will be posted on our website.>>>

I bought this for my father for Christmas. It cost a lot of money. We all got really excited about it. It doesn't work. :

