In [60]:
#Import/download prerequisites
import pandas as pd
import nltk
from nltk import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
import numpy as np
nltk.download('stopwords')
from nltk.corpus import stopwords
from bs4 import BeautifulSoup as bs
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
nltk.download('punkt')
nltk.download('wordnet')
from future.utils import iteritems

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jgran\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jgran\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jgran\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [34]:
#Set up Wordnet
wordnet_lemmatizer = WordNetLemmatizer()

#Set up list of stop words
stop_words = set(stopwords.words('english'))

#Set file paths
positive_file = "Resources/positive.review"
negative_file = "Resources/negative.review"

In [26]:
#Use BeautifulSoup to extract review text
positive_reviews = bs(open(positive_file).read())
positive_review_text = positive_reviews.find_all('review_text')

negative_reviews = bs(open(negative_file).read())
negative_review_text = negative_reviews.find_all('review_text')



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


In [76]:
negative_review_text[0]

<review_text>
I have bought and returned three of these units now. Each one has been defective, and finally I just gave up on returning the system. The DVD player constantly gives "Bad Disc" errors and skips if there is even the slightest smudge on a disc. The sound quality is very nice for the price, but since the player doesn't work, it's essentially useless. This is a complete rip-off at any price point
</review_text>

In [36]:
#Generate list of words that will be used in the analysis (pre-processing the data)
def find_relevant(word):
    
    #Set all words to lowercase
    word = word.lower()
    
    #"Tokenize" the words, splitting the string into separate words aka tokens
    tokens = nltk.tokenize.word_tokenize(word)
    
    #Remove all short words as they most likely are not relevant
    tokens_refined = []
    
    for t in tokens:
        if len(t) > 2:
            tokens_refined.append(t)
    
    #"Lemmatize" words, aka put them in their most basic form
    tokens_lemmatized = []
    
    for t in tokens_refined:
        tokens_lemmatized.append(wordnet_lemmatizer.lemmatize(t))
    
    #Remove stopwords, words that are considered irrelevant. Nltk provides a list of stopwords
    tokens_filtered = [] 
  
    for t in tokens_lemmatized: 
        if t not in stop_words: 
            tokens_filtered.append(t)
            
    return tokens_filtered

In [37]:
#Nltk can't do hashing, so we have to get a little creative
hashed = {}
i = 0
positive_tokens = []
negative_tokens = []

for r in positive_review_text:
    tokens = find_relevant(r.text)
    positive_tokens.append(tokens)
    for token in tokens:
        if token not in hashed:
            hashed[token] = i
            i = i+1
            
for r in negative_review_text:
    tokens = find_relevant(r.text)
    negative_tokens.append(tokens)
    for token in tokens:
        if token not in hashed:
            hashed[token] = i
            i = i+1

In [50]:
#set up input matrices for vectorization
def vectorization(tokens, label):
    x = np.zeros(len(hashed) +1)
    for t in tokens:
        i = hashed[t]
        x[i] = x[i] + 1
    x = x/x.sum()
    x[-1] = label
    return x
    

In [51]:
#Vectorize!
N = len(positive_tokens) + len(negative_tokens)
data = np.zeros((N, len(hashed) + 1))
i = 0
for tokens in positive_tokens:
    xy = vectorization(tokens, 1)
    data[i,:] = xy
    i = i+1

for tokens in negative_tokens:
    xy = vectorization(tokens, 0)
    data[i,:] = xy
    i = i+1

In [55]:
#Split train and test data
X = data[:,:-1]
y = data[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [56]:
#apply the model
model = LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [57]:
#Test the model
model.score(X_test, y_test)

0.7066666666666667

In [66]:
threshold = .5
weights = {}
for word, index in iteritems(hashed):
    weight = model.coef_[0][index]
    if weight > threshold or weight < threshold:
        weights.update({word:weight})
        print(word, weight)

received -0.1722391442814504
kingston -0.01310628778049762
256mb 0.05286307142638323
card 0.011208809339569085
advertised -0.12141370037128649
unit -0.3556725286022609
came 0.09588143524083753
mail 0.013058280248570177
exactly 0.5109802983197683
day -0.37030030158388233
ordered 0.09474406299478683
worked -0.23150151364059834
perfectly 0.6166443435987153
satisfied 0.21833170009408148
work -0.3480097480263599
well 0.8170218472565928
especially 0.06084639406463101
anyone 0.06743219874874536
still -0.40601230725344634
ha 0.5782761430415098
old 0.1195707809291597
console 0.16000659903098086
system 0.16481637900557314
use 0.8451662743889743
ne 0.017930451960297595
snes 0.017930451960297595
huge 0.06359242809577582
adapter 0.0986821431228399
plugged 0.17623313257976136
save -0.07197420712179003
ton 0.05492823252801075
space 0.2918641195026346
power 0.09595197333322043
strip 0.0705078309602956
design 0.0015140216184527894
little 0.6397608441967971
clumsy -0.032136055259723316
though 0.02563147

yes -0.050037545639650294
circuit 0.013405005608386077
'protected 0.011045467533582732
could -0.2773610512373737
happier 0.025583084182831353
equipment 0.13072534342334347
hav'nt 0.03493150531884223
major 0.034800312224393115
fluctuation 0.03630668559474704
handled 0.08162540649879832
delivery 0.24066092134837383
cooperative 0.017357189370661424
placed 0.002775779353931657
hooked 0.042061744918568865
checked -0.010493922782932038
offered 0.08679558088872648
empty 0.026630028944896356
carton -0.011477380471741912
saved 0.08076514486485654
250 0.012804764505147118
rather -0.1302995319112173
local 0.028934169428135984
retailer 0.024150581892149397
budget 0.03186974386351458
theater 0.1113824526737694
200 0.12276778422456126
ht-ddw700 0.0024402861190867863
tiny -0.007053574411876963
actually -0.0043593470169299075
cube -0.03715078115810618
present -0.07544255156647407
clarity 0.15697211081938267
cheap -0.1843082560754567
heavier 0.0024402861190867863
underpowered 0.0024402861190867863
dial

woofer 0.011184179897689508
pc -0.014934230520312076
router -0.2557723298394361
netgear -0.06958245706931038
improved 0.02543632482952999
non-ipod 0.0039944504014646845
terrific 0.004348818439949516
lanyard 0.021410405968712503
commuting 0.0039944504014646845
plane 0.040652812675985375
alkaline -0.007710916482967961
familiar 0.00875881469348178
enjoyed 0.018426486058421895
built 0.06556802620617928
trouble -0.05875109127524384
commuter 0.0039944504014646845
128 -0.049010977500282785
bitrate 0.009510316082913368
transferring 0.0376405298779983
lost -0.1723505703008708
foam 0.0869295942941076
padding 0.04768364145554472
bother 0.01925807793700849
hint 0.011864351506066885
location -0.11145429844350566
stored 0.02287883514261461
combat 0.0039944504014646845
accidentally 0.024273360326753753
wash -0.004996241942147849
cycle -0.0521753875253635
laundry 0.0
sending -0.1476542562265757
beause 0.0
opening -0.00574490542987275
breaking -0.01083262735165365
emory 0.01251685295768392
board 0.0325

loooong 0.015158641688028612
printout 0.015158641688028612
flexable 0.033185176850064606
keypad 0.05012808712022119
ax100u 0.0030912476557525314
oft 0.0030912476557525314
ldc 0.0030912476557525314
'screen 0.0030912476557525314
virtually 0.010941894443398884
non-existant 0.012426902415508914
panny 0.0030912476557525314
smoothening 0.0030912476557525314
achieve 0.0030912476557525314
truely 0.0030912476557525314
par 0.010672549767492163
dlp -0.013714535531965132
systesm 0.0030912476557525314
headache -0.005495900375220894
rainbow 0.0030912476557525314
wavy 0.006182495311505063
curl -0.0031580823411866956
flow -0.037033262633923196
screeen 0.0030912476557525314
4-5 0.0030912476557525314
disappeared -0.023282613142653102
secondly 0.00022000749471118132
flickering 0.0030912476557525314
shimmer 0.012660248885514478
teh 0.0030912476557525314
overscan 0.0030912476557525314
crop 0.0005169492483822088
flicker 0.0030912476557525314
aware -0.06454193214421315
lugging 0.0030912476557525314
presentat

reviewed 0.007586054186669952
performing 0.013991784557891646
installs -0.0006308963785864787
marvel 0.01714974431860352
grabbed 0.01714974431860352
sequence 0.01714974431860352
sonar 0.01714974431860352
latency 0.01714974431860352
wifi 0.05563562358406274
finder 0.01116136794362593
chrysalis 0.008023123253446637
development 0.002612399812978902
keyless 0.008023123253446637
directional -0.0010339782602932438
instantly 0.0021931371996383544
decrease 0.008023123253446637
film 0.017882696307473456
sorry 0.008405652918619128
peel -0.002906703533658083
respond -0.11642188230661055
accessible -0.009184382491039369
removal -0.0066595954973440495
necessary -0.0039703087106315385
lining -0.000175317588194254
closing 0.004202801126021622
unfolds 0.004202801126021622
axim 0.03999294878349266
x50v 0.029049820489627117
seagate -0.0065139270266217205
lavalier 0.02025260137001692
informational 0.01012630068500846
interviewing 0.01012630068500846
management -0.013204134762608311
amplified 0.0070302689

shuffling 0.005509260203479788
6800 0.005509260203479788
fired -0.0035074588512872813
5-7 0.005509260203479788
celsius 0.017187110270230222
bore.. 0.005509260203479788
chaos 0.005509260203479788
subside 0.005509260203479788
cooling 0.030807813736166218
copy/paste 0.0
printable 0.012212834357021267
nicer 0.025939641546956674
r300 0.012212834357021267
advertized 0.01731471882627674
enabled 0.026679208646689163
ericsson 0.020062736810603797
realignment 0.020062736810603797
cartridges.. 0.020062736810603797
ownership 0.01607699381897652
intel/pro 0.0065060541478085205
hawking 0.019518162443425562
haul 0.0065060541478085205
goodbye 0.0065060541478085205
isp -0.0016122594263695258
metrofi 0.0065060541478085205
completes 0.01183329282538669
city-wide 0.0065060541478085205
booster 0.0065060541478085205
prof 0.0065060541478085205
durable..after 0.0065060541478085205
albeit 0.0029828082786963537
off-beam 0.0065060541478085205
hwu8dd 0.0
rvers 0.0
accessed 0.0
campground 0.0
solved 0.016397218741

sheet -0.001453379419573146
postcard 0.008839186218312282
perforation 0.0021682559218607327
photo-only 0.0021682559218607327
backgounds 0.0021682559218607327
titles/text 0.0021682559218607327
paper/film 0.0021682559218607327
humorous 0.0021682559218607327
caption 0.008839186218312282
funny-bad 0.0021682559218607327
pass -0.00227384175118186
layer -0.007319726057306611
obstructed 0.0021682559218607327
deposited 0.0021682559218607327
1-year 0.0021682559218607327
s350 0.026726555868976345
brawny 0.003340819483622043
grabbing 0.003340819483622043
hard-to-get 0.003340819483622043
championing 0.003340819483622043
eliminating 0.003340819483622043
drift 0.004874308469053278
tin -0.005386747980610491
presets 0.05505563982791686
read-out 0.003340819483622043
overload 0.010022458450866129
angeles 0.003340819483622043
1150 0.003340819483622043
superradio 0.003340819483622043
ferrite 0.003340819483622043
era 0.003340819483622043
g4000a 0.003340819483622043
1103 0.003340819483622043
cousin 0.0033408

trivial -0.002732336725299606
redundancy 0.004465539871084991
macpro/powermacs 0.004465539871084991
managed -0.005253081040092221
osx.4 0.004465539871084991
manager 0.004465539871084991
utilized 0.004465539871084991
app 0.004465539871084991
log -0.026090320778316977
superduper 0.004465539871084991
pcuniverse 0.006632076183704435
generally -0.05373178988205955
truecrypt 0.006632076183704435
freeware 0.006632076183704435
flashdrive 0.006632076183704435
corsair 0.003925652364902002
grew 0.0003456721032258254
perfec 0.04867644174231748
muchly 0.0
pre-tried 0.0
rechargedable 0.0
countdown 0.013837027877229015
stove 0.013837027877229015
spreadsheet 0.018389909458377728
definite 0.009669872502213125
linking 0.0076551645723694104
doc 0.009669872502213125
lable 0.027736150097371105
56-year-old 0.0
scrabble -0.03280251072457821
opportunity -0.012950563676160346
enthusiastic 0.0
opponent 0.0
approve 0.0
inserted -0.013157048347475515
read/written 0.0
arrange 0.0074387028076282734
workarounds 0.00

fewer 0.0
principal -0.05940067674943899
reinstalling -0.010523106878112521
assistance -0.04294364658902248
contacting -0.023930059856856093
wristpad 0.0
comeunglued 0.0
mousing -0.004660439956132795
dificult 0.0
mousepad 0.0
skintight -0.013473645778965315
delux -0.013473645778965315
holster -0.013473645778965315
sugest -0.013473645778965315
pooling 0.0
carrier 0.0
reexamine 0.0
talked -0.021717429119583513
rep -0.0779030248419204
unintelligible 0.0
reminded -0.017187681023788183
jim -0.017187681023788183
carey -0.017187681023788183
blur -0.017187681023788183
webcam -0.017187681023788183
sucker -0.035050456535166585
latch -0.02139696177575489
pry -0.01559459577406414
pll -0.028441493753432637
dx/loc -0.028441493753432637
quality.. -0.028441493753432637
wardrobe 0.0
sysinternal -0.009001818400365279
hwi -0.018003636800730558
dpc -0.018003636800730558
deferred -0.009001818400365279
-40 -0.009001818400365279
usb-based -0.009001818400365279
802.11/b -0.009001818400365279
/b/g -0.009001818

*.inf 0.0
iogear-supplied 0.0
executable 0.0
execute 0.0
satisfactorily 0.0
non-iogear 0.0
swapping 0.0
sympathy 0.0
sacd -0.10465145555395726
mdr-if3000 -0.008981479656426363
elf -0.008981479656426363
mortal -0.008981479656426363
superlative -0.008981479656426363
casually -0.008981479656426363
mdr-if8000 -0.008981479656426363
20ft -0.032948481042058955
eachother -0.032948481042058955
overnight -0.022878286389410495
anymore.so -0.027463032247644698
2655 -0.027463032247644698
forest -0.027463032247644698
blvd -0.027463032247644698
jax -0.027463032247644698
fl.3224 -0.027463032247644698
seam -0.03282960848350029
non-skid -0.03282960848350029
underside -0.03282960848350029
peculiar -0.03282960848350029
distinctly -0.03282960848350029
ferriete 0.0
bead 0.0
useable 0.0
pvr -0.0144356101552995
friggin -0.0028871220310599005
reminder -0.0028871220310599005
appt. -0.0028871220310599005
.again -0.0028871220310599005
dainty -0.0028871220310599005
real-live 0.0
approval 0.0
incident 0.0
1:15 0.0


10-20 0.0
overpowers 0.0
grounded 0.0
lengthening 0.0
transmission 0.0
get-go 0.0
forthrightly 0.0
stereophile -0.006616606365113733
broadcasting -0.006616606365113733
pot -0.013233212730227466
standpoint -0.006616606365113733
dislodges -0.006616606365113733
potentiometer -0.006616606365113733
el -0.05953651181809921
duracell 0.0
eveready 0.0
below-average 0.0
horrid 0.0
overdrive -0.009921657544284241
hariest -0.009921657544284241
shreding -0.009921657544284241
deminishes -0.009921657544284241
85th 0.0
92nd 0.0
declined 0.0
waiving 0.0
trashcan 0.0
*sigh* 0.0
outdate 0.0
hideous -0.016257823972650257
intuos2 -0.04877347191795077
uglier -0.016257823972650257
tossing -0.027204651662026756
4x6.5 -0.027081986415828624
full-sized -0.000949835546048522
decidedly -0.000949835546048522
deal-maker -0.000949835546048522
alt-key -0.000949835546048522
symbol -0.000949835546048522
full-size -0.0028495066381455663
revamping -0.000949835546048522
miniaturized -0.000949835546048522
*both* -0.00094983

In [67]:
weights

{'received': -0.1722391442814504,
 'kingston': -0.01310628778049762,
 '256mb': 0.05286307142638323,
 'card': 0.011208809339569085,
 'advertised': -0.12141370037128649,
 'unit': -0.3556725286022609,
 'came': 0.09588143524083753,
 'mail': 0.013058280248570177,
 'exactly': 0.5109802983197683,
 'day': -0.37030030158388233,
 'ordered': 0.09474406299478683,
 'worked': -0.23150151364059834,
 'perfectly': 0.6166443435987153,
 'satisfied': 0.21833170009408148,
 'work': -0.3480097480263599,
 'well': 0.8170218472565928,
 'especially': 0.06084639406463101,
 'anyone': 0.06743219874874536,
 'still': -0.40601230725344634,
 'ha': 0.5782761430415098,
 'old': 0.1195707809291597,
 'console': 0.16000659903098086,
 'system': 0.16481637900557314,
 'use': 0.8451662743889743,
 'ne': 0.017930451960297595,
 'snes': 0.017930451960297595,
 'huge': 0.06359242809577582,
 'adapter': 0.0986821431228399,
 'plugged': 0.17623313257976136,
 'save': -0.07197420712179003,
 'ton': 0.05492823252801075,
 'space': 0.2918641195

In [72]:
df = pd.DataFrame.from_dict(weights, orient='index')
df.head()

Unnamed: 0,0
received,-0.172239
kingston,-0.013106
256mb,0.052863
card,0.011209
advertised,-0.121414


In [75]:
df.to_csv('weights_df')