1616
1717import nltk
1818import numpy as np
19+ from sklearn .utils import shuffle
1920
2021from nltk .stem import WordNetLemmatizer
2122from sklearn .linear_model import LogisticRegression
@@ -76,8 +77,10 @@ def my_tokenizer(s):
7677current_index = 0
7778positive_tokenized = []
7879negative_tokenized = []
80+ orig_reviews = []
7981
8082for review in positive_reviews :
83+ orig_reviews .append (review .text )
8184 tokens = my_tokenizer (review .text )
8285 positive_tokenized .append (tokens )
8386 for token in tokens :
@@ -86,13 +89,15 @@ def my_tokenizer(s):
8689 current_index += 1
8790
8891for review in negative_reviews :
92+ orig_reviews .append (review .text )
8993 tokens = my_tokenizer (review .text )
9094 negative_tokenized .append (tokens )
9195 for token in tokens :
9296 if token not in word_index_map :
9397 word_index_map [token ] = current_index
9498 current_index += 1
9599
100+ print ("len(word_index_map):" , len (word_index_map ))
96101
97102# now let's create our input matrices
98103def tokens_to_vector (tokens , label ):
@@ -120,7 +125,7 @@ def tokens_to_vector(tokens, label):
120125
121126# shuffle the data and create train/test splits
122127# try it multiple times!
123- np . random . shuffle (data )
128+ orig_reviews , data = shuffle (orig_reviews , data )
124129
125130X = data [:,:- 1 ]
126131Y = data [:,- 1 ]
@@ -133,7 +138,8 @@ def tokens_to_vector(tokens, label):
133138
134139model = LogisticRegression ()
135140model .fit (Xtrain , Ytrain )
136- print ("Classification rate:" , model .score (Xtest , Ytest ))
141+ print ("Train accuracy:" , model .score (Xtrain , Ytrain ))
142+ print ("Test accuracy:" , model .score (Xtest , Ytest ))
137143
138144
139145# let's look at the weights for each word
@@ -143,3 +149,30 @@ def tokens_to_vector(tokens, label):
143149 weight = model .coef_ [0 ][index ]
144150 if weight > threshold or weight < - threshold :
145151 print (word , weight )
152+
153+
154+ # check misclassified examples
155+ P = model .predict_proba (X )[:,1 ] # p(y = 1 | x)
156+
157+ # since there are many, just print the "most" wrong samples
158+ minP_whenYis1 = 1
159+ maxP_whenYis0 = 0
160+ wrong_positive_review = None
161+ wrong_negative_review = None
162+ for i in range (N ):
163+ p = P [i ]
164+ y = Y [i ]
165+ if y == 1 and p < 0.5 :
166+ if p < minP_whenYis1 :
167+ wrong_positive_review = orig_reviews [i ]
168+ minP_whenYis1 = p
169+ elif y == 0 and p > 0.5 :
170+ if p > maxP_whenYis0 :
171+ wrong_negative_review = orig_reviews [i ]
172+ maxP_whenYis0 = p
173+
174+ print ("Most wrong positive review (prob = %s):" % minP_whenYis1 )
175+ print (wrong_positive_review )
176+ print ("Most wrong negative review (prob = %s):" % maxP_whenYis0 )
177+ print (wrong_negative_review )
178+
0 commit comments