Skip to content

Commit a019afc

Browse files
committed
show misclassified
1 parent c3cb03a commit a019afc

File tree

1 file changed

+35
-2
lines changed

1 file changed

+35
-2
lines changed

nlp_class/sentiment.py

Lines changed: 35 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
import nltk
1818
import numpy as np
19+
from sklearn.utils import shuffle
1920

2021
from nltk.stem import WordNetLemmatizer
2122
from sklearn.linear_model import LogisticRegression
@@ -76,8 +77,10 @@ def my_tokenizer(s):
7677
current_index = 0
7778
positive_tokenized = []
7879
negative_tokenized = []
80+
orig_reviews = []
7981

8082
for review in positive_reviews:
83+
orig_reviews.append(review.text)
8184
tokens = my_tokenizer(review.text)
8285
positive_tokenized.append(tokens)
8386
for token in tokens:
@@ -86,13 +89,15 @@ def my_tokenizer(s):
8689
current_index += 1
8790

8891
for review in negative_reviews:
92+
orig_reviews.append(review.text)
8993
tokens = my_tokenizer(review.text)
9094
negative_tokenized.append(tokens)
9195
for token in tokens:
9296
if token not in word_index_map:
9397
word_index_map[token] = current_index
9498
current_index += 1
9599

100+
print("len(word_index_map):", len(word_index_map))
96101

97102
# now let's create our input matrices
98103
def tokens_to_vector(tokens, label):
@@ -120,7 +125,7 @@ def tokens_to_vector(tokens, label):
120125

121126
# shuffle the data and create train/test splits
122127
# try it multiple times!
123-
np.random.shuffle(data)
128+
orig_reviews, data = shuffle(orig_reviews, data)
124129

125130
X = data[:,:-1]
126131
Y = data[:,-1]
@@ -133,7 +138,8 @@ def tokens_to_vector(tokens, label):
133138

134139
model = LogisticRegression()
135140
model.fit(Xtrain, Ytrain)
136-
print("Classification rate:", model.score(Xtest, Ytest))
141+
print("Train accuracy:", model.score(Xtrain, Ytrain))
142+
print("Test accuracy:", model.score(Xtest, Ytest))
137143

138144

139145
# let's look at the weights for each word
@@ -143,3 +149,30 @@ def tokens_to_vector(tokens, label):
143149
weight = model.coef_[0][index]
144150
if weight > threshold or weight < -threshold:
145151
print(word, weight)
152+
153+
154+
# check misclassified examples
155+
P = model.predict_proba(X)[:,1] # p(y = 1 | x)
156+
157+
# since there are many, just print the "most" wrong samples
158+
minP_whenYis1 = 1
159+
maxP_whenYis0 = 0
160+
wrong_positive_review = None
161+
wrong_negative_review = None
162+
for i in range(N):
163+
p = P[i]
164+
y = Y[i]
165+
if y == 1 and p < 0.5:
166+
if p < minP_whenYis1:
167+
wrong_positive_review = orig_reviews[i]
168+
minP_whenYis1 = p
169+
elif y == 0 and p > 0.5:
170+
if p > maxP_whenYis0:
171+
wrong_negative_review = orig_reviews[i]
172+
maxP_whenYis0 = p
173+
174+
print("Most wrong positive review (prob = %s):" % minP_whenYis1)
175+
print(wrong_positive_review)
176+
print("Most wrong negative review (prob = %s):" % maxP_whenYis0)
177+
print(wrong_negative_review)
178+

0 commit comments

Comments
 (0)