In [21]:
import nltk
import numpy as np
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from bs4 import BeautifulSoup
from future.utils import iteritems
from sklearn.utils import shuffle

In [22]:
wordnet_lemmatizer = WordNetLemmatizer()
stopwords = set([w.rstrip() for w in open('stopwords.txt','r')])

In [23]:
positive_review = BeautifulSoup(open('sorted_data_acl/electronics/positive.review').read())
negative_review = BeautifulSoup(open('sorted_data_acl/electronics/negative.review').read())

In [24]:
positive_review = positive_review.findAll('review_text')
negative_review = negative_review.findAll('review_text')

In [25]:
print(type(positive_review))
print(len(positive_review))
print(len(negative_review))
#Numpy 尽然可以对这个对象进行操作
np.random.shuffle(positive_review)
#这里positive和negative是一样的数量，如果遇到不平衡的数据记得要让其平衡
positive_review = positive_review[:len(negative_review)]

<class 'bs4.element.ResultSet'>
1000
1000


In [26]:
def my_tokenizer(s):
    s= s.lower()
    tokens = nltk.tokenize.word_tokenize(s)
    tokens = [s for s in tokens if len(s)>2]
    tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens]
    tokens = [t for t in tokens if t not in stopwords]
    return tokens

In [27]:
word_index_map ={}
current_index = 0
#为了避免重复做tokenize的计算，将token储存到一个列表里
positive_tokenized = []
negative_tokenized = []
orig_reviews = []


# 从postivie_review中收集词汇
for review in positive_review:
    orig_reviews.append(review.text)
    tokens = my_tokenizer(review.text)
    positive_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index +=1

# 从negative_review中看看还有没有新的词汇            
for review in negative_review:
    orig_reviews.append(review.text)
    tokens = my_tokenizer(review.text)
    negative_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index +=1



In [28]:
def tokens_to_vector(tokens,label):
    x = np.zeros(len(word_index_map)+1) #+1是给label留了一个column
    for t in tokens:
        i = word_index_map[t]
        x[i] +=1
    x = x/x.sum()
    x[-1] = label
    return x
        
    
    

In [29]:
N = len(positive_review)+len(negative_review)
data = np.zeros((N,len(word_index_map)+1))

i = 0
for token in positive_tokenized:
    xy = tokens_to_vector(token,1)
    data[i,:]=xy
    i+=1

for token in negative_tokenized:
    xy = tokens_to_vector(token,0)
    data[i,:]=xy
    i+=1
orig_reviews, data = shuffle(orig_reviews, data)
# np.random.shuffle(data)
X = data[:,:-1]
Y = data[:,-1]
# last 100 rows will be test
Xtrain = X[:-100,]
Ytrain = Y[:-100,]
Xtest = X[-100:,]
Ytest = Y[-100:,]


In [30]:
model = LogisticRegression()
model.fit(Xtrain, Ytrain)
print("Train accuracy:", model.score(Xtrain, Ytrain))
print("Test accuracy:", model.score(Xtest, Ytest))



Train accuracy: 0.7852631578947369
Test accuracy: 0.75


In [31]:
threshold = 0.5
for word, index in iteritems(word_index_map):
    weight = model.coef_[0][index]
    if weight > threshold or weight < -threshold:
        print(word, weight)

price 2.7552918290468273
wa -1.5667967682203252
fast 0.9305590928229047
item -0.9418085319151253
money -1.138143767606628
ha 0.6676331547464043
comfortable 0.5694958844128362
easy 1.771200891760291
card -0.5505449487049685
buy -0.9129885277851556
quality 1.4829739986817791
recommend 0.5963505125178439
memory 0.9637830593575111
then -1.2162491883888558
lot 0.7565027499048885
you 0.9888484520612553
sound 1.010517235684941
returning -0.5209778528144329
company -0.5404055204154519
n't -2.0217714150490056
've 0.7969123390666663
time -0.7407011187515509
pretty 0.7564580654881009
doe -1.3324976742434105
hour -0.5932567512111824
picture 0.5622845418949092
try -0.6563475219102972
junk -0.534347225175127
perfect 0.9928018773074604
love 1.1640525916441975
speaker 0.8863951491992111
little 0.9426842510904787
home 0.5321234397715789
video 0.547251541684964
cable 0.7020148599246453
customer -0.7075438347920198
bit 0.6253490226970532
using 0.5978614851236363
value 0.5700949516971046
bad -0.8188725325

In [32]:
# check misclassified examples
preds = model.predict(X)
P =model.predict_proba(X)[:,1] # p(y = 1 | x)

In [33]:
# since there are many, just print the "most" wrong samples
minP_whenYis1 = 1
maxP_whenYis0 = 0
wrong_positive_review = None
wrong_negative_review = None
wrong_positive_prediction = None
wrong_negative_prediction = None

for i in range(N):
    p = P[i]
    y = Y[i]
    if y == 1 and p < 0.5:
        if p < minP_whenYis1:
            wrong_positive_review = orig_reviews[i]
            wrong_positive_prediction = preds[i]
            minP_whenYis1 = p
    elif y == 0 and p > 0.5:
        if p > maxP_whenYis0:
            wrong_negative_review = orig_reviews[i]
            wrong_negative_prediction = preds[i]
            maxP_whenYis0 = p

print("Most wrong positive review (prob = %s, pred = %s):" % (minP_whenYis1, wrong_positive_prediction))
print(wrong_positive_review)
print("Most wrong negative review (prob = %s, pred = %s):" % (maxP_whenYis0, wrong_negative_prediction))
print(wrong_negative_review)

Most wrong positive review (prob = 0.3529949837724324, pred = 0.0):

A device like this either works or it doesn't.  This one happens to work

Most wrong negative review (prob = 0.6029577639904368, pred = 1.0):

The Voice recorder meets all my expectations and more
Easy to use, easy to transfer great results

