In [72]:
import nltk
import numpy as np
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from bs4 import BeautifulSoup
from future.utils import iteritems
from sklearn.utils import shuffle

In [73]:
wordnet_lemmatizer = WordNetLemmatizer()
stopwords = set([w.rstrip() for w in open('stopwords.txt','r')])

In [74]:
positive_review = BeautifulSoup(open('sorted_data_acl/electronics/positive.review').read())
negative_review = BeautifulSoup(open('sorted_data_acl/electronics/negative.review').read())

In [75]:
positive_review = positive_review.findAll('review_text')
negative_review = negative_review.findAll('review_text')

In [76]:
print(type(positive_review))
print(len(positive_review))
print(len(negative_review))
#Numpy 尽然可以对这个对象进行操作
np.random.shuffle(positive_review)
#这里positive和negative是一样的数量，如果遇到不平衡的数据记得要让其平衡
positive_review = positive_review[:len(negative_review)]

<class 'bs4.element.ResultSet'>
1000
1000


In [77]:
def my_tokenizer(s):
    s= s.lower()
    tokens = nltk.tokenize.word_tokenize(s)
    tokens = [s for s in tokens if len(s)>2]
    tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens]
    tokens = [t for t in tokens if t not in stopwords]
    return tokens

In [78]:
word_index_map ={}
current_index = 0
#为了避免重复做tokenize的计算，将token储存到一个列表里
positive_tokenized = []
negative_tokenized = []
orig_reviews = []


# 从postivie_review中收集词汇
for review in positive_review:
    orig_reviews.append(review.text)
    tokens = my_tokenizer(review.text)
    positive_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index +=1

# 从negative_review中看看还有没有新的词汇            
for review in negative_review:
    orig_reviews.append(review.text)
    tokens = my_tokenizer(review.text)
    negative_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index +=1



In [79]:
def tokens_to_vector(tokens,label):
    x = np.zeros(len(word_index_map)+1) #+1是给label留了一个column
    for t in tokens:
        i = word_index_map[t]
        x[i] +=1
    x = x/x.sum()
    x[-1] = label
    return x
        
    
    

In [80]:
N = len(positive_review)+len(negative_review)
data = np.zeros((N,len(word_index_map)+1))

i = 0
for token in positive_tokenized:
    xy = tokens_to_vector(token,1)
    data[i,:]=xy
    i+=1

for token in negative_tokenized:
    xy = tokens_to_vector(token,0)
    data[i,:]=xy
    i+=1
orig_reviews, data = shuffle(orig_reviews, data)
# np.random.shuffle(data)
X = data[:,:-1]
Y = data[:,-1]
# last 100 rows will be test
Xtrain = X[:-100,]
Ytrain = Y[:-100,]
Xtest = X[-100:,]
Ytest = Y[-100:,]


In [81]:
model = LogisticRegression()
model.fit(Xtrain, Ytrain)
print("Train accuracy:", model.score(Xtrain, Ytrain))
print("Test accuracy:", model.score(Xtest, Ytest))



Train accuracy: 0.7778947368421053
Test accuracy: 0.7


In [82]:
threshold = 0.5
for word, index in iteritems(word_index_map):
    weight = model.coef_[0][index]
    if weight > threshold or weight < -threshold:
        print(word, weight)

wa -1.6079729883072436
little 0.9985095949384788
unit -0.6326406624014576
easy 1.6788120712624794
price 2.787781834531714
you 0.988370599040214
n't -2.1706587649623503
doe -1.2584637787147264
pretty 0.7990337063487584
speaker 0.975021578594107
sound 1.0607556572690178
excellent 1.3364495464316088
ha 0.7071380740735853
've 0.7372887550046962
perfect 0.9113495406111854
time -0.7223822320587461
highly 0.9496587712837982
recommend 0.6314202134674785
then -1.0137314693291324
returned -0.8345610724079219
hour -0.5550369943982171
lot 0.7442232435617449
memory 0.9578603642774107
home 0.6181950917223367
video 0.5158314510745707
using 0.6117792859580113
paper 0.5282658736616545
support -0.8401463459541889
buy -0.6638912400879797
expected 0.5802297373285455
bad -0.7938641031723082
look 0.5455444552443207
try -0.6723387685600332
space 0.5546310200763749
love 1.1952957642722026
month -0.744126726701251
cable 0.6380364134299532
picture 0.5488048544631335
item -1.193542795765836
fast 0.91951494015926

In [83]:
# check misclassified examples
preds = model.predict(X)
P =model.predict_proba(X)[:,1] # p(y = 1 | x)

In [84]:
# since there are many, just print the "most" wrong samples
minP_whenYis1 = 1
maxP_whenYis0 = 0
wrong_positive_review = None
wrong_negative_review = None
wrong_positive_prediction = None
wrong_negative_prediction = None

for i in range(N):
    p = P[i]
    y = Y[i]
    if y == 1 and p < 0.5:
        if p < minP_whenYis1:
            wrong_positive_review = orig_reviews[i]
            wrong_positive_prediction = preds[i]
            minP_whenYis1 = p
    elif y == 0 and p > 0.5:
        if p > maxP_whenYis0:
            wrong_negative_review = orig_reviews[i]
            wrong_negative_prediction = preds[i]
            maxP_whenYis0 = p

print("Most wrong positive review (prob = %s, pred = %s):" % (minP_whenYis1, wrong_positive_prediction))
print(wrong_positive_review)
print("Most wrong negative review (prob = %s, pred = %s):" % (maxP_whenYis0, wrong_negative_prediction))
print(wrong_negative_review)

Most wrong positive review (prob = 0.347095271461556, pred = 0.0):

A device like this either works or it doesn't.  This one happens to work

Most wrong negative review (prob = 0.5984894108598078, pred = 1.0):

The Voice recorder meets all my expectations and more
Easy to use, easy to transfer great results



### Extension

#### 1.  Try different Classifier
Here I tried Naive Bayes and AdaBoost which I havved been implemented in the spam detector

In [85]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(Xtrain,Ytrain)
print("Train accuracy:", model.score(Xtrain, Ytrain))
print("Test accuracy:", model.score(Xtest, Ytest))

Train accuracy: 0.8726315789473684
Test accuracy: 0.77


In [86]:
from sklearn.ensemble import AdaBoostClassifier
model =AdaBoostClassifier()
model.fit(Xtrain,Ytrain)
print("Train accuracy:", model.score(Xtrain, Ytrain))
print("Test accuracy:", model.score(Xtest, Ytest))

Train accuracy: 0.8178947368421052
Test accuracy: 0.76


#### 2. Use different feature
Here, I change the TF(term-frequency) into tf-idf

In [87]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer


In [98]:
tokenized_text_positive = []
tokenized_text_negative = []

#如果直接用原来的tokenized处理，再用tfidf，效果会比较差，因为其把水词什么的去掉了。但是在这个基础上用count会比较好，比百分比要好
# for i in positive_tokenized:
#     tokenized_text_positive.append(' '.join(i))

# for i in negative_tokenized:
#     tokenized_text_negative.append(' '.join(i))

for review in positive_review:
    tokenized_text_positive.append(review.text)
for review in negative_review:
    tokenized_text_negative.append(review.text)
    

Y = np.zeros((len(positive_tokenized)+len(negative_tokenized),1))
Y[:len(positive_tokenized)]=1
Y[len(negative_tokenized):]=0    

Shuffle arrays or sparse matrices in a consistent way

https://scikit-learn.org/stable/modules/generated/sklearn.utils.shuffle.html

In [99]:
from sklearn.utils import shuffle
Count_Vectorizer = TfidfVectorizer(decode_error = 'ignore')
data = tokenized_text_positive+tokenized_text_negative
X = Count_Vectorizer.fit_transform(data)
X,Y =shuffle(X,Y)
Xtrain = X[:-100,]
Xtest = X[-100:,]
Ytrain = Y[:-100,]
Ytest =Y[-100:,]

In [100]:
model = LogisticRegression()
model.fit(Xtrain, Ytrain)
print("Train accuracy:", model.score(Xtrain, Ytrain))
print("Test accuracy:", model.score(Xtest, Ytest))

Train accuracy: 0.9447368421052632
Test accuracy: 0.8


  y = column_or_1d(y, warn=True)


In [101]:
model = MultinomialNB()
model.fit(Xtrain, Ytrain)
print("Train accuracy:", model.score(Xtrain, Ytrain))
print("Test accuracy:", model.score(Xtest, Ytest))

Train accuracy: 0.9447368421052632
Test accuracy: 0.78


In [102]:
model = AdaBoostClassifier()
model.fit(Xtrain, Ytrain)
print("Train accuracy:", model.score(Xtrain, Ytrain))
print("Test accuracy:", model.score(Xtest, Ytest))

  y = column_or_1d(y, warn=True)


Train accuracy: 0.8468421052631578
Test accuracy: 0.78


#### Try regression
从xml里面抽取rating和text，然后用regression试试看

regression 的 预测能力堪忧啊

应该是特征不行吧，用词袋模型会造成过拟合？ 太稀疏了矩阵。

In [121]:
positive_review = BeautifulSoup(open('sorted_data_acl/electronics/positive.review').read())
negative_review = BeautifulSoup(open('sorted_data_acl/electronics/negative.review').read())
positive_rating = positive_review.findAll('rating')
negative_rating = negative_review.findAll('rating')
# positive_text = positive_review.findAll('review_text')
# negative_text = negative_review.findAll('review_text')

#构建rating的值
Y = np.zeros((len(positive_tokenized)+len(negative_tokenized),1))
i = 0
for rating in positive_rating:
    Y[i] = float(rating.text)
    i+=1
for rating in negative_rating:
    Y[i] = float(rating.text)
    i+=1

In [122]:
Count_Vectorizer = CountVectorizer(decode_error = 'ignore')
data = tokenized_text_positive+tokenized_text_negative
X = Count_Vectorizer.fit_transform(data)
X,Y =shuffle(X,Y)
Xtrain = X[:-100,]
Xtest = X[-100:,]
Ytrain = Y[:-100,]
Ytest =Y[-100:,]

In [128]:

model = LogisticRegression()
model.fit(Xtrain, Ytrain)
print("Train accuracy:", model.score(Xtrain, Ytrain))
print("Test accuracy:", model.score(Xtest, Ytest))

  y = column_or_1d(y, warn=True)


Train accuracy: 0.9947368421052631
Test accuracy: 0.6


In [136]:
np.unique(Y)

array([1., 2., 4., 5.])

In [139]:
Y

array([[5.],
       [4.],
       [4.],
       ...,
       [5.],
       [2.],
       [5.]])

###  逻辑回归多分类
只需要输入带有分类标号的column就行

class 的顺序按照 model.clasees 的顺序

In [138]:
model.classes_

array([1., 2., 4., 5.])

In [132]:
model.predict_proba(X)

array([[1.27289238e-01, 2.81981473e-02, 1.96836200e-02, 8.24828995e-01],
       [2.00090985e-02, 4.82625838e-03, 7.72405170e-01, 2.02759474e-01],
       [2.69972517e-04, 5.37364660e-02, 9.45175943e-01, 8.17618628e-04],
       ...,
       [1.79058017e-02, 1.13439081e-07, 8.83999718e-01, 9.80943666e-02],
       [1.04367200e-01, 2.17076942e-01, 6.08335409e-01, 7.02204486e-02],
       [1.96534583e-03, 1.91153822e-01, 2.15775832e-01, 5.91105000e-01]])

In [123]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(Xtrain, Ytrain)
print("Train accuracy:", model.score(Xtrain, Ytrain))
print("Test accuracy:", model.score(Xtest, Ytest))

Train accuracy: 0.9996504713485501
Test accuracy: -0.9337260575564089


In [124]:
from sklearn.ensemble import AdaBoostRegressor
model =AdaBoostRegressor()
model.fit(Xtrain, Ytrain)
print("Train accuracy:", model.score(Xtrain, Ytrain))
print("Test accuracy:", model.score(Xtest, Ytest))

  y = column_or_1d(y, warn=True)


Train accuracy: 0.10082941916038546
Test accuracy: 0.08042623704295027


In [127]:
from sklearn.naive_bayes import GaussianNB
model =GaussianNB()
model.fit(Xtrain.toarray(), Ytrain)
print("Train accuracy:", model.score(Xtrain.toarray(), Ytrain))
print("Test accuracy:", model.score(Xtest.toarray(), Ytest))

  y = column_or_1d(y, warn=True)


Train accuracy: 0.9105263157894737
Test accuracy: 0.44
