# amazon 评论的情感识别
使用amazon的评论信息进行情感识别，目的是为了测试算法的识别准确度

因为我们当前的训练数据不靠谱所以拿amazon的评论数据来做算法效果验证

In [1]:
"""
指定文件数据路径
"""
train_data_path = "../datasets/amazon.csv"
stop_words_path = "../datasets/stop_words.txt"
bayes_model_path = "../models/amazon_native_bayes.pkl"

先处理数据，分词，停用词等

In [2]:
"""
"""
import pandas as pd
data = pd.read_csv(train_data_path)

data.head(10)

Unnamed: 0,reviewText,Positive
0,This is a one of the best apps acording to a b...,1
1,This is a pretty good version of the game for ...,1
2,this is a really cool game. there are a bunch ...,1
3,"This is a silly game and can be frustrating, b...",1
4,This is a terrific game on any pad. Hrs of fun...,1
5,This is a very entertaining game! You don't h...,1
6,this is awesome and you don't need wi ti to pl...,1
7,this is awesome I bet no one even reads the re...,1
8,This is basicly the free version but with ads....,1
9,this is by far the best free app that is avail...,1


In [3]:
from sklearn.model_selection import train_test_split
X = data["reviewText"]
y = data["Positive"]
# 切分数据集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

X_train.head()


7013     This app is 99.99%  accurate 24/7!!!!!!!!!!!!!...
7766     I downloaded it as the free app of the day.  I...
5910     Nice app to have. I can jote notes to myself a...
7020     enjoy reading my horoscope everyday using this...
16952    I use this app on a daily basis. It's easy to ...
Name: reviewText, dtype: object

In [4]:
y_train.head()

7013     1
7766     1
5910     1
7020     1
16952    1
Name: Positive, dtype: int64

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

def get_custom_stopwords(stop_words_file):
    with open(stop_words_file) as f:
        stopwords = f.read()
    stopwords_list = stopwords.split('\n')
    custom_stopwords_list = [i for i in stopwords_list]
    return custom_stopwords_list

# 加载停用词
stopwords = get_custom_stopwords(stop_words_path)

# print(stopwords)
# 次数统计向量
vect = CountVectorizer(max_df = 0.8, 
                       min_df = 2, 
                       token_pattern=u'(?u)\\b[^\\d\\W]\\w+\\b', 
                       stop_words=frozenset(stopwords))
# tf向量化，效果较差
# vect = TfidfVectorizer()

print("done")

done


In [6]:
test = pd.DataFrame(vect.fit_transform(X_train).toarray(), columns=vect.get_feature_names())
test.head()

Unnamed: 0,__,___,____,_____,a500,aa,aaa,aaaa,aac,aarp,...,zip,zodiac,zombie,zombies,zone,zones,zoo,zoom,zooming,zzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


使用Gensim生成的词向量处理

In [7]:
import gensim
from gensim.models import word2vec
from gensim.test.utils import datapath
from gensim import utils

size = 768
class MyCorpus(object):
    def __iter__(self):
        for line in data["reviewText"]:
            yield utils.simple_preprocess(line)

sentences = MyCorpus()
model = gensim.models.Word2Vec(sentences,sg=1,size=size,
window=5,min_count=1,
negative=3,sample=0.001,
hs=1,workers=4)
print("done")

done


根据词向量生成句向量

In [8]:
import numpy as np
def sentence2vec(content):
    tmp = np.zeros(size)
    for w in content.split(" "):
        if w in model.wv:
            tmp += model.wv[w]
    return tmp

用贝叶斯模型来处理

In [16]:
X_train_vect = vect.fit_transform(X_train)
X_test_vect = vect.transform(X_test)
"""
使用gensim生成句向量
"""
# X_train_vect = [[n for n in sentence2vec(i)] for i in X_train]
# X_test_vect = [[n for n in sentence2vec(i)] for i in X_test]

'\n使用gensim生成句向量\n'

In [30]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train_vect, y_train)
train_score = nb.score(X_test_vect, y_test)
print(train_score)

0.9025


神经网络来处理

In [29]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(max_iter=2)
mlp.fit(X_train_vect, y_train)
train_score = mlp.score(X_test_vect, y_test)
print(train_score)

0.91175


save model to disk and reload model from disk 

In [12]:
import pickle

pickle.dump(mlp, open(bayes_model_path,"wb"))
# pickle.dump(nb, open(bayes_model_path,"wb"))

model_2 = pickle.load(open(bayes_model_path, "rb"))
print(model_2.score(X_train_vect, y_train))

0.8745


找一条具体内容测试一下, 具体输出是0 ，说明预测还是比较准确的

In [13]:
new_text = "Loved this alarm clock till it started forceclosing everyday. It works one time, then the next time you try to use it, it tells you to download it again from them Amazon app store.  Uninstall and reinstall and it works once and FC again. Vibrant tooted "

res = vect.transform([new_text])
print(model_2.predict(res))

ValueError: dimension mismatch

将结果合并到原来的数据中

In [14]:
X_vec = vect.transform(X)
nb_result = nb.predict(X_vec)
data['nb_result'] = nb_result
data.head()

Unnamed: 0,reviewText,Positive,nb_result
0,This is a one of the best apps acording to a b...,1,1
1,This is a pretty good version of the game for ...,1,1
2,this is a really cool game. there are a bunch ...,1,1
3,"This is a silly game and can be frustrating, b...",1,1
4,This is a terrific game on any pad. Hrs of fun...,1,1
