# amazon 评论的情感识别
使用amazon的评论信息进行情感识别，目的是为了测试算法的识别准确度

因为我们当前的训练数据不靠谱所以拿amazon的评论数据来做算法效果验证

In [None]:
"""
指定文件数据路径
"""
train_data_path = "../../datasets/amazon.csv"
stop_words_path = "../../datasets/stop_words.txt"
bayes_model_path = "../models/amazon_native_bayes.pkl"

先处理数据，分词，停用词等

In [None]:
"""
"""
import pandas as pd
data = pd.read_csv(train_data_path)

data.head(10)

In [None]:
from sklearn.model_selection import train_test_split
X = data["reviewText"]
y = data["Positive"]
# 切分数据集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

X_train.head()


In [None]:
y_train.head()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

def get_custom_stopwords(stop_words_file):
    with open(stop_words_file) as f:
        stopwords = f.read()
    stopwords_list = stopwords.split('\n')
    custom_stopwords_list = [i for i in stopwords_list]
    return custom_stopwords_list

# 加载停用词
stopwords = get_custom_stopwords(stop_words_path)

# print(stopwords)
# 次数统计向量
vect = CountVectorizer(max_df = 0.8, 
                       min_df = 2, 
                       token_pattern=u'(?u)\\b[^\\d\\W]\\w+\\b', 
                       stop_words=frozenset(stopwords))
# tf向量化，效果较差
# vect = TfidfVectorizer()

print("done")

In [None]:
test = pd.DataFrame(vect.fit_transform(X_train).toarray(), columns=vect.get_feature_names())
test.head()

使用Gensim生成的词向量处理

In [None]:
import gensim
from gensim.models import word2vec
from gensim.test.utils import datapath
from gensim import utils

size = 50
class MyCorpus(object):
    def __iter__(self):
        for line in data["reviewText"]:
            yield utils.simple_preprocess(line)

sentences = MyCorpus()
model = gensim.models.Word2Vec(sentences,sg=1,size=size,
window=5,min_count=1,
negative=3,sample=0.001,
hs=1,workers=4)
print("done")

根据词向量生成句向量

In [None]:
import numpy as np
def sentence2vec(content):
    tmp = np.zeros(size)
    for w in content.split(" "):
        if w in model.wv:
            tmp += model.wv[w]
    return tmp

用贝叶斯模型来处理

In [None]:
X_train_vect = vect.fit_transform(X_train)
X_test_vect = vect.transform(X_test)
"""
使用gensim生成句向量
"""
# X_train_vect = [[n for n in sentence2vec(i)] for i in X_train]
# X_test_vect = [[n for n in sentence2vec(i)] for i in X_test]

In [None]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train_vect, y_train)
train_score = nb.score(X_test_vect, y_test)
print(train_score)

神经网络来处理

In [None]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(max_iter=2)
mlp.fit(X_train_vect, y_train)
train_score = mlp.score(X_test_vect, y_test)
print(train_score)

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(X_train_vect, y_train)
train_score = lr.score(X_test_vect, y_test)
print(train_score)

save model to disk and reload model from disk 

In [None]:
import pickle

pickle.dump(mlp, open(bayes_model_path,"wb"))
# pickle.dump(nb, open(bayes_model_path,"wb"))

model_2 = pickle.load(open(bayes_model_path, "rb"))
print(model_2.score(X_train_vect, y_train))

找一条具体内容测试一下, 具体输出是0 ，说明预测还是比较准确的

In [None]:
new_text = "Loved this alarm clock till it started forceclosing everyday. It works one time, then the next time you try to use it, it tells you to download it again from them Amazon app store.  Uninstall and reinstall and it works once and FC again. Vibrant tooted "

res = vect.transform([new_text])
print(model_2.predict(res))

将结果合并到原来的数据中

In [None]:
X_vec = vect.transform(X)
nb_result = nb.predict(X_vec)
data['nb_result'] = nb_result
data.head()

# Fasttext classification 

1. 格式转换工具将普通的csv训练数据文件变为共fasttext使用的训练数据

In [None]:
import pandas as pd
import os
import fasttext
import random

amazon_source = "..../datasets/amazon.csv"

def trans_file(old_file):
    x = old_file + ".train"
    y = old_file + ".valid"
    h = pd.read_csv(old_file)

    with open(x, "w+") as fx:
        with open(y, "w+") as fy:
            for i in range(0, len(h)):
                s = "__label__" + str(h['Positive'][i])+" " + h['reviewText'][i] + "\n"
                if i > 17000:
                    fy.write(s)
                else:
                    fx.write(s)
    return x,y

amazon_train, amazon_test = trans_file(amazon_source)

In [None]:
model = fasttext.train_supervised(input=amazon_train,epoch=10,lr=1.0,wordNgrams=2,dim=100,loss='hs')
model.test(amazon_test)

In [None]:
model.predict("got this kindle fire for Christmas. trying to download free angry birds and it is not working. started to down load over 2 hours ago. an it is only at 2 percent still. piece of crap:(",k=1)