## 情感分类

停用词

加载原始文本，只考虑'1', '2', '3', '4', '-'五类，'x'不确定的暂时不考虑。

In [1]:
import os
import json
import torch
import glob
from collections import Counter

import numpy as np
from tqdm import tqdm_notebook as tqdm

# import jieba
from thulac import thulac
thu = thulac(seg_only=True)

def load_stopword():
    """
    加载停用词集合
    """
    return set(json.load(open('data/stopword-zh.json')))

stop_word = load_stopword()


Model loaded succeed


In [2]:
def load_label_sentence():
    """
    加载原始文本
    """
    sentences = []
    labels = []
    in_dir = 'data/labelled_split/'
    for in_name in glob.glob(in_dir + '*.txt'):
        for i, line in enumerate(open(in_name)):
            if line.strip() == '': continue
            label = line.split('\t')[0]
            s= line.split('\t')[1]
            # 1234：四种情绪，-：没有情绪，x：不确定
            if label in ['1', '2', '3', '4', '-']:
                if label == '-' or label == 'x':
                    labels.append('0')
                else:
                    labels.append(label)
            sentences.append(s)

    return labels, sentences


labels, sentences = load_label_sentence()

# 信息增益 and one-hot

~来计算特征词

one-hot表示法

In [23]:
def Info_gain_of_term(v_ci, v_ci_t, v_ci_non_t, pr_t):
    """
    计算信息增益，需要每类的概率，句子出现t是Ci类的概率，不出现t是Ci的概率，存在t的概率
    """
    def info_entropy(p):
        if p == 0:
            return 0
        else:
            return -p * np.log(p)
    
    gain = 0
    for i in range(len(v_ci)):
        gain = gain + (info_entropy(v_ci[i]) - pr_t * info_entropy(v_ci_t[i]) - (1 - pr_t) * info_entropy(v_ci_non_t[i]))
    return gain


def get_word_freq():
    """
    统计高频词汇
    """
    stopwords = load_stopword()
    words_freq = {}
    words_ci = {} # 出现某个词，是某类的概率，此问题有五类
    class_num = 5
    labels_num = [0] * class_num
    labels, sentences = load_label_sentence()
    
    for y, s in zip(labels, sentences):
        
        # 统计每个类别的数量
        labels_num[int(y)] += 1
        # 分词
        for w in thu.cut(s):
            w = w[0]
            # 停用词等过滤
            if w == '' or w in stopwords or w.isdigit():
                continue
            elif w in words_freq:
                words_freq[w] += 1
                words_ci[w][int(y)] += 1
            else:
                words_freq[w] = 1
                words_ci[w] = [0] * class_num
                words_ci[w][int(y)] += 1
    
    # 数量转概率
    num2pro = lambda nums: [num / sum(nums) for num in nums]
    
    # 每类上的概率
    v_ci = num2pro(labels_num)
    
    word_gain = {}
    for w in words_ci.keys():
        word_ci = words_ci[w]
        
        v_ci_t = num2pro(word_ci) # 句子出现t是Ci类的概率
        
        non_word_ci = [labels_num[i] - word_ci[i] for i in range(class_num)] # 不是t时候的各类数量
        v_ci_non_t = num2pro(non_word_ci) # 句子不出现t是Ci的概率
        
        pr_t = words_freq[w] / sum(labels_num) # 存在t的概率
        
        Gt = Info_gain_of_term(v_ci, v_ci_t, v_ci_non_t, pr_t)
        
        word_gain[w] = Gt
        

    word_gain = sorted(word_gain.items(), key=lambda d: d[1], reverse=True) # 根据信息增益排序
    with open('data/word_gain_freq.txt', 'w') as f:
        for w, gain in word_gain:
            if words_freq[w] >= 5:
                print(w, gain, words_freq[w], sep='\t', file=f)


def word2vec_one_hot():

    def load_word_list(first=5000):
        word_list = []
        for i, line in enumerate(open('data/word_gain_freq.txt')):
            if i >= first:
                break
            try:
                w, gain, freq = line.strip().split('\t')
            except ValueError:
                print('读取词向量出错：行 {}'.format(i))
            word_list.append(w)
        print('词向量大小', len(word_list))
        return word_list

    word_list = load_word_list()
    
    i = 0
    with open('data/train/one_hot-180906.txt', 'w') as f:
        for y, s in zip(labels, sentences):
            i += 1
            if not i % 1000:
                print('line ->', i)
            vec = np.zeros(len(word_list))
            for w in thu.cut(s):
                w = w[0]
                # print(w)
                try:
                    _i = word_list.index(w)
                    vec[_i] = 1
                except ValueError:
                    pass

            f.write(y + '\t' + ','.join(['{:.1f}'.format(num) for num in list(vec)]) + '\n')
    print('总行数：', i)
    
    
# one-hot 
get_word_freq() # 词分析
word2vec_one_hot()


读取词向量出错：行 237
读取词向量出错：行 3301
读取词向量出错：行 3302
词向量大小 3460
line -> 1000
line -> 2000
line -> 3000
line -> 4000
line -> 5000
line -> 6000
line -> 7000
line -> 8000
line -> 9000
line -> 10000
line -> 11000
总行数： 11917


# 创建训练数据（word vector）

引入ACL2018词向量（财经方面）

因为该文件是按出现次数排序，那么考虑“掐头去尾”

停用词要不要去？也是要考虑的，停用词有时候也起到作用。

In [7]:
def load_word_vec():
    """
    加载ACL2018词向量，只加载信息增益筛选过后的词
    """
    word_vec = {}
    print('加载词向量中 ...')
    for i, line in enumerate(open('data/sgns.financial.word')):
        if i <= 10:
            continue
        if i > 200000:
            break
        words = line.strip().split(' ')
        word = words[0]
        word_vec[word] = np.array([float(num) for num in words[1:]])
#         except UnicodeDecodeError:
#             print("编码问题，行 {}".format(i))
    print('加载词完成！一共 {}个词'.format(len(word_vec)))
    return word_vec


word_vec = load_word_vec()
cnt = Counter(labels)
print(cnt)  
            

加载词向量中 ...
加载词完成！一共 199937个词
Counter({'0': 3380, '3': 2613, '2': 2421, '1': 2368, '4': 1135})


In [13]:
i = 0

# 建立训练文件
with open('data/train/wv-180906.txt', 'w') as f:
    for y, s in zip(labels, sentences):
        i += 1
        if not i % 1000:
            print('行 -> {}'.format(i))
        count = 0
        vec = np.zeros(300)
        
        for w in thu.cut(s): # 对分词结果进行处理
            w = w[0]
#             if w in stop_word:
#                 continue
            if w in word_vec:
                vec += word_vec[w]
                count += 1
        
#         for w in jieba.cut(s): # 对分词结果进行处理
#             if w in stop_word:
#                 continue
#             if w in word_vec:
#                 vec += word_vec[w]
#                 count += 1

        if count != 0:
            vec = vec / count 
            
#         if count > 0:
        f.write(y + '\t' + ','.join(['{:.6f}'.format(num) for num in list(vec)]) + '\n')

行 -> 1000
行 -> 2000
行 -> 3000
行 -> 4000
行 -> 5000
行 -> 6000
行 -> 7000
行 -> 8000
行 -> 9000
行 -> 10000
行 -> 11000


## ~ ⬆️准备训练数据 ⬇️开始训练

机器学习算法包括：KNN、LR、随机森林、决策树、GBDT、SVM

In [14]:
from sklearn.neighbors import KNeighborsClassifier  
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.linear_model import LogisticRegression  
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC


from sklearn.externals import joblib

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report

# Multinomial Naive Bayes Classifier  
def naive_bayes_classifier(train_x, train_y):  
    from sklearn.naive_bayes import MultinomialNB  
    model = MultinomialNB(alpha=0.01)  
    model.fit(train_x, train_y)  
    return model  
  
  
# KNN Classifier  
def knn_classifier(train_x, train_y):  
    from sklearn.neighbors import KNeighborsClassifier  
    model = KNeighborsClassifier()  
    model.fit(train_x, train_y)  
    return model  
  
  
# Logistic Regression Classifier  
def logistic_regression_classifier(train_x, train_y):  
    from sklearn.linear_model import LogisticRegression  
    model = LogisticRegression(penalty='l2')  
    model.fit(train_x, train_y)  
    return model  
  
  
# Random Forest Classifier  
def random_forest_classifier(train_x, train_y):  
    from sklearn.ensemble import RandomForestClassifier  
    model = RandomForestClassifier(n_estimators=8)  
    model.fit(train_x, train_y)  
    return model  
  
  
# Decision Tree Classifier  
def decision_tree_classifier(train_x, train_y):  
    from sklearn import tree  
    model = tree.DecisionTreeClassifier()  
    model.fit(train_x, train_y)  
    return model  
  
  
# GBDT(Gradient Boosting Decision Tree) Classifier  
def gradient_boosting_classifier(train_x, train_y):  
    from sklearn.ensemble import GradientBoostingClassifier  
    model = GradientBoostingClassifier(n_estimators=200)  
    model.fit(train_x, train_y)  
    return model  
  
  
# SVM Classifier  
def svm_classifier(train_x, train_y):  
    from sklearn.svm import SVC  
    model = SVC(kernel='rbf', probability=True)  
    model.fit(train_x, train_y)  
    return model  
  
# SVM Classifier using cross validation  
def svm_cross_validation(train_x, train_y):  
    from sklearn.grid_search import GridSearchCV  
    from sklearn.svm import SVC  
    model = SVC(kernel='rbf', probability=True)  
    param_grid = {'C': [1e-3, 1e-2, 1e-1, 1, 10, 100, 1000], 'gamma': [0.001, 0.0001]}  
    grid_search = GridSearchCV(model, param_grid, n_jobs = 1, verbose=1)  
    grid_search.fit(train_x, train_y)  
    best_parameters = grid_search.best_estimator_.get_params()  
    for para, val in list(best_parameters.items()):  
        print(para, val)  
    model = SVC(kernel='rbf', C=best_parameters['C'], gamma=best_parameters['gamma'], probability=True)  
    model.fit(train_x, train_y)  



In [25]:

def load_train_data(in_name, num=5):
    """
    加载训练数据
    """
    X = []
    y = []
    for line in open(in_name):
        label, vec = line.strip().split('\t')
        # 高兴
        if num == 3:
            if label == '2':
                label = '1'
            # 没有情绪
            elif label == '0':
                label = '0'
            # 负面
            else:
                label = '-1'
        elif num == 4:
            if label == '0':
                continue
            
        x = np.array([float(v) for v in vec.split(',')])
        y.append(label)
        X.append(x)
    X = np.array(X)
    y = np.array(y)
    return X, y


def stack_X_y(X1, y1, X2, y2, out_name=0):
    if len(y1) != len(y2):
        print('两列表长度不同，不同合并。')
        return -1
    _len = len(X1)
    X = []
    for i in range(_len):
        xi= np.hstack([X1[i], X2[i]])
        X.append(xi)
    X = np.array(X)
    y = np.array(y1)
    
    if out_name != 0:
        with open(out_name, 'w') as f:
            for xi, yi in zip(X, y):
                f.write(yi + '\t' + ','.join(['{:.6f}'.format(num) for num in list(xi)]) + '\n')
    print('合并数据完成。')
    

def train():
    """
    调参
    """
    # 合并数据
#     X1, y1 = load_train_data('data/train/one_hot-180906.txt')
#     print(X1.shape, y1.shape)
#     X2, y2 = load_train_data('data/train/wv-180906.txt')
#     print(X2.shape, y2.shape)
#     stack_X_y(X1, y1, X2, y2, out_name='data/train/wv_onehot-180906')
    
    
    X, y = load_train_data('data/train/wv_onehot-180906', num=4)
    print(X.shape, y.shape)

    # 划分数据集
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=41)

        
    # 初始化分类器
    test_classifiers = ['KNN', 'LR', 'RF', 'DT', 'GBDT', 'SVM']  
    classifiers = {'NB':naive_bayes_classifier,   
                  'KNN':knn_classifier,  
                   'LR':logistic_regression_classifier,  
                   'RF':random_forest_classifier,
                   'DT':decision_tree_classifier,  
                  'SVM':svm_classifier,  
                'SVMCV':svm_cross_validation,  
                 'GBDT':gradient_boosting_classifier  
    } 
    
    for classifier in test_classifiers:  
        print('******************* {} ********************'.format(classifier))
        clf = classifiers[classifier](X_train, y_train)

        # CV
        print('accuracy of CV:', cross_val_score(clf, X, y, cv=5).mean())

#         # 执行训练
#         clf.fit(X_train, y_train)

        # 模型评估
        y_pred = []
        for i in range(len(X_test)):
            y_hat = clf.predict(X_test[i].reshape(1, -1))
            y_pred.append(y_hat[0])
        print(classification_report(y_test, y_pred))


def train_model():
    X, y = load_train_data('data/train/train_data_ACL-20180712.txt')
    clf = LogisticRegression(penalty='l2')
    print(X.shape, y.shape)
    clf.fit(X, y)
    # 保存模型
    joblib.dump(clf, "emo-LR-v1.model")

train()
# train_model()


(8537, 3760) (8537,)
******************* KNN ********************
accuracy of CV: 0.3487140781109662
             precision    recall  f1-score   support

          1       0.31      0.59      0.41       687
          2       0.50      0.20      0.28       722
          3       0.35      0.39      0.37       785
          4       0.26      0.09      0.14       368

avg / total       0.37      0.35      0.32      2562

******************* LR ********************
accuracy of CV: 0.523837659695187
             precision    recall  f1-score   support

          1       0.56      0.57      0.56       687
          2       0.61      0.67      0.64       722
          3       0.51      0.56      0.54       785
          4       0.47      0.28      0.35       368

avg / total       0.55      0.55      0.55      2562

******************* RF ********************
accuracy of CV: 0.3674484859633869
             precision    recall  f1-score   support

          1       0.35      0.47      0.40    

  'precision', 'predicted', average, warn_for)


耐心等待训练中 ...

------------- ～ 华丽丽的分割线 ～ ---------------

## LSTM（注意已经切到lstm.ipynb）

### 为lstm做准备，训练数据

句子不必对齐，不是生成模型，并不需要表示EOF；

发现这样做中间文件很不现实，数据太多，不如直接放在内存里面去训练

lstm实际就是造了一个句子向量；

In [27]:
# label转tensor
def y_tensor(y):
    _y = torch.zeros(5)
    _y[y] = 1
    return _y

i = 0
# 载入训练数据
y_x_data = []

for y, s in zip(labels, sentences): # 遍历每句话
    vectors = []
    i += 1
    if not i % 1000:
        print('载入训练数据 行 -> {}'.format(i))

    count = 0
    for w in thu.cut(s): # 对分词结果进行处理
        w = w[0]
        if w in word_vec:
            vec = word_vec[w]
            count += 1
            vectors.append(vec)
    vectors = torch.Tensor(vectors)

    # 全部0向量表示EOF
    # sentense_vec.append(np.zeros(300))

    # 至少命中3个词
    if count >= 3:
#         f.write(y + '\t' + '@@'.join([json.dumps(list(w)) for w in sentense_vec]))
        y = y_tensor(int(y))
        y_x_data.append([y, vectors])
            

载入训练数据 行 -> 1000
载入训练数据 行 -> 2000
载入训练数据 行 -> 3000
载入训练数据 行 -> 4000
载入训练数据 行 -> 5000
载入训练数据 行 -> 6000
载入训练数据 行 -> 7000
载入训练数据 行 -> 8000
载入训练数据 行 -> 9000
载入训练数据 行 -> 10000
载入训练数据 行 -> 11000


In [28]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size

        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

# 300维数据，隐藏层128维，分为5类
rnn = RNN(300, 128, 5)
input = torch.zeros(1, 300)
hidden = torch.zeros(1, 128)
rnn(input, hidden)

(tensor([[-1.6127, -1.6621, -1.5745, -1.5710, -1.6299]], grad_fn=<LogSoftmaxBackward>),
 tensor([[-0.0237,  0.0144,  0.0101, -0.0088, -0.0007,  0.0175,  0.0379,  0.0345,
          -0.0389,  0.0123, -0.0257, -0.0080, -0.0305, -0.0178, -0.0293, -0.0476,
          -0.0243, -0.0090,  0.0219, -0.0192, -0.0305, -0.0271, -0.0038, -0.0323,
           0.0291, -0.0130,  0.0411, -0.0145,  0.0430,  0.0208, -0.0319,  0.0242,
           0.0273, -0.0152, -0.0150, -0.0295,  0.0467,  0.0187, -0.0170,  0.0183,
           0.0384,  0.0158, -0.0310,  0.0062, -0.0129,  0.0275, -0.0174,  0.0054,
           0.0407, -0.0123,  0.0386, -0.0369, -0.0332, -0.0350,  0.0227,  0.0442,
          -0.0218,  0.0453, -0.0318, -0.0187,  0.0264,  0.0366, -0.0327,  0.0299,
          -0.0110,  0.0255,  0.0438,  0.0253, -0.0205,  0.0453,  0.0004,  0.0190,
          -0.0215,  0.0110,  0.0202, -0.0355, -0.0030, -0.0357,  0.0482,  0.0243,
          -0.0347, -0.0423,  0.0283,  0.0108,  0.0146,  0.0362,  0.0073,  0.0251,
          