## 情感分类

停用词

加载原始文本，只考虑'1', '2', '3', '4', '-'五类，'x'不确定的暂时不考虑。

In [18]:
import os
import json
import torch
from collections import Counter
from tqdm import tqdm_notebook as tqdm

import numpy as np
# import jieba
from thulac import thulac
thu = thulac(user_dict='data/emo-words.txt', seg_only=True)

from myclf import *

Model loaded succeed


In [27]:
def load_stopword():
    """
    加载停用词集合
    """
    return set(json.load(open('data/stopword-zh.json')))

# stop_word = load_stopword()

def load_label_sentence():
    """
    加载原始文本
    """
    sentences = []
    labels = []
    in_dir = 'data/labelled'

    for in_name in os.listdir(in_dir):
        _in = os.path.join(in_dir, in_name)
        # print(_in)
        for i, line in enumerate(open(_in)):
            if line.strip() == '':
                continue
            label = line.split('\t')[0]
            s= line.split('\t')[1]
            # 1234：四种情绪，-：没有情绪，x：不确定
            if label in ['1', '2', '3', '4', '-']:
                if label == '-':
                    labels.append('0')
                else:
                    labels.append(label)
            sentences.append(s)

    return labels, sentences

labels, sentences = load_label_sentence()

信息增益来计算特征词

## one-hot表示法

In [20]:
def get_word_freq():
    """
    统计高频词汇
    """
    stopwords = load_stopword()
    words_freq = {}
    words_ci = {} # 出现某个词，是某类的概率，此问题有五类
    class_num = 5
    labels_num = [0] * class_num
    labels, sentences = load_label_sentence()
    
    for y, s in zip(labels, sentences):
        
        # 统计每个类别的数量
        labels_num[int(y)] += 1
        # 分词
        for w in thu.cut(s):
            w = w[0]
            # 停用词等过滤
            if w == '' or w in stopwords or w.isdigit():
                continue
            elif w in words_freq:
                words_freq[w] += 1
                words_ci[w][int(y)] += 1
            else:
                words_freq[w] = 1
                words_ci[w] = [0] * class_num
                words_ci[w][int(y)] += 1
    
    # 数量转概率
    num2pro = lambda nums: [num / sum(nums) for num in nums]
    
    # 每类上的概率
    v_ci = num2pro(labels_num)
    
    word_gain = {}
    for w in words_ci.keys():
        word_ci = words_ci[w]
        
        v_ci_t = num2pro(word_ci) # 句子出现t是Ci类的概率
        
        non_word_ci = [labels_num[i] - word_ci[i] for i in range(class_num)] # 不是t时候的各类数量
        v_ci_non_t = num2pro(non_word_ci) # 句子不出现t是Ci的概率
        
        pr_t = words_freq[w] / sum(labels_num) # 存在t的概率
        
        Gt = Info_gain_of_term(v_ci, v_ci_t, v_ci_non_t, pr_t)
        
        word_gain[w] = Gt
        

    word_gain = sorted(word_gain.items(), key=lambda d: d[1], reverse=True) 
    with open('data/word_gain_freq.txt', 'w') as f:
        for w, gain in word_gain:
            if words_freq[w] >= 5:
                print(w, gain, words_freq[w], sep='\t', file=f)
            

            
def Info_gain_of_term(v_ci, v_ci_t, v_ci_non_t, pr_t):
    """
    计算信息增益，需要每类的概率，句子出现t是Ci类的概率，不出现t是Ci的概率，存在t的概率
    """
    def info_entropy(p):
        if p == 0:
            return 0
        else:
            return -p * np.log(p)
    
    gain = 0
    for i in range(len(v_ci)):
        gain = gain + (info_entropy(v_ci[i]) - pr_t * info_entropy(v_ci_t[i]) - (1 - pr_t) * info_entropy(v_ci_non_t[i]))
    return gain


In [21]:
def make_features_onehot(features_file_name):

    def load_word_list(first=2400):
        word_list = []
        for i, line in enumerate(open('data/word_gain_freq.txt')):
            if i >= first:
                break
            try:
                w, gain, freq = line.strip().split('\t')
            except ValueError:
                print('读取词向量出错：行 {}'.format(i))
            word_list.append(w)
        print('词向量大小', len(word_list))
        return word_list

    word_list = load_word_list()

    print('---- 我的词表 ----')
    i = 0
    with open(features_file_name, 'w') as f:
        for y, s in zip(labels, sentences):
            i += 1
            if not i % 1000:
                print('行 ->', i)
            vec = np.zeros(len(word_list))
            for w in thu.cut(s):
                w = w[0]
                # print(w)
                try:
                    _i = word_list.index(w)
                    vec[_i] = 1
                except ValueError:
                    pass

            f.write(y + '\t' + ','.join(['{:.1f}'.format(num) for num in list(vec)]) + '\n')
    print('总行数：', i)
    

引入ACL2018词向量（财经方面）

因为该文件是按出现次数排序，那么考虑“掐头去尾”

停用词要不要去？也是要考虑的，停用词有时候也起到作用。

In [22]:
def load_word_vec():
    """
    加载ACL2018词向量
    """
    word_vec = {}
    print('加载词向量中 ...')
    for i, line in enumerate(open('data/sgns.financial.word')):
        if i <= 10:
            continue
        if i > 150000:
            break
        words = line.strip().split(' ')
        word = words[0]
        word_vec[word] = np.array([float(num) for num in words[1:]])
#         except UnicodeDecodeError:
#             print("编码问题，行 {}".format(i))
    print('加载词完成！一共 {}个词'.format(len(word_vec)))
    return word_vec


def make_features_ACLwv(features_file_name):
    word_vec = load_word_vec()
    i = 0
    # 建立训练文件：ACL的wv
    print('---- ACL wv ----')
    with open(features_file_name, 'w') as f:
        for y, s in zip(labels, sentences):
            i += 1
            if not i % 1000:
                print('行 -> {}'.format(i))
            count = 0
            vec = np.zeros(300)

            for w in thu.cut(s): # 对分词结果进行处理
                w = w[0]
    #             if w in stop_word:
    #                 continue
                if w in word_vec:
                    vec += word_vec[w]
                    count += 1

            if count != 0:
                vec = vec / count

    #         if count > 0:
            f.write(y + '\t' + ','.join(['{:.6f}'.format(num) for num in list(vec)]) + '\n')
    print('总行数：', i)

In [23]:
from gensim.models import Word2Vec
mywv_model = Word2Vec.load("model/guba_word2vec.model")

def make_features_mywv(features_file_name):
    i = 0
    # 建立训练文件: 我的wv
    print('---- 我的wv ----')
    with open(features_file_name, 'w') as f:
        for y, s in zip(labels, sentences):
            i += 1
            if not i % 1000:
                print('行 -> {}'.format(i))
            count = 0
            vec = np.zeros(300)

            for w in thu.cut(s): # 对分词结果进行处理
                w = w[0]
                if w in mywv_model.wv:
                    vec += mywv_model.wv[w]
                    count += 1

            if count != 0:
                vec = vec / count

    #         if count > 0:
            f.write(y + '\t' + ','.join(['{:.6f}'.format(num) for num in list(vec)]) + '\n')
    print('总行数：', i)


## ~ ⬆️准备训练数据 ⬇️开始训练

机器学习算法包括：KNN、LR、随机森林、决策树、GBDT、SVM

In [29]:
get_word_freq() # 词分析
make_features_onehot('data/train/onehot.txt')
make_features_ACLwv('data/train/ACLwv.txt')
make_features_mywv('data/train/mywv.txt')

读取词向量出错：行 453
读取词向量出错：行 1248
读取词向量出错：行 1249
词向量大小 2400
---- 我的词表 ----
行 -> 1000
行 -> 2000
行 -> 3000
行 -> 4000
行 -> 5000
行 -> 6000
行 -> 7000
行 -> 8000
行 -> 9000
行 -> 10000
行 -> 11000
总行数： 11968
加载词向量中 ...
加载词完成！一共 149960个词
---- ACL wv ----
行 -> 1000
行 -> 2000
行 -> 3000
行 -> 4000
行 -> 5000
行 -> 6000
行 -> 7000
行 -> 8000
行 -> 9000
行 -> 10000
行 -> 11000
总行数： 11968
---- 我的wv ----
行 -> 1000
行 -> 2000
行 -> 3000
行 -> 4000
行 -> 5000
行 -> 6000
行 -> 7000
行 -> 8000
行 -> 9000
行 -> 10000
行 -> 11000
总行数： 11968


In [34]:
def load_train_data(in_name, num=-1):
    """
    加载训练数据
    """
    X = []
    y = []
    for line in open(in_name):

            
            
        label, vec = line.strip().split('\t')
        
        if num == 0: # 是否有情绪
            if label == '0':
                label = 0
            else:
                label = 1
        
        if num == 2: # 正负情绪
            if label == '2':
                label = 1
            elif label == '0':
                continue
            else:
                label = 0
                
        if num == 4: # 四种情绪
            if label == '0':
                continue
                
        x = np.array([float(v) for v in vec.split(',')])
        y.append(label)
        X.append(x)
    X = np.array(X)
    y = np.array(y)
    return X, y

In [None]:
def stack_X_y(X1, y1, X2, y2, out_name=0):
    print(X1.shape, y1.shape, X2.shape, y2.shape)
    if len(y1) != len(y2):
        print('两列表长度不同，不同合并。')
        return -1
    _len = len(X1)
    X = []
    for i in range(_len):
        xi= np.hstack([X1[i], X2[i]])
        X.append(xi)
    X = np.array(X)
    y = np.array(y1)

    if out_name != 0:
        with open(out_name, 'w') as f:
            for xi, yi in zip(X, y):
                f.write(yi + '\t' + ','.join(['{:.6f}'.format(num) for num in list(xi)]) + '\n')
    print('合并数据完成。')
    return X, y


def train():
    """
    调参
    """
    # 合并数据
#     X1, y1 = load_train_data('data/train/onehot.txt')
#     X2, y2 = load_train_data('data/train/ACLwv.txt')
#     X1, y1 = stack_X_y(X1, y1, X2, y2)
#     X3, y3 = load_train_data('data/train/mywv.txt')
#     X, y = stack_X_y(X1, y1, X3, y3, out_name='data/train/all-180912.txt')
    
    X, y = load_train_data('data/train/all-180912.txt', num=2)
    print(X.shape, y.shape)

    # 划分数据集
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=23)

    # 初始化分类器
    test_classifiers = ['LR', 'GBDT']
    classifiers = {
        'NB':naive_bayes_classifier,
        'KNN':knn_classifier,
        'LR':logistic_regression_classifier,
        'RF':random_forest_classifier,
        'DT':decision_tree_classifier,
        'SVM':svm_classifier,
        'SVMCV':svm_cross_validation,
        'GBDT':gradient_boosting_classifier
    }


    for classifier in test_classifiers:
        print('******************* {} ********************'.format(classifier))
        if classifier == "GBDT":
            clf = GradientBoostingClassifier(learning_rate=0.1, max_depth=5)
            clf.fit(X_train, y_train)
        if classifier == "LR":
            clf = LogisticRegression()
            clf.fit(X_train, y_train)
        else:
            clf = classifiers[classifier](X_train, y_train)
        evaluate(clf, X, y, X_test, y_test)


    # SVR
    original_params = {}
    for i, setting in enumerate([{'C':0.125}, {'C': 0.25}, {'C':0.5}, {'C':1.0}]):
        print('******************* SVR-{} ********************'.format(i))
        print(setting)
        params = dict(original_params)
        params.update(setting)

        clf = LinearSVC(**params)
        clf.fit(X_train, y_train)
        evaluate(clf, X, y, X_test, y_test)

    # GBDT
    original_params = {'n_estimators': 1000, 'max_leaf_nodes': 4, 'max_depth': 3, 'random_state': 23,
                    'min_samples_split': 5}

    for i, setting in enumerate([{'learning_rate': 1.0, 'subsample': 1.0},
                    {'learning_rate': 0.1, 'subsample': 1.0},
                    {'learning_rate': 1.0, 'subsample': 0.5},
                    {'learning_rate': 0.1, 'subsample': 0.5},
                    {'learning_rate': 0.1, 'max_features': 2}]):
        print('******************* GBDT{} ********************'.format(i))
        print(setting)
        params = dict(original_params)
        params.update(setting)

        clf = GradientBoostingClassifier(**params)
        clf.fit(X_train, y_train)
        evaluate(clf, X, y, X_test, y_test)

    original_params = {}

    
def evaluate(clf, X, y, X_test, y_test):
    # CV
    print('accuracy of CV:', cross_val_score(clf, X, y, cv=5).mean())

    # 模型评估
    y_pred = []
    for i in range(len(X_test)):
        y_hat = clf.predict(X_test[i].reshape(1, -1))
        y_pred.append(y_hat[0])
    print(classification_report(y_test, y_pred))
    
    
def train_model():
    X, y = load_train_data('data/train/train_data_ACL-20180712.txt')
    clf = LogisticRegression(penalty='l2')
    print(X.shape, y.shape)
    clf.fit(X, y)
    # 保存模型
    joblib.dump(clf, "emo-LR-v1.model")
    
    
train()

(8246, 3000) (8246,)
******************* LR ********************
accuracy of CV: 0.6672339459345854
             precision    recall  f1-score   support

          0       0.73      0.89      0.80      1197
          1       0.33      0.15      0.21       453

avg / total       0.62      0.68      0.64      1650

******************* GBDT ********************
