## 情感分类

In [1]:
import os
import json
from collections import Counter
from tqdm import tqdm_notebook as tqdm

import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report


from thulac import thulac
thu = thulac(seg_only=True)

def load_stopword():
    """
    加载停用词集合
    """
    return set(json.load(open('data/stopword-zh.json')))

Model loaded succeed


In [35]:
def load_label_sentence():
    """
    加载原始文本
    """
    sentences = []
    labels = []
    in_dir = 'data/labelled'

    for in_name in os.listdir(in_dir):
        _in = os.path.join(in_dir, in_name)
        # print(_in)
        for i, line in enumerate(open(_in)):
            if line.strip() == '':
                continue
            label = line.split('\t')[0]
            s= line.split('\t')[1]
            # 1234：四种情绪，-：没有情绪，x：不确定
            if label in ['1', '2', '3', '4', '-']:
                if label == '-':
                    labels.append('0')
                else:
                    labels.append(label)
            sentences.append(s)

    return labels, sentences


def get_word_freq():
    """
    统计高频词汇
    """
    stopwords = load_stopword()
    words_freq = {}
    words_ci = {} # 出现某个词，是某类的概率，此问题有五类
    class_num = 5
    labels_num = [0] * class_num
    labels, sentences = load_label_sentence()
    
    for y, s in zip(labels, sentences):
        
        # 统计每个类别的数量
        labels_num[int(y)] += 1
        # 分词
        for w in thu.cut(s):
            w = w[0]
            # 停用词等过滤
            if w == '' or w in stopwords or w.isdigit():
                continue
            elif w in words_freq:
                words_freq[w] += 1
                words_ci[w][int(y)] += 1
            else:
                words_freq[w] = 1
                words_ci[w] = [0] * class_num
                words_ci[w][int(y)] += 1
    
    # 数量转概率
    num2pro = lambda nums: [num / sum(nums) for num in nums]
    
    # 每类上的概率
    v_ci = num2pro(labels_num)
    
    word_gain = {}
    for w in words_ci.keys():
        word_ci = words_ci[w]
        
        v_ci_t = num2pro(word_ci) # 句子出现t是Ci类的概率
        
        non_word_ci = [labels_num[i] - word_ci[i] for i in range(class_num)] # 不是t时候的各类数量
        v_ci_non_t = num2pro(non_word_ci) # 句子不出现t是Ci的概率
        
        pr_t = words_freq[w] / sum(labels_num) # 存在t的概率
        
        Gt = Info_gain_of_term(v_ci, v_ci_t, v_ci_non_t, pr_t)
        
        word_gain[w] = Gt
        

    word_gain = sorted(word_gain.items(), key=lambda d: d[1], reverse=True) 
    with open('data/word_gain_freq.txt', 'w') as f:
        for w, gain in word_gain:
            if words_freq[w] >= 5:
                print(w, gain, words_freq[w], sep='\t', file=f)
            

            
def Info_gain_of_term(v_ci, v_ci_t, v_ci_non_t, pr_t):
    """
    计算信息增益，需要每类的概率，句子出现t是Ci类的概率，不出现t是Ci的概率，存在t的概率
    """
    def info_entropy(p):
        if p == 0:
            return 0
        else:
            return -p * np.log(p)
    
    gain = 0
    for i in range(len(v_ci)):
        gain = gain + (info_entropy(v_ci[i]) - pr_t * info_entropy(v_ci_t[i]) - (1 - pr_t) * info_entropy(v_ci_non_t[i]))
    return gain
    

def word_2_vec_one_hot():

    def load_word_list(first=2000):
        word_list = []
        for i, line in enumerate(open('data/word_gain_freq.txt')):
            if i >= first:
                break
            try:
                w, gain, freq = line.strip().split('\t')
            except ValueError:
                print('读取词向量出错：行 {}'.format(i))
            word_list.append(w)
        print('词向量大小', len(word_list))
        return word_list

    word_list = load_word_list()
    labels, sentences = load_label_sentence()
    i = 0
    for y, s in zip(labels, sentences):
        i += 1
        if not i % 1000:
            print(i)
        vec = np.zeros(len(word_list))
        for w in thu.cut(s):
            w = w[0]
            # print(w)
            try:
                _i = word_list.index(w)
                vec[_i] = 1
            except ValueError:
                pass

        print(y, ','.join(['{:.1f}'.format(num) for num in list(vec)]), sep='\t', file=open('train_data_one_hot-20180710.txt', 'a'))
    
# one-hot 
get_word_freq() # 词分析
word_2_vec_one_hot()


读取词向量出错：行 1012
词向量大小 2000
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000


In [42]:
def load_word_vec():
    """
    加载ACL2018词向量
    """
    word_vec = {}
    print('加载词向量中 ...')
    for i, line in enumerate(open('data/sgns.merge.word')):
#         if i <= 100:
#             continue
        if i > 10000:
            break
        words = line.strip().split(' ')
        word = words[0]
        vec = np.array([float(num) for num in words[1:]])
        word_vec[word] = vec
    print('加载词完成！')
    return word_vec


# 建立训练文件
word_vec = load_word_vec()
labels, sentences = load_label_sentence()
i = 0
for y, s in zip(labels, sentences):
    i += 1
    if not i % 1000:
        print(i)
    vec = np.zeros(300)
    count = 0
    for w in thu.cut(s):
        w = w[0]
        if w[0] in word_vec:
            vec += word_vec[w[0]]
            count += 1
    vec = vec / count
    if count > 0:
        print(y, ','.join(['{:.6f}'.format(num) for num in list(vec)]), sep='\t', file=open('train_data_ACL-20180710.txt', 'a'))

        

加载词向量中 ...
加载词完成！




1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000


## ~ ⬆️准备训练数据 ⬇️开始训练

In [48]:
def load_train_data(in_name):
    """
    加载训练数据
    """
    X = []
    y = []
    for line in open(in_name):
        label, vec = line.strip().split('\t')
        x = np.array([float(v) for v in vec.split(',')])
        y.append(label)
        X.append(x)
    return X, y


def train():
#     X, y = load_train_data('train_data_one_hot-20180710.txt')
    X, y = load_train_data('train_data_ACL-20180710.txt')
    
    # 划分数据集
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=41)
    
    # 初始化分类器
#     clf = RandomForestClassifier(max_depth=10, random_state=1)
#     clf = BernoulliNB()
    clf = SVC(C=0.5) # SVM较为耗时
    
    # 执行训练
    clf.fit(X_train, y_train)
    
    # 模型评估
    print(cross_val_score(clf, X, y, cv=10).mean())

    y_pred = []
    for i in range(len(X_test)):
        y = clf.predict(X_test[i].reshape(1, -1))
        # print(y[0])
        y_pred.append(y[0])
    print(classification_report(y_test, y_pred))
    
# train()