## 情感分类

停用词

加载原始文本，只考虑'1', '2', '3', '4', '-'五类，'x'不确定的暂时不考虑。

In [4]:
import os
import json
from collections import Counter
from tqdm import tqdm_notebook as tqdm
import numpy as np
# import jieba
from thulac import thulac
thu = thulac(seg_only=True)

def load_stopword():
    """
    加载停用词集合
    """
    return set(json.load(open('data/stopword-zh.json')))

stop_word = load_stopword()

def load_label_sentence():
    """
    加载原始文本
    """
    sentences = []
    labels = []
    in_dir = 'data/labelled'

    for in_name in os.listdir(in_dir):
        _in = os.path.join(in_dir, in_name)
        # print(_in)
        for i, line in enumerate(open(_in)):
            if line.strip() == '':
                continue
            label = line.split('\t')[0]
            s= line.split('\t')[1]
            # 1234：四种情绪，-：没有情绪，x：不确定
            if label in ['1', '2', '3', '4', '-']:
                if label == '-' or label == 'x':
                    labels.append('0')
                else:
                    labels.append(label)
            sentences.append(s)

    return labels, sentences


Model loaded succeed


UnicodeDecodeError: 'ascii' codec can't decode byte 0xe3 in position 2: ordinal not in range(128)

信息增益来计算特征词

one-hot表示法

In [35]:
def get_word_freq():
    """
    统计高频词汇
    """
    stopwords = load_stopword()
    words_freq = {}
    words_ci = {} # 出现某个词，是某类的概率，此问题有五类
    class_num = 5
    labels_num = [0] * class_num
    labels, sentences = load_label_sentence()
    
    for y, s in zip(labels, sentences):
        
        # 统计每个类别的数量
        labels_num[int(y)] += 1
        # 分词
        for w in thu.cut(s):
            w = w[0]
            # 停用词等过滤
            if w == '' or w in stopwords or w.isdigit():
                continue
            elif w in words_freq:
                words_freq[w] += 1
                words_ci[w][int(y)] += 1
            else:
                words_freq[w] = 1
                words_ci[w] = [0] * class_num
                words_ci[w][int(y)] += 1
    
    # 数量转概率
    num2pro = lambda nums: [num / sum(nums) for num in nums]
    
    # 每类上的概率
    v_ci = num2pro(labels_num)
    
    word_gain = {}
    for w in words_ci.keys():
        word_ci = words_ci[w]
        
        v_ci_t = num2pro(word_ci) # 句子出现t是Ci类的概率
        
        non_word_ci = [labels_num[i] - word_ci[i] for i in range(class_num)] # 不是t时候的各类数量
        v_ci_non_t = num2pro(non_word_ci) # 句子不出现t是Ci的概率
        
        pr_t = words_freq[w] / sum(labels_num) # 存在t的概率
        
        Gt = Info_gain_of_term(v_ci, v_ci_t, v_ci_non_t, pr_t)
        
        word_gain[w] = Gt
        

    word_gain = sorted(word_gain.items(), key=lambda d: d[1], reverse=True) 
    with open('data/word_gain_freq.txt', 'w') as f:
        for w, gain in word_gain:
            if words_freq[w] >= 5:
                print(w, gain, words_freq[w], sep='\t', file=f)
            

            
def Info_gain_of_term(v_ci, v_ci_t, v_ci_non_t, pr_t):
    """
    计算信息增益，需要每类的概率，句子出现t是Ci类的概率，不出现t是Ci的概率，存在t的概率
    """
    def info_entropy(p):
        if p == 0:
            return 0
        else:
            return -p * np.log(p)
    
    gain = 0
    for i in range(len(v_ci)):
        gain = gain + (info_entropy(v_ci[i]) - pr_t * info_entropy(v_ci_t[i]) - (1 - pr_t) * info_entropy(v_ci_non_t[i]))
    return gain
    

def word_2_vec_one_hot():

    def load_word_list(first=2000):
        word_list = []
        for i, line in enumerate(open('data/word_gain_freq.txt')):
            if i >= first:
                break
            try:
                w, gain, freq = line.strip().split('\t')
            except ValueError:
                print('读取词向量出错：行 {}'.format(i))
            word_list.append(w)
        print('词向量大小', len(word_list))
        return word_list

    word_list = load_word_list()
    labels, sentences = load_label_sentence()
    i = 0
    for y, s in zip(labels, sentences):
        i += 1
        if not i % 1000:
            print(i)
        vec = np.zeros(len(word_list))
        for w in thu.cut(s):
            w = w[0]
            # print(w)
            try:
                _i = word_list.index(w)
                vec[_i] = 1
            except ValueError:
                pass

        print(y, ','.join(['{:.1f}'.format(num) for num in list(vec)]), sep='\t', file=open('train_data_one_hot-20180710.txt', 'a'))
    
# one-hot 
get_word_freq() # 词分析
word_2_vec_one_hot()


读取词向量出错：行 1012
词向量大小 2000
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000


引入ACL2018词向量（财经方面）

因为该文件是按出现次数排序，那么考虑“掐头去尾”

停用词要不要去？也是要考虑的，停用词有时候也起到作用。

In [96]:
def load_word_vec():
    """
    加载ACL2018词向量，只加载信息增益筛选过后的词
    """
    word_vec = {}
    print('加载词向量中 ...')
    for i, line in enumerate(open('data/sgns.financial.word')):
        if i <= 10:
            continue
        if i > 150000:
            break
        words = line.strip().split(' ')
        word = words[0]
        word_vec[word] = np.array([float(num) for num in words[1:]])
#         except UnicodeDecodeError:
#             print("编码问题，行 {}".format(i))
    print('加载词完成！一共 {}个词'.format(len(word_vec)))
    return word_vec


word_vec = load_word_vec()
labels, sentences = load_label_sentence()

cnt = Counter(labels)
print(cnt)


i = 0
# 建立训练文件
with open('train_data_ACL-20180712.txt', 'w') as f:
    for y, s in zip(labels, sentences):
        i += 1
        if not i % 1000:
            print('行 -> {}'.format(i))
        count = 0
        vec = np.zeros(300)
        
        for w in thu.cut(s): # 对分词结果进行处理
            w = w[0]
#             if w in stop_word:
#                 continue
            if w in word_vec:
                vec += word_vec[w]
                count += 1
        
#         for w in jieba.cut(s): # 对分词结果进行处理
#             if w in stop_word:
#                 continue
#             if w in word_vec:
#                 vec += word_vec[w]
#                 count += 1

        if count != 0:
            vec = vec / count
            
        if count > 0:
            f.write(y + '\t' + ','.join(['{:.6f}'.format(num) for num in list(vec)]) + '\n')
            
            

加载词向量中 ...
加载词完成！一共 149960个词
Counter({'0': 4569, '3': 2251, '2': 2097, '1': 2095, '4': 982})
行 -> 1000
行 -> 2000
行 -> 3000
行 -> 4000
行 -> 5000
行 -> 6000
行 -> 7000
行 -> 8000
行 -> 9000
行 -> 10000
行 -> 11000


# 为lstm做准备，训练数据

句子不必对齐，不是生成模型，并不需要表示EOF；

In [None]:
def load_word_vec():
    """
    加载ACL2018财经类词向量
    """
    word_vec = {}
    print('加载词向量中 ...')
    for i, line in enumerate(open('data/sgns.financial.word')):
        if i > 150000:
            break
        words = line.strip().split(' ')
        word = words[0]
        word_vec[word] = np.array([float(num) for num in words[1:]])
    print('加载词完成！一共 {}个词'.format(len(word_vec)))
    return word_vec


word_vec = load_word_vec()
labels, sentences = load_label_sentence()

cnt = Counter(labels)
print(cnt)


i = 0
# 建立训练文件
sentense_vec = []
with open('lstm_data_ACL_180725.txt', 'w') as f:
    for y, s in zip(labels, sentences): # 遍历每句话
        sentense_vec = []
        i += 1
        
        if not i % 1000:
            print('行 -> {}'.format(i))
        
        count = 0
        for w in thu.cut(s): # 对分词结果进行处理
            w = w[0]
            if w in word_vec:
                vec = word_vec[w]
                count += 1
                sentense_vec.append(vec)
                
        # 全部0向量表示EOF
        # sentense_vec.append(np.zeros(300))
        
        # 至少命中3个词
        if count >= 3:
#             f.write(y + '\t' + ','.join(['{:.6f}'.format(num) for num in list(vec)]) + '\n')
            f.write(y + '\t' + '@@'.join([json.dumps(list(w)) for w in sentense_vec]))
            
            

加载词向量中 ...
加载词完成！一共 149971个词
Counter({'0': 3722, '3': 2565, '1': 2326, '2': 2247, '4': 1108})


## ~ ⬆️准备训练数据 ⬇️开始训练

机器学习算法包括：KNN、LR、随机森林、决策树、GBDT、SVM

In [None]:
from sklearn.neighbors import KNeighborsClassifier  
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.linear_model import LogisticRegression  
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC


from sklearn.externals import joblib

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report

# Multinomial Naive Bayes Classifier  
def naive_bayes_classifier(train_x, train_y):  
    from sklearn.naive_bayes import MultinomialNB  
    model = MultinomialNB(alpha=0.01)  
    model.fit(train_x, train_y)  
    return model  
  
  
# KNN Classifier  
def knn_classifier(train_x, train_y):  
    from sklearn.neighbors import KNeighborsClassifier  
    model = KNeighborsClassifier()  
    model.fit(train_x, train_y)  
    return model  
  
  
# Logistic Regression Classifier  
def logistic_regression_classifier(train_x, train_y):  
    from sklearn.linear_model import LogisticRegression  
    model = LogisticRegression(penalty='l2')  
    model.fit(train_x, train_y)  
    return model  
  
  
# Random Forest Classifier  
def random_forest_classifier(train_x, train_y):  
    from sklearn.ensemble import RandomForestClassifier  
    model = RandomForestClassifier(n_estimators=8)  
    model.fit(train_x, train_y)  
    return model  
  
  
# Decision Tree Classifier  
def decision_tree_classifier(train_x, train_y):  
    from sklearn import tree  
    model = tree.DecisionTreeClassifier()  
    model.fit(train_x, train_y)  
    return model  
  
  
# GBDT(Gradient Boosting Decision Tree) Classifier  
def gradient_boosting_classifier(train_x, train_y):  
    from sklearn.ensemble import GradientBoostingClassifier  
    model = GradientBoostingClassifier(n_estimators=200)  
    model.fit(train_x, train_y)  
    return model  
  
  
# SVM Classifier  
def svm_classifier(train_x, train_y):  
    from sklearn.svm import SVC  
    model = SVC(kernel='rbf', probability=True)  
    model.fit(train_x, train_y)  
    return model  
  
# SVM Classifier using cross validation  
def svm_cross_validation(train_x, train_y):  
    from sklearn.grid_search import GridSearchCV  
    from sklearn.svm import SVC  
    model = SVC(kernel='rbf', probability=True)  
    param_grid = {'C': [1e-3, 1e-2, 1e-1, 1, 10, 100, 1000], 'gamma': [0.001, 0.0001]}  
    grid_search = GridSearchCV(model, param_grid, n_jobs = 1, verbose=1)  
    grid_search.fit(train_x, train_y)  
    best_parameters = grid_search.best_estimator_.get_params()  
    for para, val in list(best_parameters.items()):  
        print(para, val)  
    model = SVC(kernel='rbf', C=best_parameters['C'], gamma=best_parameters['gamma'], probability=True)  
    model.fit(train_x, train_y)  

    
def load_train_data(in_name):
    """
    加载训练数据
    """
    X = []
    y = []
    for line in open(in_name):
        label, vec = line.strip().split('\t')
        # 高兴
        if label == '2':
            label = '1'
        # 没有情绪
        elif label == '0':
            label = '0'
        # 负面
        else:
            label = '-1'
        x = np.array([float(v) for v in vec.split(',')])
        y.append(label)
        X.append(x)
    X = np.array(X)
    y = np.array(y)
    return X, y


def train():
    """
    调参
    """
    X, y = load_train_data('data/train/train_data_ACL-20180712.txt')
    # X, y = load_train_data('data/train/train_data_one_hot-20180710.txt')
    print(X.shape, y.shape)

    # 划分数据集
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=41)

        
    # 初始化分类器
    test_classifiers = ['KNN', 'LR', 'RF', 'DT', 'GBDT']  
    classifiers = {'NB':naive_bayes_classifier,   
                  'KNN':knn_classifier,  
                   'LR':logistic_regression_classifier,  
                   'RF':random_forest_classifier,  
                   'DT':decision_tree_classifier,  
                  'SVM':svm_classifier,  
                'SVMCV':svm_cross_validation,  
                 'GBDT':gradient_boosting_classifier  
    } 
    
    for classifier in test_classifiers:  
        print('******************* {} ********************'.format(classifier))
        clf = classifiers[classifier](X_train, y_train)

        # CV
        print('accuracy of CV:', cross_val_score(clf, X, y, cv=5).mean())

#         # 执行训练
#         clf.fit(X_train, y_train)

        # 模型评估
        y_pred = []
        for i in range(len(X_test)):
            y_hat = clf.predict(X_test[i].reshape(1, -1))
            y_pred.append(y_hat[0])
        print(classification_report(y_test, y_pred))


def train_model():
    X, y = load_train_data('data/train/train_data_ACL-20180712.txt')
    clf = LogisticRegression(penalty='l2')
    print(X.shape, y.shape)
    clf.fit(X, y)
    # 保存模型
    joblib.dump(clf, "emo-LR-v1.model")

train()
# train_model()
