In [102]:
import logging
import sys
import os
import string
from pathlib import Path
from zhon import hanzi

import numpy as np
import torch
from torch import autograd, optim
import torch.nn as nn
import torch.nn.functional as F

from gensim.models import Word2Vec
from sklearn.metrics import classification_report
# from tensorboardX import SummaryWriter

from tqdm import tqdm_notebook as tqdm
from visdom import Visdom

from tweet_process import TwPro

In [103]:
# 计算字的信息熵

def Info_gain_of_term(v_ci, v_ci_t, v_ci_non_t, pr_t):
    """
    计算信息增益，需要每类的概率，句子出现t是Ci类的概率，不出现t是Ci的概率，存在t的概率
    """
    def info_entropy(p):
        if p <= 0:
            return 0
        else:
            return -p * np.log(p)

    gain = 0
    for i in range(len(v_ci)):
        gain = gain + \
            (info_entropy(v_ci[i]) - pr_t * info_entropy(v_ci_t[i]) - (1 - pr_t) * info_entropy(v_ci_non_t[i]))
    return gain


def get_word_freq():
    """
    统计高频字
    """
    words_freq = {}
    words_ci = {} # 出现某个词，是某类的概率，此问题有五类
    class_num = 4
    labels_num = np.zeros(class_num)

    for line in tqdm(open(config.train_file)):
        label, sentence = line.strip().split("\t")
        label = label.strip()
        if label == "-":
            label = 0
        elif label == "x":
            continue
        else:
            label = int(label)
                
        # 四分类
        if class_num == 4:
            if label == 0:
                continue
            label -= 1
            
            labels_num[label] += 1
            
        sentence = sentence.replace(" ", "")
        
        for char in sentence:
            if char not in string.punctuation and char not in string.digits and \
                char not in hanzi.punctuation and char not in string.ascii_letters:
                
                if char in words_freq:
                    words_freq[char] += 1
                    words_ci[char][label] += 1
                else:
                    words_freq[char] = 1
                    words_ci[char] = np.zeros(class_num)
                    words_ci[char][label] += 1
                    

    # 数量转概率
    num2pro = lambda nums: [num / sum(nums) for num in nums]

    # 每类上的概率
    v_ci = num2pro(labels_num)

    word_gain = {}
    for w in words_ci.keys():
        word_ci = words_ci[w]
        v_ci_t = num2pro(word_ci) # 句子出现t是Ci类的概率
        non_word_ci = [labels_num[i] - word_ci[i] for i in range(class_num)] # 不是t时候的各类数量
        v_ci_non_t = num2pro(non_word_ci) # 句子不出现t是Ci的概率
        pr_t = words_freq[w] / sum(labels_num) # 存在t的概率
        Gt = Info_gain_of_term(v_ci, v_ci_t, v_ci_non_t, pr_t)
        word_gain[w] = Gt


    word_gain = sorted(word_gain.items(), key=lambda d: d[1], reverse=True) # 根据信息增益排序
    with open('data/char_gain_freq.txt', 'w') as f:
        for w, gain in word_gain:
            if words_freq[w] >= 10:
                print(w, gain, words_freq[w], sep='\t', file=f)
                
get_word_freq()

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [104]:
class Config:
    def __init__(self):
        self.train_file = "data/2019-02-12/train.txt"
        self.batch_size = 128
        self.embedding_size = 64 # word embedding
        self.channel = 3
        self.learning_rate = 0.001
        self.window_size = 3
        self.num_classes = 4

        self.num_epochs = 20
        self.summary_interval = 20
        
config = Config()

In [105]:
from torch.utils.data import Dataset, DataLoader
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer

class MYDataset(Dataset):
    
    def __init__(self):
        self._wv1 = None
        self._wv2 = None
        self._wv3 = self.read_wv3()
        
        if not Path("data/y.npy").exists():
            self._calc()
        else:
            self._load()
            return

        # 根据分类目标，调整X和y
        num_c = config.num_classes
        
        if num_c == 2:
            print(num_c, "...")
            tmp_X = []
            tmp_y = []
            for x, y in zip(self.X, self.y):
                if y == 0:
                    tmp_X.append(x)
                    tmp_y.append(0)
                else:
                    tmp_X.append(x)
                    tmp_y.append(1)
            self.X = np.array(tmp_X)
            self.y = np.array(tmp_y)
            
        elif num_c == 4:
            print(num_c, "...")
            tmp_X = []
            tmp_y = []
            for x, y in zip(self.X, self.y):
                if y == 0:
                    continue
                else:
                    tmp_X.append(x)
                    tmp_y.append(y - 1)
            self.X = np.array(tmp_X)
            self.y = np.array(tmp_y)
            
        self._save()
                    
        
        # 选择是否采样
        # self.sample()

    def __getitem__(self, index):
        return self.X[index], self.y[index]
    
    def __len__(self):
        return len(self.y)
    
    def _weight(self):
        return Counter(self.y)
        
    def read_wv1(self):
        print("Loading wv1 ...")
        return Word2Vec.load("model/guba_word2vec.model")

    def read_wv2(self):
        """
        加载ACL2018词向量
        """
        word_vec = {}
        print('Loading wv2 ...')
        for i, line in enumerate(open('model/sgns.financial.word')):
            if i > 200000:
                break
            words = line.strip().split(' ')
            word = words[0]
            word_vec[word] = np.array([float(num) for num in words[1:]])
    #         except UnicodeDecodeError:
    #             print("编码问题，行 {}".format(i))
        print('Loaded! There are {} words.'.format(len(word_vec)))
        return word_vec

    def read_wv3(self):
        char_list = []
        for line in open("data/char_gain_freq.txt"):
            char = line.strip().split("\t")[0]
            if char not in string.punctuation and char not in string.digits and \
                char not in hanzi.punctuation and char not in string.ascii_letters:
                char_list.append(char)

        char_list = char_list[: 300]
        # print(char_list)
        count_v = CountVectorizer(vocabulary=char_list, analyzer="char", binary=True)
        count_v.fit(["hello world"])
        return count_v

    def wv1(self, words):
        v = np.zeros(config.embedding_size * 300).reshape(config.embedding_size, 300)
        _index = 0
        for w in words:
            if _index >= config.embedding_size:
                break
            if w in self._wv1.wv:
                v[_index] = self._wv1.wv[w]
                _index += 1
        return v

    def wv2(self, words):
        v = np.zeros(config.embedding_size * 300).reshape(config.embedding_size, 300)
        _index = 0
        for w in words:
            if _index >= config.embedding_size:
                break
            if w in self._wv2:
                v[_index] = self._wv2[w]
                _index += 1
        return v
    
    def wv3(self, text):
        v = np.zeros(config.embedding_size * 300).reshape(config.embedding_size, 300)
        _index = 0
        for char in text:
            if _index >= config.embedding_size:
                break
            if char in self._wv3.vocabulary_:
                pos = self._wv3.vocabulary_.get(char)
                v[_index][pos] = 1 
                _index += 1
        return v
    
    def sample(self):
        X, y = self.X, self.y
        X = X.reshape(-1, 2 * 100 * 300)
        ros = RandomOverSampler(random_state=13)
        X_resampled, y_resampled = ros.fit_sample(X, y)
        X_resampled = X.reshape(-1, 2, 100, 300)
        self.X, self.y = X_resampled, y_resampled
        
    def _calc(self):
        if not self._wv1:
            self._wv1 = self.read_wv1()
        if not self._wv2:
            self._wv2 = self.read_wv2()

        X = []
        y = []
        for line in tqdm(open(config.train_file)):
            try:
                label, sentence = line.strip().split("\t")
            except ValueError:
                print("Error line:", line)
                continue

            # 5- 分类
            label = label.strip()
            if label == "-":
                label = 0
            elif label == "x":
                continue
            else:
                label = int(label)
            y.append(label)
            
            words = sentence.split()
            
            if config.channel == 2:
                X.append(np.array([self.wv1(words), self.wv2(words)]))
            elif config.channel == 3:
                X.append(np.array([self.wv1(words), self.wv2(words), self.wv3(sentence)]))
                
        self.X, self.y = X, y
    
    def _save(self):
        print("Saving ...")
        np.save("data/X.npy", np.array(self.X))
        np.save("data/y.npy", np.array(self.y))
    
    def _load(self):
        # self.clear()
        if Path("data/y.npy").exists():
            print("Loading X and y...")
            self.X = np.load("data/X.npy")
            self.y = np.load("data/X.npy")
        else:
            print("数据文件不存在。")

In [106]:
%time my_data = MYDataset()
len(my_data)

Loading wv1 ...
Loading wv2 ...
Loaded! There are 199948 words.


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


4 ...
Saving ...
CPU times: user 44.1 s, sys: 32.1 s, total: 1min 16s
Wall time: 1min 29s


9550

In [96]:
rst = my_data._weight()
print(rst)
sample_weight = np.array([1 / rst[i] for i in range(config.num_classes)])
sample_weight = torch.from_numpy(sample_weight).float()
# sampler = torch.utils.data.sampler.WeightedRandomSampler(sample_weight, num_samples=len(my_data))
sample_weight

Counter({2: 2703, 1: 2614, 0: 2473, 3: 1760})


tensor([0.0004, 0.0004, 0.0004, 0.0006])

In [97]:
test_split = 0.1

# dataset_size = len(my_data)
# indices = list(range(dataset_size))
# split = int(np.floor(validation_split * dataset_size))
# np.random.seed(13)
# np.random.shuffle(indices)
# train_indices, val_indices = indices[:split], indices[split:]
# train_data = my_data[train_indices]
# val_data = my_data[val_indices]

# test_size = int(test_split * len(my_data))
# train_size = len(my_data) - test_size
# train_data, test_data = torch.utils.data.random_split(my_data, [train_size, test_size])

from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(my_data, test_size=test_split, random_state=21)
print(train_data[0][0].shape)
# train_loader = torch.utils.data.DataLoader(train_data, batch_size=config.batch_size, 
#                                            sampler=sampler)
# test_loader = torch.utils.data.DataLoader(test_data, batch_size=1000,
#                                                 sampler=sampler)

train_loader = torch.utils.data.DataLoader(train_data, batch_size=config.batch_size)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=len(test_data))

(3, 50, 300)


In [98]:
class CNNClassifier(nn.Module):

    def __init__(self):
        super(CNNClassifier, self).__init__()
        # 2 in- channels, 32 out- channels, 3 * 300 windows size
        self.conv = torch.nn.Conv2d(config.channel, 32 * config.channel, kernel_size=(3, 300), groups=config.channel)
        
        self.f1 = nn.Linear(16 * config.channel * (config.embedding_size - 2), 256)
        self.f2 = nn.Linear(256, 128)
        self.f3 = nn.Linear(128, 64)
        self.f4 = nn.Linear(64, 32)
        self.f5 = nn.Linear(32, config.num_classes)

    def forward(self, x):
#         print(x.size())
        x = x.float()
        out = F.relu(self.conv(x))
#         print(out.size())
        out = torch.squeeze(out)
#         print(out.size())
        out = F.max_pool1d(out, 2)
#         print(out.size())
        out = out.view(-1, 16 * config.channel * (config.embedding_size - 2))
#         print(out.size())
        out = F.relu(out)
    
    
        out = F.relu(self.f1(out))
#         print(out.size())
        out = F.relu(self.f2(out))
        out = F.relu(self.f3(out))
        out = F.relu(self.f4(out))
        out = F.relu(self.f5(out))
        
        # print(out.size())
        
        if config.num_classes == 2:
            probs = F.log_softmax(out, dim=1)
        else:
            probs = out
            
        # print(probs, probs.size())
        # print(probs.size())
        classes = torch.max(probs, -1)[1]

        return probs, classes


In [99]:
def train(model):
#     vis = Visdom()
#     lay1 = dict(title="train loss", xaxis={'title': 'step'}, yaxis={'title': 'loss'})
#     win1 = vis.line(X = np.array([0]), Y = np.array([0]), opts=lay1)
#     lay2 = dict(title="avg precision", xaxis={'title': 'epoch'}, yaxis={'title': 'precision'})
#     win2 = vis.line(X = np.array([0]), Y = np.array([0]), opts=lay2)
    
    step_loss = []
    step_precision = []

    num_c = config.num_classes
    if num_c == 2:
        loss_function = nn.NLLLoss(sample_weight)
    else:
        # loss_function = nn.CrossEntropyLoss(sample_weight)
        loss_function = nn.CrossEntropyLoss(sample_weight)
        
    # optimizer = optim.SGD(model.parameters(), lr=config.learning_rate)
    optimizer = optim.Adam(model.parameters(), lr=config.learning_rate)

    # writer = SummaryWriter(log_dir="log")

    epoch = 0
    step = 0

    for epoch in range(1, config.num_epochs + 1):
        print("============== Epoch: {} ==============".format(epoch))
        running_losses = []

        for sequences, labels in train_loader:
            # Predict
            # model.zero_grad()
            probs, classes = model(sequences)

            # Backpropagation
            optimizer.zero_grad()
            losses = loss_function(probs, labels)
            losses.backward()
            optimizer.step()

            # Log summary
            running_losses.append(losses.data.item())
            if step % config.summary_interval == 0:
                loss = sum(running_losses) / len(running_losses)
                # writer.add_scalar("train/loss", loss, step)
                step_loss.append([step, loss])
                
#                 vis.line(X=np.array([step]), Y=np.array([loss]), update='append', win=win1)
                print("step = {}, loss = {:.4f}".format(step, loss))
            if loss < 0.1:
            
                running_losses = []
            step += 1

        # Classification report
        for vali in test_loader:
            X_test = vali[0]
            y_test = vali[1]
            probs, y_pred = model(X_test)
            # print(y_pred)
            if num_c == 5:
                target_names = ['non-', 'anger', "joy", "sadness", "fear"]
            elif num_c == 4:
                target_names = ['anger', "joy", "sadness", "fear"]
            elif num_c == 2:
                target_names = ['non-', "emotinal"]
            # logging.info("{}".format(classification_report(y_test, y_pred, target_names=target_names)))
            # report = classification_report(y_test, y_pred, target_names=target_names, output_dict=True)
            print(classification_report(y_test, y_pred, target_names=target_names))
            
#             vis.line(X=np.array([epoch]), Y=np.array([report["micro avg"]["precision"]]), update='append', win=win2)
            break
    
        if loss < 0.1:
            print("loss < 0.1")
            break

        epoch += 1

    # Save
    # torch.save(model, "model/shit{}.pkl".format(epoch))
    

#     step_loss = np.array(step_loss)
#     step_precision = np.array(step_precision)
#     viz.line(X=step_loss[:, 0], Y=step_loss[:, 1])
#     viz.line(X=step_precision[:, 0], Y=step_precision[:, 1])

In [100]:
model = CNNClassifier()
print(model)
%time train(model)

CNNClassifier(
  (conv): Conv2d(3, 96, kernel_size=(3, 300), stride=(1, 1), groups=3)
  (f1): Linear(in_features=2304, out_features=256, bias=True)
  (f2): Linear(in_features=256, out_features=128, bias=True)
  (f3): Linear(in_features=128, out_features=64, bias=True)
  (f4): Linear(in_features=64, out_features=32, bias=True)
  (f5): Linear(in_features=32, out_features=4, bias=True)
)
step = 0, loss = 1.3803
step = 20, loss = 1.3652
step = 40, loss = 1.3061
step = 60, loss = 1.2359
              precision    recall  f1-score   support

       anger       0.48      0.68      0.57       226
         joy       0.73      0.42      0.53       272
     sadness       0.38      0.40      0.39       267
        fear       0.57      0.61      0.59       190

   micro avg       0.51      0.51      0.51       955
   macro avg       0.54      0.53      0.52       955
weighted avg       0.54      0.51      0.51       955

step = 80, loss = 0.9953
step = 100, loss = 0.9823
step = 120, loss = 0.9455
 

step = 960, loss = 0.1218
step = 980, loss = 0.1633
step = 1000, loss = 0.1894
              precision    recall  f1-score   support

       anger       0.53      0.67      0.59       226
         joy       0.61      0.68      0.64       272
     sadness       0.53      0.51      0.52       267
        fear       0.73      0.45      0.56       190

   micro avg       0.58      0.58      0.58       955
   macro avg       0.60      0.57      0.58       955
weighted avg       0.59      0.58      0.58       955

step = 1020, loss = 0.1479
step = 1040, loss = 0.0915
step = 1060, loss = 0.1042
step = 1080, loss = 0.0663
              precision    recall  f1-score   support

       anger       0.49      0.73      0.59       226
         joy       0.65      0.64      0.65       272
     sadness       0.51      0.42      0.46       267
        fear       0.72      0.49      0.59       190

   micro avg       0.57      0.57      0.57       955
   macro avg       0.59      0.57      0.57       95