In [16]:
import logging
import sys
from pathlib import Path

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from gensim.models import Word2Vec
from sklearn.metrics import classification_report
from tensorboardX import SummaryWriter
from torch import autograd, optim
from tqdm import tqdm
from visdom import Visdom

from tweet_process import TwPro

In [17]:
class Config:
    def __init__(self):
        self.train_file = "data/2019-02-12/train.txt"
        self.batch_size = 128
        self.embedding_size = 50 # word embedding

        self.learning_rate = 0.001
        self.window_size = 3
        # self.num_classes = 2

        self.num_epochs = 10
        self.train_steps = None

        self.summary_interval = 10
        
config = Config()

In [18]:
from torch.utils.data import Dataset, DataLoader
from collections import Counter

num_c = 4

class MYDataset(Dataset):
    
    def __init__(self):
        self._wv1 = None
        self._wv2 = None
        self.X, self.y = self._load() 

        if num_c == 2:
            print(num_c, "...")
            tmp_X = []
            tmp_y = []
            for x, y in zip(self.X, self.y):
                if y == 0:
                    tmp_X.append(x)
                    tmp_y.append(0)
                else:
                    tmp_X.append(x)
                    tmp_y.append(1)
            self.X = np.array(tmp_X)
            self.y = np.array(tmp_y)
            
        elif num_c == 4:
            print(num_c, "...")
            tmp_X = []
            tmp_y = []
            for x, y in zip(self.X, self.y):
                if y == 0:
                    continue
                else:
                    tmp_X.append(x)
                    tmp_y.append(y - 1)
            self.X = np.array(tmp_X)
            self.y = np.array(tmp_y)
                    
        
        # 选择是否采样
        # self.sample()

    def __getitem__(self, index):
        return self.X[index], self.y[index]
    
    def __len__(self):
        return len(self.y)
    
    def _weight(self):
        return Counter(self.y)
        
    def read_wv1(self):
        print("Loading wv1 ...")
        return Word2Vec.load("model/guba_word2vec.model")

    def read_wv2(self):
        """
        加载ACL2018词向量
        """
        word_vec = {}
        print('Loading wv2 ...')
        for i, line in enumerate(open('model/sgns.financial.word')):
            if i > 500000:
                break
            words = line.strip().split(' ')
            word = words[0]
            word_vec[word] = np.array([float(num) for num in words[1:]])
    #         except UnicodeDecodeError:
    #             print("编码问题，行 {}".format(i))
        print('Loaded! There are {} words.'.format(len(word_vec)))
        return word_vec

    def wv1(self, words):
        v = np.zeros(config.embedding_size * 300).reshape(config.embedding_size, 300)
        _index = 0
        for w in words:
            if _index >= config.embedding_size:
                break
            if w in self._wv1.wv:
                v[_index] = self._wv1.wv[w]
                _index += 1
        return v

    def wv2(self, words):
        v = np.zeros(config.embedding_size * 300).reshape(config.embedding_size, 300)
        _index = 0
        for w in words:
            if _index >= config.embedding_size:
                break
            if w in self._wv2:
                v[_index] = self._wv2[w]
                _index += 1
        return v

    
    def sample(self):
        X, y = self.X, self.y
        X = X.reshape(-1, 2 * 100 * 300)
        ros = RandomOverSampler(random_state=13)
        X_resampled, y_resampled = ros.fit_sample(X, y)
        X_resampled = X.reshape(-1, 2, 100, 300)
        self.X, self.y = X_resampled, y_resampled
        
    def _save(self):
        
        if not self._wv1:
            self._wv1 = self.read_wv1()
        if not self._wv2:
            self._wv2 = self.read_wv2()

        X = []; y = []
        for line in open(config.train_file):
            try:
                label, sentence = line.strip().split("\t")
            except ValueError:
                print("Error line:", line)
                continue

            # 5- 分类
            label = label.strip()
            if label == "-":
                label = 0
            elif label == "x":
                continue
            else:
                label = int(label)
            y.append(label)
            
            words = sentence.split()
            X.append(np.array([self.wv1(words), self.wv2(words)]))
            
        np.save("data/X.npy", np.array(X))
        np.save("data/y.npy", np.array(y))
        
    def _load(self):
        if not Path("data/X.npy").exists():
            self._save()
        X = np.load("data/X.npy")
        y = np.load("data/y.npy")
        return X, y

In [19]:
my_data = MYDataset()
len(my_data)

Loading wv1 ...
Loading wv2 ...
Loaded! There are 467266 words.
4 ...


9603

In [20]:
rst = my_data._weight()
sample_weight = np.array([1 / rst[i] for i in range(num_c)])
sample_weight = torch.from_numpy(sample_weight).float()
# sampler = torch.utils.data.sampler.WeightedRandomSampler(sample_weight, num_samples=len(my_data))
sample_weight

tensor([0.0004, 0.0004, 0.0004, 0.0006])

In [25]:
test_split = 0.9

# dataset_size = len(my_data)
# indices = list(range(dataset_size))
# split = int(np.floor(validation_split * dataset_size))
# np.random.seed(13)
# np.random.shuffle(indices)
# train_indices, val_indices = indices[:split], indices[split:]
# train_data = my_data[train_indices]
# val_data = my_data[val_indices]

train_size = int(test_split * len(my_data))
test_size = len(my_data) - train_size
train_data, test_data = torch.utils.data.random_split(my_data, [train_size, test_size])

# train_loader = torch.utils.data.DataLoader(train_data, batch_size=config.batch_size, 
#                                            sampler=sampler)
# test_loader = torch.utils.data.DataLoader(test_data, batch_size=1000,
#                                                 sampler=sampler)

train_loader = torch.utils.data.DataLoader(train_data, batch_size=config.batch_size)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=len(test_data))

In [26]:
class CNNClassifier(nn.Module):

    def __init__(self):
        super(CNNClassifier, self).__init__()
        # 2 in- channels, 32 out- channels, 3 * 300 windows size
        self.conv = torch.nn.Conv2d(2, 64, kernel_size=(3, 300), groups=2)
        
        self.f1 = nn.Linear(32 * (config.embedding_size - 2), 128)
        self.f2 = nn.Linear(256, 128)
        self.f3 = nn.Linear(128, 64)
        self.f4 = nn.Linear(64, 32)
        self.f5 = nn.Linear(32, num_c)

    def forward(self, x):
        # print(x.size())
        x = x.float()
        
        out = self.conv(x)
        out = F.relu(out)
        # print(out.size())
        out = torch.squeeze(out)
        # print(out.size())
        out = F.max_pool1d(out, 2)
        # print(out.size())
        out = out.view(-1, 32 * (config.embedding_size - 2))
        # print(out.size())
        out = F.relu(out)

        out = F.relu(self.f1(out))
        # print(out.size())
#         out = F.relu(self.f2(out))
        out = F.relu(self.f3(out))
        out = F.relu(self.f4(out))
        out = F.relu(self.f5(out))
        
        # print(out.size())

        if num_c == 2:
            probs = F.log_softmax(out, dim=1)
        else:
            probs = out
            
        # print(probs, probs.size())
        # print(probs.size())
        classes = torch.max(probs, -1)[1]

        return probs, classes


In [27]:
def train(model):
    viz = Visdom()
    step_loss = []
    step_precision = []

    if num_c == 2:
        loss_function = nn.NLLLoss(sample_weight)
    else:
        # loss_function = nn.CrossEntropyLoss(sample_weight)
        loss_function = nn.CrossEntropyLoss(sample_weight)
        
    optimizer = optim.Adam(model.parameters(), lr=config.learning_rate)

    writer = SummaryWriter(log_dir="log")

    epoch = 0
    step = 0

    for epoch in range(1, config.num_epochs + 1):
        print("================ Epoch: {} ================".format(epoch))
        running_losses = []

        for sequences, labels in train_loader:
            # Predict
            # model.zero_grad()
            probs, classes = model(sequences)

            # Backpropagation
            optimizer.zero_grad()
            losses = loss_function(probs, labels)
            losses.backward()
            optimizer.step()

            # Log summary
            running_losses.append(losses.data.item())
            if step % config.summary_interval == 0:
                loss = sum(running_losses) / len(running_losses)
                # writer.add_scalar("train/loss", loss, step)
                step_loss.append([step, loss])
                print("step = {}, loss = {:.4f}".format(step, loss))
                running_losses = []

            step += 1

        # Classification report
        for vali in test_loader:
            X_test = vali[0]
            y_test = vali[1]
            probs, y_pred = model(X_test)
            # print(y_pred)
            if num_c == 5:
                target_names = ['non-', 'anger', "joy", "sadness", "fear"]
            elif num_c == 4:
                target_names = ['anger', "joy", "sadness", "fear"]
            elif num_c == 2:
                target_names = ['robot', "human"]
            # logging.info("{}".format(classification_report(y_test, y_pred, target_names=target_names)))
            report = classification_report(y_test, y_pred, target_names=target_names, output_dict=True)
            print(classification_report(y_test, y_pred, target_names=target_names))
            step_precision.append([step, report["micro avg"]["precision"]])
            break

        epoch += 1

    # Save
    # torch.save(model, "model/shit{}.pkl".format(epoch))
    

    step_loss = np.array(step_loss)
    step_precision = np.array(step_precision)
    viz.line(X=step_loss[:, 0], Y=step_loss[:, 1])
    viz.line(X=step_precision[:, 0], Y=step_precision[:, 1])

In [28]:
model = CNNClassifier()
print(model)
train(model)



CNNClassifier(
  (conv): Conv2d(2, 64, kernel_size=(3, 300), stride=(1, 1), groups=2)
  (f1): Linear(in_features=1536, out_features=128, bias=True)
  (f2): Linear(in_features=256, out_features=128, bias=True)
  (f3): Linear(in_features=128, out_features=64, bias=True)
  (f4): Linear(in_features=64, out_features=32, bias=True)
  (f5): Linear(in_features=32, out_features=4, bias=True)
)
step = 0, loss = 1.4065
step = 10, loss = 1.3782
step = 20, loss = 1.3449
step = 30, loss = 1.2883
step = 40, loss = 1.1895
step = 50, loss = 1.1214
step = 60, loss = 1.0754
              precision    recall  f1-score   support

       anger       0.43      0.71      0.53       234
         joy       0.67      0.56      0.61       254
     sadness       0.48      0.19      0.27       294
        fear       0.54      0.72      0.62       179

   micro avg       0.52      0.52      0.52       961
   macro avg       0.53      0.55      0.51       961
weighted avg       0.53      0.52      0.49       961

ste