In [191]:
import jieba
import os
import numpy as np
import pandas as pd
import pickle as pkl

import time
from datetime import timedelta


from tqdm.auto import tqdm, trange
from collections import Counter
import random

from nltk.corpus import stopwords  # import the stopwords
from nltk.tokenize import RegexpTokenizer

from torch.utils.tensorboard import SummaryWriter


import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [140]:
class Corpus:
    
    def __init__(self):

        
        self.comment_list = []

        self.word_to_index = {} # word to unique-id
        self.index_to_word = {} # unique-id to word

        # How many times each word occurs in our data after filtering
        self.word_counts = Counter()
        self.word_embedd = np.array  #300 is the embedd dimension which is fixed here
        self.result = []
        self.data_train = []
        self.x_input = []

    def load_data(self, data_arr):        

        # 1: Use jieba to do word segmentation
        for n in range(len(data_arr)):
            words = [w for w in jieba.lcut(data_arr[n]) if w.isalpha]
            self.comment_list.append(words)
            
        # Jieba not only do sentence segmentation, but also tokennize
        total_words = []
        for w in self.comment_list:
            total_words = total_words + w  # load the whole corpus
            
        self.word_counts = Counter(total_words)
            
        # 2: Creat word to id mapping
        word_list = list(self.word_counts.keys())
        for i in range(len(word_list)):
            self.word_to_index[word_list[i]] = i
            self.index_to_word[i] = word_list[i]
        
        self.word_to_index.update({'<UNK>': len(self.word_to_index), '<PAD>': len(self.word_to_index)+1})
        self.index_to_word.update({len(self.word_to_index): '<UNK>', len(self.word_to_index)+1 :'<PAD>'})
            
    def load_pre_trained_embedding(self, file_path, dimmension):
        self.word_embedd = np.random.rand(len(self.word_to_index), dimmension)  # 部分是随机数，可以用0去赋值来查看那些词没有embedding
        
        f = open(file_path, "r", encoding='UTF-8')
        for i, vec in enumerate(f.readlines()):
            vector = vec.strip().split(" ")
            if vector[0] in self.word_to_index:
                index = self.word_to_index[vector[0]]
                emb = [float(x) for x in vector[1:301]]  # extract the pretrained embedding
                self.word_embedd[index] = np.asarray(emb, dtype='float32')                
        f.close()
        np.savez_compressed('pretrained_embedd', embeddings=self.word_embedd)
                
    
    def gen_dataset(self, label_arr, pad_size=15):

        for c in range(len(self.comment_list)):
            
            comment = self.comment_list[c]
            comment_id = []
            if len(comment) < pad_size:
                comment.extend(['<PAD>'] * (pad_size - len(comment)))
            else:
                comment = comment[:pad_size]
            # convert word to id    
            for w in comment:
                comment_id.append(self.word_to_index[w])
            # the format is [([1,2,3],2), ([2,3,4],0),,,,]    
            self.result.append((np.array(comment_id), np.array(int(label_arr[c]))))

In [174]:
data_pre = np.load("clf_train_2.npy", allow_pickle=True)
label_pre =  data_pre[:,1]

In [175]:
corpus = Corpus()
corpus.load_data(data_pre[:,0])

In [176]:
corpus.load_pre_trained_embedding('data/sgns.sogou.char', 300)
corpus.gen_dataset(label_pre)

In [177]:
whole_data = corpus.result
random.shuffle(whole_data)
train_data = whole_data[:20000]
test_data = whole_data[20000:]

In [188]:
train = torch.utils.data.DataLoader(dataset=train_data,    # load the data
                                           batch_size=50, 
                                           shuffle=True)
test = torch.utils.data.DataLoader(dataset=test_data,    # load the data
                                           batch_size=50, 
                                           shuffle=True)

In [131]:
class TextCNN(nn.Module):
    
    def __init__(self, embedding_dimen, sentence_len, num_filters, dropout):
        super(TextCNN, self).__init__()
        self.conv3 = nn.Conv2d(1, num_filters, (3, embedding_dimen))
        self.conv4 = nn.Conv2d(1, num_filters, (4, embedding_dimen))
        self.conv5 = nn.Conv2d(1, num_filters, (5, embedding_dimen))
        self.Max3_pool = nn.MaxPool2d((sentence_len-3+1, 1))
        self.Max4_pool = nn.MaxPool2d((sentence_len-4+1, 1))
        self.Max5_pool = nn.MaxPool2d((sentence_len-5+1, 1))
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(3*num_filters, 2)
        
        
    def forward(self, x):
        batch = x.shape[0]
        # Convolution
        x = x.unsqueeze(1)
        x1 = F.relu(self.conv3(x))
        x2 = F.relu(self.conv4(x))
        x3 = F.relu(self.conv5(x))

        # Pooling
        x1 = self.Max3_pool(x1)
        x2 = self.Max4_pool(x2)
        x3 = self.Max5_pool(x3)

        # capture and concatenate the features
        x = torch.cat((x1, x2, x3), -1)
        x = x.view(batch, 1, -1)
        
        x = self.dropout(x)

        # project the features to the labels
        x = self.fc(x)
        x = x.view(-1, 2)  # 2 is the number of the label
        # print(x.shape)

        return x


In [200]:
learning_rate = 0.0001
num_epoch = 10

model = TextCNN(300, 15, 10, 0.5)  # embedding, sentence len, num_filter, dropout
weight = torch.FloatTensor(corpus.word_embedd)
embeds = nn.Embedding.from_pretrained(weight)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

writer = SummaryWriter()  # tensorboard summary writer

count = 0
loss_sum = 0

for epoch in tqdm(range(num_epoch)):
    for data, label in train:
        
        input_data = embeds(data)
        out = model(input_data)
        loss = criterion(out, label)
        
        loss_sum += loss
        count += 1

        if count % 1000 == 0:
            print("epoch", epoch, end='  ')
            print("The loss is: %.5f" % (loss_sum/1000))
            writer.add_scalar("Loss/train", loss_sum/10000, count+1)

            loss_sum = 0

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


  0%|          | 0/10 [00:00<?, ?it/s]

epoch 2  The loss is: 0.33983
epoch 4  The loss is: 0.30825
epoch 7  The loss is: 0.29362
epoch 9  The loss is: 0.28416


In [34]:
weight = torch.FloatTensor(corpus.word_embedd)

In [35]:
embeds = nn.Embedding.from_pretrained(weight)

In [None]:
model.eval()  # eval mode (batchnorm uses moving mean/variance instead of mini-batch mean/variance)
with torch.no_grad():
    correct = 0
    total = 0
    TP = 0
    TN = 0
    FP = 0
    FN = 0

    for texts, labels in train:
        if total == 0:
            print(texts.shape)
        outputs = model(embeds(texts))
        _, predicted = torch.max(outputs.data, 1)  # the location of the max outputs
        total += labels.size(0)
        correct += (predicted == labels.data).sum()
        TP += ((predicted == 1) & (labels.data == 1)).sum()
        TN += ((predicted == 0) & (labels.data == 0)).sum()
        FN += ((predicted == 0) & (labels.data == 1)).sum()
        FP += ((predicted == 1) & (labels.data == 0)).sum()

    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    F1 = 2 * recall * precision / (recall + precision)
    print('Accuracy: {} %'.format(100 * correct / total))
    print('Precision: {} %'.format(100 * precision))
    print('Recall: {} %'.format(100 * recall))
    print('F1 Score: {} %'.format(100 * F1))