In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np


### 加载词向量数据

In [2]:

import pickle as pkl
vocab = pkl.load(open('./rnn_data/vocab.pkl','rb'))
len(vocab)

4762

### 加载数据集

In [5]:
from tqdm import tqdm

def load_dataset(path, pad_size=32):
    contents = []
    with open(path, 'r', encoding='UTF-8') as f:
        for line in tqdm(f):
            lin = line.strip()
            if not lin:
                continue
            content, label = lin.split('\t')
            words_line = []
            token = [x for x in content]
            seq_len = len(token)
            if pad_size:
                if len(token) < pad_size:
                    token.extend([vocab.get('<PAD>')] * (pad_size - len(token)))
                else:
                    token = token[:pad_size]
                    seq_len = pad_size
            # word to id
            for word in token:
                words_line.append(vocab.get(word, vocab.get('<UNK>')))
            contents.append((words_line, int(label), seq_len))
    return contents  # [([...], 0), ([...], 1), ...]


train_ds = load_dataset('./rnn_data/train.txt')
dev_ds = load_dataset('./rnn_data/dev.txt')
test_ds = load_dataset('./rnn_data/test.txt')

180000it [00:01, 142112.75it/s]
10000it [00:00, 120334.41it/s]
10000it [00:00, 156114.76it/s]


### 数据集迭代器

In [6]:
class DatasetIterater(object):
    def __init__(self, batches, batch_size, device):
        self.batches = batches
        self.batch_size = batch_size
        self.n_batches = len(batches) // batch_size
        self.residue = (len(batches) % batch_size == 0)
        
        self.index = 0
        self.device = device
        
    def _to_tensor(self, datas):
        x = torch.LongTensor([_[0] for _ in datas]).to(self.device)
        y = torch.LongTensor([_[1] for _ in datas]).to(self.device)
        seq_len = torch.LongTensor([len(_[0]) for _ in datas]).to(self.device)
        
        return (x, seq_len),y
    
    def __next__(self):
        if self.residue and self.index == self.n_batches:
            batches = self.batches[self.index*self.batch_size : len(self.batches)]
            self.index += 1
            batches = self._to_tensor(batches)
            return batches
        
        if self.index > self.n_batches:
            self.index = 0
            raise StopIteration
            pass
        else:
            batches = self.batches[self.index*self.batch_size: (self.index+1)*self.batch_size]
            batches = self._to_tensor(batches)
            self.index += 1
#             print('n',batches)
            return batches
    
    def __iter__(self):
        return self
    
    def __len__(self):
        return self.n_batches + (1 if self.residue else 0)

In [7]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [8]:
train_dl = DatasetIterater(train_ds, 128, device)
dev_dl = DatasetIterater(dev_ds, 128, device)
test_dl = DatasetIterater(test_ds, 128, device)


### 加载已训练词向量

In [9]:
embeding_pretrained = torch.tensor(
    np.load('./rnn_data/embedding_Tencent.npz')['embeddings'].astype('float32')
)
embeding_pretrained.shape
class_list = [x.strip() for x in open(
        './rnn_data/class.txt'
    ).readlines()]
class_list
num_classes = len(class_list)

In [10]:
print(embeding_pretrained.shape)

torch.Size([4762, 200])


### 定义模型

In [21]:
class TextCNN(nn.Module):
    def __init__(self, embd_pretrained, classes_num):
        super().__init__()
        self.embedding = nn.Embedding.from_pretrained(embd_pretrained, freeze=False)
        self.convs = nn.ModuleList(
            [nn.Conv2d(1, 256, (k, embd_pretrained.size(1))) for k in (2,3,4)]
            #          (input channels, output_channels, kerner_size
        )
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(256*3, classes_num)
        
    def conv_and_pool(self, x, conv):
        x = conv(x)   # [batch_size 1, 32, vocab_size] -> [batch_size, 256, 31/30/29, 1]
        x = F.relu(x).squeeze(3)  # [batch_size, 256, 31/29/28]
        x = F.max_pool1d(x, x.size(2)).squeeze(2)  # [batch_size, 256]
        return x
        
    def forward(self, x):
        # x[1] = seq_len
        out = self.embedding(x[0]) # [batch_size , 32 , vocab_size]
        out = out.unsqueeze(1)   # [batche_size , 1, 32, vocab_size]
        out = [self.conv_and_pool(out, conv) for conv in self.convs]
        out = torch.cat(out, 1)
        out = self.dropout(out)
        out = self.fc(out)
        return out
        

### 训练网络

In [25]:
from sklearn import metrics

def train(model, train_dl, dev_dl,epoches, writer = None):
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    total_batch = 0
    for epoch in (range(epoches)):
        print('{}/{}'.format(epoch, epoches))
        
        for x, labels in train_dl:
            model.train()
            optimizer.zero_grad()  # TODO ???
            outputs = model(x)
            loss = F.cross_entropy(outputs, labels)
            loss.backward()
            optimizer.step()
            
            if total_batch % 100 == 0:
                true = labels.data.cpu()
                predict = torch.max(outputs.data, 1)[1].cpu()
#                 print(true)
#                 print(predict)
                train_acc = metrics.accuracy_score(true, predict)
                print('train_acc = {:.4f}'.format(train_acc))
                dev_acc, dev_loss = evalute(model, dev_dl )
                print('dev_acc = {:.4f}, dev_loss={:.4f}'.format(dev_acc, dev_loss))
                if writer is not None:
                    writer.add_scalar('loss/train', loss.item(), total_batch)
                    writer.add_scalar('loss/dev', dev_loss, total_batch)
                    writer.add_scalar('acc/train', train_acc, total_batch)
                    writer.add_scalar('acc/dev', dev_acc, total_batch)
                
            total_batch += 1

import numpy as np
def evalute(model, data_dl, test=False):
    model.eval()
    loss_total = 0
    predict_all = np.array([], dtype = int)
    labels_all = np.array([], dtype=int)
    with torch.no_grad():
        for texts, labels in data_dl:
            outputs = model(texts)
            loss = F.cross_entropy(outputs, labels)
            loss_total += loss
            labels = labels.data.cpu()
            predict = torch.max(outputs.data.cpu(), 1)[1].cpu().numpy()
            labels_all = np.append(labels_all, labels)
            predict_all = np.append(predict_all, predict)
            
        acc = metrics.accuracy_score(labels_all, predict_all)
        
    return acc, loss_total / len(data_dl)

In [23]:
model_txtcnn = TextCNN(embeding_pretrained, len(class_list))
model_txtcnn = model_txtcnn.to(device)

In [27]:
from tensorboardX import SummaryWriter
import time
writer = SummaryWriter(log_dir='./log/' + time.strftime('%m-%d_%H.%M', time.localtime()))

In [30]:
# import pdb
# pdb.set_trace()
train(model_txtcnn, train_dl, dev_dl,10, writer)

0/10
train_acc = 0.9062
dev_acc = 0.9010, dev_loss=0.3243
train_acc = 0.8906
dev_acc = 0.9034, dev_loss=0.3345
train_acc = 0.8984
dev_acc = 0.8986, dev_loss=0.3416
train_acc = 0.9219
dev_acc = 0.9033, dev_loss=0.3355
train_acc = 0.8594
dev_acc = 0.9052, dev_loss=0.3324
train_acc = 0.9453
dev_acc = 0.9044, dev_loss=0.3316
train_acc = 0.9375
dev_acc = 0.9053, dev_loss=0.3304
train_acc = 0.9219
dev_acc = 0.9010, dev_loss=0.3339
train_acc = 0.9375
dev_acc = 0.9017, dev_loss=0.3370
train_acc = 0.9375
dev_acc = 0.9031, dev_loss=0.3288
train_acc = 0.9453
dev_acc = 0.9042, dev_loss=0.3376
train_acc = 0.9297
dev_acc = 0.9029, dev_loss=0.3377
train_acc = 0.9375
dev_acc = 0.9002, dev_loss=0.3374
train_acc = 0.9219
dev_acc = 0.9002, dev_loss=0.3326
train_acc = 0.9141
dev_acc = 0.9011, dev_loss=0.3353
1/10
train_acc = 0.8984
dev_acc = 0.9036, dev_loss=0.3265
train_acc = 0.9453
dev_acc = 0.9015, dev_loss=0.3397
train_acc = 0.9219
dev_acc = 0.9022, dev_loss=0.3371
train_acc = 0.9531
dev_acc = 0.8995,

In [33]:
print(torch.cuda.memory_summary())
torch.cuda.empty_cache()

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |   33415 KB |   76636 KB |    2083 GB |    2083 GB |
|       from large pool |   22323 KB |   59340 KB |    1865 GB |    1865 GB |
|       from small pool |   11092 KB |   22387 KB |     218 GB |     218 GB |
|---------------------------------------------------------------------------|
| Active memory         |   33415 KB |   76636 KB |    2083 GB |    2083 GB |
|       from large pool |   22323 KB |   59340 KB |    1865 GB |    1865 GB |
|       from small pool |   11092 KB |   22387 KB |     218 GB |     218 GB |
|---------------------------------------------------------------