In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import nltk
import random
import numpy as np
from collections import Counter, OrderedDict
import nltk
import re
from copy import deepcopy
random.seed(1024)

In [2]:
USE_CUDA = torch.cuda.is_available()
if USE_CUDA:
    gpus = [0]
    torch.cuda.set_device(gpus[0])

FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if USE_CUDA else torch.ByteTensor

In [3]:
#将data随机排序在分成若干个size相同的batch(最后一个batch的size不确定)
def get_batch(batch_size, train_data):
    random.shuffle(train_data)      #将列表随机排序
    start_index = 0
    end_index = batch_size
    while end_index < len(train_data):
        batch = train_data[start_index: end_index]
        start_index = end_index
        end_index += batch_size
        yield batch
    if end_index >= len(train_data):
        batch = train_data[start_index:]
        yield batch

In [4]:
def pad_batch(batch):
    x, y = zip(*batch)
    max_x = max([s.size(1) for s in x])
    x_p = []
    for i in range(len(batch)):
        if x[i].size(1) < max_x:
#             [x[i], Variable(LongTensor([word2index['<PAD>']] * (max_x - x[i].size(1)))).view(1, -1)]
            x_p.append(torch.cat([x[i], Variable(LongTensor([word2index['<PAD>']] * (max_x - x[i].size(1)))).view(1, -1)], 1))
        else:
            x_p.append(x[i])
    return torch.cat(x_p), torch.cat(y).view(-1)

In [5]:
def prepare_sequence(seq, to_index):
    idxs = list(map(lambda w: to_index[w] if to_index[w] is not None else to_index['<UNK>'], seq))
    return Variable(LongTensor(idxs))

In [6]:
#Data load & Preprocessing      http://cogcomp.org/Data/QA/QC/
data = open('./train_5500.label.txt', 'r', encoding='latin-1').readlines()
data = [[d.split(':')[1][:], d.split(':')[0]] for d in data]
X, y = list(zip(*data))  #将每行的标记和问题分离
X = list(X)
for i, x in enumerate(X):
    X[i] = re.sub('\d', '#', x).split()     #将问题中的数字用#代替，换行符去掉，并将句子的单词划开

In [7]:
#构造字典
flatten = lambda l: [item for sublist in l for item in sublist]
vocab = list(set(flatten(X)))   #将X降维去重后转成list

word2index={'<PAD>': 0, '<UNK>': 1}
for v in vocab:
    if word2index.get(v) is None:
        word2index[v] = len(word2index)
index2word = {v:k for k, v in word2index.items()}

target2index = {}
for cl in set(y):
    if target2index.get(cl) is None:
        target2index[cl] = len(target2index)
index2target = {v:k for k, v in target2index.items()}

In [12]:
#划分数据集
X_p, y_p = [], []
# print(X[:3], y[:3])
for pair in zip(X, y):
    X_p.append(prepare_sequence(pair[0], word2index).view(1, -1))
    y_p.append(Variable(LongTensor([target2index[pair[1]]])).view(1, -1))
data_p = list(zip(X_p, y_p))
random.shuffle(data_p)   #改变顺序

#划分训练集和测试集
train_data = data_p[: int(len(data_p) * 0.9)]
test_data = data_p[int(len(data_p) * 0.9):]

[['manner', 'How', 'did', 'serfdom', 'develop', 'in', 'and', 'then', 'leave', 'Russia', '?'], ['cremat', 'What', 'films', 'featured', 'the', 'character', 'Popeye', 'Doyle', '?'], ['manner', 'How', 'can', 'I', 'find', 'a', 'list', 'of', 'celebrities', "'", 'real', 'names', '?']] ('DESC', 'ENTY', 'DESC')


In [13]:
#Load Pretrained word vector   https://github.com/mmihaltz/word2vec-GoogleNews-vectors
# import gensim
# model = gensim.models.KeyedVectors.load_word2vec_format('../dataset/GoogleNews-vectors-negative300.bin', binary=True)
# pretrained = []
# for key in word2index.keys():
#     try:
#         pretrained.append(model[word2index[key]])
#     except:
#         pretrained.append(np.random.randn(300))
# pretrained_vectors = np.vstack(pretrained)

In [14]:
#Model
class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, output_size, kernel_dim=100, kernel_sizes=(3, 4, 5), dropout=0.5):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.convs = nn.ModuleList(nn.Conv2d(1, kernel_dim, (k, embedding_dim)) for k in kernel_sizes)
        self.dropout = nn.Dropout(dropout)
        self.decoder = nn.Linear(len(kernel_sizes) * kernel_dim, output_size)
#         self.pool = F.max_pool1d()

    def init_wight(self):
        pass

    def forward(self, inputs, is_training=False):
        inputs = self.embedding(inputs).unsqueeze(1)   #增加一个维度
        inputs = [F.relu(conv(inputs)).squeeze(3) for conv in self.convs]
        inputs = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in inputs]
        concated = torch.cat(inputs, 1)

        if is_training:
            concated = self.dropout(concated)
        out = self.decoder(concated)
        return out

In [15]:
#Train
EPOCH = 5
BATCH_SIZE = 50
KERNEL_SIZES = [3,4,5]
KERNEL_DIM = 100

model = CNN(len(word2index), 300, len(target2index), KERNEL_DIM, KERNEL_SIZES)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr = 0.001)

for epoch in range(EPOCH):
    losses = []
    for i, batch in enumerate(get_batch(BATCH_SIZE, train_data)):
        inputs, targets = pad_batch(batch)
        optimizer.zero_grad()
        
        preds = model(inputs, True)
        loss = criterion(preds, targets)
        losses.append(loss.detach().numpy())
        loss.backward()
        optimizer.step()

        if i % 100 == 0:
            print("[%d/%d] mean_loss: %0.2f" %(epoch, EPOCH, np.mean(losses)))
            losses = []

[0/5] mean_loss: 1.93
[1/5] mean_loss: 0.21
[2/5] mean_loss: 0.05
[3/5] mean_loss: 0.01
[4/5] mean_loss: 0.02


In [17]:
#Test
acc = 0
inputs, targets = pad_batch(test_data)

predicts = model(inputs).data.max(1, keepdim=True)[1]
predicts = predicts.view(-1)
targets = targets.view(-1)
for pred, t in zip(predicts, targets):
    if pred == t:
        acc += 1
    
print(acc/len(test_data) * 100)

97.98534798534799


In [18]:
'''完成'''

'完成'