In [6]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import nltk
import random
import numpy as np
from collections import Counter, OrderedDict
import nltk
from copy import deepcopy
import os
import re
import unicodedata
flatten = lambda l: [item for sublist in l for item in sublist]

from torch.nn.utils.rnn import PackedSequence, pack_padded_sequence
random.seed(1024)

In [7]:
# CPU 환경에서 실행시
USE_CUDA = torch.cuda.is_available()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

FloatTensor = torch.FloatTensor
LongTensor = torch.LongTensor
ByteTensor = torch.ByteTensor

# train data path
path = "./data/train/[최종]졸업이수학점.txt"

In [8]:
def getBatch(batch_size, train_data):
    random.shuffle(train_data)
    sindex = 0
    eindex = batch_size
    while eindex < len(train_data):
        batch = train_data[sindex: eindex]
        temp = eindex
        eindex = eindex + batch_size
        sindex = temp
        yield batch
    
    if eindex >= len(train_data):
        batch = train_data[sindex:]
        yield batch

In [9]:
def pad_to_batch(batch, w_to_ix): # for bAbI dataset
    fact,q,a = list(zip(*batch))
    max_fact = max([len(f) for f in fact])
    max_len = max([f.size(1) for f in flatten(fact)])
    max_q = max([qq.size(1) for qq in q])
    max_a = max([aa.size(1) for aa in a])
    
    facts, fact_masks, q_p, a_p = [], [], [], []
    for i in range(len(batch)):
        fact_p_t = []
        for j in range(len(fact[i])):
            if fact[i][j].size(1) < max_len:
                fact_p_t.append(torch.cat([fact[i][j], Variable(LongTensor([w_to_ix['<PAD>']] * (max_len - fact[i][j].size(1)))).view(1, -1)], 1))
            else:
                fact_p_t.append(fact[i][j])

        while len(fact_p_t) < max_fact:
            fact_p_t.append(Variable(LongTensor([w_to_ix['<PAD>']] * max_len)).view(1, -1))

        fact_p_t = torch.cat(fact_p_t)
        facts.append(fact_p_t)
        fact_masks.append(torch.cat([Variable(ByteTensor(tuple(map(lambda s: s ==0, t.data))), volatile=False) for t in fact_p_t]).view(fact_p_t.size(0), -1))

        if q[i].size(1) < max_q:
            q_p.append(torch.cat([q[i], Variable(LongTensor([w_to_ix['<PAD>']] * (max_q - q[i].size(1)))).view(1, -1)], 1))
        else:
            q_p.append(q[i])

        if a[i].size(1) < max_a:
            a_p.append(torch.cat([a[i], Variable(LongTensor([w_to_ix['<PAD>']] * (max_a - a[i].size(1)))).view(1, -1)], 1))
        else:
            a_p.append(a[i])

    questions = torch.cat(q_p)
    answers = torch.cat(a_p)
    question_masks = torch.cat([Variable(ByteTensor(tuple(map(lambda s: s ==0, t.data))), volatile=False) for t in questions]).view(questions.size(0), -1)
    
    return facts, fact_masks, questions, question_masks, answers

In [10]:
def prepare_sequence(seq, to_index):
    idxs = list(map(lambda w: to_index[w] if to_index.get(w) is not None else to_index["<UNK>"], seq))
    return Variable(LongTensor(idxs))

In [11]:
# Data text파일을 읽어서 팩트, 질문 분할 및 저장하여 반환하는 코드
def bAbI_data_load(path):
    try:
        data = open(path, 'r' ,encoding='utf8').readlines()
    except:
        print("Such a file does not exist at %s".format(path))
        return None
    
    data = [d[:-1] for d in data]
    data_p = []
    fact = []
    qa = []
    try:
        for d in data:
            index = d.split(' ')[0]
            if index == '1':
                fact = []
                qa = []
            if '?' in d:
                temp = d.split('\t')
                q = temp[0].strip().replace('?', '').split(' ')[1:] + ['?']
                a = temp[1].split() + ['</s>']
                stemp = deepcopy(fact)
                data_p.append([stemp, q, a])
            else:
                tokens = d.replace('.', '').split(' ')[1:] + ['</s>']
                fact.append(tokens)
    except Exception as e:
        print(e)
        print("Please check the data is right")
        return None
    return data_p

In [12]:
train_data = bAbI_data_load(path)

In [13]:
train_data[0]

[[['17학번', '영어회화', '필수', '안', '들어도', '돼', '</s>'],
  ['18학번', '영어회화', '필수', '안', '들어도', '돼', '</s>'],
  ['17학번', '대학영어', '영어레벨테스트', '통과하지', '못한', '사람만', '들으면', '돼', '</s>'],
  ['18학번', '대학영어', '영어레벨테스트', '통과하지', '못한', '사람만', '들으면', '돼', '</s>'],
  ['필수교양,', '균형교양,', '기초교양', '', '들어야', '해', '</s>'],
  ['17학번', '필수교양', '', '광운인되기,', '영어,', '정보가', '있어', '</s>'],
  ['18학번', '필수교양', '', '광운인되기,', '영어,', '정보가', '있어', '</s>'],
  ['19학번', '필수교양', '', '광운인되기,', '영어,', '정보가', '있어', '</s>'],
  ['균형교양', '3학점', '과목만', '인정돼', '</s>'],
  ['19학번', '대학영어', '', '필수', '', '들어야', '돼', '</s>'],
  ['20학번', '필수교양', '', '광운인되기,', '대학영어,', '정보,', '융합적사고와글쓰기가', '있어', '</s>'],
  ['20학번', '균형교양', '', '5영역', '중', '3영역x3학점', '', '의무', '이수해야', '해', '</s>'],
  ['17학번', '졸업', '이수학점', '133학점', '야', '</s>'],
  ['18학번', '졸업', '이수학점', '133학점', '야', '</s>'],
  ['17학번', '필수+균형', '교양', '', '19~22학점', '들어야', '해', '</s>'],
  ['18학번', '필수+균형', '교양', '', '19~22학점', '들어야', '해', '</s>'],
  ['컴퓨터정보공학부', '17학번', '', '기초교양', '', '24학

# data 문장들을 쪼개서 vaca를 dict 형태로 저장하기 위한 코드

In [14]:
fact,q,a = list(zip(*train_data))

In [15]:
vocab = list(set(flatten(flatten(fact)) + flatten(q) + flatten(a)))

In [16]:
word2index={'<PAD>': 0, '<UNK>': 1, '<s>': 2, '</s>': 3}
for vo in vocab:
    if word2index.get(vo) is None:
        word2index[vo] = len(word2index)
index2word = {v:k for k, v in word2index.items()}

In [17]:
len(word2index)

84

In [32]:
for t in train_data:
    for i,fact in enumerate(t[0]):
        t[0][i] = prepare_sequence(fact, word2index).view(1, -1)
    
    t[1] = prepare_sequence(t[1], word2index).view(1, -1)
    t[2] = prepare_sequence(t[2], word2index).view(1, -1)

# DMN Model 코드 

In [19]:
class DMN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dropout_p=0.1):
        super(DMN, self).__init__()
        
        self.hidden_size = hidden_size
        self.embed = nn.Embedding(input_size, hidden_size, padding_idx=0) #sparse=True)
        self.input_gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.question_gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        
        self.gate = nn.Sequential(
                            nn.Linear(hidden_size * 4, hidden_size),
                            nn.Tanh(),
                            nn.Linear(hidden_size, 1),
                            nn.Sigmoid()
                        )
        
        self.attention_grucell =  nn.GRUCell(hidden_size, hidden_size)
        self.memory_grucell = nn.GRUCell(hidden_size, hidden_size)
        self.answer_grucell = nn.GRUCell(hidden_size * 2, hidden_size)
        self.answer_fc = nn.Linear(hidden_size, output_size)
        
        self.dropout = nn.Dropout(dropout_p)
        
    def init_hidden(self, inputs):
        hidden = Variable(torch.zeros(1, inputs.size(0), self.hidden_size))
        return hidden.cuda() if USE_CUDA else hidden
    
    def init_weight(self):
        nn.init.xavier_uniform(self.embed.state_dict()['weight'])
        
        for name, param in self.input_gru.state_dict().items():
            if 'weight' in name: nn.init.xavier_normal(param)
        for name, param in self.question_gru.state_dict().items():
            if 'weight' in name: nn.init.xavier_normal(param)
        for name, param in self.gate.state_dict().items():
            if 'weight' in name: nn.init.xavier_normal(param)
        for name, param in self.attention_grucell.state_dict().items():
            if 'weight' in name: nn.init.xavier_normal(param)
        for name, param in self.memory_grucell.state_dict().items():
            if 'weight' in name: nn.init.xavier_normal(param)
        for name, param in self.answer_grucell.state_dict().items():
            if 'weight' in name: nn.init.xavier_normal(param)
        
        nn.init.xavier_normal(self.answer_fc.state_dict()['weight'])
        self.answer_fc.bias.data.fill_(0)
        
    def forward(self, facts, fact_masks, questions, question_masks, num_decode, episodes=3, is_training=False):
        """
        facts : (B,T_C,T_I) / LongTensor in List # batch_size, num_of_facts, length_of_each_fact(padded)
        fact_masks : (B,T_C,T_I) / ByteTensor in List # batch_size, num_of_facts, length_of_each_fact(padded)
        questions : (B,T_Q) / LongTensor # batch_size, question_length
        question_masks : (B,T_Q) / ByteTensor # batch_size, question_length
        """
        # Input Module
        C = [] # encoded facts
        for fact, fact_mask in zip(facts, fact_masks):
            embeds = self.embed(fact)
            if is_training:
                embeds = self.dropout(embeds)
            hidden = self.init_hidden(fact)
            outputs, hidden = self.input_gru(embeds, hidden)
            real_hidden = []

            for i, o in enumerate(outputs): # B,T,D
                real_length = fact_mask[i].data.tolist().count(0) 
                real_hidden.append(o[real_length - 1])

            C.append(torch.cat(real_hidden).view(fact.size(0), -1).unsqueeze(0))
        
        encoded_facts = torch.cat(C) # B,T_C,D
        
        # Question Module
        embeds = self.embed(questions)
        if is_training:
            embeds = self.dropout(embeds)
        hidden = self.init_hidden(questions)
        outputs, hidden = self.question_gru(embeds, hidden)
        
        if isinstance(question_masks, torch.autograd.Variable):
            real_question = []
            for i, o in enumerate(outputs): # B,T,D
                real_length = question_masks[i].data.tolist().count(0) 
                real_question.append(o[real_length - 1])
            encoded_question = torch.cat(real_question).view(questions.size(0), -1) # B,D
        else: # for inference mode
            encoded_question = hidden.squeeze(0) # B,D
            
        # Episodic Memory Module
        memory = encoded_question
        T_C = encoded_facts.size(1)
        B = encoded_facts.size(0)
        for i in range(episodes):
            hidden = self.init_hidden(encoded_facts.transpose(0, 1)[0]).squeeze(0) # B,D
            for t in range(T_C):
                #TODO: fact masking
                #TODO: gate function => softmax
                z = torch.cat([
                                    encoded_facts.transpose(0, 1)[t] * encoded_question, # B,D , element-wise product
                                    encoded_facts.transpose(0, 1)[t] * memory, # B,D , element-wise product
                                    torch.abs(encoded_facts.transpose(0,1)[t] - encoded_question), # B,D
                                    torch.abs(encoded_facts.transpose(0,1)[t] - memory) # B,D
                                ], 1)
                g_t = self.gate(z) # B,1 scalar
                hidden = g_t * self.attention_grucell(encoded_facts.transpose(0, 1)[t], hidden) + (1 - g_t) * hidden
                
            e = hidden
            memory = self.memory_grucell(e, memory)
        
        # Answer Module
        answer_hidden = memory
        start_decode = Variable(LongTensor([[word2index['<s>']] * memory.size(0)])).transpose(0, 1)
        y_t_1 = self.embed(start_decode).squeeze(1) # B,D
        
        decodes = []
        for t in range(num_decode):
            answer_hidden = self.answer_grucell(torch.cat([y_t_1, encoded_question], 1), answer_hidden)
            decodes.append(F.log_softmax(self.answer_fc(answer_hidden),1))
        return torch.cat(decodes, 1).view(B * num_decode, -1)

# 모델의 하이퍼파라미터

In [20]:
HIDDEN_SIZE = 80
BATCH_SIZE = 64
LR = 0.001
EPOCH = 10000
NUM_EPISODE = 3
EARLY_STOPPING = False

In [21]:
# 모델 선언
model = DMN(len(word2index), HIDDEN_SIZE, len(word2index))
model.init_weight()
if USE_CUDA:
    model = model.cuda()

# loss 함수와 최적화 함수 선언
loss_function = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters(), lr=LR)

  nn.init.xavier_uniform(self.embed.state_dict()['weight'])
  if 'weight' in name: nn.init.xavier_normal(param)
  if 'weight' in name: nn.init.xavier_normal(param)
  if 'weight' in name: nn.init.xavier_normal(param)
  if 'weight' in name: nn.init.xavier_normal(param)
  if 'weight' in name: nn.init.xavier_normal(param)
  if 'weight' in name: nn.init.xavier_normal(param)
  nn.init.xavier_normal(self.answer_fc.state_dict()['weight'])


In [22]:
# 모델 training
for epoch in range(EPOCH):
    losses = []
    if EARLY_STOPPING:
        #torch.save(model.state_dict(), 'C:/Users/82104/Desktop/데이터/모델 데이터/model/[0601]crayon.pth')
        break
        
    for i,batch in enumerate(getBatch(BATCH_SIZE, train_data)):
        facts, fact_masks, questions, question_masks, answers = pad_to_batch(batch, word2index)
        
        model.zero_grad()
        pred = model(facts, fact_masks, questions, question_masks, answers.size(1), NUM_EPISODE, True)
        loss = loss_function(pred, answers.view(-1))
        #losses.append(loss.data.tolist()[0])
        losses.append(loss.data)
        
        loss.backward()
        optimizer.step()
        
        if i % 100 == 0:
            print("[%d/%d] mean_loss : %0.2f" %(epoch, EPOCH, np.mean(losses)))
            
            if np.mean(losses) < 0.01:
                EARLY_STOPPING = True
                print("Early Stopping!")
                break
            losses = []

[0/10000] mean_loss : 4.36
[1/10000] mean_loss : 4.27
[2/10000] mean_loss : 4.19
[3/10000] mean_loss : 4.10
[4/10000] mean_loss : 4.01
[5/10000] mean_loss : 3.90
[6/10000] mean_loss : 3.78
[7/10000] mean_loss : 3.65
[8/10000] mean_loss : 3.50
[9/10000] mean_loss : 3.34
[10/10000] mean_loss : 3.17
[11/10000] mean_loss : 3.01
[12/10000] mean_loss : 2.89
[13/10000] mean_loss : 2.79
[14/10000] mean_loss : 2.73
[15/10000] mean_loss : 2.68
[16/10000] mean_loss : 2.62
[17/10000] mean_loss : 2.54
[18/10000] mean_loss : 2.47
[19/10000] mean_loss : 2.41
[20/10000] mean_loss : 2.35
[21/10000] mean_loss : 2.31
[22/10000] mean_loss : 2.29
[23/10000] mean_loss : 2.26
[24/10000] mean_loss : 2.24
[25/10000] mean_loss : 2.21
[26/10000] mean_loss : 2.17
[27/10000] mean_loss : 2.14
[28/10000] mean_loss : 2.11
[29/10000] mean_loss : 2.09
[30/10000] mean_loss : 2.06
[31/10000] mean_loss : 2.05
[32/10000] mean_loss : 2.03
[33/10000] mean_loss : 2.01
[34/10000] mean_loss : 1.99
[35/10000] mean_loss : 1.97
[3

[287/10000] mean_loss : 0.04
[288/10000] mean_loss : 0.04
[289/10000] mean_loss : 0.04
[290/10000] mean_loss : 0.03
[291/10000] mean_loss : 0.04
[292/10000] mean_loss : 0.03
[293/10000] mean_loss : 0.03
[294/10000] mean_loss : 0.03
[295/10000] mean_loss : 0.03
[296/10000] mean_loss : 0.03
[297/10000] mean_loss : 0.03
[298/10000] mean_loss : 0.03
[299/10000] mean_loss : 0.03
[300/10000] mean_loss : 0.03
[301/10000] mean_loss : 0.03
[302/10000] mean_loss : 0.03
[303/10000] mean_loss : 0.03
[304/10000] mean_loss : 0.03
[305/10000] mean_loss : 0.03
[306/10000] mean_loss : 0.03
[307/10000] mean_loss : 0.03
[308/10000] mean_loss : 0.03
[309/10000] mean_loss : 0.03
[310/10000] mean_loss : 0.03
[311/10000] mean_loss : 0.03
[312/10000] mean_loss : 0.03
[313/10000] mean_loss : 0.03
[314/10000] mean_loss : 0.03
[315/10000] mean_loss : 0.03
[316/10000] mean_loss : 0.03
[317/10000] mean_loss : 0.03
[318/10000] mean_loss : 0.03
[319/10000] mean_loss : 0.03
[320/10000] mean_loss : 0.02
[321/10000] me

In [23]:
# TEST

In [24]:
def pad_to_fact(fact, x_to_ix): # this is for inference
    
    max_x = max([s.size(1) for s in fact])
    x_p = []
    for i in range(len(fact)):
        if fact[i].size(1) < max_x:
            x_p.append(torch.cat([fact[i], Variable(LongTensor([x_to_ix['<PAD>']] * (max_x - fact[i].size(1)))).view(1, -1)], 1))
        else:
            x_p.append(fact[i])
        
    fact = torch.cat(x_p)
    fact_mask = torch.cat([Variable(ByteTensor(tuple(map(lambda s: s ==0, t.data))), volatile=False) for t in fact]).view(fact.size(0), -1)
    return fact, fact_mask

test data 정의

In [25]:
p = './data/test/[TEST]교양교과목안내.txt'
test_data = bAbI_data_load(p)

In [26]:
for t in test_data:
    for i, fact in enumerate(t[0]):
        t[0][i] = prepare_sequence(fact, word2index).view(1, -1)
    
    t[1] = prepare_sequence(t[1], word2index).view(1, -1)
    t[2] = prepare_sequence(t[2], word2index).view(1, -1)

# 모델 정확도 계산 

In [27]:
accuracy = 0

In [28]:
for t in test_data:
    fact, fact_mask = pad_to_fact(t[0], word2index)
    question = t[1]
    question_mask = Variable(ByteTensor([0] * t[1].size(1)), volatile=False).unsqueeze(0)
    answer = t[2].squeeze(0)
    
    model.zero_grad()
    pred = model([fact], [fact_mask], question, question_mask, answer.size(0), NUM_EPISODE)
    if pred.max(1)[1].data.tolist() == answer.data.tolist():
        accuracy += 1
    #print("")
    #print("Question : ",' '.join(list(map(lambda x: index2word[x], question.data.tolist()[0]))))
    #print("")
    #print("Answer : ",' '.join(list(map(lambda x: index2word[x], answer.data.tolist()))))
    #print("Prediction : ",' '.join(list(map(lambda x: index2word[x], pred.max(1)[1].data.tolist()))))

print(accuracy/len(test_data) * 100)

100.0


In [None]:
# sample test
t = random.choice(test_data)
fact, fact_mask = pad_to_fact(t[0], word2index)
question = t[1]
question_mask = Variable(ByteTensor([0] * t[1].size(1)), volatile=False).unsqueeze(0)
answer = t[2].squeeze(0)

model.zero_grad()

pred = model([fact], [fact_mask], question, question_mask, answer.size(0), NUM_EPISODE)

print("Facts : ")
print('\n'.join([' '.join(list(map(lambda x: index2word[x],f))) for f in fact.data.tolist()]))
print("")
print("Question : ",' '.join(list(map(lambda x: index2word[x], question.data.tolist()[0]))))
print("")
print("Answer : ",' '.join(list(map(lambda x: index2word[x], answer.data.tolist()))))
print("Prediction : ",' '.join(list(map(lambda x: index2word[x], pred.max(1)[1].data.tolist()))))

In [30]:
# model의 파라미터들의 사이즈 확인

# Print model's state_dict
print("Model's state_dict:")
for param_tensor in model.state_dict():
    print(param_tensor, "\t", model.state_dict()[param_tensor].size())

# Print optimizer's state_dict
print("Optimizer's state_dict:")
for var_name in optimizer.state_dict():
    print(var_name, "\t", optimizer.state_dict()[var_name])

Model's state_dict:
embed.weight 	 torch.Size([84, 80])
input_gru.weight_ih_l0 	 torch.Size([240, 80])
input_gru.weight_hh_l0 	 torch.Size([240, 80])
input_gru.bias_ih_l0 	 torch.Size([240])
input_gru.bias_hh_l0 	 torch.Size([240])
question_gru.weight_ih_l0 	 torch.Size([240, 80])
question_gru.weight_hh_l0 	 torch.Size([240, 80])
question_gru.bias_ih_l0 	 torch.Size([240])
question_gru.bias_hh_l0 	 torch.Size([240])
gate.0.weight 	 torch.Size([80, 320])
gate.0.bias 	 torch.Size([80])
gate.2.weight 	 torch.Size([1, 80])
gate.2.bias 	 torch.Size([1])
attention_grucell.weight_ih 	 torch.Size([240, 80])
attention_grucell.weight_hh 	 torch.Size([240, 80])
attention_grucell.bias_ih 	 torch.Size([240])
attention_grucell.bias_hh 	 torch.Size([240])
memory_grucell.weight_ih 	 torch.Size([240, 80])
memory_grucell.weight_hh 	 torch.Size([240, 80])
memory_grucell.bias_ih 	 torch.Size([240])
memory_grucell.bias_hh 	 torch.Size([240])
answer_grucell.weight_ih 	 torch.Size([240, 160])
answer_grucell.

torch.save(model.state_dict(), 'C:/Users/82104/Desktop/데이터/모델 데이터/model/[0602]crayon.pth')

# model, loss_function, optimizer의 parameter등을 나누어 저장

In [27]:
torch.save({
            'epoch': epoch,
            'model': DMN(len(word2index), HIDDEN_SIZE, len(word2index)),
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'criterion_state_dict': loss_function.state_dict(),
            'loss': loss,
            }, 'C:/Users/82104/Desktop/데이터/모델 데이터/model/[졸업이수학점0603]crayon.pth')



for param in model.state_dict():
    print(param)