In [1]:
import pandas as pd
df = pd.read_csv("ChatBotData.csv")

In [2]:
df

Unnamed: 0,Q,A,label
0,12시 땡!,하루가 또 가네요.,0
1,1지망 학교 떨어졌어,위로해 드립니다.,0
2,3박4일 놀러가고 싶다,여행은 언제나 좋죠.,0
3,3박4일 정도 놀러가고 싶다,여행은 언제나 좋죠.,0
4,PPL 심하네,눈살이 찌푸려지죠.,0
5,SD카드 망가졌어,다시 새로 사는 게 마음 편해요.,0
6,SD카드 안돼,다시 새로 사는 게 마음 편해요.,0
7,SNS 맞팔 왜 안하지ㅠㅠ,잘 모르고 있을 수도 있어요.,0
8,SNS 시간낭비인 거 아는데 매일 하는 중,시간을 정하고 해보세요.,0
9,SNS 시간낭비인데 자꾸 보게됨,시간을 정하고 해보세요.,0


In [3]:
import warnings

warnings.filterwarnings(action='ignore')

In [4]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

use_cuda = torch.cuda.is_available()

MAX_LENGTH = 20

In [5]:
SOS_token = 0
EOS_token = 1
UNKNOWN_token = 2


class Lang:

    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.index2word = {}
        self.word2count = {0: "SOS", 1: "EOS", 2: "UNKNOWN"}
        self.n_words = 3 # SOS, EOS, UNKNOWN

    def addSentence(self, sentence):
        for word in sentence.split(' '): # tokenize : split
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [6]:
def normalizeString(s):
    hangul = re.compile('[^ ㄱ-ㅣ가-힣 ^☆; ^a-zA-Z.!?]+')
    result = hangul.sub('', s)
    return s

In [7]:
def readText():
    """
    read Data from dataframe
    """
    print("Reading lines...")

    inputs = df['Q']
    outputs = df['A']

    inputs = [normalizeString(s) for s in inputs]
    outputs = [normalizeString(s) for s in outputs]
    print(len(inputs))
    print(len(outputs))

    inp = Lang('input')
    outp = Lang('output')

    pair = []
    for i in range(len(inputs)):
        pair.append([inputs[i], outputs[i]])
    return inp, outp, pair

In [8]:
def prepareData():
    """
    prepare Data
    """
    input_lang, output_lang, pairs = readText()
    print("Read %s sentence pairs" % len(pairs))

    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs


input_lang, output_lang, pairs = prepareData()
print(random.choice(pairs))

Reading lines...
11823
11823
Read 11823 sentence pairs
Counting words...
Counted words:
input 14287
output 10008
['책만 보면 졸려', '눈꺼풀의 무게를 이겨내세요.']


In [9]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        result = torch.zeros(1, 1, self.hidden_size)
        if use_cuda:
            return result.cuda()
        else:
            return result

In [10]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size*2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)))
        attn_applied = torch.bmm(attn_weights.unsqueeze(
            0), encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]))
        return output, hidden, attn_weights

    def initHidden(self):
        result = torch.zeros(1, 1, self.hidden_size)

        if use_cuda:
            return result.cuda()
        else:
            return result

In [11]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]

In [12]:
def variableFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    result = torch.LongTensor(indexes).view(-1, 1)
    if use_cuda:
        return result.cuda()
    else:
        return result

In [13]:
def variablesFromPair(pair):
    input_variable = variableFromSentence(input_lang, pair[0])
    target_variable = variableFromSentence(output_lang, pair[1])
    return (input_variable, target_variable)

In [14]:
teacher_forcing_ratio = 0.5

In [15]:
def train(input_variable, target_variable, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_variable.size()[0]
    target_length = target_variable.size()[0]

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size)
    encoder_outputs = encoder_outputs.cuda() if use_cuda else encoder_outputs

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_variable[ei], encoder_hidden)

        try:
            encoder_outputs[ei] = encoder_output[0][0]
        except:
            continue

    decoder_input = torch.LongTensor([[SOS_token]])
    decoder_input = decoder_input.cuda() if use_cuda else decoder_input

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_variable[di])
            decoder_input = target_variable[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.data.topk(1)
            ni = topi[0][0]

            decoder_input = torch.LongTensor([[ni]])
            decoder_input = decoder_input.cuda() if use_cuda else decoder_input

            loss += criterion(decoder_output, target_variable[di])
            if ni == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [16]:
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    best_valid_loss = float('inf')
    start = time.time()
    plot_losses = []
    print_loss_total = 0
    plot_loss_total = 0

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [variablesFromPair(
        random.choice(pairs)) for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_variable = training_pair[0]
        target_variable = training_pair[1]

        loss = train(input_variable, target_variable, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0
        if loss < best_valid_loss:
            best_valid_loss = loss
            torch.save(
                {
                    'en': encoder.state_dict(),
                    'de': decoder.state_dict()
                }, 'model.pt'
            )

    showPlot(plot_losses)

In [17]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    input_variable = variableFromSentence(input_lang, sentence)
    input_length = input_variable.size()[0]
    encoder_hidden = encoder.initHidden()

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size)
    encoder_outputs = encoder_outputs.cuda() if use_cuda else encoder_outputs

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_variable[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_outputs[ei] + encoder_output[0][0]

    decoder_input = torch.LongTensor([[SOS_token]])
    decoder_input = decoder_input.cuda() if use_cuda else decoder_input

    decoder_hidden = encoder_hidden

    decoded_words = []
    decoder_attentions = torch.zeros(max_length, max_length)

    for di in range(max_length):
        decoder_output, decoder_hidden, decoder_attention = decoder(
            decoder_input, decoder_hidden, encoder_outputs)
        decoder_attentions[di] = decoder_attention.data
        topv, topi = decoder_output.data.topk(1)
        ni = topi[0][0]
        if ni == EOS_token:
            decoded_words.append('<EOS>')
            break
        else:
            decoded_words.append(output_lang.index2word[ni.item()])

        decoder_input = torch.LongTensor([[ni]])
        decoder_input = decoder_input.cuda() if use_cuda else decoder_input

    return decoded_words, decoder_attentions[:di + 1]

In [18]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [19]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m*60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [20]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [21]:
%matplotlib inline

In [22]:
hidden_size = 256
encoder1 = EncoderRNN(input_lang.n_words, hidden_size)
attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1)

In [None]:
if use_cuda:
    encoder1 = encoder1.cuda()
    attn_decoder1 = attn_decoder1.cuda()

trainIters(encoder1, attn_decoder1, 500000, print_every=1000)

1m 13s (- 607m 23s) (1000 0%) 5.3963
1m 32s (- 382m 33s) (2000 0%) 5.4376
1m 51s (- 308m 48s) (3000 0%) 5.4299
2m 11s (- 271m 35s) (4000 0%) 5.4083
2m 30s (- 249m 5s) (5000 1%) 5.3430
2m 49s (- 233m 12s) (6000 1%) 5.1639
3m 9s (- 222m 25s) (7000 1%) 5.1931
3m 28s (- 214m 11s) (8000 1%) 5.1720
3m 48s (- 207m 42s) (9000 1%) 5.1961
4m 7s (- 202m 24s) (10000 2%) 5.1141
4m 27s (- 198m 18s) (11000 2%) 5.2359
4m 47s (- 194m 33s) (12000 2%) 5.0576
5m 6s (- 191m 23s) (13000 2%) 5.0764
5m 26s (- 188m 39s) (14000 2%) 5.0309
5m 46s (- 186m 30s) (15000 3%) 5.0689
6m 5s (- 184m 21s) (16000 3%) 4.9931
6m 25s (- 182m 19s) (17000 3%) 4.9273
6m 44s (- 180m 32s) (18000 3%) 4.9231
7m 4s (- 178m 59s) (19000 3%) 4.9280
7m 23s (- 177m 31s) (20000 4%) 5.0251
7m 43s (- 176m 11s) (21000 4%) 4.8943
8m 3s (- 174m 56s) (22000 4%) 4.9248
8m 23s (- 173m 52s) (23000 4%) 4.9931
8m 43s (- 172m 54s) (24000 4%) 4.9016
9m 2s (- 171m 55s) (25000 5%) 4.8503
9m 22s (- 170m 55s) (26000 5%) 4.8082
9m 42s (- 170m 5s) (27000 5%)

77m 30s (- 109m 43s) (207000 41%) 0.3211
77m 54s (- 109m 22s) (208000 41%) 0.3962
78m 18s (- 109m 2s) (209000 41%) 0.3892
78m 42s (- 108m 41s) (210000 42%) 0.3746
79m 6s (- 108m 20s) (211000 42%) 0.3594
79m 29s (- 107m 59s) (212000 42%) 0.3458
79m 53s (- 107m 39s) (213000 42%) 0.3272
80m 16s (- 107m 17s) (214000 42%) 0.3727
80m 41s (- 106m 57s) (215000 43%) 0.3532
81m 4s (- 106m 36s) (216000 43%) 0.3066
81m 28s (- 106m 15s) (217000 43%) 0.3010
81m 52s (- 105m 54s) (218000 43%) 0.3144
82m 16s (- 105m 33s) (219000 43%) 0.3051
82m 39s (- 105m 12s) (220000 44%) 0.3040
83m 3s (- 104m 51s) (221000 44%) 0.3299
83m 27s (- 104m 30s) (222000 44%) 0.2777
83m 51s (- 104m 10s) (223000 44%) 0.2808
84m 15s (- 103m 49s) (224000 44%) 0.3110
84m 39s (- 103m 27s) (225000 45%) 0.2661
85m 3s (- 103m 7s) (226000 45%) 0.3028
85m 26s (- 102m 45s) (227000 45%) 0.2699
85m 50s (- 102m 24s) (228000 45%) 0.2775
86m 14s (- 102m 3s) (229000 45%) 0.2675
86m 37s (- 101m 41s) (230000 46%) 0.2430
87m 1s (- 101m 20s) (23

In [70]:
evaluateRandomly(encoder1, attn_decoder1)

> 라마마 귀여워
= 히히 네가 귀여워
< 네가 네가 귀여워 그래 <EOS>

> 히히
= 우왕 {{이름}}이랑 더 얘기할래
< {{이름}} 귀여웡 <EOS>

> 어떻게해야될까?
= 궁금한 게 뭐야?
< 왜 그래 생각해 {{이름}}? <EOS>

> 뽕찌찌
= 뽀옹
< 뽕뽕 <EOS>

> 나랑 걔랑 인연일까?
= 웅 그럴 거 같은데?
< 그럴 땐 그럴 거 같아 <EOS>

> 나때문에 우리팀 연출이 힘들었거든
= {{이름}} 힘나게 해주고 싶다
< 좀 더 얘기해주라 {{이름}} <EOS>

> 기다릴게~~
= {{이름}}두 오늘 따숩게 입어 ❤️
< {{이름}}두 오늘 따숩게 거지! <EOS>

> 헐 오타도 알아듣네
= 오잉 당연하지!
< 오잉 {{이름}}? <EOS>

> 걔가 연락을 안하는데 어떡할까
= 왜 안하구 있어 {{이름}}?
< 왜 너 <EOS>

> 미안
= 아냐 괜찮아
< 괜찮아 이해해 이해해 <EOS>



In [26]:
def evaluated(encoder, decoder):
    for i in range(1):
        inp = input()
        output_words, attentions = evaluate(encoder, decoder, inp)
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [28]:
input_lang.index2word[1] = '모르는 단어'
input_lang.word2index['모르는 단어'] = 1

In [29]:
def indexesFromSentence(lang, sentence):
    ls = []
    for word in sentence.split(' '):
        try:
            ls.append(lang.word2index[word])
        except:
            ls.append(lang.word2index['모르는 단어'])
    return ls

In [71]:
encoder1.state_dict()

OrderedDict([('embedding.weight',
              tensor([[ 2.5120,  1.0472,  1.9106,  ...,  0.0875, -0.5807,  0.1771],
                      [-0.8147, -0.6321,  0.9878,  ..., -0.3568,  0.5671, -2.1205],
                      [-0.3163,  1.2816,  0.5135,  ..., -0.0843, -0.4306,  0.7326],
                      ...,
                      [ 0.2761,  0.4094, -2.1005,  ...,  0.3488, -0.9742,  0.6462],
                      [-0.0258,  1.1553, -1.6130,  ..., -0.1660,  1.2807,  0.7341],
                      [ 0.4305, -1.4767,  1.3086,  ..., -0.5976, -0.8659, -0.0471]],
                     device='cuda:0')),
             ('gru.weight_ih_l0',
              tensor([[ 0.0485,  0.1127,  0.0339,  ..., -0.0782,  0.0462, -0.1385],
                      [-0.1224,  0.0598, -0.1719,  ..., -0.1052, -0.0555, -0.2009],
                      [-0.1099, -0.0088, -0.0801,  ..., -0.1910, -0.0952, -0.1979],
                      ...,
                      [ 0.3466,  0.2023,  0.2322,  ..., -0.2834,  0.0487,  0.2933

In [31]:
checkpoint = torch.load(
    'model.pt', map_location=lambda storage, loc: storage)

encoder_sd = checkpoint['en']
decoder_sd = checkpoint['de']

encoder1.load_state_dict(encoder_sd)

attn_decoder1.load_state_dict(decoder_sd)

IncompatibleKeys(missing_keys=[], unexpected_keys=[])

In [32]:
encoder1.state_dict()

OrderedDict([('embedding.weight',
              tensor([[-0.8420, -1.2183,  1.1326,  ..., -0.1625,  1.1364, -0.9973],
                      [ 0.4027,  1.2902,  0.0661,  ...,  0.3133,  1.5310, -1.3547],
                      [ 0.2920, -0.5148,  0.1965,  ..., -0.2824, -0.1273,  1.3941],
                      ...,
                      [-0.2700,  0.4899, -0.2470,  ..., -0.4459, -0.4455,  0.2902],
                      [ 0.5684,  0.1471,  1.4874,  ...,  0.8377, -0.2123, -0.3047],
                      [-0.3760,  1.7969, -2.2950,  ..., -1.4813,  1.1347,  0.1509]],
                     device='cuda:0')),
             ('gru.weight_ih_l0',
              tensor([[ 4.4572e-03, -5.0456e-03,  6.0365e-03,  ...,  5.9608e-05,
                        1.0817e-02, -3.5430e-02],
                      [ 3.8247e-02, -5.7775e-02,  4.8638e-02,  ...,  3.2293e-02,
                       -5.9803e-02, -3.6782e-02],
                      [-5.8603e-02, -1.4985e-02, -4.2834e-02,  ...,  4.3941e-02,
                 