In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import string
import unicodedata
import random
import re

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

from keras.preprocessing.sequence import pad_sequences

import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
# Check if GPU is available
train_on_gpu = torch.cuda.is_available()
if(train_on_gpu):
    print('Training on GPU!')
else: 
    print('No GPU available, training on CPU; consider making n_epochs very small.')

Training on GPU!


In [None]:
# 몇 개의 단어를 context vec으로 사용할지 설정

context_num = 2

In [None]:
lines = open("/content/textbook", "r").read().split('\n')

In [None]:
test_sentence = [word_tokenize(s) for s in lines]
print(test_sentence[0])

In [None]:
vocab = []
for sent in test_sentence:
    for word in sent: vocab.append(word)
        
vocab = set(vocab)
voc_len=len(vocab)
word_to_ix = {word: i for i, word in enumerate(vocab)}

print(voc_len)

1552


In [None]:
MAX_LEN = 75

In [None]:
input_ids = []
for sent in test_sentence:
    input_ids.append([word_to_ix[w] for w in sent])

input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
print(input_ids[0])

[1243   56 1323  699  570  219  300  345  838    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0 

In [None]:
inp = []
tar = []

for sentence in input_ids:
    for i in range(len(sentence)-context_num):
        context = sentence[i:i+context_num]
        target = sentence[i+context_num]
        if sum(context.tolist())+target==0: continue
        inp.append(context)
        tar.append(torch.tensor([target]))

In [None]:
inp = torch.tensor(inp)
data_len = len(inp)
print(data_len)

2205126


In [None]:
for i, t in zip(inp, tar):
    print(i.size())
    print(i)
    print(t)
    break

torch.Size([4])
tensor([1243,   56, 1323,  699])
tensor([570])


In [None]:
class GRU(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, n_layers=1):
        super(RNN, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        
        self.encoder = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size*context_num, hidden_size, n_layers,batch_first=True,
                          bidirectional=False)
        self.decoder = nn.Linear(hidden_size, output_size)
    
    def forward(self, input, hidden):
        input = self.encoder(input.view(1, -1))
        output, hidden = self.gru(input.view(1, 1, -1), hidden)
        output = self.decoder(output.view(1, -1))
        return output, hidden

    def init_hidden(self):
        return Variable(torch.zeros(self.n_layers, 1, self.hidden_size))

In [None]:
import time, math

def time_since(since):
    s = time.time() - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

In [None]:
def train(inp, target):
    hidden = decoder.init_hidden().cuda()
    loss = 0
    decoder.zero_grad()

    output, hidden = decoder(inp.cuda(), hidden) 
    
    loss += criterion(output, target.cuda())
    loss.backward()
    decoder_optimizer.step()
    
    return loss.data.item()

In [None]:
def generate(prime_str='the name of the dog', predict_len=175, temperature=0.8):
    torch.no_grad()
    hidden = decoder.init_hidden().cuda()

    for p in range(predict_len):
        prime_input = torch.tensor([word_to_ix[w] for w in prime_str.split()], dtype=torch.long).cuda()
        inp = prime_input[-1*context_num:] 

        if torch.sum(inp) == 0: break  # 종료 조건
            
        output, hidden = decoder(inp, hidden)
        
        # Sample from the network as a multinomial distribution
        output_dist = F.softmax(output.data).view(-1).div(temperature).exp()
        top_i = torch.multinomial(output_dist, 1)[0]
    
        
        # Add predicted word to string and use as next input
        predicted_word = list(word_to_ix.keys())[list(word_to_ix.values()).index(top_i)]
        prime_str += " " + predicted_word

    return prime_str

In [None]:
n_epochs = 10
print_every = 2
plot_every = 2
hidden_size = 100
n_layers = 2
lr = 0.015

decoder = GRU(voc_len, hidden_size, voc_len, n_layers)
decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=lr)
criterion = torch.nn.CrossEntropyLoss()

decoder.train()
decoder.cuda()

start = time.time()
all_losses = []

for epoch in range(1, n_epochs + 1):
    print("Epoch: ", epoch)
    loss_avg = 0
    for i,t in zip(inp, tar):
        loss = train(i,t)       
        loss_avg += loss

    if epoch % print_every == 0:
        print('[%s (%d %d%%)]' % (time_since(start), epoch, epoch / n_epochs * 100))
        print(generate())

    if epoch % plot_every == 0:
        all_losses.append((loss_avg/data_len) / plot_every)
        loss_avg = 0

In [None]:
torch.save(decoder, 'grumodel.pt')

## Generate sentence

In [None]:
print(generate('the name of the dog is', 75, temperature=4))

  del sys.path[0]


the name of the dog is choe paintings supermarket eating ella jim ha moves ketchup 160 looked matter him gim glasses sour yu chris lot raise rain selina too anything ice here take hikers excellent hamburgers 3,000 bees africa so p.e t5 reporter used now learning floor ant 5th program dasom ... once tired skating father quiz happychildren willy isstv kiwis grandma kangaroo baby stars happychildren hooray market yuna semi traveler cakes sunny andong gim date would very course hand cooking boring calling asia yourself having arm earth job rain flowers gomawoyo ahh b-e-n loved foot stories 23rd no brothers talking borte fever beacause teach gim medicine small eat gang line health design more sleeping mr.allan blind plane right faster scissors 15th shopkeeper elephant move kajuru lke up lion picures helen pants ha.ben andy brothers save suah front places ray ... see miso writing expensive ahh done anu again does finger cook ? pierre juho ladybug 30 officer tanay usually 25th ivan weekends j