# Character Level Text Generation

## 1. Prepare Dataset

In [1]:
from functools import reduce
import glob
import math
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np
import os
import os.path
import pprint
import random
import string
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import tarfile
import time
import unicodedata
import urllib.request
import zipfile

In [2]:
def bundle_dataset(list):
    files = [
        {
            'name': f,
            'path': os.path.join(data_path, f),
            'size': os.stat(os.path.join(data_path, f)).st_size,
            'handle': None
        }
        for f in list
    ]
    total_size = reduce((lambda x, y: x + y), map((lambda x: x['size']), files))
    return {
        'files': files,
        'total_size': total_size
    }

In [3]:
data_path = os.path.join('data', 'CBTest', 'data')
input_files =  os.listdir(data_path)

all_dataset = bundle_dataset([f for f in input_files])
train_dataset = bundle_dataset(['cbt_test.txt'])

pp = pprint.PrettyPrinter(indent=2)
pp.pprint(all_dataset)
pp.pprint(train_dataset)

{ 'files': [ { 'handle': None,
               'name': 'cbt_test.txt',
               'path': 'data/CBTest/data/cbt_test.txt',
               'size': 1528744},
             { 'handle': None,
               'name': 'cbt_train.txt',
               'path': 'data/CBTest/data/cbt_train.txt',
               'size': 25742364},
             { 'handle': None,
               'name': 'cbt_valid.txt',
               'path': 'data/CBTest/data/cbt_valid.txt',
               'size': 1182697},
             { 'handle': None,
               'name': 'cbtest_CN_test_2500ex.txt',
               'path': 'data/CBTest/data/cbtest_CN_test_2500ex.txt',
               'size': 6018376},
             { 'handle': None,
               'name': 'cbtest_CN_train.txt',
               'path': 'data/CBTest/data/cbtest_CN_train.txt',
               'size': 295933246},
             { 'handle': None,
               'name': 'cbtest_CN_valid_2000ex.txt',
               'path': 'data/CBTest/data/cbtest_CN_valid_2000ex.txt',
    

## 2. Train

In [4]:
all_letters = "abcdefghijklmnopqrstuvwxyz .,'"
n_letters = len(all_letters)

In [5]:
def findFiles(path): return glob.glob(path)

# Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427
# O'Néàl => O'Neal
def transform(s):
    s = s.lower()
    s = ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )
    s = ' '.join(s.split())
    s = s.replace(
        ' .', '.'
    ).replace(
        ' ,', ','
    )
    return s

print(transform("O'Néàl\":    , , ."))

o'neal,,.


In [6]:
def letterToIndex(letter):
    return all_letters.find(letter)

# Just for demonstration, turn a letter into a <1 x n_letters> Tensor
def letterToTensor(letter):
    tensor = torch.zeros(1, n_letters)
    tensor[0][letterToIndex(letter)] = 1
    return tensor

# Turn a line into a <line_length x 1 x n_letters>,
# or an array of one-hot letter vectors
def textToTensor(text):
    tensor = torch.zeros(len(text), 1, n_letters)
    for li, letter in enumerate(text):
        tensor[li][0][letterToIndex(letter)] = 1
    return tensor

def textToIndexTensor(text):
    tensor = torch.zeros(len(text), 1, dtype=torch.long)
    for li, letter in enumerate(text):
        tensor[li][0] = letterToIndex(letter)
    return tensor

In [7]:
print(all_letters, len(all_letters))
print(letterToIndex('a'))
print(letterToTensor('b'))
print(textToTensor('ab'))
print(textToIndexTensor('ab'))

abcdefghijklmnopqrstuvwxyz .,' 30
0
tensor([[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])
tensor([[[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]])
tensor([[0],
        [1]])


In [None]:
max_lines = 100
min_size = 20

def next_inputs(bundled_dataset):
    files = bundled_dataset['files']
    total_size = bundled_dataset['total_size']
    
    file_idx = np.random.choice(len(files), p = [f['size'] / total_size for f in files])
    
    if files[file_idx]['handle'] == None:
        files[file_idx]['handle'] = open(files[file_idx]['path'], encoding='utf-8')
    
    cnt_lines = 0
    text = ''
    for i in range(max_lines):
        line = files[file_idx]['handle'].readline()
        if line == None:
            files[file_idx]['handle'].close()
            break
        text += line.strip()
        cnt_lines += 1

        if len(text) >= min_size:
            if random.random() < 0.5:
                break
    return transform(text)

def close_dataset_files(bundled_dataset):
    files = bundled_dataset['files']
    
    for f in files:
        if f['handle'] is not None:
            f['handle'].close()
            f['handle'] = None

print(next_inputs(train_dataset))
close_dataset_files(train_dataset)

booktitle andrewlangtheyellowfairybook.txt.out


In [None]:
class LSTM(nn.Module):
    def __init__(self, input_size, output_size):
        print('LSTM init()', (input_size + output_size) * output_size * 4)
        super(LSTM, self).__init__()

        self.wf = nn.Linear(input_size + output_size, output_size)
        self.wi = nn.Linear(input_size + output_size, output_size)
        self.wc = nn.Linear(input_size + output_size, output_size)
        self.wo = nn.Linear(input_size + output_size, output_size)

    def forward(self, input, output_in, context_in):
#         print(input.size(), output_in.size(), context_in.size())
        input_combined = torch.cat((input, output_in), 1)
        f = torch.sigmoid(self.wf(input_combined))
        i = torch.sigmoid(self.wi(input_combined))
        c = torch.tanh(self.wc(input_combined))
        o = torch.sigmoid(self.wo(input_combined))
        
        ic = torch.mul(i,c)
        
        context_out = torch.mul(context_in, f) + ic
        output_out = torch.mul(torch.tanh(context_out), o)

        return context_out, output_out

class MyNet(nn.Module):
    def __init__(self, input_size, output_size, hidden_size):
        super(MyNet, self).__init__()
        
        self.hidden_size = hidden_size

        self.lstm1 = LSTM(n_letters, hidden_size)
        self.dropout = nn.Dropout()
        self.lstm2 = LSTM(hidden_size, n_letters)
        self.softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, input, output_in, hidden_in, context_in):
#         print(context_in[:self.hidden_size].size(), self.hidden_size, context_in.size())
        context1_out, hidden_out = self.lstm1(input, hidden_in, context_in[0][:self.hidden_size])
        hidden_out = self.dropout(hidden_out)
        context2_out, output = self.lstm2(hidden_out, output_in, context_in[0][self.hidden_size:])
        output_out = self.softmax(output)
        return output_out, hidden_out, torch.cat((context1_out, context2_out), 1)

hidden_size = 40
my_net = MyNet(n_letters, n_letters, hidden_size)

LSTM init() 11200
LSTM init() 8400


In [None]:
learning_rate = 0.005 # If you set this too high, it might explode. If too low, it might not learn
n_iters = 30000
print_every = 100
plot_every = 100
current_loss = 0
all_losses = []

In [None]:
optimizer = optim.Adam(my_net.parameters(), lr = learning_rate)

def train(text):
    context = torch.zeros(1, hidden_size + n_letters)
    hidden = torch.zeros(1, hidden_size)
    output = torch.zeros(1, n_letters)
    loss = torch.zeros(1)
    text_tensor = textToTensor(text)
    text_index_tensor = textToIndexTensor(text)
    outputs = ""
    n_iteration = len(text) - 1

    my_net.zero_grad()

    for i in range(n_iteration):
        output, hidden, context = my_net(text_tensor[i], output, hidden, context)
        _, idx = output.max(1)
        c = all_letters[idx]
        outputs += c
        l = F.nll_loss(output, text_index_tensor[i + 1])
        loss += l
    
    loss = loss / n_iteration
    loss.backward()

    # Add parameters' gradients to their values, multiplied by learning rate
#     for p in lstm.parameters():
#         p.data.add_(-learning_rate, p.grad.data)
    optimizer.step()

    return outputs, loss.item(), my_net.parameters()


def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

start = time.time()

In [None]:
current_loss = 0
for iter in range(1, n_iters + 1):
    text = next_inputs(train_dataset)
    if text == '':
        continue;
    outputs, loss, params = train(text)
    current_loss += loss

    # Print iter number, loss, name and guess
    if iter % print_every == 0:
        print('[%d] %d%% (%s) %.4f' % (iter, iter / n_iters * 100, timeSince(start), loss))
        print('- TO BE: %s' % text)
        print('- AS IS: %s' % outputs)

    # Add current loss avg to list of losses
    if iter % plot_every == 0:
        all_losses.append(current_loss / plot_every)
        print(current_loss)
        current_loss = 0
#         for p in params:
#             print('- PARAMS: %s' % p)
print(all_losses)
close_dataset_files(train_dataset)

## 3. Demo

## References

- https://pytorch.org/tutorials/intermediate/char_rnn_generation_tutorial.html
- x