# Character Level Text Generation

## 1. Prepare Dataset

In [1]:
from functools import reduce
import glob
import math
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np
import os
import os.path
import pprint
import random
import string
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import tarfile
import time
import unicodedata
import urllib.request
import zipfile

In [2]:
def bundle_dataset(list):
    files = [
        {
            'name': f,
            'path': os.path.join(data_path, f),
            'size': os.stat(os.path.join(data_path, f)).st_size,
            'handle': None
        }
        for f in list
    ]
    total_size = reduce((lambda x, y: x + y), map((lambda x: x['size']), files))
    return {
        'files': files,
        'total_size': total_size
    }

In [3]:
data_path = os.path.join('data', 'CBTest', 'data')
input_files =  os.listdir(data_path)

all_dataset = bundle_dataset([f for f in input_files])
train_dataset = bundle_dataset(['cbt_test.txt'])

pp = pprint.PrettyPrinter(indent=2)
pp.pprint(all_dataset)
pp.pprint(train_dataset)

{ 'files': [ { 'handle': None,
               'name': 'cbtest_V_train.txt',
               'path': 'data/CBTest/data/cbtest_V_train.txt',
               'size': 247098043},
             { 'handle': None,
               'name': 'cbtest_V_test_2500ex.txt',
               'path': 'data/CBTest/data/cbtest_V_test_2500ex.txt',
               'size': 5686625},
             { 'handle': None,
               'name': 'cbtest_V_valid_2000ex.txt',
               'path': 'data/CBTest/data/cbtest_V_valid_2000ex.txt',
               'size': 4460425},
             { 'handle': None,
               'name': 'cbt_valid.txt',
               'path': 'data/CBTest/data/cbt_valid.txt',
               'size': 1182697},
             { 'handle': None,
               'name': 'cbtest_CN_train.txt',
               'path': 'data/CBTest/data/cbtest_CN_train.txt',
               'size': 295933246},
             { 'handle': None,
               'name': 'cbt_test.txt',
               'path': 'data/CBTest/data/cbt_test.txt

## 2. Train

In [4]:
end_letter = "$"
_all_letters = "abcdefghijklmnopqrstuvwxyz .,'"
all_letters = _all_letters + end_letter
n_letters = len(all_letters)

In [5]:
def letterToIndex(letter):
    return all_letters.find(letter)

# Just for demonstration, turn a letter into a <1 x n_letters> Tensor
def letterToTensor(letter):
    tensor = torch.zeros(1, n_letters)
    tensor[0][letterToIndex(letter)] = 1
    return tensor

def indexToLetter(index):
    return all_letters[index]

# Turn a line into a <line_length x 1 x n_letters>,
# or an array of one-hot letter vectors
def textToTensor(text):
    tensor = torch.zeros(len(text), 1, n_letters)
    for li, letter in enumerate(text):
        tensor[li][0][letterToIndex(letter)] = 1
    return tensor

def textToIndexTensor(text):
    tensor = torch.zeros(len(text), 1, dtype=torch.long)
    for li, letter in enumerate(text):
        tensor[li][0] = letterToIndex(letter)
    return tensor

In [6]:
def findFiles(path): return glob.glob(path)

# Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427
# O'Néàl => O'Neal
def transform(s):
    s = s.lower()
    s = ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )
    s = ' '.join(s.split())
    s = s.replace(
        ' .', '.'
    ).replace(
        ' ,', ','
    )
    return s

print(transform("O'Néàl\":    , , ."))

o'neal,,.


In [7]:
print(all_letters, n_letters)
print(letterToIndex('a'))
print(letterToTensor('b'))
print(textToTensor('ab'))
print(textToIndexTensor('ab'))

abcdefghijklmnopqrstuvwxyz .,'$ 31
0
tensor([[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])
tensor([[[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]])
tensor([[0],
        [1]])


In [8]:
max_lines = 50
min_size = 10

def next_inputs(bundled_dataset):
    files = bundled_dataset['files']
    total_size = bundled_dataset['total_size']
    
    file_idx = np.random.choice(len(files), p = [f['size'] / total_size for f in files])
    
    if files[file_idx]['handle'] == None:
        files[file_idx]['handle'] = open(files[file_idx]['path'], encoding='utf-8')
    
    cnt_lines = 0
    text = ''
    for i in range(max_lines):
        line = files[file_idx]['handle'].readline()
        if line == None:
            files[file_idx]['handle'].close()
            break
        text += line.strip()
        cnt_lines += 1

        if len(text) >= min_size:
            if random.random() < 0.5:
                break
    return transform(text)

def close_dataset_files(bundled_dataset):
    files = bundled_dataset['files']
    
    for f in files:
        if f['handle'] is not None:
            f['handle'].close()
            f['handle'] = None

print(next_inputs(train_dataset))
close_dataset_files(train_dataset)

booktitle andrewlangtheyellowfairybook.txt.out


In [9]:
class LSTM(nn.Module):
    def __init__(self, input_size, output_size):
        print('LSTM init()', (input_size + output_size) * output_size * 4)
        super(LSTM, self).__init__()

        self.wf = nn.Linear(input_size + output_size, output_size)
        self.wi = nn.Linear(input_size + output_size, output_size)
        self.wc = nn.Linear(input_size + output_size, output_size)
        self.wo = nn.Linear(input_size + output_size, output_size)

    def forward(self, input, output_in, context_in):
        input_combined = torch.cat((input, output_in), 1)
        f = torch.sigmoid(self.wf(input_combined))
        i = torch.sigmoid(self.wi(input_combined))
        c = torch.tanh(self.wc(input_combined))
        o = torch.sigmoid(self.wo(input_combined))
        
        ic = torch.mul(i,c)
        
        context_out = torch.mul(context_in, f) + ic
        output_out = torch.mul(torch.tanh(context_out), o)

        return context_out, output_out

class MyNet(nn.Module):
    def __init__(self, input_size, output_size, hidden_size):
        super(MyNet, self).__init__()
        
        self.hidden_size = hidden_size

        self.lstm1 = LSTM(n_letters, hidden_size)
        self.dropout = nn.Dropout()
        self.lstm2 = LSTM(hidden_size, n_letters)
        self.softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, input, output_in, hidden_in, context_in):
        context1_out, hidden_out = self.lstm1(input, hidden_in, context_in[0][:self.hidden_size])
        hidden_out = self.dropout(hidden_out)
        context2_out, output = self.lstm2(hidden_out, output_in, context_in[0][self.hidden_size:])
        output_out = self.softmax(output)
        return output_out, hidden_out, torch.cat((context1_out, context2_out), 1)

hidden_size = 25
my_net = MyNet(n_letters, n_letters, hidden_size)

LSTM init() 5600
LSTM init() 6944


In [10]:
learning_rate = 0.005 # If you set this too high, it might explode. If too low, it might not learn
n_iters = 50000
print_every = 500
plot_every = 500
current_loss = 0
all_losses = []

In [11]:
optimizer = optim.Adam(my_net.parameters(), lr = learning_rate)

def train(text):
    context = torch.zeros(1, hidden_size + n_letters) # memory
    hidden = torch.zeros(1, hidden_size)
    output = torch.zeros(1, n_letters)
    loss = torch.zeros(1)
    text_tensor = textToTensor(text)
    text_index_tensor = textToIndexTensor(text)
    outputs = ""
    n_iteration = len(text) - 1

    my_net.zero_grad()

    for i in range(n_iteration):
        output, hidden, context = my_net(text_tensor[i], output, hidden, context)
        _, idx = output.max(1)
        c = all_letters[idx[0].data]
        outputs += c
        l = F.nll_loss(output, text_index_tensor[i + 1])
        loss += l
    
    loss = loss / n_iteration
    loss.backward()

    # Add parameters' gradients to their values, multiplied by learning rate
#     for p in lstm.parameters():
#         p.data.add_(-learning_rate, p.grad.data)
    optimizer.step()

    return outputs, loss.item()

In [12]:
def sample(context=torch.zeros(1, hidden_size + n_letters)):
    output_text = ''
    hidden = torch.zeros(1, hidden_size)
    output = torch.zeros(1, n_letters)
    input_ = torch.zeros(1, n_letters)

    n_max_len = 500

    for i in range(n_max_len):
        tmp_output, hidden, context = my_net(input_, output, hidden, context)
        val,idx = torch.max(tmp_output, 1)
        c = all_letters[idx[0].data]
        output_text += c
        if c == end_letter:
            break
        
        output = tmp_output
        input_ = textToTensor(c)[0]

    return output_text, context

def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

start = time.time()

In [13]:
current_loss = 0
for iter in range(1, n_iters + 1):
    text = next_inputs(train_dataset) + end_letter
    if text == '':
        continue;
    outputs, loss = train(text + end_letter)
    current_loss += loss

    # Print iter number, loss, name and guess
    if iter % print_every == 0:
        print('[%d] %d%% (%s) %.4f' % (iter, iter / n_iters * 100, timeSince(start), loss))
        print('- TO BE: %s' % text)
        print('- AS IS: %s' % outputs)
        print('- Sample: %s' % sample()[0])

    # Add current loss avg to list of losses
    if iter % plot_every == 0:
        all_losses.append(current_loss / plot_every)
        print(current_loss)
        current_loss = 0

print(all_losses)
close_dataset_files(train_dataset)

[500] 1% (0m 59s) 2.8371
- TO BE: go back, ' she said, to the castle, and bury this little diamond close to the door.$
- AS IS: h                                                                                   
- Sample: e                            a     h                  a                            h       h         a     h          h   a                                h h               a h                                     t                   h        h  h          a          h                                            a                                          h           h   h             a            h         h                                 h            h   a      a       a   t                     
1516.558836221695
[1000] 2% (2m 17s) 2.8008
- TO BE: impatient at not being recognised, fairerthanafairy now drew out her third present, and on opening the crystal scentbottle a little syren flew out, who silenced the violins and then sang close to the prince 's ear the story

[5000] 10% (9m 1s) 2.7789
- TO BE: the little room was flooded with a mellow light from the pinkglobed lamp on the table, and in the soft, shadowy radiance she was as beautiful as a dream.she wore a dress of crepe, cut low in the neck.$
- AS IS: he he     aeo  ae  aee     aen  anae     aa    aoe eahe aon   e    ae eeanehee ah     a   a  hhe ao    aae     ao  n  h aae aan an aea       a  hnaee  e    aa   anaee   a  aoeaea aa  ae  a  ahe ae e   
- Sample: o he an an  an an an an an a an a an an a a he an a he a an an an an a he an he a an a an he he an a he an a a ne a a an a an a a  a a an an a $
1394.6834952831268
[5500] 11% (9m 41s) 2.8105
- TO BE: the next day mrs. elwell said, ches, abner stearns wants you to go up there for a fortnight while tom bixby is away, and drive the milk wagon of mornings and do the chores for mrs. stearns.$
- AS IS: he ae   ao  aeee aa     ao    hhe   a     aae     ao    aa  ah aa a eahe   he  hnhe        aae   ahoeaen e a  a     a   aoe e ahe aen eae h  a

[11500] 23% (14m 10s) 2.0528
- TO BE: $
- AS IS: $
- Sample: e an na an an an a an an an an an an an an an a  an nea  an an an an an an a  an an an an an a an non a nea ne an an an an an an an no an na an an an a an an an an an an an an an an an a an  an ne an an an an a an an no an a nean an an an an a an an an a  an an a an an an no an an an ne an an an an a an a a a an an  an a a  on a a an an  no an an an an  an a  an a an an an an an a no an an  an  an an an an an an an an a an a noon an a an nea an an an an  an non an an an an an an an an an an an a
1026.615356206894
[12000] 24% (14m 11s) 2.0529
- TO BE: $
- AS IS: $
- Sample: he a an an an an an an an a an an an an an an an an an an an an a  an an an an an an an a  on an a an an nean a an ne an an a  an an an no a a an an an  an an an an a an an an an an an an  an an a  on a an an an an a a  an a an an an an a a no a an an  an a  an ne an an an a an an an an an an an an an a an an an an an an an an no nea a an an an an nee  one 

[43500] 87% (15m 13s) 2.0203
- TO BE: $
- AS IS: $
- Sample: $
1010.1420891284943
[44000] 88% (15m 14s) 2.0203
- TO BE: $
- AS IS: $
- Sample: $
1010.1420888900757
[44500] 89% (15m 15s) 2.0203
- TO BE: $
- AS IS: $
- Sample: $
1010.1420893669128
[45000] 90% (15m 16s) 2.0203
- TO BE: $
- AS IS: $
- Sample: $
1010.1421797275543
[45500] 91% (15m 17s) 2.0203
- TO BE: $
- AS IS: $
- Sample: $
1010.14209151268
[46000] 92% (15m 18s) 2.0203
- TO BE: $
- AS IS: $
- Sample: $
1010.1420974731445
[46500] 93% (15m 19s) 2.0203
- TO BE: $
- AS IS: $
- Sample: $
1010.1420893669128
[47000] 94% (15m 20s) 2.0203
- TO BE: $
- AS IS: $
- Sample: $
1010.1421117782593
[47500] 95% (15m 21s) 2.0203
- TO BE: $
- AS IS: $
- Sample: $
1010.1420912742615
[48000] 96% (15m 21s) 2.0203
- TO BE: $
- AS IS: $
- Sample: $
1010.1420965194702
[48500] 97% (15m 22s) 2.0203
- TO BE: $
- AS IS: $
- Sample: $
1010.14208984375
[49000] 98% (15m 23s) 2.0203
- TO BE: $
- AS IS: $
- Sample: $
1010.1420922279358
[49500] 99% (15m 24s

## 3. Demo

In [14]:
context = torch.zeros(1, hidden_size + n_letters)
for i in range(10):
    output_text, context = sample(context=context)
    print(output_text)

$
$
$
$
$
$
$
$
$
$


## References

- https://pytorch.org/tutorials/intermediate/char_rnn_generation_tutorial.html
- x