# 2019-10-03_fundamentallearning_pytorchCharRNN_generateName

see here https://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html

In [1]:
import requests
import os
import glob
import torch
import unicodedata
import string
import random
import time
import math
import logging

import torch.nn as nn
import torch.nn.functional as F

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

In [2]:
torch.cuda.is_available()

True

### Logging

In [3]:
log = logging.getLogger('DeepPockets')

# the base info stream
formatter1 = logging.Formatter('[%(asctime)s][%(levelname)s][%(name)s][%(funcName)s] %(message)s')
stream_handler1 = logging.StreamHandler()
stream_handler1.setLevel('DEBUG') # will print for anythin, provided
stream_handler1.setFormatter(formatter1)

# the base info stream
formatter2 = logging.Formatter('[%(asctime)s][%(levelname)s][%(name)s] %(message)s')
stream_handler2 = logging.StreamHandler()
stream_handler2.setLevel('INFO') # will print for anythin, provided
stream_handler2.setFormatter(formatter2)

log.addHandler(stream_handler1)
log.addHandler(stream_handler2)


In [4]:
log.setLevel('INFO')

In [5]:
log.info('hello')

[2019-12-03 21:38:28,720][INFO][DeepPockets][<module>] hello
[2019-12-03 21:38:28,720][INFO][DeepPockets] hello


In [6]:
log.debug('test')

## Data in

### Data download

In [7]:
url = 'https://download.pytorch.org/tutorial/data.zip'
if not os.path.isfile('data/pytorch_tutorial/data.zip'):
    r = requests.get(url)
    with open('data/pytorch_tutorial/data.zip', 'wb') as f:
        f.write(r.content)
fnames = glob.glob('data/pytorch_tutorial/data/names/*.txt')

### Data Cleaning and organising

In [8]:
all_letters = string.ascii_letters + " .,;'-"
n_letters = len(all_letters)+1 #accounts for the EOS character

def unicode_to_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

def clean_names(name):
    return unicode_to_ascii(name.strip())
    

In [9]:
# dict matching category (language) to list of names
category_lines = {}
# list of all languages
all_categories = []
# number of categories
n_categories = 0
for fname in fnames:
    with open(fname, 'r') as f:
        category = os.path.basename(fname)[:-4]
        names = [clean_names(line) for line in f]
        all_categories.append(category)
        category_lines[category] = names
n_categories = len(all_categories)
print(all_categories, n_categories)

['Spanish', 'German', 'Polish', 'Russian', 'Chinese', 'Portuguese', 'Japanese', 'French', 'English', 'Korean', 'Irish', 'Arabic', 'Vietnamese', 'Dutch', 'Italian', 'Scottish', 'Czech', 'Greek'] 18


### Data to tensor

Need to code the words as one hot vectors of dimension [1,n_letters], the first dimension is the batch size (1)

thus each word needs to be a [word_length,1,n_letters] tensor of one hot vectors

In [10]:
def letter_to_tensor(l):
    l_t = torch.zeros(1, n_letters)
    l_t[0,all_letters.find(l)] = 1
    return l_t
    
def category_tensor(category):
    li = all_categories.index(category)
    tensor = torch.zeros(1, n_categories)
    tensor[0,li] = 1
    return tensor

def input_tensor(line):
    tensor = torch.zeros(len(line), 1, n_letters)
    for i,letter in enumerate(line):
        tensor[i,0,all_letters.find(letter)] = 1
    return tensor

def target_tensor(line):
    # all letter indices except first
    letter_index = [all_letters.find(letter) for letter in line][1:]
    # appends the EOF char, represented by the index number +1 from the end
    # of all_letters
    letter_index.append(n_letters - 1)
    return torch.LongTensor(letter_index)

In [11]:
input_tensor('abc').shape

torch.Size([3, 1, 59])

In [12]:
target_tensor('aaaZ-')

tensor([ 0,  0, 51, 57, 58])

In [25]:
def random_choice(l):
    return l[random.randint(0,len(l)-1)]

def random_example(subset=False):
    category = subset if subset else random_choice(all_categories)
    word = random_choice(category_lines[category])
    cat_t = category_tensor(category)
    in_t = input_tensor(word)
    target_t = target_tensor(word)
    return category, word,  cat_t, in_t, target_t
    

In [27]:
for i in range(10):
    category, word,  cat_t, in_t, target_t  = random_example()
    print(category, word,  cat_t.shape, in_t.shape, target_t.shape)

Japanese Kuramochi torch.Size([1, 18]) torch.Size([9, 1, 59]) torch.Size([9])
Russian Paidyshev torch.Size([1, 18]) torch.Size([9, 1, 59]) torch.Size([9])
Portuguese Pereira torch.Size([1, 18]) torch.Size([7, 1, 59]) torch.Size([7])
Russian Bagdasarov torch.Size([1, 18]) torch.Size([10, 1, 59]) torch.Size([10])
Portuguese Simoes torch.Size([1, 18]) torch.Size([6, 1, 59]) torch.Size([6])
Vietnamese Than torch.Size([1, 18]) torch.Size([4, 1, 59]) torch.Size([4])
German Boehler torch.Size([1, 18]) torch.Size([7, 1, 59]) torch.Size([7])
Polish Gajos torch.Size([1, 18]) torch.Size([5, 1, 59]) torch.Size([5])
Irish O'Sullivan torch.Size([1, 18]) torch.Size([10, 1, 59]) torch.Size([10])
Arabic Daher torch.Size([1, 18]) torch.Size([5, 1, 59]) torch.Size([5])


## make the network

In [15]:
import torch.nn as nn 

class RNN(nn.Module):
    
    def __init__(self, n_categories, data_size, hidden_size, output_size, gpu=False):
        super(RNN,self).__init__()
        self.combined_size = n_categories + data_size + hidden_size
        self.hidden_size = hidden_size
        self.i2o = nn.Linear(self.combined_size,output_size)
        self.i2h = nn.Linear(self.combined_size,hidden_size)
        self.o2o = nn.Linear(hidden_size + output_size, output_size)
        self.dropout = nn.Dropout(0.1)
        self.softmax = nn.LogSoftmax(dim=1)
        self.criterion = nn.NLLLoss()
        self.gpu=gpu
        if self.gpu:
            self.cuda()
    
    def forward(self, category, x, last_hidden):
        combined_data = torch.cat((category, x, last_hidden), 1)
        output = self.i2o(combined_data)
        hidden = self.i2h(combined_data)
        out_combined = torch.cat((output, hidden), 1)
        output = self.o2o(out_combined)
        output = self.dropout(output)
        output = self.softmax(output)
        return output, hidden
    
    def init_hidden(self):
        return torch.zeros(1, self.hidden_size)
    
    
    def train(self, cat_t,word_t,target_t,lr=0.0005):
        hidden = self.init_hidden()
        target_t.unsqueeze_(-1)
        if self.gpu:
            target_t = target_t.cuda()
            cat_t = cat_t.cuda()
            word_t = word_t.cuda()
            hidden = hidden.cuda()

        self.zero_grad()
        loss = 0

        for i in range(word_t.size()[0]):
            output, hidden = self(cat_t, word_t[i],hidden)
            l = self.criterion(output, target_t[i])
            loss += l

        loss.backward()

        for p in self.parameters():
            p.data.add_(-lr,p.grad.data)
        return output, loss.item()/word_t.size()[0]
    
    def generate(self, start = '', category='English'):
        cat_t = category_tensor(category)
        
        hidden = self.init_hidden()
        if self.gpu:
            hidden = hidden.cuda()
            cat_t = cat_t.cuda()
        if not(start):
            char = random_choice(all_letters)
            word = char
        else:
            char = start[-1]
            word = start
        scores = 0
        for i in range(20):
            char_t = letter_to_tensor(char)
            if self.gpu:
                char_t = char_t.cuda()
            output, hidden = self(cat_t, char_t,hidden)
            prediction = category_from_output(output)
            char = prediction[0]
            scores += prediction[2]
            if char == '#':
                break
            word = word + char
            
            
        return (word, scores/len(word))


In [16]:
n_hidden = 128
hidden = torch.zeros(1, n_hidden)
rnn = RNN(n_categories, n_letters, n_hidden, n_letters)
output, next_hidden = rnn(category_tensor('English'),letter_to_tensor('z'),hidden)

In [17]:
print(output.shape, next_hidden.shape)

torch.Size([1, 59]) torch.Size([1, 128])


In [18]:
def category_from_output(out):
    top_n, top_i = out.topk(1)
    index = top_i[0].item()
    score = top_n[0].item()
    if index < len(all_letters):
        category = all_letters[index]
    else:
        category ='#'
        
    return category, index, score

In [19]:
category_from_output(output)

('V', 47, -3.9471349716186523)

In [20]:
rnn.generate('','Japanese')

('mVVVVVVVIIVDIIVVVVVVI', -3.7515671366737005)

In [21]:
def time_since(start):
    now = time.time()
    dt = now - start
    mins = int(dt/60)
    return f'{mins:>5} min {dt - mins*60:>5.2f} secs'

In [42]:
n_hidden = 128
hidden = torch.zeros(1, n_hidden)
rnn = RNN(n_categories, n_letters, n_hidden, n_letters,gpu=False)

In [43]:
n_iters = 10000
print_every = 200

In [45]:
lang_restriction = 'Japanese'
losses = []
start = time.time()
for i in range(n_iters):
    cat,word,cat_t,word_t,target_t = random_example(lang_restriction)
    output, loss = rnn.train(cat_t,word_t,target_t)
    losses.append(loss)
    
    if (i+1)%print_every==0:
        av_loss = sum(losses[-print_every:])/print_every
        test_cat =lang_restriction
        testgen = rnn.generate(category=test_cat)
        test_string = (f'category:{test_cat:<15s} generated: {testgen[0]:<20s} score: {testgen[1]:<5.3f}')
        print(f'iter:{i+1:>6} | time:{time_since(start)} | done:{(i+1)/n_iters*100:>5.0f}% | loss:{av_loss:>10.4f} | {test_string}')

iter:   200 | time:    0 min  1.12 secs | done:    2% | loss:    2.2513 | category:Japanese        generated: zaka                 score: -1.354
iter:   400 | time:    0 min  2.25 secs | done:    4% | loss:    2.3097 | category:Japanese        generated: xakama               score: -1.172
iter:   600 | time:    0 min  3.35 secs | done:    6% | loss:    2.2443 | category:Japanese        generated: Maka                 score: -1.323
iter:   800 | time:    0 min  4.43 secs | done:    8% | loss:    2.2308 | category:Japanese        generated: Gaka                 score: -1.294
iter:  1000 | time:    0 min  5.55 secs | done:   10% | loss:    2.2849 | category:Japanese        generated: Gaka                 score: -1.416
iter:  1200 | time:    0 min  6.64 secs | done:   12% | loss:    2.2550 | category:Japanese        generated: goka                 score: -1.579
iter:  1400 | time:    0 min  7.75 secs | done:   14% | loss:    2.2454 | category:Japanese        generated: Mokata              

In [None]:
rnn.generate()

run on batches? see https://towardsdatascience.com/taming-lstms-variable-sized-mini-batches-and-why-pytorch-is-good-for-your-health-61d35642972e