In [4]:
import os

In [5]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [None]:
from sklearn.model_selection import train_test_split

In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset, ConcatDataset

In [None]:
from tqdm import tqdm

## Path

In [6]:
DATA_PATH = '../data'
CORPUS = os.path.join(DATA_PATH, 'vietnamese_names.txt')
WEIGHT_PATH = '../weight'

In [None]:
if not os.path.isdir(WEIGHT_PATH):
    os.makedirs(WEIGHT_PATH)

## Preprocess data

In [3]:
with open('mini.txt') as f:
    data = f.read().split('\n')

In [4]:
def preprocess(sent):
    sent = '^' + sent + '$'
    return sent

In [5]:
data = [preprocess(x) for x in data]

## Tokenize

In [6]:
tokenizer = Tokenizer(char_level=True, filters='', split='', oov_token='*')

In [7]:
pad_token = 0
unknow_token = 1

In [8]:
tokenizer.fit_on_texts(data)

In [9]:
seq_all = tokenizer.texts_to_sequences(data)

In [10]:
len_all = [len(x) for x in seq_all]
max_len = max(len_all)

In [11]:
seq_all = pad_sequences(seq_all, maxlen=max_len, padding='post')

In [12]:
X_all = seq_all[:,:-1]
y_all = seq_all[:,1:]

In [13]:
X_train, X_val, y_train, y_val = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

In [None]:
# save tokenizer
torch.save(tokenizer, os.path.join(WEIGHT_PATH, 'vocab.h5'))

## Data loader

In [15]:
def build_dataset_from_tensors(X, y):
    ds = TensorDataset(X, y)
    return ds

In [16]:
X_train = torch.tensor(X_train).long()
y_train = torch.tensor(y_train).long()
X_val = torch.tensor(X_val).long()
y_val = torch.tensor(y_val).long()

In [17]:
train_ds = build_dataset_from_tensors(X_train, y_train)
val_ds = build_dataset_from_tensors(X_val, y_val)

In [18]:
batch_size = 32
shuffle = True

In [19]:
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=shuffle)
val_dl = DataLoader(val_ds, batch_size=batch_size, shuffle=shuffle)

## Model

In [9]:
class Model(nn.Module):
    def __init__(self, vocab_size, embeding_size, hidden_size):
        super(Model, self).__init__()
        self.embeding = nn.Embedding(vocab_size, embeding_size)
        self.lstm = nn.LSTM(embeding_size, hidden_size, num_layers=2, bidirectional=True, batch_first=True, dropout=0.1)
        self.linear = nn.Linear(2*hidden_size, vocab_size)
        
    def forward(self, x):
        # x: BxS
        x = self.embeding(x) # BxSxE
        x, _ = self.lstm(x) # BxSx2H
        x = self.linear(x) # BxSxV
        return x
    
    def predict(self, x):
        x = self.forward(x)
        x = F.softmax(x, dim=-1) # BxS
        return x

In [34]:
def forward_and_loss(model, x, y, loss_fn, pad_token):
    out = model(x)
    loss = loss_fn(out.view(-1, out.size(-1)), y.view(-1), ignore_index=pad_token)
    return out, loss    

In [3]:
def train_model(model, optim, train_iter, loss_fn, pad_token, weight_path=None):
    total_loss = 0.0
    model.train()
    with tqdm(total=len(train_iter)) as pbar:
        for x, y in train_iter: 
            optimizer.zero_grad()
            _, loss = forward_and_loss(model, x, y, loss_fn, pad_token=pad_token)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            pbar.update(1)
            pbar.set_description("%-10s = %.6f  " % ('loss', total_loss))
            
    # Save model
    if weight_path is not None:
        state = {
            "model": model.state_dict(),
            "optim": optimizer.state_dict()
        }
        
        torch.save(state, weight_path)

In [2]:
def evaluate_model(model, val_iter, pad_token):
    model.eval()
    with torch.no_grad(), tqdm(total=len(val_iter)) as pbar:
        total_loss = 0.0
        for x, y in val_iter:
            _, loss = forward_and_loss(model, x, y, F.cross_entropy, pad_token=pad_token)
            total_loss += loss.item()
            pbar.update(1)
            pbar.set_description("%-10s = %.6f  " % ('val_loss', total_loss))

## Training

In [26]:
vocab_size = len(tokenizer.word_index) + 1
embedding_size = 200
hidden_size = 512
learning_rate = 0.0001
loss_fn = F.cross_entropy

In [None]:
model = Model(vocab_size, embedding_size, hidden_size)

In [28]:
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, betas=(0.9, 0.98), eps=1e-9)

In [1]:
num_epoch = 100

In [None]:
for i in range(1, num_epoch+1):
    print("\nEpoch %02d" % i, flush=True)
    train_model(model, optimizer, train_dl, loss_fn, pad_token)
    evaluate_model(model, val_dl, pad_token)    

In [None]:
torch.save(model.state_dict(), os.path.join(WEIGHT_PATH, 'model.h5'))