<a href="https://colab.research.google.com/github/harryypham/MyMLPractice/blob/main/nlp/lang_translation_draft.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [23]:
import os
import torch
from collections import Counter

In [40]:
data_path = 'data'
src_lang = 'en'
tgt_lang = 'vi'

def load_data(data_path, split):
  data_path = os.path.join(data_path, split)
  with open(os.path.join(data_path, src_lang + '.txt'), 'r', encoding='utf-8') as src_f:
    src_data = src_f.read()

  with open(os.path.join(data_path, tgt_lang + '.txt'), 'r', encoding='utf-8') as tgt_f:
    tgt_data = tgt_f.read()

  return src_data, tgt_data

def load_dataset(src_data, tgt_data, src_encode, tgt_encode, tokenizer):
  data = []
  for i, (src, tgt) in enumerate(zip(src_data.split('\n'), tgt_data.split('\n'))):
    src_toks, tgt_toks = tokenizer(src), tokenizer(tgt)
    src_ids, tgt_ids = src_encode(src_toks), tgt_encode(tgt_toks)
    data.append([src_ids, tgt_ids])
  return data

def tokenizer(s):
  return ['<bos>'] + s.split(' ') + ['<eos>']

def build_vocab(data, min_freq=2):
  word_freq = Counter()
  data = " ".join(data.split('\n'))
  word_freq.update(data.split(' '))
  vocab = [w for w in word_freq.keys() if word_freq[w] >= min_freq]
  return vocab

def build_tokenizer(vocab):
  stoi = {s:i+4 for i,s in enumerate(vocab)}
  stoi['<unk>'] = 0
  stoi['<bos>'] = 1
  stoi['<eos>'] = 2
  stoi['<pad>'] = 3
  itos = {i:s for s,i in stoi.items()}
  encode = lambda s: [stoi.get(w, stoi['<unk>']) for w in s]
  decode = lambda l: ' '.join([itos[i] for i in l if i != 3])
  return encode, decode

def get_batch(data, batch_size):
  ix = torch.randint(len(data), (batch_size,))
  src_max_len = max([len(data[i][0]) for i in ix])
  tgt_max_len = max([len(data[i][1]) for i in ix])
  x = torch.stack([torch.tensor(data[i][0] + [3] * (src_max_len - len(data[i][0])), dtype=torch.long) for i in ix])
  y = torch.stack([torch.tensor(data[i][1] + [3] * (tgt_max_len - len(data[i][1])), dtype=torch.long)  for i in ix])
  return x, y

In [41]:
data = load_data(data_path, 'train')
src_vocab = build_vocab(data[0])
tgt_vocab = build_vocab(data[1])
src_vocab_size = len(src_vocab)
tgt_vocab_size = len(tgt_vocab)
print(src_vocab_size, tgt_vocab_size)
src_encode, src_decode = build_tokenizer(src_vocab)
tgt_encode, tgt_decode = build_tokenizer(tgt_vocab)
train_data = load_dataset(data[0], data[1], src_encode, tgt_encode, tokenizer)
xb, yb = get_batch(train_data, 4)
print(xb, yb)

32258 14318
tensor([[    1,  3627,  2260,     7,  4827,    29,   516, 12885,  1393,    47,
          1736, 13221,    13, 21634, 21635,   600,     7,  8084,    57,   621,
           441,    51,   662,  3976,  5481,    70,  5160,  7780,    51,  3607,
           122,  5436,   150,   370, 21636,    13, 21637, 21638,    70,  4777,
          1662,    20, 21639, 21640,    47,     2,     3],
        [    1,     4, 22612, 10838,   127,   647,   983,   138,   200,  4087,
            20, 14232,    13,   212,   460,    48,   133,   329,  2492,    13,
           478,   173,   380,    53,     7,   692,  1862,    20,     7, 30498,
            47,     2,     3,     3,     3,     3,     3,     3,     3,     3,
             3,     3,     3,     3,     3,     3,     3],
        [    1,   110,   516,    33,   214,  1280,   387,  4406,   166, 15946,
           113,    13,   211,    13,   138,  2379,  1893,    13,    21,   130,
           670,  3853,  5352,   149,    47,     2,     3,     3,     3,     3,
 

In [38]:
print(src_decode(xb[0].tolist()))

<bos> In fact , right now -- plug it from here , and then plug it in here , and now let &apos;s see if it gets my facial expressions . <eos>


In [46]:
import torch.nn as nn
import torch.nn.functional as F

device = "cuda:0" if torch.cuda.is_available() else "cpu"

class LSTM(nn.Module):
  def __init__(self, hidden_size, emb_dim, num_layers, vocab_size, encoder=True):
    super().__init__()
    self.embedding_table = nn.Embedding(vocab_size, emb_dim)
    self.hidden_size = hidden_size
    self.num_layers = num_layers
    self.lstm = nn.LSTM(emb_dim, hidden_size, num_layers, batch_first=True)
    if encoder:
      self.fc = nn.Linear(hidden_size, hidden_size)
    else:
      self.fc = nn.Linear(hidden_size, vocab_size)

  def forward(self, x, targets=None):
    h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
    c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)

    x = self.embedding_table(x)
    logits, _ = self.lstm(x, (h0, c0))
    if targets is not None:
      B, T, C = logits.shape
      logits = self.fc(logits.reshape(B*T, C))
      targets = targets.view(-1)
      loss = F.cross_entropy(logits, targets)
    else:
      logits = self.fc(logits[:, -1, :])
      loss = None
    return logits, loss


In [47]:
encoder = LSTM(256, 64, 2, src_vocab_size)
logits, loss = encoder(xb)
print(logits.shape)

torch.Size([4, 256])
