<a href="https://colab.research.google.com/github/jdasam/aat3020-2023/blob/main/notebooks/3_Language_modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
from torchtext.data.utils import get_tokenizer
import matplotlib.pyplot as plt
from tqdm.auto import tqdm


In [None]:
!wget "https://raw.githubusercontent.com/karpathy/makemore/master/names.txt"

In [None]:
def read_txt(txt_path):
  with open(txt_path, 'r') as f:
    txt_string = f.readlines()
  return txt_string

txt_string = read_txt('names.txt')

In [None]:
names = [x[:-1] for x in txt_string]
len(names)

In [None]:
# 2-gram 
entire_ngram = []
n = 5

for name in names:
  len_name = len(name)
  for i in range(len(name)-(n-1)):
    n_gram = name[i:i+n]
    entire_ngram.append(n_gram)

In [None]:
from collections import Counter

n_gram_counter = Counter(entire_ngram)

In [None]:
# how many combination exists?
len(n_gram_counter)

In [None]:
n_gram_counter.most_common(10)

In [None]:
n_gram_keys = sorted(list(n_gram_counter.keys()))
n_gram_keys

In [None]:
n_gram_counter['ab']

In [None]:
starting_characters = list(set([x[0] for x in n_gram_keys]))

example_chr = starting_characters[0]
example_chr
corresp_ngram_words = [x for x in n_gram_keys if x[0] == example_chr]
total_chr_appearance = sum([n_gram_counter[word] for word in corresp_ngram_words])

prob_of_words = [n_gram_counter[x]/total_chr_appearance for x in corresp_ngram_words]


In [None]:
prob_of_words

In [None]:
starting_characters = sorted(list(set([x[0] for x in n_gram_keys])))
len(starting_characters)

In [None]:
# Make transition matrix
import numpy as np

transition = np.zeros([26, 26], dtype=np.int)
chrs = sorted(list(set([x[0] for x in n_gram_keys])))
chrs

In [None]:
for i, start in enumerate(chrs):
  for j, end in enumerate(chrs):
    transition[i, j] = n_gram_counter[start+end]

In [None]:
import torch

transition = torch.tensor(transition)
torch.set_printoptions(sci_mode=False)

In [None]:
trans_prob = transition / transition.sum(dim=1).unsqueeze(1)
trans_prob

In [None]:
trans_prob[0].sum()

In [None]:
chrs[16], chrs[20], trans_prob[16, 20]

In [None]:
import matplotlib.pyplot as plt

plt.imshow(trans_prob)

## Neural Network

In [None]:
import torch
import torch.nn

class Dataset:
  def __init__(self, list_of_names):
    self.names = list_of_names
    self.chrs = sorted(list(set([chr  for name in self.names for chr in name])))
    self.chrs = ['0', '.'] + self.chrs
    self.tok2idx = {chr:i for i, chr in enumerate(self.chrs)}

  def __len__(self):
    return len(self.names)

  def __getitem__(self, idx):
    name = self.names[idx]
    name = '.' + name + '.'
    name_in_indices = [self.tok2idx[chr] for chr in name]
    return torch.tensor(name_in_indices, dtype=torch.long)

dataset = Dataset(names)
dataset[0]

In [None]:
vocab_size = len(dataset.chrs)
emb_dim = 8

word_emb = nn.Embedding(vocab_size, emb_dim)

In [None]:
word_emb.weight[0]

In [None]:
name_tensor = dataset[0]
emb = word_emb(name_tensor)

In [None]:
emb.shape

In [None]:
emb[0:1]

In [None]:
hidden_size = 12
weight_xh = nn.Linear(emb_dim, hidden_size, bias=False)

weight_xh(emb)

In [None]:
weight_hh = nn.Linear(hidden_size, hidden_size)

initial_hidden = torch.zeros(hidden_size)

$h_1 = \sigma(W_{hh}h_{0} + W_{xh}x_{1})$


In [None]:
next_hidden = (weight_hh(initial_hidden) + weight_xh(emb[0:1])).tanh()
next_hidden

In [None]:
timestep = 0
hidden = torch.zeros(hidden_size)
total_hidden = []

for t in range(len(name_tensor)):
  emb_t = word_emb(name_tensor[t])
  hidden = (weight_hh(hidden) + weight_xh(emb_t)).tanh()
  total_hidden.append(hidden)
total_hidden

In [None]:
import matplotlib.pyplot as plt
plt.plot(((torch.arange(100) - 50)/10).tanh())

# Language Model

In [None]:
class LanguageModel(nn.Module):
  def __init__(self, vocab_size, emb_dim, hidden_size):
    super().__init__()
    self.emb = nn.Embedding(vocab_size, emb_dim)
    self.rnn = nn.RNN(emb_dim, hidden_size, num_layers=1, batch_first=True)
    self.proj = nn.Linear(hidden_size, vocab_size)

  def forward(self, x):
    emb = self.emb(x)
    out, _ = self.rnn(emb)
    out = self.proj(out)
    return out
  
dataset = Dataset(names)

vocab_size = len(dataset.chrs)
emb_dim = 16
hidden_size = 32
model = LanguageModel(vocab_size, emb_dim, hidden_size)
  

In [None]:
def get_nll(pred, target):
  pred = pred.reshape(-1, pred.shape[-1])
  target = target.reshape(-1)
  pred_masked = pred[target!=0]
  target_masked = target[target!=0]
  
  return -torch.log(pred_masked[torch.arange(len(target_masked)), target_masked] + 1e-8).mean()

In [None]:
def pad_collate_fn(batch):
  batch = torch.nn.utils.rnn.pad_sequence(batch, batch_first=True, padding_value=0)
  return batch

In [None]:

dataloader = torch.utils.data.DataLoader(dataset, batch_size=64, shuffle=True, collate_fn=pad_collate_fn)

model = LanguageModel(vocab_size, emb_dim, hidden_size)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
model = model.train()
dev = 'cuda'
# dev='cpu'
model.to(dev)
loss_fn = get_nll
n_epoch = 10
loss_record = []

for epoch in range(10):
  for batch in tqdm(dataloader, leave=False):
    batch = batch.to(dev)
    x = batch[:, :-1]
    y = batch[:, 1:]
    
    optimizer.zero_grad()
    out = model(x)
    loss = loss_fn(out.softmax(dim=-1), y)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    loss_record.append(loss.item())

In [None]:
plt.plot(loss_record)