In [None]:
import torch
import torch.nn.functional as F
import os
import requests

In [None]:
# download the input file from github
filename = 'Birds_scientific_names.txt'
input_file_path = os.path.join(os.path.dirname(filename), filename)
if not os.path.exists(input_file_path):
    data_url = 'https://raw.githubusercontent.com/laiamr/nlp_ex2/refs/heads/main/Birds_scientific_names.txt'
    with open(input_file_path, 'w', encoding='utf-8') as f:
        f.write(requests.get(data_url).text)
# Open file
with open(filename, 'r', encoding='utf-8') as f:
    words = f.read().splitlines()

In [None]:
words[:8]

In [None]:
len(words)

In [None]:
# remove capitalization (reduce vocab size)
words = [line.lower() for line in words]

# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
print(itos)

In [None]:
#Hyperparameters
block_size = 3 # context length: how many characters do we take to predict the next one
vocab_size = len(stoi.keys()) # number of chars in our vocabulary
emb_dim = 10 # embedding dimension
batch_size = 32 # quantity of examples
num_neurons = 200 # number of neurons in the hidden layer
max_iters = 200000 # number of training iterations

In [None]:
# build the dataset
def build_dataset(words):
  X, Y = [], []
  for w in words:

    #print(w)
    context = [0] * block_size
    for ch in w + '.':
      ix = stoi[ch]
      X.append(context)
      Y.append(ix)
      #print(''.join(itos[i] for i in context), '--->', itos[ix])
      context = context[1:] + [ix] # crop and append

  X = torch.tensor(X)
  Y = torch.tensor(Y)
  print(X.shape, Y.shape)
  return X, Y

import random
random.seed(42)
random.shuffle(words)
n = int(0.9*len(words))

Xtr, Ytr = build_dataset(words[:n])
Xdev, Ydev = build_dataset(words[n:])

In [None]:
g = torch.Generator().manual_seed(2147483647) # for reproducibility
# Matrix C 27x10 initialized randomly
# It's our lookup table from each character in our vocabulary: match char to its vector representation
# 27 -> number of characters in our vocab
# 10 -> dimension of the vectors (of our embedding) - we decide
C = torch.randn((vocab_size, emb_dim), generator=g)
# 1st layer - hidden
# Weigts - randomly initialized
# 30 -> 3 characters as our context (block_size) * 10 dimensions each vector: concat 3 vectors of size 10
# 200 -> we decide (number of neurons in the layer)
W1 = torch.randn((emb_dim*block_size, num_neurons), generator=g)
# Bias - 200x1 (make some neurons more trigger happy than others)
b1 = torch.randn(num_neurons, generator=g)
# 2nd layer - ouptut
# Output: 200x27
# 200 -> must match previous dim (output of previous layer)
# 27: we want to see how likely each char is (distribution in logits - "fake counts")
W2 = torch.randn((num_neurons, vocab_size), generator=g)
b2 = torch.randn(vocab_size, generator=g)
parameters = [C, W1, b1, W2, b2]

In [None]:
sum(p.nelement() for p in parameters) # number of parameters in total

In [None]:
# Tell the model that it needs to keep track of all the operations that happen so that we can calculate the gradient for each step
for p in parameters:
  p.requires_grad = True

In [None]:
# For each training step:
# Get training data, pass the data
# Calculate loss and adjust each parameter

for i in range(max_iters): # how many times to go through training

  # minibatch construct - we don't pass the whole training dataset through all the steps, only some selected examples
  ix = torch.randint(0, Xtr.shape[0], (batch_size,))

  # forward pass
  # Get the embedding from the inputs
  emb = C[Xtr[ix]] # (32, 3, 10)
  B, T, E = emb.shape
  # Calculate 1st layer
  h = torch.tanh(emb.view(-1, T*E) @ W1 + b1) # (32, 100)
  # Calculate 2nd layer - output
  logits = h @ W2 + b2 # (32, 27)
  # Calculate loss
  loss = F.cross_entropy(logits, Ytr[ix])

  # backward pass
  for p in parameters:
    p.grad = None
  loss.backward()

  # update
  lr = 0.1 if i < 100000 else 0.01
  for p in parameters:
    p.data += -lr * p.grad

In [None]:
# training loss
emb = C[Xtr] # (32, 3, 2)
h = torch.tanh(emb.view(-1, 30) @ W1 + b1) # (32, 100)
logits = h @ W2 + b2 # (32, 27)
loss = F.cross_entropy(logits, Ytr) # Loss: the lower (closer to 0) the better
loss

In [None]:
# validation loss
emb = C[Xdev] # (32, 3, 2)
h = torch.tanh(emb.view(-1, 30) @ W1 + b1) # (32, 100)
logits = h @ W2 + b2 # (32, 27)
loss = F.cross_entropy(logits, Ydev)
loss

In [None]:
# sample from the model
g = torch.Generator().manual_seed(2147483647 + 10)

for _ in range(20):

    out = []
    context = [0] * block_size # initialize with all ...
    while True:
      emb = C[torch.tensor([context])] # (1,block_size,d)
      h = torch.tanh(emb.view(1, -1) @ W1 + b1)
      logits = h @ W2 + b2
      probs = F.softmax(logits, dim=1)
      ix = torch.multinomial(probs, num_samples=1, generator=g).item()
      context = context[1:] + [ix]
      out.append(ix)
      if ix == 0:
        break

    print(''.join(itos[i] for i in out))