In [17]:
from torch import tensor
import torch
import numpy as np
import matplotlib.pyplot as plt

In [89]:
# Impplementing stochastic gradient descent for now, will update to mini-batch gradient descent.

In [19]:
# read in all the names
names = open('names.txt', 'r').read().splitlines()
print('total no. of names', len(names))
names[:8]

total no. of names 32033


['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [20]:
# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(names))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
vocab_size = len(itos)
print(itos)
print(vocab_size)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}
27


In [21]:
# build the dataset
# The input and target seq is different in this case. For input '.emma' the target is 'emma.'

def build_dataset(names):  
  X, Y = [], []
  
  for w in names:
    x = [0] + [stoi[ch] for ch in w[:-1]]
    y = [stoi[ch] for ch in w[1:]] + [0]
    X.append(tensor(x))
    Y.append(tensor(y))

  print(len(X), len(Y))
  return X, Y

import random
random.seed(42)
random.shuffle(names)
n1 = int(0.8*len(names))
n2 = int(0.9*len(names))

Xtr,  Ytr  = build_dataset(names[:n1])     # 80%
Xdev, Ydev = build_dataset(names[n1:n2])   # 10%
Xte,  Yte  = build_dataset(names[n2:])     # 10%

25626 25626
3203 3203
3204 3204


In [22]:
# One-hot enoding of the data set
def encoding(X: list, Y: list) -> list:
    """
    Input: List of tesnors. Each entry of the list is an 1d tensor.
    Output: List of tesnors. Each entry of the list is an 2d tensor.
    """
    X_emb = []
    # Y_emb = []
    for x_example,  y_example in zip(X, Y):
        X_emb.append(torch.nn.functional.one_hot(x_example, num_classes=vocab_size).float())
        # Y_emb.append(torch.nn.functional.one_hot(y_example, num_classes=vocab_size).float())
    return X_emb

In [23]:
Xtr_emb = encoding(Xtr, Ytr)       # Total 25626 examples, each example is of dims (n_char, vocab_size=27).

In [24]:
# Shape of the one hot representation of first name
print(Xtr_emb[0].shape)         # [7, 27] 7 chars and 27 vocab size (embedding size).
print(Xtr_emb[0][0].shape)      # shape [27]

torch.Size([6, 27])
torch.Size([27])


<img src="What-is-Recurrent-Neural-Network-660.webp">

In [25]:
# Forward pass
state_size = 100
U = np.random.randn(vocab_size, state_size)
W = np.random.randn(state_size, state_size)
V = np.random.randn(state_size, vocab_size)
b = np.random.randn(state_size, )
c = np.random.randn(vocab_size, )
s_prev = torch.zeros((state_size, ))

h_preact = Xtr_emb[0] @  U + b
s_curr = torch.tanh(s_prev@W+h_preact)
y_hat = s_curr@V+c
probs = y_hat.softmax(dim=1)

In [26]:
loss = torch.nn.functional.cross_entropy(probs, Ytr[0])