### Importing the input dataset

In [1]:
words = open('names.txt').read().splitlines()

In [2]:
words[:10]

['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']

In [3]:
len(words)

32033

### Bigrams

In [None]:
b = {}

for word in words:
    chs = ["<S>"] + list(word) + ["<E>"]
    for char1, char2 in zip(chs, chs[1:]):
        bigram = (char1, char2)
        b[bigram] = b.get(bigram, 0) + 1
        # print(char1, char2)

In [None]:
sorted(b.items(), key=lambda kv: kv[1], reverse=True)

#### Storing bigrams as a 2D Tensor

In [6]:
import torch

a = torch.zeros((3, 5), dtype=torch.int32)
a

tensor([[0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0]], dtype=torch.int32)

In [None]:
N = torch.zeros((27, 27), dtype=torch.int32)

chars = ["."] + sorted(list(set("".join(words))))
s_to_i = {s: i for i, s in enumerate(chars)}

s_to_i

In [None]:
for word in words:
    chs = ["."] + list(word) + ["."]
    for char1, char2 in zip(chs, chs[1:]):
        bigram = (char1, char2)
        i, j = s_to_i[char1], s_to_i[char2]
        N[i, j] += 1

In [None]:
N

##### Visualizing the bigrams

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
i_to_s = {i: s for s, i in s_to_i.items()}

In [None]:
plt.figure(figsize=(16, 16))
plt.imshow(N, cmap="Blues")

for i in range(len(chars)):
    for j in range(len(chars)):
        char_string = i_to_s[i] + i_to_s[j]
        plt.text(j, i, char_string, ha="center", va="bottom", color="gray")
        plt.text(j, i, N[i, j].item(), ha="center", va="top", color="gray")

plt.axis("off")

plt.show()

#### Using the bigram language model

##### Generating it manually

In [None]:
N[0, :]

In [None]:
p = N[0].float()
p = p / p.sum()
p

In [None]:
g = torch.Generator().manual_seed(2147483647)

ix = torch.multinomial(p, 1, generator=g).item()
i_to_s[ix]

##### Creating a loop

In [None]:
# Probablity matrix
# P_ij = N_ij / sum(N_i)
P = (N + 1).float()
P /= P.sum(dim=1, keepdim=True)

In [None]:
# Looping over the generation process
g = torch.Generator().manual_seed(2147483647)
for i in range(5):
    ix = 0
    generated = [i_to_s[ix]]
    while True:
        # Get the row of the current character and calculate the probabilities
        p = P[ix]
        # p = N[ix].float()
        # p = p / p.sum()

        # If the model were completely random, the probability distribution would be uniform over the set of defined characters
        # p = torch.ones(27) / 27.0

        # Sample the next character
        ix = torch.multinomial(p, 1, replacement=True, generator=g).item()
        generated.append(i_to_s[ix])

        # Stop if special character is sampled
        if ix == 0:
            break

    print("".join(generated))

#### Measuring performance using log-likelihood

In [None]:
log_likelihood = 0.0
n = 0

for word in words:
    chs = ["."] + list(word) + ["."]
    for char1, char2 in zip(chs, chs[1:]):
        bigram = (char1, char2)
        i, j = s_to_i[char1], s_to_i[char2]
        prob = P[i, j]
        logprob = torch.log(prob)

        log_likelihood += logprob
        n += 1

        # print(f"{char1}{char2}, {prob:.4f} {logprob:.4f}")

print(f"{log_likelihood=}")
print(f"NLL: {-log_likelihood=}")
print(f"Average NLL: {-log_likelihood/n}")

### Training a neural network

In [None]:
# Creating the training set of bigrams (x, y)
character_set = ["."]
character_set.extend(sorted(list(set("".join(words)))))

s_to_i = {s: i for i, s in enumerate(character_set)}

dims = len(character_set)

In [None]:
xs, ys = [], []

for word in words[:1]:
    # Adding start and end tokens to correctly capture all bigrams
    chs = ['.'] + list(word) + ['.']
    for c1, c2 in zip(chs, chs[1:]):
        print(c1, c2)
        i, j = s_to_i[c1], s_to_i[c2]
        xs.append(i)
        ys.append(j)

xs = torch.tensor(xs)
ys = torch.tensor(ys)

In [None]:
xs

In [None]:
ys

#### One-hot encoding of the input vector

In [11]:
import torch.nn.functional as F

x_enc = F.one_hot(xs, num_classes=dims).float()

plt.imshow(x_enc)
x_enc

NameError: name 'plt' is not defined

#### One layer of the neural network

In [None]:
# Construct a matrix of weights (d x d), setting all initial values randomly
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((dims, dims), generator=g)

Once we generate the weight matrix, we feed forward the input vector to the neural network and get the output vector. The steps to do this are as follows:
1. Multiply the input vector with the weight matrix to get the logits
2. Exponentiate the logits to get the log counts
3. Normalize the log counts to get the probabilities

> Steps 2 and 3 above correspond to the `softmax` function.

Doing this gives us a vector of probabilities for each input vector. This helps us predict the next token of the bigram. 

Then, we calculate the loss by optimizing the predicted probability of the actual token, and use backpropagation to optimize the loss function by tuning the weights of the neural network.

In [None]:
## Forward pass
x_enc = F.one_hot(xs, num_classes=dims).float()
logits = (
    x_enc @ W
)  # logits_ij refers to the firing rate of the j-th character given the i-th character

counts = logits.exp()  # log-counts, equivalent to the Tensor `N` defined above.
probs = counts / counts.sum(
    dim=1, keepdim=True
)  # probabilities, equivalent to the Tensor `P` defined above.
probs

In [None]:
nlls = torch.zeros(len(xs))

for i in range(len(xs)):
    x = xs[i].item()
    y = ys[i].item()

    print("-"*10)
    print(f'Bigram example {i+1}: "{i_to_s[x]}{i_to_s[y]}", {x=}, {y=}')
    print(f'Input to NN: {x}')
    print(f'Output probabilities: {probs[i]}')
    print(f'Actual next character: {y}')
    p = probs[i, y]
    print(f'Probability assigned to actual next character: {p}')
    logp = torch.log(p)
    print(f'Log-probability assigned to actual next character: {logp}')
    nll = -logp
    print(f'NLL: {nll}')

    nlls[i] = nll

print("="*100)
print(f'Average NLL: ', nlls.mean().item())

#### Optimization

In [None]:
xs

In [None]:
ys

##### Manual optimization

In [None]:
# Construct a matrix of weights (d x d), setting all initial values randomly
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((dims, dims), generator=g, requires_grad=True)

In [None]:
## Forward pass
x_enc = F.one_hot(xs, num_classes=dims).float()
logits = x_enc @ W

counts = logits.exp()
probs = counts / counts.sum(dim=1, keepdim=True)

## Calculate loss
# probs[i, j] refers to the probability of the j-th character given the i-th character
# This is the probability assigned by the model that we want to maximize
loss = probs[torch.arange(len(xs)), ys].log().neg().mean()
loss

In [None]:
## Backward pass
W.grad = None
loss.backward()

In [None]:
## Update weights in the opposite direction of the gradient
learning_rate = 1e-1
W.data += -learning_rate * W.grad

#### Gradient descent

In [4]:
# Creating the training set of bigrams (x, y)
character_set = ["."]
character_set.extend(sorted(list(set("".join(words)))))

s_to_i = {s: i for i, s in enumerate(character_set)}
i_to_s = {i: s for s, i in s_to_i.items()}

dims = len(character_set)

In [7]:
xs, ys = [], []

for word in words:
    chs = ["."] + list(word) + ["."]
    for c1, c2 in zip(chs, chs[1:]):
        # print(c1, c2)
        i, j = s_to_i[c1], s_to_i[c2]
        xs.append(i)
        ys.append(j)

xs = torch.tensor(xs)
ys = torch.tensor(ys)
num_examples = xs.nelement()
print(f"Number of examples: {num_examples}")

# Initialize the network
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((dims, dims), generator=g, requires_grad=True)

Number of examples: 228146


In [12]:
xs.shape
F.one_hot(xs, num_classes=dims).float().shape

torch.Size([228146, 27])

In [None]:
learning_rate = 50

for k in range(200):
    x_enc = F.one_hot(xs, num_classes=dims).float()
    logits = x_enc @ W

    counts = logits.exp()
    probs = counts / counts.sum(dim=1, keepdim=True)

    loss = (
        probs[torch.arange(num_examples), ys].log().neg().mean()
        + 1e-2 * (W**2).mean()
    )

    W.grad = None
    loss.backward()

    W.data += -learning_rate * W.grad

    print(f"Iteration {k}: {loss.item()}")

In [None]:
# Predicting the next character
g = torch.Generator().manual_seed(2147483647)

for i in range(5):
    ix = 0
    out = [i_to_s[ix]]
    
    while True:
        # print(f"Input character: {x}")
        # print(f"Probabilities: {probs[x]}")

        x_enc = F.one_hot(torch.tensor([ix]), num_classes=dims).float()
        logits = x_enc @ W

        counts = logits.exp()
        probs = counts / counts.sum(dim=1, keepdim=True)

        ix = torch.multinomial(
            probs, 1, replacement=True, generator=g
        ).item()
        out.append(i_to_s[ix])

        # print(f"Predicted next character: {next_char}")

        if ix == 0:
            break

    print("=" * 100)
    print("".join(out))