In [None]:
import random
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn.functional as F
from sklearn.model_selection import train_test_split

# Makemore 

## Pytorch

In [None]:
# read in all the words
words = open('names.txt', 'r').read().splitlines()
words[:8]

In [None]:
# Create vocabulary
vocab = sorted(set('.' + ''.join(words).lower()))
char_to_idx = {char: idx for idx, char in enumerate(vocab)}
idx_to_char = {idx: char for char, idx in char_to_idx.items()}
vocab_size = len(vocab)

In [None]:
# build the dataset
block_size = 3

def build_dataset(words):  
  X, y = [], []
  for word in words:
    context = [0] * block_size
    for char in word + '.':
      idx = char_to_idx[char]
      X.append(context)
      y.append(idx)
      context = context[1:] + [idx]

  X = torch.tensor(X)
  y = torch.tensor(y)
  print(X.shape, y.shape)
  return X, y

In [None]:
X, y = build_dataset(words)

In [None]:
X

In [None]:
TRAIN_SIZE = 0.8
VAL_SIZE = 0.1
TEST_SIZE = 0.1

def train_val_test_split(X, y, train_size):
    """Split dataset into data splits."""
    X_train, X_, y_train, y_ = train_test_split(X, y, train_size=TRAIN_SIZE, stratify=y)
    X_val, X_test, y_val, y_test = train_test_split(X_, y_, train_size=0.5, stratify=y_)
    return X_train, X_val, X_test, y_train, y_val, y_test

In [None]:
# Create data splits
X_train, X_val, X_test, y_train, y_val, y_test = train_val_test_split(
    X=X, y=y, train_size=TRAIN_SIZE)
print (f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print (f"X_val: {X_val.shape}, y_val: {y_val.shape}")
print (f"X_test: {X_test.shape}, y_test: {y_test.shape}")

In [None]:
g = torch.Generator().manual_seed(2147483647) # for reproducibility
C = torch.randn((27, 10), generator=g)
W1 = torch.randn((30, 200), generator=g)
b1 = torch.randn(200, generator=g)
W2 = torch.randn((200, 27), generator=g)
b2 = torch.randn(27, generator=g)
parameters = [C, W1, b1, W2, b2]

In [None]:
sum(p.nelement() for p in parameters)

In [None]:
for p in parameters:
  p.requires_grad = True

In [None]:
lre = torch.linspace(-3, 0, 1000)
lrs = 10**lre
lri = []
lossi = []
stepi = []

for i in range(10000):
  
  # minibatch construct
  ix = torch.randint(0, X_train.shape[0], (32,))
  
  # forward pass
  emb = C[X_train[ix]] # (32, 3, 10)
  h = torch.tanh(emb.view(-1, 30) @ W1 + b1) # (32, 200)
  logits = h @ W2 + b2 # (32, 27)
  loss = F.cross_entropy(logits, y_train[ix])
  #print(loss.item())
  
  # backward pass
  for p in parameters:
    p.grad = None
  loss.backward()
  
  # update
  #lr = lrs[i]
  lr = 0.1 if i < 100000 else 0.01
  for p in parameters:
    p.data += -lr * p.grad

  # track stats
  #lri.append(lre[i])
  stepi.append(i)
  lossi.append(loss.log10().item())

#print(loss.item())

In [None]:
plt.plot(stepi, lossi)

In [None]:
emb = C[X_train] # (32, 3, 2)
h = torch.tanh(emb.view(-1, 30) @ W1 + b1) # (32, 100)
logits = h @ W2 + b2 # (32, 27)
loss = F.cross_entropy(logits, y_train)
loss

In [None]:
emb = C[X] # (32, 3, 2)
h = torch.tanh(emb.view(-1, 30) @ W1 + b1) # (32, 100)
logits = h @ W2 + b2 # (32, 27)
loss = F.cross_entropy(logits, )
loss

In [None]:
plt.figure(figsize=(8,8))
plt.scatter(C[:,0].data, C[:,1].data, s=200)
for i in range(C.shape[0]):
    plt.text(C[i,0].item(), C[i,1].item(), idx_to_char[i], ha="center", va="center", color='white')
plt.grid('minor')

In [None]:
g = torch.Generator().manual_seed(2147483647 + 10)

for _ in range(20):
    
    out = []
    context = [0] * block_size # initialize with all ...
    while True:
      emb = C[torch.tensor([context])] # (1,block_size,d)
      h = torch.tanh(emb.view(1, -1) @ W1 + b1)
      logits = h @ W2 + b2
      probs = F.softmax(logits, dim=1)
      ix = torch.multinomial(probs, num_samples=1, generator=g).item()
      context = context[1:] + [ix]
      out.append(ix)
      if ix == 0:
        break
    
    print(''.join(idx_to_char[i] for i in out))

## Numpy

### Generate data

In [None]:
names = open('names.txt', 'r').read().splitlines()
names[:10]

In [None]:
len(names)

In [None]:
vocab = sorted(set('.' + ''.join(names).lower()))
char_to_idx = {char: idx for idx, char in enumerate(vocab)}
idx_to_char = {idx: char for char, idx in char_to_idx.items()}

In [None]:
len(vocab)

In [None]:
char_to_idx

In [None]:
# build the dataset
block_size = 3

def build_dataset(words):  
    X, y = [], []
    for word in words:
        context = [0] * block_size
        for char in word + '.':
            X.append(context)
            y.append(char_to_idx[char])
            #print(''.join(idx_to_char[i] for i in context), '--->', idx_to_char[char_to_idx[char]])
            context = context[1:] + [char_to_idx[char]]

    X = np.array(X)
    y = np.array(y)
    print(X.shape, y.shape)
    return X, y

In [None]:
X, y = build_dataset(names)

### Split data

In [None]:
TRAIN_SIZE = 0.8
VAL_SIZE = 0.1
TEST_SIZE = 0.1

In [None]:
NUM_SAMPLES = len(X)
indices = list(range(NUM_SAMPLES))
np.random.shuffle(indices)
X = X[indices]
y = y[indices]

In [None]:
# Split indices
train_start = 0
train_end = int(0.7*NUM_SAMPLES)
val_start = train_end
val_end = int((TRAIN_SIZE+VAL_SIZE)*NUM_SAMPLES)
test_start = val_end

In [None]:
# Split data
X_train = X[train_start:train_end]
y_train = y[train_start:train_end]
X_val = X[val_start:val_end]
y_val = y[val_start:val_end]
X_test = X[test_start:]
y_test = y[test_start:]
print (f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print (f"X_val: {X_val.shape}, y_test: {y_val.shape}")
print (f"X_test: {X_test.shape}, y_test: {y_test.shape}")

### Init Weights 

In [None]:
def init_parameters(vocab_size, embedding_dim, hidden_dim, output_dim):
    return {
        'C': np.random.randn(vocab_size, embedding_dim) * 0.01,
        'W1': np.random.randn(3 * embedding_dim, hidden_dim) * 0.01,
        'b1': np.zeros((1, hidden_dim)),
        'W2': np.random.randn(hidden_dim, output_dim) * 0.01,
        'b2': np.zeros((1, output_dim))
    }

### Model

In [None]:
def tanh(x):
    return np.tanh(x)

def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    return exp_x / np.sum(exp_x, axis=1, keepdims=True)

def cross_entropy_loss(logits, y):
    probs = softmax(logits)
    return -np.mean(np.log(probs[np.arange(len(y)), y]))

def forward(X, params):
    emb = params['C'][X]  # (batch_size, 3, embedding_dim)
    h = tanh(np.reshape(emb, (emb.shape[0], -1)) @ params['W1'] + params['b1'])
    logits = h @ params['W2'] + params['b2']
    return emb, h, logits

### Loss, Gradients and Update weights

In [None]:
def backward(X, y, emb, h, logits, params):
    batch_size = X.shape[0]
    
    probs = softmax(logits)
    dlogits = probs
    dlogits[np.arange(batch_size), y] -= 1
    dlogits /= batch_size
    
    dW2 = h.T @ dlogits
    db2 = np.sum(dlogits, axis=0, keepdims=True)
    dh = dlogits @ params['W2'].T
    
    demb = (dh * (1 - h**2)) @ params['W1'].T
    demb = np.reshape(demb, (batch_size, 3, -1))
    
    dW1 = np.reshape(emb, (batch_size, -1)).T @ (dh * (1 - h**2))
    db1 = np.sum(dh * (1 - h**2), axis=0, keepdims=True)
    
    dC = np.zeros_like(params['C'])
    for i in range(batch_size):
        for j in range(3):
            dC[X[i, j]] += demb[i, j]
    
    return {'C': dC, 'W1': dW1, 'b1': db1, 'W2': dW2, 'b2': db2}

def update_parameters(params, grads, lr):
    for key in params:
        params[key] -= lr * grads[key]
    return params

### Training

In [None]:
def train_step(X, y, params, lr):
    emb, h, logits = forward(X, params)
    loss = cross_entropy_loss(logits, y)
    grads = backward(X, y, emb, h, logits, params)
    params = update_parameters(params, grads, lr)
    return loss, params

def train_model(Xtr, Ytr, vocab_size, embedding_dim, hidden_dim, output_dim, num_iterations, batch_size):
    params = init_parameters(vocab_size, embedding_dim, hidden_dim, output_dim)
    losses = []
    steps = []

    for i in range(num_iterations):
        ix = np.random.randint(0, Xtr.shape[0], (batch_size,))
        X_batch, y_batch = Xtr[ix], Ytr[ix]
        
        lr = 0.1 if i < 100000 else 0.01
        loss, params = train_step(X_batch, y_batch, params, lr)
        
        if i % 1000 == 0:
            losses.append(np.log10(loss))
            steps.append(i)
            print(f"Iteration {i}, Loss: {loss:.4f}")

    return params, losses, steps

In [None]:
vocab_size = len(vocab)
embedding_dim = 10 # dimensionality of the character embedding vectors.
hidden_dim = 200 # number of neurons in the hidden layer
output_dim = 27  # Adjust based on your output size
num_iterations = 200000
batch_size = 32

trained_params, losses, steps = train_model(X_train, y_train, 
                                            vocab_size, embedding_dim, 
                                            hidden_dim, output_dim, 
                                            num_iterations, batch_size)

# Plot the learning curve
plt.plot(steps, losses)
plt.xlabel('Step')
plt.ylabel('Log10 Loss')
plt.title('Training Loss over Time')
plt.show()

In [None]:
def get_minibatch(X, Y, batch_size):
    indices = np.random.randint(0, X.shape[0], batch_size)
    return X[indices], Y[indices]

def plot_histogram(h, bins=50):
    flattened_h = np.reshape(h, -1)  # Flatten h
    plt.hist(flattened_h, bins=bins)
    plt.xlabel('Values')
    plt.ylabel('Frequency')
    plt.title('Histogram of Flattened h')
    plt.show()

X_batch, _ = get_minibatch(X_train, y_train, batch_size)
_, h, _ = forward(X_batch, params)
plot_histogram(h)

### Evaluation

In [None]:
plt.figure(figsize=(8,8))
plt.scatter(C[:,0].data, C[:,1].data, s=200)
for i in range(C.shape[0]):
    plt.text(C[i,0].item(), C[i,1].item(), idx_to_char[i], ha="center", va="center", color='white')
plt.grid('minor')

In [None]:
def generate_name(params, vocab_size, block_size, idx_to_char):
    C, W1, b1, W2, b2 = params['C'], params['W1'], params['b1'], params['W2'], params['b2']
    
    out = []
    context = [0] * block_size  # initialize with all ...
    
    while True:
        emb = C[context]  # (block_size, d)
        h = np.tanh(emb.reshape(1, -1) @ W1 + b1)
        logits = h @ W2 + b2
        probs = softmax(logits)
        ix = np.random.choice(vocab_size, p=probs.ravel())
        context = context[1:] + [ix]
        out.append(ix)
        if ix == 0:
            break
    
    return ''.join(idx_to_char[i] for i in out)

# Generate 20 names
for _ in range(20):
    generated_name = generate_name(trained_params, vocab_size, block_size, idx_to_char)
    print(generated_name)