<a href="https://colab.research.google.com/github/kaifkh20/nn_z_to_h/blob/main/makemore_wavenet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [37]:
from google.colab import files
uploaded = files.upload()

Saving names(1).txt to names(1) (1).txt


In [38]:
import torch
import torch.nn.functional as F

In [39]:
words = open('names(1).txt','r').read().splitlines()
len(words)

32033

In [40]:
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
vocab_size = len(itos)

In [58]:
#building the dataset

block_size = 8 #3 #context length
X,Y = [],[]

#build training, dev and test dataset

def build_dataset(words):
  # block_size = 3 #context length
  X,Y = [],[]

  for w in words:
    # print(w)
    context = [0] * block_size #pad it with ...
    for ch in w + '.':
      ix = stoi[ch]
      X.append(context)
      Y.append(ix)
      # print(''.join(itos[i] for i in context), '--->', itos[ix])
      context = context[1:] + [ix]
  X = torch.tensor(X)
  Y = torch.tensor(Y)

  return X,Y

import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])


In [59]:
for x,y in zip(Xtr[:20],Ytr[:20]):
  print(''.join(itos[ix.item()] for ix in x), '--->', itos[y.item()])

........ ---> e
.......e ---> l
......el ---> i
.....eli ---> a
....elia ---> n
...elian ---> y
..eliany ---> s
.elianys ---> .
........ ---> t
.......t ---> r
......tr ---> o
.....tro ---> y
....troy ---> .
........ ---> m
.......m ---> a
......ma ---> r
.....mar ---> k
....mark ---> u
...marku ---> s
..markus ---> .


In [81]:
# Near copy paste of the layers we have developed in Part 3

# -----------------------------------------------------------------------------------------------
class Linear:

  def __init__(self, fan_in, fan_out, bias=True):
    self.weight = torch.randn((fan_in, fan_out)) / fan_in**0.5 # note: kaiming init
    self.bias = torch.zeros(fan_out) if bias else None

  def __call__(self, x):
    self.out = x @ self.weight
    if self.bias is not None:
      self.out += self.bias
    return self.out

  def parameters(self):
    return [self.weight] + ([] if self.bias is None else [self.bias])

# -----------------------------------------------------------------------------------------------
class BatchNorm1d:

  def __init__(self, dim, eps=1e-5, momentum=0.1):
    self.eps = eps
    self.momentum = momentum
    self.training = True
    # parameters (trained with backprop)
    self.gamma = torch.ones(dim)
    self.beta = torch.zeros(dim)
    # buffers (trained with a running 'momentum update')
    self.running_mean = torch.zeros(dim)
    self.running_var = torch.ones(dim)

  def __call__(self, x):
    # calculate the forward pass
    if self.training:
      if x.ndim == 2:
        dim = 0
      elif x.ndim == 3:
        dim = (0,1)
      xmean = x.mean(dim, keepdim=True) # batch mean
      xvar = x.var(dim, keepdim=True) # batch variance
    else:
      xmean = self.running_mean
      xvar = self.running_var
    xhat = (x - xmean) / torch.sqrt(xvar + self.eps) # normalize to unit variance
    self.out = self.gamma * xhat + self.beta
    # update the buffers
    if self.training:
      with torch.no_grad():
        self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * xmean
        self.running_var = (1 - self.momentum) * self.running_var + self.momentum * xvar
    return self.out

  def parameters(self):
    return [self.gamma, self.beta]

# -----------------------------------------------------------------------------------------------
class Tanh:
  def __call__(self, x):
    self.out = torch.tanh(x)
    return self.out
  def parameters(self):
    return []

class Embedding:
  def __init__(self, num_embeddings, embedding_dim):
    self.weight = torch.randn((num_embeddings, embedding_dim))
  def __call__(self, IX):
    self.out = self.weight[IX]
    return self.out
  def parameters(self):
    return [self.weight]

class FlattenConsecutive: #in wavenet instead of flattening all the 10 emb for 8 block size into one
                          #we are grouping in parts of 2 here [(a,b),(c,d)...]
                          #so instead of being flattened into (batch_size,context_size*embedding_dim) #(32,80)
                          # we are doing in like this format (32,4(for block size of 8)(pairing {2}),20{each pair having embedding for 2 token pairs})
  def __init__(self,n):
    self.n = n
  def __call__(self, x):
    B,T,C = x.shape
    x = x.view(B,T//self.n,C*self.n)
    if x.shape[1] == 1:
      x = x.squeeze(1) #if (B,1,C*n) then it becomes (B,C*n)
    self.out = x
    return self.out
  def parameters(self):
    return []

# -----------------------------------------------------------------------------------------------

class Sequential:
  def __init__(self,layers):
    self.layers = layers
  def __call__(self, x):
    for layer in self.layers:
      x = layer(x)
    self.out = x
    return self.out
  def parameters(self):
    return [p for layer in self.layers for p in layer.parameters()]

In [78]:
g = torch.manual_seed(42)

In [82]:
n_embd = 10
n_hidden = 200

# C = torch.randn((vocab_size,n_embd))

model = Sequential([
    Embedding(vocab_size,n_embd),
    # FlattenConsecutive(block_size),
    # Linear(n_embd*block_size,n_hidden,bias=False), BatchNorm1d(n_hidden), Tanh(),
    # Linear(n_hidden,n_hidden,bias=False), BatchNorm1d(n_hidden), Tanh(),
    FlattenConsecutive(2),Linear(n_embd*2,n_hidden,bias=False), BatchNorm1d(n_hidden), Tanh(),
    FlattenConsecutive(2),Linear(n_hidden*2,n_hidden,bias=False), BatchNorm1d(n_hidden), Tanh(),
    FlattenConsecutive(2),Linear(n_hidden*2,n_hidden,bias=False), BatchNorm1d(n_hidden), Tanh(),
    Linear(n_hidden,vocab_size),
])

with torch.no_grad():
  model.layers[-1].weight*=0.1

parameters = model.parameters()
# parameters = [p for layer in layers for p in layer.parameters()]
print(sum(p.nelement() for p in parameters)) # number of parameters in total
for p in parameters:
  p.requires_grad = True


170897


In [84]:
ix = torch.randint(0, Xtr.shape[0], (4,))
Xb, Yb = Xtr[ix], Ytr[ix] # batch X,Y
logits = model(Xb)

for layer in model.layers:
  print(layer.__class__.__name__,':',tuple(layer.out.shape))

Embedding : (4, 8, 10)
FlattenConsecutive : (4, 4, 20)
Linear : (4, 4, 200)
BatchNorm1d : (4, 4, 200)
Tanh : (4, 4, 200)
FlattenConsecutive : (4, 2, 400)
Linear : (4, 2, 200)
BatchNorm1d : (4, 2, 200)
Tanh : (4, 2, 200)
FlattenConsecutive : (4, 400)
Linear : (4, 200)
BatchNorm1d : (4, 200)
Tanh : (4, 200)
Linear : (4, 27)


torch.Size([4, 8])


torch.Size([4, 200])

In [52]:
# same optimization as last time
max_steps = 200000
batch_size = 32
lossi = []
ud = []

for i in range(max_steps):

  # minibatch construct
  ix = torch.randint(0, Xtr.shape[0], (batch_size,), generator=g)
  Xb, Yb = Xtr[ix], Ytr[ix] # batch X,Y

  # forward pass
  # x = Xb
  # for layer in layers:
  #   x = layer(x)
  # logits = model(Xb)
  # loss = F.cross_entropy(logits, Yb) # loss function

  logits = model(Xb)
  loss = F.cross_entropy(logits, Yb) # loss function

  # backward pass
  for p in parameters:
    p.grad = None
  loss.backward()

  # update
  lr = 0.1 if i < 150000 else 0.01 # step learning rate decay
  for p in parameters:
    p.data += -lr * p.grad

  # track stats
  if i % 10000 == 0: # print every once in a while
    print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
  lossi.append(loss.log10().item())
  with torch.no_grad():
    ud.append([((lr*p.grad).std() / p.data.std()).log10().item() for p in parameters])

  # if i >= 1000:
  #   break # AFTER_DEBUG: would take out obviously to run full optimization

      0/ 200000: 2.3852
  10000/ 200000: 2.3233
  20000/ 200000: 2.2366
  30000/ 200000: 2.3198
  40000/ 200000: 2.0599
  50000/ 200000: 2.0731
  60000/ 200000: 1.9926
  70000/ 200000: 2.1057
  80000/ 200000: 2.2649
  90000/ 200000: 1.8797
 100000/ 200000: 2.1898
 110000/ 200000: 2.5004
 120000/ 200000: 2.1082
 130000/ 200000: 2.5984
 140000/ 200000: 2.5720
 150000/ 200000: 2.3032
 160000/ 200000: 2.1624
 170000/ 200000: 1.8057
 180000/ 200000: 1.7378
 190000/ 200000: 1.9618


In [48]:
# torch.tensor(lossi).view(-1,1000).mean(1).shape #1 meaning along the columns and #0 meaning along the rows
# #[[1,2,3],[2,3,4]] #0 [1.5,2.5,3.5] #1 [2.0,3.0]

In [49]:
for layer in model.layers:
  layer.training = False

In [50]:
# evaluate the loss
@torch.no_grad() # this decorator disables gradient tracking inside pytorch
def split_loss(split):
  x,y = {
    'train': (Xtr, Ytr),
    'val': (Xdev, Ydev),
    'test': (Xte, Yte),
  }[split]

  logits = model(x)
  loss = F.cross_entropy(logits, y)
  print(split, loss.item())

split_loss('train')
split_loss('val')

train 2.4253151416778564
val 2.4242467880249023


In [51]:
#sample from the model

for _ in range(20):
  out = []
  context = [0] * block_size # initialize with all ...
  while True:
    # forward pass the neural net
    logits = model(torch.tensor([context]))
    # emb = C[torch.tensor([context])]
    # x = emb.view(emb.shape[0],-1)
    # for layer in layers:
    #   x = layer(x)
    # logits = x
    probs = F.softmax(logits, dim=1)
    # sample from the distribution
    ix = torch.multinomial(probs, num_samples=1).item()
    # shift the context window and track
    context = context[1:] + [ix]
    out.append(ix)
    # if we sample the special '.' token, break
    if ix == 0:
      break
  print(''.join(itos[i] for i in out))

jafaularid.
anb.
anle.
eli.
elo.
mer.
amayyel.
xec.
loneshax.
awssol.
azcqesik.
tison.
rasalurde.
alayil.
cferiodakilo.
kevtonley.
jaxah.
anerin.
nre.
anu.
