In [1]:
import torch
import torch.nn.functional as F
import csv
import itertools
import operator
import numpy as np
import nltk
import sys
from datetime import datetime
from utils import *

import matplotlib.pyplot as plt
%matplotlib inline

# Download NLTK model data (you need to do this once)
nltk.download("book")

[nltk_data] Downloading collection 'book'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /Users/kenny/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package brown to
[nltk_data]    |     /Users/kenny/nltk_data...
[nltk_data]    |   Package brown is already up-to-date!
[nltk_data]    | Downloading package chat80 to
[nltk_data]    |     /Users/kenny/nltk_data...
[nltk_data]    |   Package chat80 is already up-to-date!
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     /Users/kenny/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package conll2000 to
[nltk_data]    |     /Users/kenny/nltk_data...
[nltk_data]    |   Package conll2000 is already up-to-date!
[nltk_data]    | Downloading package conll2002 to
[nltk_data]    |     /Users/kenny/nltk_data...
[nltk_data]    |   Package conll2002 is already up-to-date!
[nltk_data]    | Downloading package dependency_t

True

In [2]:
unknown_token = "UNKNOWN_TOKEN"
sentence_start_token = "SENTENCE_START"
sentence_end_token = "SENTENCE_END"

In [7]:
with open('../data/random.csv', 'r', newline='', encoding='utf-8') as f:
    # Initalize a reader object
    reader = csv.reader(f, skipinitialspace=True)
    # Skip the header row
    # next(reader)  
    # Split full comments into sentences  - [nltk.sent_tokenize(x[0].lower()) for x in reader] - for the paragraph x[0] from the csv file, make it lowercase and tokenize all sentence
    # For all pararaphs in the csv file. * operator unpacks the list into individual sentences, and creates a single iterable
    # sentences = itertools.chain(*[nltk.sent_tokenize(x[0].lower()) for x in reader])
    sentences = itertools.chain(*[nltk.sent_tokenize(str(x).lower()) for x in reader])
    # Append SENTENCE_START and SENTENCE_END
    # Replace all sentence x in sentences with the start token, sentence body, and text token"
    sentences = ["%s %s %s" % (sentence_start_token, x, sentence_end_token) for x in sentences]
print (f"Parsed {len(sentences)} sentences.")

Parsed 646 sentences.


In [54]:
# Tokenize the sentences into words
tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]

In [76]:
tokenized_sentences = [[word for word in sentence if word not in {'[', ']', '(', ')'}] for sentence in tokenized_sentences]

In [77]:
# # List of lists
print(tokenized_sentences[10])

['SENTENCE_START', "'she", 'tilted', 'her', 'head', 'back', 'and', 'let', 'whip', 'cream', 'stream', 'into', 'her', 'mouth', 'while', 'taking', 'a', 'bath', '.', "'", 'SENTENCE_END']


In [78]:
b = {}

for i in range (len(tokenized_sentences)):
    for word1, word2, in zip(tokenized_sentences[i], tokenized_sentences[i][1:]):
        # Create a tuple
        bigram = (word1, word2)
        # Index into the dictionary, update it by one
        b[bigram] = b.get(bigram, 0) + 1


In [79]:
# Taking the first 20,000 entries as our vocabulary
sorted(b.items(), key = lambda kv: -kv[1])

[(("'", 'SENTENCE_END'), 542),
 (('.', "'"), 528),
 (("'", ','), 116),
 (('SENTENCE_START', "'the"), 109),
 (("''", 'SENTENCE_END'), 102),
 (('SENTENCE_START', '``'), 102),
 (('.', "''"), 97),
 (('SENTENCE_START', "'he"), 70),
 (('in', 'the'), 50),
 (('SENTENCE_START', "'"), 46),
 (('SENTENCE_START', "'she"), 41),
 (("'", 'i'), 41),
 (('of', 'the'), 40),
 ((',', "'but"), 29),
 (('on', 'the'), 25),
 (('to', 'be'), 19),
 (('``', 'it'), 18),
 (('from', 'the'), 18),
 (('``', 'i'), 17),
 (('at', 'the'), 16),
 ((',', "'he"), 16),
 (('was', 'the'), 16),
 (('SENTENCE_START', "'it"), 15),
 ((',', '``'), 14),
 (("''", ','), 14),
 (('to', 'the'), 14),
 (('by', 'the'), 14),
 (('had', 'a'), 13),
 (('as', 'a'), 13),
 (('it', '.'), 13),
 (('he', 'was'), 13),
 (('for', 'the'), 12),
 (('it', 'was'), 12),
 (('in', 'a'), 12),
 (('SENTENCE_START', "'there"), 12),
 (('did', "n't"), 12),
 (('SENTENCE_START', "'when"), 11),
 (('``', 'he'), 11),
 (("'he", 'was'), 11),
 (('SENTENCE_START', "'they"), 11),
 (('w

In [80]:
word_set = set(word for bigram, count in sorted(b.items(), key=lambda kv: -kv[1])[:20000] for word in bigram)

In [81]:
dim = len(word_set)
dim

2764

In [82]:
stoi = {s:i for i,s in enumerate(word_set)} # string to index
itos = {i:s for s, i in stoi.items()}
stoi

{"'car": 0,
 'hot': 1,
 'barbed': 2,
 'daily': 3,
 'mountains': 4,
 'game': 5,
 'into': 6,
 'mere': 7,
 'soul': 8,
 'gnome': 9,
 'mum': 10,
 "'iguanas": 11,
 "'plans": 12,
 'debate': 13,
 'raccoon': 14,
 'complete': 15,
 '500': 16,
 'looks': 17,
 'grumbling': 18,
 'dangerous': 19,
 'glaze': 20,
 'makes': 21,
 'coffee': 22,
 'cook': 23,
 'surgical': 24,
 'birds': 25,
 'goodbye': 26,
 'pesticides': 27,
 'pressed': 28,
 'useless': 29,
 'flat': 30,
 'exchanged': 31,
 'powerful': 32,
 'couldnâ€™t': 33,
 'bowling': 34,
 'seemed': 35,
 'orchard': 36,
 'icon': 37,
 'gnu': 38,
 'platypus': 39,
 'songs': 40,
 'stairway': 41,
 'dancing': 42,
 'easily': 43,
 "'check": 44,
 'closet': 45,
 'the': 46,
 'vacant': 47,
 'why': 48,
 'level': 49,
 'rather': 50,
 'manuals': 51,
 'sister': 52,
 'hacked': 53,
 'chosen': 54,
 'popping': 55,
 'pass': 56,
 'explaining': 57,
 'it': 58,
 'come': 59,
 'nair': 60,
 'race': 61,
 'indicated': 62,
 'monday': 63,
 'lady': 64,
 'grew': 65,
 'entered': 66,
 'useful': 67,

In [83]:
for i, sentence in enumerate(tokenized_sentences):
    for j, word in enumerate(sentence):
        if word not in word_set:
            tokenized_sentences[i][j] = 'UNKNOWN_TOKEN'

In [84]:
def build_dataset(tokenized_sentences):
    block_size = 8
    X, Y = [], []
    for sentence in tokenized_sentences:
        context = [0] * block_size
        for word in sentence:
            if word == 'UNKNOWN_TOKEN':
                continue
            ix = stoi[word]
            X.append(context)
            Y.append(ix)
            context = context[1:] + [ix]
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    print(X.shape, Y.shape)
    return X, Y

import random
random.seed(42)
random.shuffle(tokenized_sentences)
n1 = int(0.8 * len(tokenized_sentences))
n2 = int(0.9 * len(tokenized_sentences))

Xtr, Ytr = build_dataset(tokenized_sentences[:n1])
Xdev, Ydev = build_dataset(tokenized_sentences[n1:n2])
Xte, Yte = build_dataset(tokenized_sentences[n2:])

    

torch.Size([8784, 8]) torch.Size([8784])
torch.Size([2082, 8]) torch.Size([2082])
torch.Size([1090, 8]) torch.Size([1090])


In [85]:
for x, y in zip( Xtr[:20], Ytr[:20]):
    print(' '.join(itos[ix.item()] for ix in x), '-->', itos[y.item()])

'car 'car 'car 'car 'car 'car 'car 'car --> SENTENCE_START
'car 'car 'car 'car 'car 'car 'car SENTENCE_START --> 'he
'car 'car 'car 'car 'car 'car SENTENCE_START 'he --> decided
'car 'car 'car 'car 'car SENTENCE_START 'he decided --> to
'car 'car 'car 'car SENTENCE_START 'he decided to --> live
'car 'car 'car SENTENCE_START 'he decided to live --> his
'car 'car SENTENCE_START 'he decided to live his --> life
'car SENTENCE_START 'he decided to live his life --> by
SENTENCE_START 'he decided to live his life by --> the
'he decided to live his life by the --> big
decided to live his life by the big --> beats
to live his life by the big beats --> manifesto
live his life by the big beats manifesto --> .
his life by the big beats manifesto . --> '
life by the big beats manifesto . ' --> SENTENCE_END
'car 'car 'car 'car 'car 'car 'car 'car --> SENTENCE_START
'car 'car 'car 'car 'car 'car 'car SENTENCE_START --> '100
'car 'car 'car 'car 'car 'car SENTENCE_START '100 --> years
'car 'car 'car 'c

In [86]:
# ----------------------------------------------------------------------------------------------------------------
class Linear:
    def __init__(self, fan_in, fan_out, bias = True):
        self.weight = torch.randn((fan_in, fan_out)) / fan_in ** 0.5
        self.bias = torch.zeros(fan_out) if bias else None

    def __call__(self, x):
        self.out = x @ self.weight
        if self.bias is not None:
            self.out += self.bias
        return self.out
    def parameters(self):
        return [self.weight] + ([] if self.bias is None else [self.bias])

# ----------------------------------------------------------------------------------------------------------------
class BatchNorm1d:
    def __init__(self, dim, eps = 1e-5, momentum = 0.1):
        self.eps = eps # epsilon
        self.momentum = momentum
        self.training = True
        # Parameters trained in backprop
        self.gamma = torch.ones(dim)
        self.beta = torch.zeros(dim)
        # Buffers (trained with a running 'momentum update')
        self.running_mean = torch.zeros(dim)
        self.running_var = torch.ones(dim)

    def __call__(self, x):
        # Forward pass
        if self.training:
            if x.ndim ==2:
                dim = 0
            elif x.ndim == 3:
                dim = (0,1)
            xmean = x.mean(0, keepdim = True)
            xvar = x.var(0, keepdim = True, unbiased = True)
        else:
            xmean = self.running_mean
            xvar = self.running_var
        xhat = (x-xmean) / torch.sqrt(xvar + self.eps) 
        self.out = self.gamma * xhat + self.beta
        # Update our buffers
        if self.training:
            with torch.no_grad():
                self.running_mean = (1-self.momentum) * self.running_mean + self.momentum * xmean
                self.running_var = (1-self.momentum) * self.running_var + self.momentum * xvar
        return self.out

    def parameters(self):
        return [self.gamma, self.beta]

# ----------------------------------------------------------------------------------------------------------------

class Tanh:
  def __call__(self, x):
    self.out = torch.tanh(x)
    return self.out
  def parameters(self):
    return []

# ----------------------------------------------------------------------------------------------------------------

class Embedding:
    def __init__(self, num_embeddings, embeddings_dim):
        self.weight = torch.randn((num_embeddings, embeddings_dim));
    def __call__(self, IX):
        self.out = self.weight[IX]
        return self.out
    def parameters(self):
        return [self.weight]

# ----------------------------------------------------------------------------------------------------------------

class FlattenConsecutive:
    def __init__(self, n):
        self.n = n
        
    def __call__(self, x):
        B, T, C = x.shape
        x = x.view(B, T//self.n, C*self.n)
        if x.shape[1] == 1:
            x = x.squeeze(1)
        self.out = x
        return self.out
    def parameters(self):
        return []

# ----------------------------------------------------------------------------------------------------------------

class Sequential:
    def __init__(self, layers):
        self.layers = layers
    def __call__(self, x):
        for layer in self.layers:
            x = layer(x)
        self.out = x
        return self.out
    def parameters(self):
        # Get all parameters and put them in a list
        return [p for layer in self.layers for p in layer.parameters()]

In [87]:
vocab_size = dim
n_embd = 24 # the dimensionality of the character embedding vectors
n_hidden = 128 # the number of neurons in the hidden layer of the MLP

model = Sequential([
    Embedding(vocab_size, n_embd),
    FlattenConsecutive(2), Linear(n_embd * 2, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
    FlattenConsecutive(2), Linear(n_hidden * 2, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
    FlattenConsecutive(2), Linear(n_hidden * 2, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
    Linear(n_hidden, vocab_size),
])

# # Initalize parameters
# with torch.no_grad():
#   layers[-1].weight *= 0.1

parameters = model.parameters()
print(sum(p.nelement() for p in parameters)) # number of parameters in total
for p in parameters:
  p.requires_grad = True

495340


In [109]:
# same optimization as last time
max_steps = 200000
batch_size = 32
lossi = []
ud = []

for i in range(max_steps):
  
    # minibatch construct
    ix = torch.randint(0, Xtr.shape[0], (batch_size,))
    Xb, Yb = Xtr[ix], Ytr[ix] # batch X,Y
    
    # forward pass
    logits = model(Xb)
    loss = F.cross_entropy(logits, Yb) # loss function
    
    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()
    
    # update
    lr = 0.1 if i < 150000 else 0.01 # step learning rate decay
    for p in parameters:
        p.data += -lr * p.grad

    # track stats
    if i % 10000 == 0: # print every once in a while
        print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
    lossi.append(loss.log10().item())
    # break

      0/ 200000: 7.5114
  10000/ 200000: 0.5206
  20000/ 200000: 0.5838
  30000/ 200000: 0.1938
  40000/ 200000: 0.2704
  50000/ 200000: 0.6689


KeyboardInterrupt: 

In [92]:
torch.tensor(lossi).view(-1, 1000).mean(1).shape
plt.plot(torch.tensor(lossi).view(-1,100).mean(1))

RuntimeError: shape '[-1, 1000]' is invalid for input of size 1

In [93]:
# Save tensors to text file with labels
def save_tensors_to_txt(filename, tensor_names, tensors):
    with open(filename, 'w') as file:
        for name, tensor in zip(tensor_names, tensors):
            file.write(f'{name}\n') 
            flattened_tensor = tensor.detach().numpy()  # Flatten the tensor and convert to numpy array
            flattened_tensor_str = ','.join(map(str, flattened_tensor))
            file.write(f'{flattened_tensor_str}\n') 

parameters = model.parameters()
parameter_names = ['embedding_weight', 'linear1_weight', 'linear1_gamma', 'linear1_beta', 'batchnorm1_running_mean', 'batchnorm1_running_var',
                   'linear2_weight', 'linear2_gamma', 'linear2_beta', 'batchnorm2_running_mean', 'batchnorm2_running_var',
                   'linear3_weight', 'linear3_gamma', 'linear3_beta', 'batchnorm3_running_mean', 'batchnorm3_running_var',
                   'linear4_weight', 'linear4_bias']

save_tensors_to_txt('model2_parameters.txt', parameter_names, parameters)


In [94]:
# Put all layers into eval mode
for layer in model.layers:
    layer.training = False

In [112]:
# Evaluate the loss
@torch.no_grad() # this decorator disables gradient tracking
def split_loss(split):
    x,y = {
    'train': (Xtr, Ytr),
    'val': (Xdev, Ydev),
    'test': (Xte, Yte),
    }[split]
    logits = model(x)
    loss = F.cross_entropy(logits, y)
    print(split, loss.item())

split_loss('train')
split_loss('val')

train 0.39030706882476807
val 8.892152786254883


In [113]:
# sample from the model
block_size = 8
for _ in range(5):
    counter = 0
    out = []
    context = [0] * block_size
    while True and counter < 20:
        logits = model(torch.tensor([context]))
        probs = F.softmax(logits, dim=1)
        # Sample from the distribution
        ix = torch.multinomial(probs, num_samples=1).item()
        # shift the context window and track the samples
        context = context[1:] + [ix]
        out.append(ix)
        counter = counter + 1
        if ix == stoi['SENTENCE_END']:
            break
    print('  '.join(itos[i] for i in out)) # decode and print the generated word





# # Sample from the model

# for _ in range (5):
#     out = []
#     context = [0] * block_size
#     counter = 0
#     while True & counter < 20:
#         # Embed current context using embedding table
#         emb = C[torch.tensor([context])] # (1, block_size, d)
#         h = torch.tanh(emb.view(1, -1) @ W1 + b1)
#         logits = h @ W2 + b2
#         probs = F.softmax(logits, dim = 1)
#         ix = torch.multinomial(probs, num_samples = 1, generator = g).item()
#         context = context[1:] + [ix]
#         out.append(ix)
#         counter +=1
#         if ix == stoi['SENTENCE_END']:
#             break
#         if ix == stoi['SENTENCE_START']:
#             break
#     print(' '.join(itos[i] for i in out))

SENTENCE_START  'he  figured  a  few  sticks  of  dynamite  were  easier  than  a  fishing  pole  to  catch  fish  .  '  SENTENCE_END
SENTENCE_START  'lucifer  was  surprised  at  the  amount  of  life  at  death  valley  .  '  SENTENCE_END
SENTENCE_START  'at  that  moment  she  realized  she  had  a  sixth  sense  .  '  SENTENCE_END
SENTENCE_START  'in  that  instant  '  ,  'everything  changed  .  '  SENTENCE_END
SENTENCE_START  ``  it  would  have  been  a  better  night  if  the  guys  next  to  us  were  n't  in  the  splash
