In [3]:
def read_file(_file):
    try:
        with open(_file, 'r') as f:
            return f.read()
    except FileNotFoundError:
        return None

In [4]:
contents = read_file('/Users/UI0627/Projects/genai/input.txt')

In [18]:
# In our case vocab is character level
# In practise, it can be word, or sub-word (token) level
def get_vocab(contents):
    uniq_chars = set(contents)
    vocab = sorted(list(uniq_chars))
    # print("".join(vocab))
    # print(len(vocab))
    return vocab

vocab = get_vocab(contents)
vocab_size = len(vocab)

In [6]:
# Now lets define encoder and decoder functions
# Alternatives: Open-source variants. For example, GPT uses tik-token library

encoder_map = { ch: i for i, ch in enumerate(vocab) }
decoder_map = { i: ch for i, ch in enumerate(vocab) }

def encoder(str):
    return [ encoder_map[ch] for ch in str ]

def decoder(idx_arr):
    return "".join([ decoder_map[idx] for idx in idx_arr ])

# sample_arr = encoder("Hello!")
# back = decoder(sample_arr)
# print(sample_arr, back)


In [7]:
# Convert to Tensor
import torch

data = torch.tensor(encoder(contents))

# print(data.dtype, data.shape)
# print(data[:100])

In [8]:
# Convert dataset to Training and Testing Data

def split_data(data, split_ratio=0.9):
    split_point = int(len(data) * split_ratio)
    train_data = data[:split_point]
    test_data = data[split_point:]
    return train_data, test_data

train_data, test_data = split_data(data)
# print(len(train_data), len(test_data))
# print(train_data[:10], test_data[:10])

In [9]:
# Any transformer is never fed entire data directly, that is computationally expensive
# So, data is fed in chunks or blocks

# Time dimennsion
block_size = 8

In [94]:
# However, given that our Systems can work on multiple things at same time
# We want to feed transformers multiple chunks at same time
# This value depends on how good GPU is

# This is parallelism
batch_size = 32

In [90]:
# import random

# def get_batches_v1(data):

#     # Get a random index from data
#     random_idx = random.randint(0, len(data) - block_size)
#     x = data[random_idx:random_idx+block_size]
#     y = data[random_idx+1:random_idx+block_size+1]

#     return x, y

# # get_batch(train_data)

# def get_batches(data):

#     x = []
#     y = []

#     for _ in range(batch_size):
#         batch_x, batch_y = get_batch(data)
#         x.append(batch_x)
#         y.append(batch_y)

#     return x, y


# If you want same random numbers every time
# torch.manual_seed(234)

# Alternative Tensor version for same
def get_batches_v2(data):
    random_idx = torch.randint(len(data) - block_size, size=(batch_size,))
    x = torch.stack( [data[ix:ix+block_size] for ix in random_idx] )
    y = torch.stack( [data[ix+1:ix+block_size+1] for ix in random_idx] )
    return x, y

def get_batch(split):
    data = train_data if split == "train" else test_data
    return get_batches_v2(data)

# xb, yb = get_batches(train_data)
xb, yb = get_batch("train")

In [59]:
import torch
import torch.nn as nn
from torch.nn import functional as F


class BigramModel(nn.Module):

    def __init__(self, vocab_size, embedding_dim=None):
        super(BigramModel, self).__init__()

        if embedding_dim is None:
            embedding_dim = vocab_size

        self.embeddings = nn.Embedding(vocab_size, embedding_dim)

    def forward(self, idx, targets=None):

        # Idx should be Batch X Time dimesions
        logits = self.embeddings(idx) # this should return Batch X Time X Embedding (channel)

        if targets is None:
            return logits, None

        B, T, C = logits.shape
        logits = logits.view(B*T, C)
        targets = targets.view(B*T)
        loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens=1):
        """
        Generate new tokens based on the input idx
        """

        # IDX is in Batch X Time

        for _ in range(max_new_tokens):

            # Get the logits for the last token
            logits, loss = self(idx)

            # Focus only on last stuff
            logits = logits[:, -1, :]

            # Multiple ways of generating new stuff
            probs = F.softmax(logits, dim=1)

            # 1. Get the most likely token
            # idx_next = torch.max(probs, dim=1)
            # 2. Pick one from distribution
            idx_next = torch.multinomial(probs, num_samples=1)

            # Append the new token to the idx
            idx = torch.cat([idx, idx_next], dim=1)

        return idx


In [86]:
model = BigramModel(vocab_size)
# logits, loss = model(xb, yb)
# print(loss)

print(decoder(model.generate(xb, 5)[0].tolist()))

promise.EbZJk


In [127]:
# Copying this line - I dont know yet what this does
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

test_d = torch.zeros( (1, 1), dtype=torch.long)

def train():

    for steps in range(1000):
        xb, yb = get_batch("train")

        logits, loss = model(xb, yb)
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

        if steps % 100 == 0:
            print(loss.item(), steps)

    return loss

print(loss.item())
loss = train()

2.520458459854126
2.485140085220337 0
2.56502366065979 100
2.5035088062286377 200
2.4271862506866455 300
2.3636176586151123 400
2.396932363510132 500
2.363022804260254 600
2.467040538787842 700
2.3500454425811768 800
2.5277249813079834 900


In [130]:
print(loss.item())
print(decoder(model.generate(test_d, 10000)[0].tolist()))

2.4259490966796875

NUS:
RUS:
Sthed m My ppalicouromy hom hes.
an lawin ceres: f nopesindom f d? y s at it OLAn frilin ll he: u th,
ANOnce il tir heds:
Je:

Wistlant the,
YON wa hithe lle t tina my s ARo sta mbe chor watan t vetoam wi knd,
ABuirs y areredouis surato o thencindeit oile ben We akist thig urinoutoof chng tr ize:
agrd b sn t Wha s.
TENENuroves tholely
Fit cherd w me the, echourirores RKICOucce t
'ss;

ARYOMavioute ge hantut:
Whthaus INGo ther t ad s:
Anomely pril aken;
F Jar lllver ta tobessere venewe to bert t't omithingoureerorllighorille r ig ferif eponghe f iser st;
I gho:

ME:

Ande ad mexscot nor LAn in, pais:
I h viveerol Yorothay ond hathare d whalo tse y,
T:
Wid he



USotitlime nclke g p llloetre
TE:
UTHUSockn
ARivinthizerur ave R:
Hererizid s LO:
Ay bousondintesthu ghnus me ndur n rly flikiraniowhare.
WI nso herdarit cay, g so jufind lot yor ar'seng purenoofteel ex s OUSes. brie?
Seant OLERENCHed pp bry thit 'se bon hurm s ws t il;
Sendes
Yor aithuens th mean'so