In [2]:
def read_file(_file):
    try:
        with open(_file, 'r') as f:
            return f.read()
    except FileNotFoundError:
        return None

In [3]:
contents = read_file('/Users/UI0627/Projects/genai/input.txt')

In [21]:
# In our case vocab is character level
# In practise, it can be word, or sub-word level
def get_vocab(contents):
    uniq_chars = set(contents)
    vocab = sorted(list(uniq_chars))
    # print("".join(vocab))
    # print(len(vocab))
    return vocab

vocab = get_vocab(contents)

In [22]:
# Now lets define encoder and decoder functions
# Alternatives: Open-source variants. For example, GPT uses tik-token library

encoder_map = { ch: i for i, ch in enumerate(vocab) }
decoder_map = { i: ch for i, ch in enumerate(vocab) }

def encoder(str):
    return [ encoder_map[ch] for ch in str ]

def decoder(idx_arr):
    return "".join([ decoder_map[idx] for idx in idx_arr ])

# sample_arr = encoder("Hello!")
# back = decoder(sample_arr)
# print(sample_arr, back)


[20, 43, 50, 50, 53, 2] Hello!


In [24]:
# Convert to Tensor
import torch

def get_tensor_representation(arr):
    return torch.tensor(arr, dtype=torch.long)

data = get_tensor_representation(encoder(contents))

# print(data.dtype, data.shape)
# print(data[:100])

torch.int64 torch.Size([746442])
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


In [25]:
# Convert dataset to Training and Testing Data

def split_data(data):
    split_point = int(len(data) * 0.9)
    train_data = data[:split_point]
    test_data = data[split_point:]
    return train_data, test_data

train_data, test_data = split_data(data)
# print(len(train_data), len(test_data))
# print(train_data[:10], test_data[:10])

671797 74645
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47]) tensor([52, 43, 43, 42, 57,  1, 39, 40, 47, 42])


In [26]:
# Any transformer is never fed entire data directly, that is computationally expensive
# So, data is fed in chunks or blocks

block_size = 8

# Now, what this block size means how many prediction NN can make for one iteration
# For example 18, 47, 56, 57, 58,  1, 15, 47 
# --> In this 
#     Given 18, predict 47
#     Given 18, 47, predict 56

In [28]:
# However, given that our Systems can work on multiple things at same time
# We want to feed transformers multiple chunks at same time
# This value depends on how good GPU is
batch_size = 4

In [43]:
# import random

# def get_batch(data):
        
#     # Get a random index from data
#     random_idx = random.randint(0, len(data) - block_size)
#     x = data[random_idx:random_idx+block_size]
#     y = data[random_idx+1:random_idx+block_size+1]

#     return x, y

# # get_batch(train_data)

# def get_batches(data):

#     x = []
#     y = []

#     for _ in range(batch_size):
#         batch_x, batch_y = get_batch(data)
#         x.append(batch_x)
#         y.append(batch_y)

#     return x, y


# If you want same random numbers every time
# torch.manual_seed(234)

# Alternative Tensor version for same
def get_batches_v2(data):
    random_idx = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack( [data[ix:ix+block_size] for ix in random_idx] )
    y = torch.stack( [data[ix+1:ix+block_size+1] for ix in random_idx] )
    return x, y


# xb, yb = get_batches(train_data)
xb, yb = get_batches_v2(train_data)
