In [31]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from torch.utils.data import DataLoader, TensorDataset

import matplotlib.pyplot as plt
import seaborn as sns
import logging

In [4]:
# getting the dataset for training 
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2025-05-16 18:44:05--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2025-05-16 18:44:06 (13.0 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [None]:
# set up logging


In [6]:
# loading the dataset
def load_dataset(filname):
    try:
        with open(filname, 'r') as f:
            text = f.read()
        return text
    except Exception as e:
        return None

In [None]:
class preprocessing_training():
    def __init__(self, text, Batch, Time):
        self.text = text
        self.dataset_size = len(self.text)
        self.vocab = sorted(set(self.text)) # number of distinct tokens in the dataset (for our model token is a character)
        self.vocab_size = len(self.vocab)
        self.Batch = Batch # we create a batch of 4 chunks which can be processed paralllely by our language model
        self.Time = Time

    def hash_for_tokenization(self):
        # Tokenization is the process of splitting text into units (tokens) like characters, words, or subwords,
        # and mapping each token to a unique numerical ID for model input.
        self.str_to_int = {char:index for index, char in enumerate(self.vocab)}
        self.int_to_str = {index:char for index, char in enumerate(self.vocab)}

    def encoding(self, st: str):
        return [self.str_to_int[x] for x in st]

    def decoding(self, li: list):
        return ''.join([self.int_to_str[x] for x in li])


    def train_test_validation_split(self):
        """
        Splits the dataset into train (81%), validation (9%), and test (10%).
    
        - First 90% is training+validation
        - Last 10% is test
        - From the 90%, 10% is taken as validation (i.e., 9% of total)
        """
    
        split_index_test = int(0.9 * len(text))
        train_val_text = text[:split_index_test]
        test_text = text[split_index_test:]
    
        split_index_val = int(0.9 * len(train_val_text))
        train_text = train_val_text[:split_index_val]
        val_text = train_val_text[split_index_val:]
    
        return train_text, val_text, test_text

    def batch_index(self, split: str):
        self.train_text, self.val_text, self.test_text = self.train_test_validation_split()
        torch.manual_seed(1337)
        if split == 'train':
            data = train_text
        elif split == 'validation':
            data = val_text
        else:
            data = test_text
        # we are seeding our torch random generator so that when we reproduce or rerurn this code, we always get same random numbers
        batch_indices = torch.randint(len(data)-self.Time), (self.Batch,)) 
        # we are generating 4 random indices which can be any integers between 0 and len(data)-block_size which is Time
        return batch_indices

    def batch_of_chunks(self):
        # creating x (input) and y (target) tensors for building and training our model
        x = torch.stack([torch.tensor(data[ix: ix+self.Time]) for ix in batch_indices])
        y = torch.stack([torch.tensor(data[ix+1: ix+self.Time+1]) for ix in batch_indices])
        return x, y

    
        

        
        
        
        
        