## Import Libraries

In [None]:
!nvidia-smi

Tue Nov 28 03:19:37 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   46C    P0    26W /  70W |  13923MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!pip install wandb --quiet
!pip install torchsummaryX -q
!pip install datasets
!pip install zstandard
!pip install tiktoken



In [None]:
import torch
import random
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torchsummaryX import summary
from torch.utils.data import Dataset, DataLoader
import torchaudio.transforms as tat

from sklearn.metrics import accuracy_score
import gc

import zipfile
import pandas as pd
from tqdm import tqdm
import os
import datetime
import zstandard
import datasets
import tiktoken
import random
import wandb
import math

import warnings
warnings.filterwarnings('ignore')

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Device: ", device)

Device:  cuda


In [None]:
### If you are using colab, you can import google drive to save model checkpoints in a folder.
### This is used when connecting to GCE VMs, but the user still wants to connect to Google Drive
import os.path as path
if not path.exists("/content/drive"):
  !sudo add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
  !sudo apt-get update -qq 2>&1 > /dev/null
  !sudo apt -y install -qq google-drive-ocamlfuse 2>&1 > /dev/null
  !google-drive-ocamlfuse

  !sudo apt-get install -qq w3m # to act as web browser
  !xdg-settings set default-web-browser w3m.desktop # to set default browser
  %cd /content
  !mkdir drive
  %cd drive
  !mkdir MyDrive
  %cd ..
  %cd ..
  !google-drive-ocamlfuse /content/drive/MyDrive

# Download OpenWebText Dataset

In [None]:
# 13GB dataset: https://huggingface.co/datasets/Skylion007/openwebtext
# Small dataset: stas/openwebtext-10k
from datasets import load_dataset

dataset = load_dataset("kerpr/cc_openwebtext")

In [None]:
config = {
    'epochs'        : 5,
    'batch_size'    : 32,
    'init_lr'       : 3e-5,
    'block_size'    : 256,
    'dropout'       : 0.1,
    'vocab_size'    : 50257,
    'bias'          : True,
    'n_layer'       : 12,
    'n_head'        : 10,
    'n_embd'        : 250,
    'end_token'     : 50256
}

In [None]:
gc.collect()
torch.cuda.empty_cache()

## Dataloader / Train Test Split

In [None]:
data = dataset["train"].train_test_split(test_size=0.05, seed=1200, shuffle=True)
test_valid = data['test'].train_test_split(test_size=0.5)
train_test_valid_dataset = datasets.DatasetDict({
    'train': data['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})

In [None]:
# Dataset class to load train and validation data

class OpenWebTextDataset(torch.utils.data.Dataset):

    def __init__(self, prefix):

        prev_data = train_test_valid_dataset[prefix]
        enc = tiktoken.get_encoding("gpt2") # encoding using the tiktoken library
        self.values = [enc.encode_ordinary(prev_data[i]["text"]) if (prev_data[i]["text"] != None) else [] for i in range(len(prev_data))]

        # Append start and end token
        self.values = [np.array(self.values[i] + [config['end_token']]) for i in range(len(self.values))]
        self.values = np.concatenate(self.values, axis=0)

    def __len__(self):
        return len(self.values) // config['block_size']

    def __getitem__(self, ind):

        # Shift x and y by one and then index by block_size amount of tokens
        x = torch.from_numpy(self.values[(ind*config['block_size']):(ind*config['block_size'])+config['block_size']].astype(np.int64))
        y = torch.from_numpy(self.values[(ind*config['block_size'] + 1):(ind*config['block_size'])+config['block_size']+1].astype(np.int64))

        return x, y

In [None]:
class OpenWebTextTestDataset(torch.utils.data.Dataset):

    def __init__(self, prefix):

      prev_data = train_test_valid_dataset["test"]
      enc = tiktoken.get_encoding("gpt2") # encoding using the tiktoken library
      self.values = [enc.encode_ordinary(prev_data[i]["text"]) if (prev_data[i]["text"] != None) else [] for i in range(len(prev_data))]

      self.values = [np.array(self.values[i] + [config['end_token']]) for i in range(len(self.values))] #start
      self.values = np.concatenate(self.values, axis=0)

    def __len__(self):
      return len(self.values) // config['block_size']

    def __getitem__(self, ind):

      x = torch.from_numpy(self.values[(ind*config['block_size']):(ind*config['block_size'])+config['block_size']].astype(np.int64))
      return x

In [None]:
train_data = OpenWebTextDataset(prefix="train")
val_data = OpenWebTextDataset(prefix="valid")
test_data = OpenWebTextTestDataset(prefix="test")

In [None]:
import multiprocessing
train_loader = torch.utils.data.DataLoader(
    dataset     = train_data,
    num_workers = 1,
    batch_size  = config['batch_size'],
    pin_memory  = True,
    shuffle     = True
)

val_loader = torch.utils.data.DataLoader(
    dataset     = val_data,
    num_workers = 1,
    batch_size  = config['batch_size'],
    pin_memory  = True,
    shuffle     = False
)

test_loader = torch.utils.data.DataLoader(
    dataset     = test_data,
    num_workers = 1,
    batch_size  = config['batch_size'],
    pin_memory  = True,
    shuffle     = False
)


In [None]:
# Testing code to check if your data loaders are working
for i, data in enumerate(train_loader):
    x, y = data
    print(x.shape, y.shape)
    print(x, y)
    break

for i, data in enumerate(test_loader):
    x = data
    print(x.shape)
    print(x)
    break

torch.Size([32, 256]) torch.Size([32, 256])
tensor([[  513,    13,    15,  ...,  2810,   262,   636],
        [14196,   526,   198,  ...,   257,   471,    13],
        [  447,   247,   303,  ...,    12,    33,    12],
        ...,
        [13742,   357,    16,  ...,   262,  2520,  2807],
        [13449,   532,  3267,  ...,    12,    23,  3648],
        [ 1919,  2324,  1271,  ...,  4179,   534,  7111]]) tensor([[   13,    15, 33721,  ...,   262,   636,   468],
        [  526,   198,   198,  ...,   471,    13,    50],
        [  247,   303,  1775,  ...,    33,    12,    34],
        ...,
        [  357,    16,     8,  ...,  2520,  2807,  6896],
        [  532,  3267,   860,  ...,    23,  3648,  9595],
        [ 2324,  1271,  1906,  ...,   534,  7111,    11]])
torch.Size([32, 256])
tensor([[15592,  1578,   402,  ...,    14,   940,    14],
        [ 1238,  1828,   860,  ...,  7841,  1008,    13],
        [10366,   654,    13,  ..., 28682,   329,   691],
        ...,
        [  481,  2148, 

## Blocks

In [None]:
# Layer normalization for regularizing the model
class LayerNorm(nn.Module):
  def __init__(self, ndim, bias):
    super().__init__()
    self.weight, self.bias = nn.Parameter(torch.ones(ndim)), nn.Parameter(torch.zeros(ndim))

  def forward(self, input):
    return nn.functional.layer_norm(input=input, 
                                    normalized_shape=self.weight.shape, 
                                    weight=self.weight, 
                                    bias=self.bias, 
                                    eps=1e-5)

In [None]:
class AttentionLayer(nn.Module):
  def __init__(self):
    super().__init__()
    self.attention_layer = nn.Linear(config['n_embd'], 3 * config['n_embd'])
    self.projection_layer = nn.Linear(config['n_embd'], config['n_embd'])
    self.dropout = nn.Dropout(config['dropout'])

  def attention_calculation (self, x):
    query, key, value = self.attention_layer(x).split(config['n_embd'], dim=2)
    key = key.view(x.size(0), x.size(1), config['n_head'], x.size(2) // config['n_head'])
    query = query.view(x.size(0), x.size(1), config['n_head'], x.size(2) // config['n_head'])
    value = value.view(x.size(0), x.size(1), config['n_head'], x.size(2) // config['n_head'])

    key, query, value = key.transpose(1, 2), query.transpose(1, 2), value.transpose(1, 2)

    y = nn.functional.scaled_dot_product_attention(query, key, value, dropout_p=config['dropout'])

  def forward(self, x):
    y = self.attention_calculation(x)
    out = y.transpose(1, 2).view(x.size(0), x.size(1), x.size(2))
    out = self.projection_layer(out)
    out = self.dropout(out)
    return y

In [None]:
class Block(nn.Module):

    def __init__(self):
        super().__init__()
        self.ln_1 = LayerNorm(config['n_embd'], bias=config['bias'])
        self.attn = AttentionLayer()
        self.ln_2 = LayerNorm(config['n_embd'], bias=config['bias'])
        self.mlp = nn.Sequential (
            nn.Linear(config['n_embd'], 4 * config['n_embd']),
            nn.GELU(),
            nn.Linear(4 * config['n_embd'], config['n_embd']),
            nn.Dropout(config['dropout'])
        )

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x

## Model

In [None]:
class GPT(nn.Module):
  def __init__(self):
    super().__init__()

    self.token_embed = nn.Embedding(config['vocab_size'], config['n_embd'])
    self.pos_embed = nn.Embedding(config['block_size'], config['n_embd'])
    self.dropout = nn.Dropout(config['dropout'])

    modules = [Block() for b in range(config['n_layer'])]
    self.blocks = nn.Sequential(*modules)
    self.layernorm = LayerNorm(config['n_embd'], bias=config['bias'])

    self.lin1 = nn.Linear(config['n_embd'], config['vocab_size'])
    self.transformer.wte.weight = self.lm_head.weight

  def forward(self, idx):
      position = torch.arange(0, idx.size(1))

      tok_emb = self.token_embed(idx)
      pos_emb = self.pos_embed(position)
      x = self.dropout(tok_emb + pos_emb)

      for block in self.blocks: x = block(x)
      x = self.layernorm(x)

      return self.lin1(x)

  def generate(self, idx, max_new_tokens, temperature=1.0):
    for _ in range(max_new_tokens):
        idx_cond = idx if idx.size(1) <= self.config['block_size'] else idx[:, -self.config['block_size']:]
        logits, _ = self(idx_cond)
        probs = F.softmax(logits[:, -1, :], dim=-1)
        idx_next = torch.multinomial(probs, num_samples=1)
        idx = torch.cat((idx, idx_next), dim=1)

    return idx

In [None]:
model = GPT().to(device)
summary(model, x.to(device))

                                                 Kernel Shape  \
Layer                                                           
0_transformer.Embedding_wte                      [250, 50257]   
1_transformer.Embedding_wpe                        [250, 256]   
2_transformer.Dropout_drop                                  -   
3_transformer.h.0.LayerNorm_ln_1                        [250]   
4_transformer.h.0.attn.Linear_c_attn               [250, 750]   
5_transformer.h.0.attn.Linear_c_proj               [250, 250]   
6_transformer.h.0.attn.Dropout_resid_dropout                -   
7_transformer.h.0.LayerNorm_ln_2                        [250]   
8_transformer.h.0.mlp.Linear_c_fc                 [250, 1000]   
9_transformer.h.0.mlp.GELU_gelu                             -   
10_transformer.h.0.mlp.Linear_c_proj              [1000, 250]   
11_transformer.h.0.mlp.Dropout_dropout                      -   
12_transformer.h.1.LayerNorm_ln_1                       [250]   
13_transformer.h.1.attn.L

Unnamed: 0_level_0,Kernel Shape,Output Shape,Params,Mult-Adds
Layer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0_transformer.Embedding_wte,"[250, 50257]","[32, 256, 250]",12564250.0,12564250.0
1_transformer.Embedding_wpe,"[250, 256]","[256, 250]",64000.0,64000.0
2_transformer.Dropout_drop,-,"[32, 256, 250]",,
3_transformer.h.0.LayerNorm_ln_1,[250],"[32, 256, 250]",500.0,250.0
4_transformer.h.0.attn.Linear_c_attn,"[250, 750]","[32, 256, 750]",188250.0,187500.0
...,...,...,...,...
108_transformer.h.11.mlp.GELU_gelu,-,"[32, 256, 1000]",,
109_transformer.h.11.mlp.Linear_c_proj,"[1000, 250]","[32, 256, 250]",250250.0,250000.0
110_transformer.h.11.mlp.Dropout_dropout,-,"[32, 256, 250]",,
111_transformer.LayerNorm_ln_f,[250],"[32, 256, 250]",500.0,250.0


In [None]:
criterion = torch.nn.CrossEntropyLoss() # Defining Loss function.

optimizer = torch.optim.AdamW(model.parameters(), lr= config['init_lr']) # Defining Optimizer
scheduler = torch.optim.lr_scheduler.ReduceLRonPlateau(optimizer, patience=2, factor=0.75)
scaler = torch.cuda.amp.GradScaler()

In [None]:
def train(model, dataloader, optimizer, criterion):

    model.train()
    tloss, tacc = 0, 0 # Monitoring loss and accuracy
    batch_bar   = tqdm(total=len(train_loader), dynamic_ncols=True, leave=False, position=0, desc='Train')

    scaler = torch.cuda.amp.GradScaler()
    counter = 0

    for i, (inputs, targets) in enumerate(dataloader):

        counter += 1

        ### Initialize Gradients
        optimizer.zero_grad()

        with torch.amp.autocast(device_type="cuda", dtype=torch.float16):
          ### Move Data to Device (Ideally GPU)
          inputs      = inputs.to(device)
          targets    = targets.to(device)

          ### Forward Propagation
          logits  = model(inputs)

          ### Loss Calculation
          logits = logits.view(-1, logits.size(-1))
          targets = targets.view(-1)
          loss = F.cross_entropy(logits, targets, ignore_index=-1)

        ### Backward Propagation
        scaler.scale(loss).backward()
        # loss.backward()

        ### Gradient Descent
        scaler.step(optimizer)

        scaler.update()

        tloss   += loss.item()
        tacc    += torch.sum(torch.argmax(logits, dim= 1) == targets).item()/logits.shape[0]

        batch_bar.set_postfix(loss="{:.04f}".format(float(tloss / (i + 1))),
                              acc="{:.04f}%".format(float(tacc*100 / (i + 1))))
        batch_bar.update()

        if (counter % 1000 == 0):
          wandb.log({'train_acc': (tacc/counter)*100, 'train_loss': (tloss/counter), 'lr': curr_lr})

        ### Release memory
        del inputs, targets, logits
        torch.cuda.empty_cache()

    batch_bar.close()
    tloss   /= len(train_loader)
    tacc    /= len(train_loader)

    return tloss, tacc

In [None]:
def eval(model, dataloader):

    model.eval() # set model in evaluation mode
    vloss, vacc = 0, 0 # Monitoring loss and accuracy
    batch_bar   = tqdm(total=len(val_loader), dynamic_ncols=True, position=0, leave=False, desc='Val')

    counter = 0

    for i, (inputs, targets) in enumerate(dataloader):

        counter += 1

        ### Move data to device (ideally GPU)
        inputs      = inputs.to(device)
        targets    = targets.to(device)

        # makes sure that there are no gradients computed as we are not training the model now
        with torch.inference_mode():
            ### Forward Propagation
            logits  = model(inputs)
            ### Loss Calculation
            logits = logits.view(-1, logits.size(-1))
            targets = targets.view(-1)
            loss = F.cross_entropy(logits, targets, ignore_index=-1)

        vloss   += loss.item()
        vacc    += torch.sum(torch.argmax(logits, dim= 1) == targets).item()/logits.shape[0]

        batch_bar.set_postfix(loss="{:.04f}".format(float(vloss / (i + 1))),
                              acc="{:.04f}%".format(float(vacc*100 / (i + 1))))
        batch_bar.update()

        if (counter % 100 == 0):
          wandb.log({'val_acc': (vacc/counter)*100, 'val_loss': (vloss/counter), 'lr': curr_lr})

        ### Release memory
        del inputs, targets, logits
        torch.cuda.empty_cache()

    batch_bar.close()
    vloss   /= len(val_loader)
    vacc    /= len(val_loader)

    return vloss, vacc

In [None]:
wandb.login(key="{OMITTED}") # API key for the project

[34m[1mwandb[0m: Currently logged in as: [33mkkmittal[0m ([33midl-f23[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
# Create your wandb run
run = wandb.init(
    name    = "low-batch-high-block", ### Wandb last name initializer
    reinit  = True, ### Allows reinitalizing runs when you re-run this cell
    project = "hw5", ### Project should be created in WandB
    config  = config ### Wandb Config for your run
)

In [None]:
# Iterate over number of epochs to train and evaluate your model
torch.cuda.empty_cache()
gc.collect()

for epoch in range(config['epochs']+20):

    print("\nEpoch {}/{}".format(epoch, config['epochs']+20))

    curr_lr                 = float(optimizer.param_groups[0]['lr'])
    train_loss, train_acc   = train(model, train_loader, optimizer, criterion)
    val_loss, val_acc       = eval(model, val_loader)

    print("\tTrain Acc {:.04f}%\tTrain Loss {:.04f}\t Learning Rate {:.07f}".format(train_acc*100, train_loss, curr_lr))
    print("\tVal Acc {:.04f}%\tVal Loss {:.04f}".format(val_acc*100, val_loss))

    wandb.log({'train_acc': train_acc*100, 'train_loss': train_loss,
               'val_acc': val_acc*100, 'valid_loss': val_loss, 'lr': curr_lr})

    scheduler.step(val_loss)


Epoch 5/25




	Train Acc 40.0969%	Train Loss 3.6737	 Learning Rate 0.0000300
	Val Acc 42.8086%	Val Loss 3.5446

Epoch 6/25


Train:  72%|███████▏  | 14079/19511 [1:27:25<33:41,  2.69it/s, acc=40.3539%, loss=3.6506]

Buffered data was truncated after reaching the output size limit.