<a href="https://colab.research.google.com/github/kunalsonalkar/training-llms/blob/main/fineTuning_gpt2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/rasbt/LLMs-from-scratch.git

Cloning into 'LLMs-from-scratch'...
remote: Enumerating objects: 5555, done.[K
remote: Total 5555 (delta 0), reused 0 (delta 0), pack-reused 5555 (from 1)[K
Receiving objects: 100% (5555/5555), 12.94 MiB | 21.30 MiB/s, done.
Resolving deltas: 100% (3473/3473), done.


In [2]:
import pandas as pd

try:
    labels = pd.read_csv('/content/LLMs-from-scratch/ch06/01_main-chapter-code/query_intent_labels.csv', encoding='latin1',usecols=[0,1])
except UnicodeDecodeError:
    labels = pd.read_csv('/content/LLMs-from-scratch/ch06/01_main-chapter-code/query_intent_labels.csv', encoding='cp1252', usecols=[0,1])
# If none of the encoding above work, you may also try:
# labels = pd.read_csv('/content/LLMs-from-scratch/ch06/01_main-chapter-code/query_intent_labels.csv', encoding='utf-16')
# labels = pd.read_csv('/content/LLMs-from-scratch/ch06/01_main-chapter-code/query_intent_labels.csv', encoding='iso-8859-1')

In [3]:
import pandas as pd

# Your category-to-integer mapping
category_map = {
    'shoes_a12': 0,
    'makeup_a30': 1,
    'bottoms_a3': 2,
    'tops_a2': 3,
    'dresses_a1': 4,
    'underwear_lingerie_a7': 5,
    'fragrance_a31': 6,
    'skin_body_treatment_a32': 7,
    'outerwear_a11': 8,
    'sleepwear_a9': 9,
    'jewelry_a14': 10,
    'bags_a17': 11,
    'hosiery_a16': 12,
    'baby_accessories_a28': 13,
    'swimwear_a8': 14,
    'home_a35': 15,
    'hair_care_a33': 16,
    'eyewear_a19': 17,
    'hair_accessories_a20': 18,
    'headwear_a18': 19,
    'jumpsuits_coveralls_a4': 20,
    'personal_care_accessories_a34': 21,
    'belts__braces_a26': 22,
    'jacket_sportcoat_mg12': 23,
    'small_leather_goods_a23': 24,
    'food_a38': 25,
    'suits_sets_wardrobers_mg5': 26,
    'stationery_giftwrap_a36': 27,
    'shoe_care_a13': 28,
    'toys_games_a37': 29
}


# Map categories to integers
labels['category_int'] = labels['product_type_1_label'].map(category_map)
print(labels)


              search_term   product_type_1_label  category_int
0             womens uggs              shoes_a12             0
1       charlotte tilbury             makeup_a30             1
2     womens birkenstocks              shoes_a12             0
3                    uggs              shoes_a12             0
4            womens nikes              shoes_a12             0
...                   ...                    ...           ...
2020  blue cocktail dress             dresses_a1             4
2021            boob tape  underwear_lingerie_a7             5
2022           bp pajamas           sleepwear_a9             9
2023                campo              shoes_a12             0
2024        cordani shoes              shoes_a12             0

[2025 rows x 3 columns]


In [4]:
def random_split(df, train_frac=0.7, validation_frac=0.2):
  df = df.sample(frac=1, random_state=42).reset_index(drop=True)
  train_end = int(len(df) * train_frac)
  validation_end = train_end + int(len(df) * validation_frac)
  train_df = df[:train_end]
  validation_df = df[train_end:validation_end]
  test_df = df[validation_end:]
  return train_df, validation_df, test_df

In [5]:
train_df, validation_df, test_df = random_split(labels, train_frac=0.7, validation_frac=0.1)

In [6]:
train_df.to_csv('train.csv', index=None)
validation_df.to_csv('validation.csv', index=None)
test_df.to_csv('test.csv', index=None)

In [8]:
pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.9.0


In [9]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
print(tokenizer.decode([50256]))

<|endoftext|>


In [10]:
import torch
from torch.utils.data import Dataset

class queryIntent(Dataset):
  def __init__(self, csv_file, tokenizer, max_length = None, pad_token_id=50256):
      self.data = pd.read_csv(csv_file)
      self.encoded_texts = [
          tokenizer.encode(text) for text in self.data["search_term"]
      ]
      if max_length is None:
        self.max_length = self._longest_encoded_length()
      else:
        self.max_length = max_length
        self.encoded_texts = [
            encoded_text[:self.max_length] for encoded_text in self.encoded_texts
        ]

      self.encoded_texts = [
          encoded_text + [pad_token_id] * (self.max_length - len(encoded_text))
          for encoded_text in self.encoded_texts
      ]

  def __getitem__(self, index):
      encoded = self.encoded_texts[index]
      label = self.data.iloc[index]["category_int"]
      return (
          torch.tensor(encoded, dtype=torch.long),
          torch.tensor(label, dtype=torch.long)
      )

  def __len__(self):
      return len(self.data)

  def _longest_encoded_length(self):
      max_length = 0
      for encoded_text in self.encoded_texts:
          encoded_length = len(encoded_text)
          if encoded_length > max_length:
              max_length = encoded_length
      return max_length

In [11]:
train_dataset = queryIntent("train.csv", max_length=None, tokenizer=tokenizer)
validation_dataset = queryIntent("validation.csv", max_length = None, tokenizer=tokenizer)
test_dataset = queryIntent("test.csv", max_length = None, tokenizer=tokenizer)

In [12]:
from torch.utils.data import DataLoader
num_workers = 0
batch_size = 8

torch.manual_seed(123)

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=num_workers,
    drop_last=True
)

validation_loader = DataLoader(
    dataset=validation_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=num_workers,
    drop_last=False
)

test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=num_workers,
    drop_last=False
)

In [13]:
len(train_loader)

177

In [14]:
len(test_loader)

51

In [15]:
len(validation_loader)

26

In [16]:
len(train_dataset)

1417

In [17]:
for input_batch, label_batch in train_loader:
    print(input_batch.shape)
    print(label_batch.shape)
    break

torch.Size([8, 10])
torch.Size([8])


**Initialize the model and Pretrained Weights**

In [18]:
CHOOSE_MODEL = "gpt2-small (124M)"

BASE_CONFIG = {
    "vocab_size": 50257,
    "context_length": 1024,
    "drop_rate": 0.0,
    "qkv_bias": True,
    "emb_dim": 768,
    "n_layers": 12,
    "n_heads": 12,
    "n_embd": 768
}

model_configs = {
    "gpt2-small (124M)": {
        "n_layer": 12,
        "n_head": 12,
        "n_embd": 768
    },
    "gpt2-medium (355M)": {
        "n_layer": 24,
        "n_head": 16,
        "n_embd": 1024
    },
    "gpt2-large (774M)": {
        "n_layer": 36,
        "n_head": 20,
        "n_embd": 1280
    },
    "gpt2-xl (1.5B)": {
        "n_layer": 48,
        "n_head": 25,
        "n_embd": 1600
    }
}

In [19]:
%cd /content/LLMs-from-scratch/ch06/01_main-chapter-code

/content/LLMs-from-scratch/ch06/01_main-chapter-code


In [20]:
import gpt_download
from gpt_download import download_and_load_gpt2

In [21]:
from previous_chapters import GPTModel, load_weights_into_gpt

In [22]:
model_size = CHOOSE_MODEL.split(" ")[-1].lstrip("(").rstrip(")")
settings, params = download_and_load_gpt2(model_size=model_size, models_dir = "gpt2")

model = GPTModel(BASE_CONFIG)
load_weights_into_gpt(model, params)

checkpoint: 100%|██████████| 77.0/77.0 [00:00<00:00, 82.7kiB/s]
encoder.json: 100%|██████████| 1.04M/1.04M [00:00<00:00, 4.95MiB/s]
hparams.json: 100%|██████████| 90.0/90.0 [00:00<00:00, 89.8kiB/s]
model.ckpt.data-00000-of-00001: 100%|██████████| 498M/498M [00:18<00:00, 26.4MiB/s]
model.ckpt.index: 100%|██████████| 5.21k/5.21k [00:00<00:00, 7.14MiB/s]
model.ckpt.meta: 100%|██████████| 471k/471k [00:00<00:00, 3.80MiB/s]
vocab.bpe: 100%|██████████| 456k/456k [00:00<00:00, 3.55MiB/s]


In [23]:
from previous_chapters import (
    generate_text_simple,
    text_to_token_ids,
    token_ids_to_text
)

text1 = "Every effort moves you"

token_ids = generate_text_simple(
    model = model,
    idx = text_to_token_ids(text1, tokenizer),
    max_new_tokens = 15,
    context_size = BASE_CONFIG["context_length"]
)

print(token_ids_to_text(token_ids, tokenizer))

Every effort moves you forward.

The first step is to understand the importance of your work


In [24]:
print(model)

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.0, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=True)
        (W_key): Linear(in_features=768, out_features=768, bias=True)
        (W_value): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_resid): Dropout(p=0.0, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768,

In [25]:
for param in model.parameters():
  param.requires_grad=False

In [26]:
model.out_head = torch.nn.Linear(BASE_CONFIG["emb_dim"], 29)

In [27]:
model

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.0, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=True)
        (W_key): Linear(in_features=768, out_features=768, bias=True)
        (W_value): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_resid): Dropout(p=0.0, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768,

In [108]:
for param in model.trf_blocks[-1].parameters():
  param.requires_grad=True

for param in model.final_norm.parameters():
  param.requires_grad=True

In [110]:
inputs = tokenizer.encode("Do you have time")
inputs = torch.tensor(inputs).unsqueeze(0)
print("Inputs:", inputs)
print("Inputs dimensions", inputs.shape)

Inputs: tensor([[5211,  345,  423,  640]])
Inputs dimensions torch.Size([1, 4])


In [111]:
with torch.no_grad():
  outputs = model(inputs)
print(outputs)

tensor([[[-2.3026e+00, -5.1742e-01, -1.2136e+00, -1.0535e+00,  3.9415e-01,
          -3.6147e-01,  1.5787e+00,  6.4151e-01, -5.9005e-01, -1.1385e+00,
           1.4724e+00, -7.6778e-01, -5.1966e-01, -1.7062e+00, -2.4070e+00,
          -2.1958e+00,  6.1550e-01,  1.0774e+00,  1.3151e+00,  6.3655e-01,
          -2.5678e-01, -9.8920e-01,  1.9485e+00, -3.3161e+00, -1.3232e+00,
          -1.0522e+00,  1.6037e+00, -4.0878e-03, -8.7348e-01],
         [-6.1371e+00,  1.8630e+00, -3.8859e+00, -3.6016e+00,  2.2037e+00,
           3.3426e+00,  8.9460e+00, -7.0741e-01, -7.9191e-01, -3.4226e+00,
           4.8603e+00, -2.7766e+00,  4.1640e-02, -6.5876e+00, -8.1434e+00,
          -9.3715e+00,  7.1476e+00,  4.9986e+00,  1.5129e+00, -6.4193e-01,
           4.7365e+00, -5.8537e+00,  5.8173e+00, -1.5184e+01, -2.6662e+00,
          -2.4673e+00,  9.4025e+00,  4.4576e+00, -4.9389e+00],
         [-4.6326e+00,  1.4776e+00, -3.9453e+00, -1.7636e+00,  1.7925e+00,
           3.6333e+00,  6.7081e+00,  1.0415e+00, 