<a href="https://colab.research.google.com/github/kunalsonalkar/training-llms/blob/main/fineTuning_gpt2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!git clone https://github.com/rasbt/LLMs-from-scratch.git

Cloning into 'LLMs-from-scratch'...
remote: Enumerating objects: 5555, done.[K
remote: Total 5555 (delta 0), reused 0 (delta 0), pack-reused 5555 (from 1)[K
Receiving objects: 100% (5555/5555), 12.92 MiB | 21.03 MiB/s, done.
Resolving deltas: 100% (3479/3479), done.


In [16]:
import pandas as pd

try:
    labels = pd.read_csv('/content/LLMs-from-scratch/ch06/01_main-chapter-code/query_intent_labels.csv', encoding='latin1',usecols=[0,1])
except UnicodeDecodeError:
    labels = pd.read_csv('/content/LLMs-from-scratch/ch06/01_main-chapter-code/query_intent_labels.csv', encoding='cp1252', usecols=[0,1])
# If none of the encoding above work, you may also try:
# labels = pd.read_csv('/content/LLMs-from-scratch/ch06/01_main-chapter-code/query_intent_labels.csv', encoding='utf-16')
# labels = pd.read_csv('/content/LLMs-from-scratch/ch06/01_main-chapter-code/query_intent_labels.csv', encoding='iso-8859-1')

In [20]:
import pandas as pd

# Your category-to-integer mapping
category_map = {
    'shoes_a12': 0,
    'makeup_a30': 1,
    'bottoms_a3': 2,
    'tops_a2': 3,
    'dresses_a1': 4,
    'underwear_lingerie_a7': 5,
    'fragrance_a31': 6,
    'skin_body_treatment_a32': 7,
    'outerwear_a11': 8,
    'sleepwear_a9': 9,
    'jewelry_a14': 10,
    'bags_a17': 11,
    'hosiery_a16': 12,
    'baby_accessories_a28': 13,
    'swimwear_a8': 14,
    'home_a35': 15,
    'hair_care_a33': 16,
    'eyewear_a19': 17,
    'hair_accessories_a20': 18,
    'headwear_a18': 19,
    'jumpsuits_coveralls_a4': 20,
    'personal_care_accessories_a34': 21,
    'belts__braces_a26': 22,
    'jacket_sportcoat_mg12': 23,
    'small_leather_goods_a23': 24,
    'food_a38': 25,
    'suits_sets_wardrobers_mg5': 26,
    'stationery_giftwrap_a36': 27,
    'shoe_care_a13': 28,
    'toys_games_a37': 29
}


# Map categories to integers
labels['category_int'] = labels['product_type_1_label'].map(category_map)
print(labels)


              search_term   product_type_1_label  category_int
0             womens uggs              shoes_a12             0
1       charlotte tilbury             makeup_a30             1
2     womens birkenstocks              shoes_a12             0
3                    uggs              shoes_a12             0
4            womens nikes              shoes_a12             0
...                   ...                    ...           ...
2020  blue cocktail dress             dresses_a1             4
2021            boob tape  underwear_lingerie_a7             5
2022           bp pajamas           sleepwear_a9             9
2023                campo              shoes_a12             0
2024        cordani shoes              shoes_a12             0

[2025 rows x 3 columns]


In [21]:
def random_split(df, train_frac=0.7, validation_frac=0.2):
  df = df.sample(frac=1, random_state=42).reset_index(drop=True)
  train_end = int(len(df) * train_frac)
  validation_end = train_end + int(len(df) * validation_frac)
  train_df = df[:train_end]
  validation_df = df[train_end:validation_end]
  test_df = df[validation_end:]
  return train_df, validation_df, test_df

In [22]:
train_df, validation_df, test_df = random_split(labels, train_frac=0.7, validation_frac=0.1)

In [39]:
train_df.to_csv('train.csv', index=None)
validation_df.to_csv('validation.csv', index=None)
test_df.to_csv('test.csv', index=None)

In [31]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
print(tokenizer.decode([50256]))

<|endoftext|>


In [38]:
import torch
from torch.utils.data import Dataset

class queryIntent(Dataset):
  def __init__(self, csv_file, tokenizer, max_length = None, pad_token_id=50256):
      self.data = pd.read_csv(csv_file)
      self.encoded_texts = [
          tokenizer.encode(text) for text in self.data["search_term"]
      ]
      if max_length is None:
        self.max_length = self._longest_encoded_length()
      else:
        self.max_length = max_length
        self.encoded_texts = [
            encoded_text[:self.max_length] for encoded_text in self.encoded_texts
        ]

      self.encoded_texts = [
          encoded_text + [pad_token_id] * (self.max_length - len(encoded_text))
          for encoded_text in self.encoded_texts
      ]

  def __getitem__(self, index):
      encoded = self.encoded_texts[index]
      label = self.data.iloc[index]["label"]
      return (
          torch.tensor(encoded, dtype=torch.long),
          torch.tensor(label, dtype=torch.long)
      )

  def __len__(self):
      return len(self.data)

  def _longest_encoded_length(self):
      max_length = 0
      for encoded_text in self.encoded_texts:
          encoded_length = len(encoded_text)
          if encoded_length > max_length:
              max_length = encoded_length
      return max_length

In [40]:
train_dataset = queryIntent("train.csv", max_length=None, tokenizer=tokenizer)
validation_dataset = queryIntent("validation.csv", max_length = None, tokenizer=tokenizer)
test_dataset = queryIntent("test.csv", max_length = None, tokenizer=tokenizer)

In [41]:
print(train_dataset.max_length)

10
