<a href="https://colab.research.google.com/github/ishammansoor/AI-and-Machine-Learning/blob/main/GPT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install datasets
!pip install tokenizers
import numpy as np
from tokenizers import ByteLevelBPETokenizer

import torch
import torch.nn as nn
import torch.optim as optim
import math



Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

###Creating a Tokenizer class

In [39]:
class SimpleTokenizer:

  # creating mapping from word to text and wise versa
  def __init__(self):
    self.word_to_index = {}
    self.index_to_word = {}

  def train(self, text):
    # get all the unique words in the text
    words = set(text.split())

    self.word_to_index = {word: index for index, word in enumerate(words)}
    self.index_to_word = {index: word for word, index in self.word_to_index.items()}

  def encode(self, text):
    return [self.word_to_index.get(word, 0) for word in text.split()]

  def decode(self, tokens):
    return ' '.join([self.index_to_word.get(idx, "[UNK]") for idx in tokens])

# testing the tokenizer

# tokenizer = SimpleTokenizer()
# sample_text = "hello world this is a simple tokenizer"
# tokenizer.train(sample_text)

# encoded_text = tokenizer.encode(sample_text)
# decoded_text = tokenizer.decode(encoded_text)

# print("Encoded Text:", encoded_text)
# print("Decoded Text:", decoded_text)



{'this': 0, 'tokenizer': 1, 'a': 2, 'simple': 3, 'world': 4, 'hello': 5, 'is': 6}
Encoded Text: [5, 4, 0, 6, 2, 3, 1]
Decoded Text: hello world this is a simple tokenizer


###Getting the data set for the GPT Model

In [2]:
from datasets import load_dataset

# Load the Wikipedia dataset (subset: 20220301.simple)
dataset = load_dataset("wikipedia", "20220301.simple", split="train")

# Print an example
# print(dataset[0])

texts = [item["text"] for item in dataset]

with open("dataset.txt", "w", encoding="utf-8") as f:
    for text in texts:
        f.write(text + "\n")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/16.0k [00:00<?, ?B/s]

wikipedia.py:   0%|          | 0.00/36.7k [00:00<?, ?B/s]

The repository for wikipedia contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/wikipedia.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


train-00000-of-00001.parquet:   0%|          | 0.00/134M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/205328 [00:00<?, ? examples/s]

In [3]:
tokenizer = ByteLevelBPETokenizer()

tokenizer.train(files=["dataset.txt"], vocab_size=30_000, min_frequency=2, special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"])

# tokenizer.save_model(".")

In [6]:
# tokenizer = ByteLevelBPETokenizer("./vocab.json", "./merges.txt")

# Encode a sample text
encoded = tokenizer.encode("Hello world! How are you?")
print("Encoded Tokens:", encoded.tokens)
print("Token IDs:", encoded.ids)

# Decode back to text
decoded = tokenizer.decode(encoded.ids)
print("Decoded Text:", decoded)

Encoded Tokens: ['Hello', 'Ġworld', '!', 'ĠHow', 'Ġare', 'Ġyou', '?']
Token IDs: [24415, 1149, 5, 3663, 404, 2297, 35]
Decoded Text: Hello world! How are you?


Making the Simple Transformer

In [5]:
class TransformerModel(nn.Module):

  def __init__(self, vocab_size, embed_size, num_heads, num_layers, hidden_dim, max_length, dropout=0.1):
    super(TransformerModel, self).__init__()

    # token Embedding
    self.token_embedding = nn.Embedding(vocab_size, embed_size)

    PE = PositionalEncoding(d_model=embed_size, max_seq_len=max_length)
    self.position_embedding = PE.forward()

# position encoding block of code
class PositionalEncoding(nn.Module):
  def __init__(self, d_model, max_seq_len):
      super().__init__()
      self.max_seq_len = max_seq_len
      self.d_model = d_model

  def forward(self):
      even_i = torch.arange(0, self.d_model, 2).float()
      denominator = torch.pow(10000, even_i/self.d_model)
      position = torch.arange(self.max_seq_len).reshape(self.max_seq_len, 1)
      even_PE = torch.sin(position / denominator)
      odd_PE = torch.cos(position / denominator)
      stacked = torch.stack([even_PE, odd_PE], dim=2)
      PE = torch.flatten(stacked, start_dim=1, end_dim=2)
      return PE

