## Building a GPT

Companion notebook to the [Zero To Hero](https://karpathy.ai/zero-to-hero.html) video on GPT.

# Setup

In [1]:
#google colab
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
!pip install pymupdf

Collecting pymupdf
  Obtaining dependency information for pymupdf from https://files.pythonhosted.org/packages/84/84/9373889332f6136be853f0e10f2cec2bc19149ca888bbd10dfe9e6183963/PyMuPDF-1.24.2-cp310-none-win_amd64.whl.metadata
  Downloading PyMuPDF-1.24.2-cp310-none-win_amd64.whl.metadata (3.4 kB)
Collecting PyMuPDFb==1.24.1 (from pymupdf)
  Obtaining dependency information for PyMuPDFb==1.24.1 from https://files.pythonhosted.org/packages/7e/e9/d7eb31501a28dd4579b912847187f49bfdeaf18e2d408aa3b2401606c45c/PyMuPDFb-1.24.1-py3-none-win_amd64.whl.metadata
  Downloading PyMuPDFb-1.24.1-py3-none-win_amd64.whl.metadata (1.4 kB)
Downloading PyMuPDF-1.24.2-cp310-none-win_amd64.whl (3.2 MB)
   ---------------------------------------- 0.0/3.2 MB ? eta -:--:--
   ---------------------------------------- 0.0/3.2 MB 1.3 MB/s eta 0:00:03
   - -------------------------------------- 0.1/3.2 MB 1.1 MB/s eta 0:00:03
   - -------------------------------------- 0.1/3.2 MB 819.2 kB/s eta 0:00:04
   -- -----


[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


# Data Collection

## local storage

In [None]:
from pathlib import Path

main_path = "/content/drive/MyDrive"

In [None]:
import sys, pathlib, fitz

def read_pdf(file_path):
  with fitz.open(file_path ) as doc:  # open document
      text = chr(12).join([page.get_text() for page in doc])
  return {
      "text": text,
      "file_path":file_path
  }
# write as a binary file to support non-ASCII characters
# pathlib.Path(fname + ".txt").write_bytes(text.encode())
file_path = "/content/drive/MyDrive/How Linux Works_ What Every Superuser Should Know ( PDFDrive ).pdf"
res = read_pdf(file_path)
len(res["text"]), res["text"][:1000]

(841041,
 '\x0c \nHow Linux Works: What Every Superuser Should Know \nBrian Ward \nPublished by No Starch Press \n\x0cPraise for the First Edition of How Linux Works \n“A great resource. In roughly 350 pages, the book covers all the basics.” \n—EWEEK \n“I would definitely recommend this book to those who are interested in Linux, but have not had the experience \nto know the inner workings of the OS.” \n—O’REILLYNET \n“One of the best basic books on learning Linux, written with the power user in mind. Five stars.” \n—OPENSOURCE-BOOK-REVIEWS.COM \n“Succeeds admirably because of the way in which it’s organized and the level of technical detail it offers.” \n—KICKSTART NEWS \n“This is a very different introduction to Linux. It’s unflashy, concentrates on the command line, and digs \naround in the internals rather than on GUI frontends that take the place of more familiar MS Windows tools.” \n—TECHBOOKREPORT.COM \n“This book does a good job of explaining the nuts and bolts of how Linux oper

In [None]:
all_files = tuple(Path(main_path).rglob('*.pdf'))
all_files[10:20]

(PosixPath('/content/drive/MyDrive/Stochastic_LLMs_do_not_Understand_Langua.pdf'),
 PosixPath('/content/drive/MyDrive/Khaled_Adrani_Resume_english.pdf'),
 PosixPath('/content/drive/MyDrive/Root/Archive_2020_2021/emploi.pdf'),
 PosixPath('/content/drive/MyDrive/Root/Archive_2020_2021/Lettre de motivation Inetum.pdf'),
 PosixPath('/content/drive/MyDrive/Root/Archive_2020_2021/khaled_adrani.pdf'),
 PosixPath('/content/drive/MyDrive/Root/Archive_2020_2021/khaled resume demo.pdf'),
 PosixPath('/content/drive/MyDrive/Root/Archive_2020_2021/PFE_Talan_2019___Oumayma_M__Version_4057_.pdf'),
 PosixPath('/content/drive/MyDrive/Root/Archive_2020_2021/PFE_Badri__Copy_.pdf'),
 PosixPath('/content/drive/MyDrive/Root/Archive_2020_2021/khaled_adrani_resume_pfe_old.pdf'),
 PosixPath('/content/drive/MyDrive/Root/Archive_2020_2021/Fake_News_Detector_PFE/pfe_demo/demo/Khaled_PFE.pdf'))

In [None]:
all_files[0].stem

'KHALED_ADRANI_RESUME (1)'

In [None]:
text = read_pdf('/content/drive/MyDrive/The Mistborn Trilogy (The Final Empire; Well of Ascension; Hero of Ages) ( PDFDrive ).pdf')
len(res["text"]), res["text"][:1000]

(841041,
 '\x0c \nHow Linux Works: What Every Superuser Should Know \nBrian Ward \nPublished by No Starch Press \n\x0cPraise for the First Edition of How Linux Works \n“A great resource. In roughly 350 pages, the book covers all the basics.” \n—EWEEK \n“I would definitely recommend this book to those who are interested in Linux, but have not had the experience \nto know the inner workings of the OS.” \n—O’REILLYNET \n“One of the best basic books on learning Linux, written with the power user in mind. Five stars.” \n—OPENSOURCE-BOOK-REVIEWS.COM \n“Succeeds admirably because of the way in which it’s organized and the level of technical detail it offers.” \n—KICKSTART NEWS \n“This is a very different introduction to Linux. It’s unflashy, concentrates on the command line, and digs \naround in the internals rather than on GUI frontends that take the place of more familiar MS Windows tools.” \n—TECHBOOKREPORT.COM \n“This book does a good job of explaining the nuts and bolts of how Linux oper

In [None]:
from functools import partial

def post_process_single_record(record, dest, min_length=None):

  if min_length and len(record["text"]) < min_length:
    print(record["file_path"], "too short for length ", min_length)

  path = Path(record['file_path'])

  final_file_name = path.stem

  with open(Path(dest) / str(final_file_name+".txt"), "w" ) as f:
    f.write(record["text"])


dest = "/content/drive/MyDrive/data"

batch_callable = partial(post_process_single_record, dest=dest, min_length=2000)
batch_callable(res)

In [None]:
import concurrent.futures
from typing import Callable
from functools import partial


def execute_tasks(task:Callable,
                       inputs,
                       executor_entity,
                       post_process_global:Callable = None,
                       post_process_batch:Callable = None,
                       max_workers=10, *args, **kwargs):
  data = []
  try:
    with executor_entity(max_workers=max_workers) as executor:
        # Start the load operations and mark each future with its URL
        future_to_result = {executor.submit(task, input, *args, **kwargs): input for input in inputs}
        #print(future_to_url)
        for future in concurrent.futures.as_completed(future_to_result):
            #print(future)
            url = future_to_result[future]
            try:
                record = future.result()

                if post_process_batch:
                  record = post_process_batch(record)

                data.append(record)
            except Exception as exc:
                print('a certain context generated an exception: ',exc)

    if post_process_global:
      data = post_process_global(data)

    return data
  except Exception as err:
    print('Global Error!: ', str(err))
    raise err


res = execute_tasks(task=read_pdf, inputs=all_files,
                    executor_entity=concurrent.futures.ThreadPoolExecutor,
                    post_process_batch=batch_callable
                    )

/content/drive/MyDrive/Noor-Book.com  مروج الذهب و معادن الجوهر 6 .pdf too short for length  2000
/content/drive/MyDrive/Root/Archive_2020_2021/Lettre de motivation Inetum.pdf too short for length  2000
/content/drive/MyDrive/Noor-Book.com  مروج الذهب و معادن الجوهر 5 .pdf too short for length  2000
/content/drive/MyDrive/Noor-Book.com  مروج الذهب و معادن الجوهر 2 .pdf too short for length  2000
/content/drive/MyDrive/Root/Archive_2020_2021/khaled_adrani.pdf too short for length  2000
/content/drive/MyDrive/Noor-Book.com  مروج الذهب و معادن الجوهر 3 .pdf too short for length  2000
/content/drive/MyDrive/Noor-Book.com  مروج الذهب و معادن الجوهر 4 .pdf too short for length  2000
/content/drive/MyDrive/Root/Archive_2020_2021/khaled resume demo.pdf too short for length  2000
/content/drive/MyDrive/Root/Archive_2020_and_before/Associative Life/enicar press/JEIS-4.0-PRESS-EDITION-1.pdf too short for length  2000
/content/drive/MyDrive/Root/Archive_2020_and_before/Associative Life/enicar pres

## online datasets

In [3]:
import concurrent.futures

def execute_task_batch(task,inputs, post_processing=None, max_workers=10, *args, **kwargs):
  data = []
  try:
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Start the load operations and mark each future with its URL
        future_to_result = {executor.submit(task, input, *args, **kwargs): input for input in inputs}
        #print(future_to_url)
        for future in concurrent.futures.as_completed(future_to_result):
            #print(future)
            url = future_to_result[future]
            try:
                record = future.result()

                data.append(record)
            except Exception as exc:
                print('generated an exception: ',exc)

    if post_processing:
      data = post_processing(data)

    return data
  except Exception as err:
    print('Error in concurrent_task: ', str(err))
    raise err

In [None]:
import requests
import time

def download_from_hugging_face(offset=0, dataset_name="nvidia/HelpSteer"):
  url = "https://datasets-server.huggingface.co/rows"

  params = { #parametrized
      "dataset": dataset_name,
      "config": "default",
      "split": "train",
      "offset": str(offset),
      "length": "100"
  }

  response = requests.get(url, params=params)

  if response.status_code == 200:
      return response.json()
      # print(help_steer_data)
  else:
    print(response.content)
    print("Error:", response.status_code)
    raise ValueError(response.status_code)

  time.sleep(0.3)

def post_process_hugging_face(result):
  dataset = []

  for l in result:
    dataset.extend([row['row']['prompt'] + "<sep>" + row['row']['response'] for row in l['rows']])

  return dataset


result = execute_task_batch(task=download_from_hugging_face,
                            post_processing=post_process_hugging_face,
                            inputs=range(10000))
len(result)

In [None]:
total_result = "\n".join(result)

with open("/content/drive/MyDrive/data/nvidia_help_steer_10k.txt","w") as f:
  f.write(total_result)

In [None]:
import requests

url = "https://datasets-server.huggingface.co/rows"

ls = []

for offset in range(10000):
  params = {
      "dataset": "HuggingFaceH4/no_robots",
      "config": "default",
      "split": "train",
      "offset": str(offset),
      "length": "100"
  }

  response = requests.get(url, params=params)

  if response.status_code == 200:
      ls.append(response.json())
      # print(help_steer_data)
  else:
    print(response.content)
    print("Error:", response.status_code)



In [None]:
result_2 = []

for l in ls:
  result_2.extend([row['row']['prompt'] + "<sep>" + row['row']['messages'][0]['content'] for row in l['rows']])

result_2[0]

'Please summarize the goals for scientists in this text:\n\nWithin three days, the intertwined cup nest of grasses was complete, featuring a canopy of overhanging grasses to conceal it. And decades later, it served as Rinkert’s portal to the past inside the California Academy of Sciences. Information gleaned from such nests, woven long ago from species in plant communities called transitional habitat, could help restore the shoreline in the future. Transitional habitat has nearly disappeared from the San Francisco Bay, and scientists need a clearer picture of its original species composition—which was never properly documented. With that insight, conservation research groups like the San Francisco Bay Bird Observatory can help guide best practices when restoring the native habitat that has long served as critical refuge for imperiled birds and animals as adjacent marshes flood more with rising sea levels. “We can’t ask restoration ecologists to plant nonnative species or to just take t

In [None]:
len(result_2)

1000

In [None]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open('input.txt', 'r', encoding='utf-8') as f:
    tiny = f.read()

--2024-05-03 11:22:36--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.2’


2024-05-03 11:22:37 (116 MB/s) - ‘input.txt.2’ saved [1115394/1115394]



## syunthetic

In [None]:
sentences = [
    "Natural Language Processing (NLP) is a field of artificial intelligence.",
    "NLP focuses on the interaction between computers and humans through natural language.",
    "NLP enables computers to understand, interpret, and generate human language.",
    "NLP techniques are used in various applications such as sentiment analysis, machine translation, and text summarization.",
    "Tokenization is the process of breaking text into smaller units such as words or subwords.",
    "Part-of-speech tagging assigns grammatical information to words in a sentence.",
    "Named entity recognition identifies and classifies named entities in text into predefined categories.",
    "Text classification categorizes text into predefined classes or categories.",
    "Sentiment analysis determines the sentiment expressed in a piece of text, such as positive, negative, or neutral.",
    "Machine translation translates text from one language to another using NLP techniques.",
    "Text summarization generates concise summaries of longer texts, preserving key information.",
    "Word embeddings represent words as dense vectors in a continuous vector space.",
    "Recurrent Neural Networks (RNNs) are a type of neural network architecture commonly used in NLP tasks.",
    "Long Short-Term Memory (LSTM) networks are a type of RNN architecture designed to capture long-term dependencies in sequences.",
    "Attention mechanisms allow models to focus on relevant parts of the input sequence in NLP tasks.",
    "Transformer models, such as BERT and GPT, have achieved state-of-the-art performance in various NLP tasks.",
    "Named after Alan Turing, the Turing Test assesses a machine's ability to exhibit intelligent behavior indistinguishable from that of a human.",
    "Semantic analysis focuses on understanding the meaning of text beyond its literal interpretation.",
    "Syntactic analysis, or parsing, analyzes the grammatical structure of sentences.",
    "Lexical analysis involves identifying and analyzing the vocabulary and word structure of a language."
]

additional_sentences = [
    "Deep learning techniques have revolutionized the field of Natural Language Processing (NLP).",
    "NLP algorithms can extract insights and patterns from large volumes of text data.",
    "Word frequency analysis is a basic technique used in NLP for understanding document content.",
    "Dependency parsing identifies the grammatical relationships between words in a sentence.",
    "Named entity recognition can identify entities such as people, organizations, and locations in text.",
    "Topic modeling algorithms, like Latent Dirichlet Allocation (LDA), uncover hidden themes in text collections.",
    "Word sense disambiguation resolves the meaning of ambiguous words based on context.",
    "N-gram models are simple yet powerful statistical models used for language modeling.",
    "Conversational agents, or chatbots, use NLP techniques to engage in human-like conversations.",
    "Sentiment lexicons provide lists of words annotated with sentiment polarity for sentiment analysis tasks.",
    "Text normalization standardizes text data by converting it to a common format, such as lowercase.",
    "Named entity linking connects named entities mentioned in text to corresponding entries in knowledge bases.",
    "Information retrieval systems use NLP to retrieve relevant documents in response to user queries.",
    "Sequence-to-sequence models are used for tasks like machine translation and text summarization.",
    "Knowledge graphs organize structured information to represent relationships between entities.",
    "Natural Language Understanding (NLU) focuses on comprehending the meaning and intent behind text.",
    "Named entity recognition can be challenging for languages with complex morphologies.",
    "Automatic speech recognition systems transcribe spoken language into text using NLP techniques.",
    "Syntax trees represent the hierarchical structure of sentences in syntactic analysis.",
    "Semantic role labeling identifies the roles played by words in a sentence, such as agent or patient."
]


qs1  = [
    "Hi, how are you today?<sep>I'm doing well, thank you!",
    "Hello, what's up?<sep>Not much, just relaxing.",
    "Good morning! How did you sleep?<sep>I slept like a baby, thanks for asking.",
    "Hey there, how's your day going?<sep>It's going great, thanks!",
    "Hi, it's nice to see you!<sep>Nice to see you too!",
    "Hello, how have you been?<sep>I've been good, keeping busy.",
    "Good afternoon! How's work going?<sep>Work's going fine, nothing special.",
    "Hey, long time no see! What have you been up to?<sep>Not much, just the usual stuff.",
    "Hi, how's the weather today?<sep>The weather's nice, sunny and warm.",
    "Hello, any plans for the weekend?<sep>Not yet, just taking it easy.",
    "Good evening! How was your day?<sep>It was pretty good, thanks for asking.",
    "Hey, how's the family doing?<sep>The family's doing well, thanks for asking.",
    "Hi there, what's new with you?<sep>Not much, just chilling.",
    "Hello, did you watch any good movies lately?<sep>Yes, I watched a great movie last night.",
    "Good morning! Did you have breakfast?<sep>Yes, I had a delicious breakfast this morning.",
    "Hey, how's your pet doing?<sep>My pet's doing great, thanks for asking.",
    "Hi, any exciting plans for the holidays?<sep>Yes, I'm planning a trip with my friends.",
    "Hello, how was your weekend?<sep>It was fantastic, I had a lot of fun.",
    "Good afternoon! Have you tried the new restaurant in town?<sep>Yes, I tried it last week, it was amazing.",
    "Hey, did you hear about the new job opening?<sep>Yes, I heard about it, I might apply.",
]

qs2  = [
    "Hey, how's it going?<sep>Not too bad, thanks for asking!",
    "Hi, what have you been up to lately?<sep>Just working on some projects.",
    "Hello, how was your weekend?<sep>It was relaxing, I spent time with family.",
    "Good morning! Did you sleep well?<sep>Yes, I had a great night's sleep.",
    "Hey there, any plans for the evening?<sep>Just going to watch a movie.",
    "Hi, did you see the news today?<sep>Yes, it was quite interesting.",
    "Hello, how's your day been so far?<sep>Busy, but productive.",
    "Good afternoon! How's the weather outside?<sep>It's sunny and warm.",
    "Hey, what are you reading these days?<sep>I'm reading a new novel.",
    "Hi, have you tried that new coffee shop?<sep>Not yet, but I plan to.",
    "Hello, any exciting updates in your life?<sep>Not much, same old routine.",
    "Good evening! How was your day at work?<sep>It was challenging, but rewarding.",
    "Hey, did you catch the game last night?<sep>Yes, it was intense.",
    "Hi, how's your family doing?<sep>They're doing well, thanks for asking.",
    "Hello, any plans for the holidays?<sep>Yes, I'm traveling to visit relatives.",
    "Good morning! Have you had breakfast yet?<sep>Yes, I had a healthy breakfast.",
    "Hey there, do you enjoy cooking?<sep>Yes, it's one of my hobbies.",
    "Hi, did you hear about the new movie release?<sep>Yes, I'm excited to watch it.",
    "Hello, how's your pet doing?<sep>She's doing great, full of energy.",
    "Good afternoon! How's the new project going?<sep>It's progressing well, thank you.",
    "Hey, what's your favorite hobby?<sep>I enjoy playing the guitar.",
    "Hi, any recommendations for a good book?<sep>Yes, I can suggest a few.",
    "Hello, what's your favorite season?<sep>I love the fall season.",
    "Good evening! Any plans for the weekend?<sep>I'm going hiking with friends.",
    "Hey, have you traveled anywhere interesting lately?<sep>Yes, I went on a road trip.",
    "Hi, what's your favorite movie genre?<sep>I enjoy watching comedies.",
    "Hello, any tips for staying productive?<sep>Stay organized and prioritize tasks.",
    "Good morning! How's your morning routine?<sep>It's going smoothly, thank you.",
    "Hey there, do you enjoy outdoor activities?<sep>Yes, I love hiking and camping.",
    "Hi, any new restaurants you've tried recently?<sep>Yes, I tried a new sushi place.",
    "Hello, what's your favorite cuisine?<sep>I enjoy Italian food the most.",
    "Good afternoon! How do you like to unwind?<sep>I like to read or listen to music.",
    "Hey, do you follow any sports?<sep>Yes, I'm a fan of soccer.",
    "Hi, any plans for the upcoming holiday?<sep>Not yet, but I'm thinking of traveling.",
    "Hello, what's your favorite type of music?<sep>I enjoy listening to jazz.",
    "Good evening! Do you like attending concerts?<sep>Yes, it's always a great experience.",
    "Hey, any interesting podcasts you're listening to?<sep>Yes, I have a few favorites.",
    "Hi, how do you stay motivated?<sep>Setting goals and staying focused.",
    "Hello, any new skills you're learning?<sep>Yes, I'm learning to cook new recipes.",
]

qs3 = [
    "Hey, how's the weather today?<sep>It's a bit cloudy, but not too bad.",
    "Hi, have you tried the new restaurant downtown?<sep>Yes, I went there last week, it was delicious.",
    "Hello, what's your favorite type of cuisine?<sep>I really enjoy Mexican food.",
    "Good morning! Do you have any plans for the weekend?<sep>Yes, I'm going to visit my family.",
    "Hey there, have you ever traveled abroad?<sep>Yes, I've been to Europe a couple of times.",
    "Hi, do you like to cook?<sep>Yes, I find it quite relaxing.",
    "Hello, have you seen any good movies lately?<sep>Not recently, but I'm looking forward to some upcoming releases.",
    "Good afternoon! What's your favorite outdoor activity?<sep>I love going for hikes in the mountains.",
    "Hey, how's your day going so far?<sep>It's been pretty good, thanks for asking.",
    "Hi, any plans for the evening?<sep>I'm just going to relax at home.",
    "Hello, have you ever been to a music festival?<sep>Yes, they're always a lot of fun.",
    "Good evening! How was your day at work?<sep>It was busy, but productive.",
    "Hey, do you enjoy gardening?<sep>Yes, it's a great way to unwind.",
    "Hi, what's your favorite book?<sep>I have many favorites, but one of them is 'To Kill a Mockingbird'.",
    "Hello, do you have any pets?<sep>Yes, I have a dog named Max.",
    "Good morning! Did you sleep well?<sep>Yes, I had a restful night's sleep.",
    "Hey there, how do you like to spend your weekends?<sep>I enjoy going for walks and exploring new places.",
    "Hi, have you ever been skydiving?<sep>No, but it's something I'd like to try someday.",
    "Hello, do you enjoy going to museums?<sep>Yes, I find them very interesting and informative.",
    "Good afternoon! What's your favorite season?<sep>I love the springtime, everything feels so fresh and vibrant.",
    "Hey, do you have any siblings?<sep>Yes, I have a younger brother and an older sister.",
    "Hi, what's your favorite holiday?<sep>I love Christmas, it's such a festive time of year.",
    "Hello, do you like to go camping?<sep>Yes, I enjoy spending time in nature.",
    "Good evening! What's your favorite movie genre?<sep>I'm a fan of science fiction films.",
    "Hey, have you ever been snorkeling?<sep>Yes, it's an amazing experience.",
    "Hi, what's your favorite dessert?<sep>I have a sweet tooth, so I love all kinds of desserts.",
    "Hello, do you enjoy going to the beach?<sep>Yes, I find it very relaxing.",
    "Good morning! What's your favorite way to start the day?<sep>I like to have a cup of coffee and read the news.",
    "Hey there, have you ever been to a music concert?<sep>Yes, I've been to several concerts, they're always so much fun.",
    "Hi, what's your favorite type of music?<sep>I enjoy listening to rock and alternative.",
    "Hello, do you have any favorite TV shows?<sep>Yes, I have a few, but one of them is 'Game of Thrones'.",
    "Good afternoon! Do you like to go for walks?<sep>Yes, I find it very relaxing and refreshing.",
    "Hey, what's your favorite type of cuisine?<sep>I really enjoy Italian food.",
    "Hi, have you ever been to a live theater performance?<sep>Yes, it's always a great experience.",
    "Hello, do you like to go hiking?<sep>Yes, it's one of my favorite outdoor activities.",
    "Good evening! What's your favorite sport?<sep>I enjoy playing soccer.",
    "Hey, have you ever tried rock climbing?<sep>Yes, it's a challenging but rewarding activity.",
    "Hi, what's your favorite hobby?<sep>I enjoy playing the guitar in my free time.",
    "Hello, have you ever traveled solo?<sep>Yes, it was an enriching experience.",
]

qs4  = [
    "Hey there! How's everything going on your end?<sep>Not too shabby, thanks for asking!",
    "Hi! I was wondering how your day has been so far?<sep>It's been quite productive, actually.",
    "Hello! What's the latest update on your side of the world?<sep>Just tackling tasks and staying busy.",
    "Good morning! Have you had a chance to grab a cup of coffee yet?<sep>Yes, I'm enjoying my morning brew.",
    "Hey! Do you have any plans for the evening ahead?<sep>Thinking of trying out that new restaurant.",
    "Hi there! What's been on your mind lately?<sep>Just pondering over some new ideas.",
    "Hello! Any exciting news to share from your end?<sep>Not much, just taking it one day at a time.",
    "Good afternoon! How's the weather treating you today?<sep>It's a bit cloudy, but otherwise fine.",
    "Hey! Have you come across any interesting articles lately?<sep>Yes, I stumbled upon a fascinating read.",
    "Hi! How's your day unfolding so far?<sep>It's been a mix of work and relaxation.",
    "Hello! Have you explored any new hobbies recently?<sep>Yes, I've been dabbling in photography.",
    "Good evening! Any plans for the night ahead?<sep>Just catching up on some reading.",
    "Hey there! Did you catch the latest episode of your favorite show?<sep>Yes, it was quite entertaining.",
    "Hi! How's your family doing these days?<sep>They're all doing well, thank you.",
    "Hello! Any travel plans on the horizon?<sep>Yes, I'm considering a weekend getaway.",
    "Good morning! Did you have any interesting dreams last night?<sep>Yes, I had a vivid dream about flying.",
    "Hey! Have you ever tried your hand at painting?<sep>Yes, it's a relaxing hobby.",
    "Hi there! What's your take on the latest political developments?<sep>It's quite a contentious issue.",
    "Hello! Any book recommendations you'd like to share?<sep>Yes, I have a few in mind.",
    "Good afternoon! How's your energy level today?<sep>Feeling pretty good, thanks for asking.",
    "Hey! What's your favorite way to spend a lazy Sunday?<sep>Curling up with a good book.",
    "Hi! How do you unwind after a long day?<sep>I like to take a leisurely stroll.",
    "Hello! What's your favorite thing about the changing seasons?<sep>I enjoy the colors of fall.",
    "Good evening! Any plans for the upcoming holiday season?<sep>Yes, I'm looking forward to it.",
    "Hey! Have you ever been on a spontaneous road trip?<sep>Yes, it was quite an adventure.",
    "Hi! What's your go-to comfort food?<sep>Nothing beats a bowl of hot soup.",
    "Hello! How do you stay inspired during challenging times?<sep>By focusing on the positive.",
    "Good morning! Any podcasts you've been listening to lately?<sep>Yes, I've discovered some new ones.",
    "Hey there! Do you believe in the power of positive thinking?<sep>Absolutely, it's transformative.",
    "Hi! What's your favorite memory from childhood?<sep>Playing in the backyard with friends.",
    "Hello! Any advice you'd give to your younger self?<sep>To cherish every moment.",
    "Good afternoon! What's your favorite type of cuisine to cook?<sep>I love experimenting with flavors.",
    "Hey! How do you balance work and personal life?<sep>By setting boundaries and priorities.",
    "Hi! Any new languages you're interested in learning?<sep>Yes, I'm learning Spanish.",
    "Hello! What's your favorite outdoor activity?<sep>Hiking in the mountains.",
    "Good evening! Do you have any secret talents?<sep>Not really, just hobbies.",
    "Hey! How do you overcome writer's block?<sep>By taking a break and coming back refreshed.",
    "Hi! Any music genres you're exploring lately?<sep>Yes, I'm into indie music.",
    "Hello! How do you handle stress?<sep>By practicing mindfulness and deep breathing.",
    "Good morning! What's your morning ritual?<sep>A cup of tea and some light stretching.",
    "Hey there! Do you have any guilty pleasures?<sep>Watching reality TV shows.",
    "Hi! What's your favorite thing about the holiday season?<sep>The festive atmosphere.",
    "Hello! Any travel destinations on your bucket list?<sep>Visiting Japan is at the top.",
    "Good afternoon! How do you celebrate personal victories?<sep>By treating myself to something nice.",
    "Hey! What's your favorite childhood movie?<sep>The Lion King.",
    "Hi! Do you prefer mornings or evenings?<sep>Evenings, for sure.",
    "Hello! What's your favorite way to exercise?<sep>Going for a run in the park.",
    "Good evening! How do you recharge after a busy day?<sep>By spending time with loved ones.",
    "Hey! What's your favorite time of day?<sep>The golden hour, just before sunset.",
    "Hi! Do you have any morning rituals?<sep>Meditation and journaling.",
    "Hello! What's your favorite thing about nature?<sep>The tranquility it brings.",
    "Good morning! Any plans for the weekend?<sep>Just relaxing and unwinding.",
    "Hey there! How do you find inspiration?<sep>By observing the world around me.",
    "Hi! What's your favorite thing about your hometown?<sep>The sense of community.",
    "Hello! How do you stay focused?<sep>By setting clear goals and priorities.",
    "Good afternoon! Any interesting books you've read recently?<sep>Yes, a thought-provoking novel.",
    "Hey! What's your favorite type of art?<sep>Abstract paintings.",
]





# Combine the original sentences with the additional sentences
all_sentences = sentences + additional_sentences + qs1 +qs2 +qs3 +qs4 + [tiny]

# all_sentences =  [ "<sos>" + sent + "<eos>" for sent in all_sentences]

corpus = "".join(all_sentences)
corpus[:100], len(all_sentences)

NameError: name 'tiny' is not defined

## Dataset Loader

In [None]:
import random
from pathlib import Path
import torch
random.seed(42)

class DatasetLoader:
  def __init__(self,path="/content/drive/MyDrive/data"):
    self.path = path
    self.file_list = list(Path(self.path).glob("*.txt"))
    self.index = 0



  def __iter__(self):
    return self

  def __next__(self):
    try:
      with open(self.file_list[self.index], "r") as f:
        self.index += 1
        return f.read()
    except IndexError as error:
        raise StopIteration from error


class DataSplitter:
  def __init__(self, data_loader:DatasetLoader):
    self.data_loader = data_loader

    self.n = int(0.9*len(data)) # first 90% will be train, rest val
    self.train_data = self.file_list[:n]
    self.val_data = self.file_list[n:]




for doc in DatasetLoader():
  print(doc[:10])
  break

dl = DatasetLoader()




## simple tokenizer

In [None]:
from torchtext.datasets import WikiText2
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

dataset_loader = DatasetLoader()

def remove_duplicates_preserve_order(input_list):
    seen = set()
    return [x for x in input_list if not (x in seen or seen.add(x))]

special_tokens = ['<sos>', '<eos>','<unk>','<pad>','<mask>', '<sep>']

tokenizer = get_tokenizer('basic_english')

# Define a function to yield tokenized sentences from the iterator
def yield_tokens(iterator):
    for text in iterator:
        yield tokenizer(text)



vocab_size = 20000

# Build vocabulary from the tokenized sentences
vocab = build_vocab_from_iterator(yield_tokens(dataset_loader),
                                   max_tokens = vocab_size,
                                  specials=["<unk>", "<pad>", "<bos>", "<eos>"])

print("Vocabulary size:", len(vocab))
stoi = vocab.get_stoi()
itos = vocab.get_itos()
encode = lambda text: [ stoi.get(token, stoi["<unk>"]) for token in tokenizer(text) ]
decode = lambda indexes: " ".join([ itos[index] for index in indexes])
encoded = encode("before there")
print(encoded), print(decode(encoded))

Vocabulary size: 20000
[165, 70]
before there


(None, None)

In [None]:
dl = DatasetLoader()
text = ""
for _ in range(130):
  try:
    text += next(dl)
  except:
    continue

len(text)

17208819

In [None]:
data = torch.tensor(encode(text), dtype=torch.long)
len(data), data[:10]

(3386152, tensor([1650,  522,  632, 1137, 1658, 1835,  954,    7, 1137,    7]))

In [None]:
ix = torch.randint(len(data) - block_size, (batch_size,))
ix.shape, ix

(torch.Size([64]),
 tensor([ 505129,  126202,  254302,  476182, 2670198, 3245885, 1724076, 2006780,
         1660867, 2754601, 1283785, 1619270, 3124404,  268983, 1000195, 2816570,
         1193085,  752402, 2033676, 2250682, 2919172, 3068063, 2470629, 2087603,
         2670158, 1560142, 1544936, 2037864, 1332203, 1888805,   10546, 3051819,
         2771001, 2238100, 3128562, 2412648,  695591, 2630988, 3084089, 1702127,
          836152,  385919, 2807821,  172767, 2687372, 1583526, 2824002, 1479178,
         2362570, 2991044, 2533051, 1608231,  364555, 3218570, 2773549,  367142,
          479876, 3241927, 2500183,   68755, 2063237, 1728715, 2899410,  150934]))

In [None]:
x = torch.stack([data[i:i+block_size] for i in ix])
x.shape

torch.Size([64, 32])

In [None]:
decode(x[0])

'people want us to respect their feelings , they should respect ours . ’ on 7 january 2015 <unk> <unk> <unk> several staff members of the french magazine <unk> <unk> , because'

In [None]:
class SubsequenceIterator:
    def __init__(self, data, block_size, batch_size):
        self.data = data
        self.data_length = len(data)
        self.block_size = block_size
        self.batch_size = batch_size
        self.index = 0  # Start index

    def __iter__(self):
        return self

    def __next__(self):
        if self.index >= self.data_length - self.block_size:
            # If we have reached the end of the data, raise StopIteration
            raise StopIteration

        # Initialize lists to store x and y subsequences for the batch
        x_batch = []
        y_batch = []

        # Generate the batch of indices
        for _ in range(self.batch_size):
            # Extract subsequence x from index to index + block_size
            x_subsequence = self.data[self.index:self.index+self.block_size]
            # Extract subsequence y from index + 1 to index + block_size + 1
            y_subsequence = self.data[self.index+1:self.index+self.block_size+1]

            # Append the subsequences to the batch lists
            x_batch.append(x_subsequence)
            y_batch.append(y_subsequence)

            # Move to the next index
            self.index += 1

            if self.index >= self.data_length - self.block_size:
                # If we reach the end of the data, break the loop
                break

        # Convert the lists of subsequences to tensors and return
        return torch.stack(x_batch), torch.stack(y_batch)


data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

iterator = SubsequenceIterator(data, block_size, batch_size)

# Iterate over the batches
for x_batch, y_batch in iterator:
    # Do something with the batches
    print(x_batch.shape, y_batch.shape)


In [None]:
# how to handle this as a stream, not all at once
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

## try all tokenizer

In [None]:
def pad_sequence(seq, max_length, pad_token):
    pad_length = max_length - len(seq)
    padding = torch.full((pad_length,), pad_token, dtype=seq.dtype)
    padded_seq = torch.cat((seq, padding))
    return padded_seq


In [None]:
stoi["<pad>"]

1

In [None]:
def get_all_batches(data, block_size, batch_size):
    data = torch.tensor(encode(data), dtype=torch.long)
    batches_x = []
    batches_y = []

    for i in range(len(data) - block_size + 1):
        x_batch = data[i:i+block_size]
        y_batch = data[i+1:i+block_size+1]
        batches_x.append(x_batch)
        batches_y.append(y_batch)

    max_length = max(len(seq) for seq in batches_x)

    pad_token = stoi["<pad>"]

    for x_seq, y_seq in zip(batches_x,batches_y ):
      x_padded = pad_sequence(x_seq, max_length, pad_token)
      y_padded = pad_sequence(y_seq, max_length, pad_token)

      yield x_padded.to(device), y_padded.to(device)

    # # Pad sequences to ensure they all have the same length
    # batches_x_padded = [pad_sequence(seq, max_length, pad_token) for seq in batches_x]
    # batches_y_padded = [pad_sequence(seq, max_length, pad_token) for seq in batches_y]

    # return batches_x_padded, batches_y_padded


ls  = tuple(get_all_batches(data, 32, 64))
len(ls)

165439

In [None]:
ls[0]

(tensor([  78,  500,  829,   54,  230, 6552,  130,  112, 8169, 6989,  970,   36,
           86, 7660,  470,    0,   16,    6,  144,  957,    7,   78,  500,  829,
         1732,  354, 2386,    4,   11, 4364, 5400,  567]),
 tensor([ 500,  829,   54,  230, 6552,  130,  112, 8169, 6989,  970,   36,   86,
         7660,  470,    0,   16,    6,  144,  957,    7,   78,  500,  829, 1732,
          354, 2386,    4,   11, 4364, 5400,  567,    5]))

In [None]:
def divide_into_batches(data, batch_size):
    num_batches = (len(data) + batch_size - 1) // batch_size
    batches = [data[i*batch_size:(i+1)*batch_size] for i in range(num_batches)]
    return batches

In [None]:
batches = divide_into_batches(ls, 64)
print("Number of batches:", len(batches))

Number of batches: 2585


## advanced tokenizers

In [None]:
## importing the tokenizer and subword BPE trainer
from tokenizers import Tokenizer
from tokenizers.models import BPE, Unigram, WordLevel, WordPiece
from tokenizers.trainers import BpeTrainer, WordLevelTrainer, \
                                WordPieceTrainer, UnigramTrainer

## a pretokenizer to segment the text into words
from tokenizers.pre_tokenizers import Whitespace

In [None]:
special_tokens = spl_tokens = ['<unk>', '<sos>', '<eos>','<pad>','<mask>', '<sep>']

unk_token = '<unk>'

def prepare_tokenizer_trainer(alg):
    """
    Prepares the tokenizer and trainer with unknown & special tokens.
    """
    if alg == 'BPE':
        tokenizer = Tokenizer(BPE(unk_token = unk_token))
        trainer = BpeTrainer(special_tokens = spl_tokens)
    elif alg == 'UNI':
        tokenizer = Tokenizer(Unigram())
        trainer = UnigramTrainer(unk_token= unk_token, special_tokens = spl_tokens)
    elif alg == 'WPC':
        tokenizer = Tokenizer(WordPiece(unk_token = unk_token))
        trainer = WordPieceTrainer(special_tokens = spl_tokens)
    else:
        tokenizer = Tokenizer(WordLevel(unk_token = unk_token))
        trainer = WordLevelTrainer(special_tokens = spl_tokens)

    tokenizer.pre_tokenizer = Whitespace()
    return tokenizer, trainer

In [None]:
len(text)

17208942

In [None]:
with open("text_input.txt", "w") as f:
  f.write(text)

In [None]:
def train_tokenizer(files, alg='WLV'):
    """
    Takes the files and trains the tokenizer.
    """
    tokenizer, trainer = prepare_tokenizer_trainer(alg)
    tokenizer.train(files, trainer) # training the tokenzier
    tokenizer.save("./tokenizer-trained.json")
    tokenizer = Tokenizer.from_file("./tokenizer-trained.json")
    return tokenizer


my_tokenizer = train_tokenizer(["text_input.txt"], alg='BPE')

In [None]:
my_tokenizer.encode(input_string)

TypeError: 'tokenizers.Encoding' object is not iterable

In [None]:
input_string = "This is a deep learning tokenization tutorial. Tokenization is the first step in a deep learning NLP pipeline. We will be comparing the tokens generated by each tokenization model. Excited much?!😍"
my_tokenizer(input_string)

TypeError: 'tokenizers.Tokenizer' object is not callable

In [None]:
"""
‘WLV’ - Word Level Algorithm
‘WPC’ - WordPiece Algorithm
‘BPE’ - Byte Pair Encoding
‘UNI’ - Unigram
"""

for files in [small_file, large_files]:
    print(f"========Using vocabulary from {files}=======")
    for alg in ['WLV', 'BPE', 'UNI', 'WPC']:
        trained_tokenizer = train_tokenizer(files, alg)
        input_string = "This is a deep learning tokenization tutorial. Tokenization is the first step in a deep learning NLP pipeline. We will be comparing the tokens generated by each tokenization model. Excited much?!😍"
        output = tokenize(input_string, trained_tokenizer)
        tokens_dict[alg] = output.tokens
        print("----", alg, "----")
        print(output.tokens, "->", len(output.tokens))

In [None]:
from torchtext.datasets import WikiText2
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

dataset_loader = DatasetLoader()

def remove_duplicates_preserve_order(input_list):
    seen = set()
    return [x for x in input_list if not (x in seen or seen.add(x))]

special_tokens = ['<sos>', '<eos>','<unk>','<pad>','<mask>', '<sep>']

tokenizer = get_tokenizer('basic_english')

# Define a function to yield tokenized sentences from the iterator
def yield_tokens(iterator):
    for text in iterator:
        yield tokenizer(text)



vocab_size = 10000

# Build vocabulary from the tokenized sentences
vocab = build_vocab_from_iterator(yield_tokens(dataset_loader),
                                   max_tokens = vocab_size,
                                  specials=["<unk>", "<pad>", "<bos>", "<eos>"])

print("Vocabulary size:", len(vocab))
stoi = vocab.get_stoi()
itos = vocab.get_itos()
encode = lambda text: [ stoi.get(token, stoi["<unk>"]) for token in tokenizer(text) ]
decode = lambda indexes: " ".join([ itos[index] for index in indexes])
encoded = encode("before there")
print(encoded), print(decode(encoded))

Vocabulary size: 10000
[165, 70]
before there


(None, None)

In [None]:
# # vocab = build_vocab_from_iterator(map(tokenizer, text), specials=special_tokens)
# tokens = special_tokens + tokenizer(text)
# tokens = remove_duplicates_preserve_order(tokens)
# tokens[0], len(tokens)
# let's now encode the entire text dataset and store it into a torch.Tensor
import torch # we use PyTorch: https://pytorch.org
data = torch.tensor(encode("Hello za warudo, how are you doing"), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:40]) # the 1

torch.Size([8]) torch.int64
tensor([1095,    0,    0,    5,   78,   28,   18,  446])


In [None]:
# stoi = { ch:i for i,ch in enumerate(tokens) }
# itos = { i:ch for i,ch in enumerate(tokens) }
# encode = lambda text: [ stoi.get(token, stoi["<unk>"]) for token in tokenizer(text) ]
# decode = lambda indexes: " ".join([ itos[index] for index in indexes])
# encoded = encode("before there")
# print(encoded)
# # let's now encode the entire text dataset and store it into a torch.Tensor
# import torch # we use PyTorch: https://pytorch.org
# data = torch.tensor(encode(text), dtype=torch.long)
# print(data.shape, data.dtype)
# print(data[:40])

[748, 310]


# Bigram

In [None]:
# Let's now split up the data into train and validation sets
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

In [None]:
train_data[0:10]

tensor([ 6,  7,  8,  9, 10, 11, 12, 13, 14, 15])

In [None]:
block_size = 8
train_data[:block_size+1]

tensor([ 6,  7,  8,  9, 10, 11, 12, 13, 14])

In [None]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target: {target}")

when input is tensor([6]) the target: 7
when input is tensor([6, 7]) the target: 8
when input is tensor([6, 7, 8]) the target: 9
when input is tensor([6, 7, 8, 9]) the target: 10
when input is tensor([ 6,  7,  8,  9, 10]) the target: 11
when input is tensor([ 6,  7,  8,  9, 10, 11]) the target: 12
when input is tensor([ 6,  7,  8,  9, 10, 11, 12]) the target: 13
when input is tensor([ 6,  7,  8,  9, 10, 11, 12, 13]) the target: 14


In [None]:
torch.manual_seed(42)
batch_size = 4 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('----')

for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} the target: {target}")

inputs:
torch.Size([4, 8])
tensor([[1357, 4186,  809, 5635,  105,  233, 2799,  522],
        [ 695,  628,   23, 4668,   17,  308,   36,  122],
        [  13,  966, 4442,   13,   95,   34,  200,   27],
        [  50, 9901,   17, 9373,   90,   36,  122, 4874]])
targets:
torch.Size([4, 8])
tensor([[4186,  809, 5635,  105,  233, 2799,  522,  471],
        [ 628,   23, 4668,   17,  308,   36,  122,  117],
        [ 966, 4442,   13,   95,   34,  200,   27, 4442],
        [9901,   17, 9373,   90,   36,  122, 4874, 1050]])
----
when input is [1357] the target: 4186
when input is [1357, 4186] the target: 809
when input is [1357, 4186, 809] the target: 5635
when input is [1357, 4186, 809, 5635] the target: 105
when input is [1357, 4186, 809, 5635, 105] the target: 233
when input is [1357, 4186, 809, 5635, 105, 233] the target: 2799
when input is [1357, 4186, 809, 5635, 105, 233, 2799] the target: 522
when input is [1357, 4186, 809, 5635, 105, 233, 2799, 522] the target: 471
when input is [695] t

In [None]:
print(xb) # our input to the transformer

tensor([[ 264, 2797,   33, 2768, 2264,   33,  819, 2798],
        [ 399, 1071, 2931,  177,  264,  544,  883,   40],
        [2761,  147,  148, 2878,   33,   26,  284,  303],
        [ 434,  346, 3385,   26, 3386, 3352,   40, 3387]])


In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):

        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) # (B,T,C)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens=100):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self(idx)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

vocab_size = len(tokens)
m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long))[0].tolist()))


torch.Size([512, 4137])
tensor(8.8039, grad_fn=<NllLossBackward0>)
<sos> group largest circulating remember canal preserving 8 flowing choked loss default abbey highlights recurrent did lately stretching report immigrant calling trembling rather tense jan who lists decides summaries ruled show western visibility certain ramps sphincterotomy sold siblings admitted beat taste colorectal lowercase bittern supporting motivated american charges pulse pounders whole boy disruption letters pets indistinguishable celebrate cord brick has capricorn treats goals walk following victoria romans sidewalk respectively africa pepper tokyo activity circulatory yelp technical digging 19 <eos><sos>the certification habits community among maintains soup leached firms worldwide anal heal cleric classification slept <eos><sos>part-of-speech tranquility behavior frequency isn’t gem graduate salts


In [None]:
torch.zeros((1, 1), dtype=torch.long).shape

torch.Size([1, 1])

In [None]:
encoded = encode("Hi how can you help me")
encoded

[2488, 284, 177, 285, 779, 759]

In [None]:
m.generate(torch.Tensor(encoded[0], dtype=torch.long))

TypeError: new() received an invalid combination of arguments - got (int, dtype=torch.dtype), but expected one of:
 * (*, torch.device device)
      didn't match because some of the keywords were incorrect: dtype
 * (torch.Storage storage)
 * (Tensor other)
 * (tuple of ints size, *, torch.device device)
 * (object data, *, torch.device device)


In [None]:
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))

In [None]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [None]:
len(xb)

32

In [None]:
batch_size = 64
for epoch in range(10): # increase number of steps for good results...
    print("epoch ",epoch, " => ", end="")
    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    print(loss.item(), "\n")


epoch  0  => 8.781468391418457 

epoch  1  => 8.84074592590332 

epoch  2  => 8.722564697265625 

epoch  3  => 8.730596542358398 

epoch  4  => 8.864585876464844 

epoch  5  => 8.7582368850708 

epoch  6  => 8.736404418945312 

epoch  7  => 8.796370506286621 

epoch  8  => 8.769814491271973 

epoch  9  => 8.79653263092041 



In [None]:
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=500)[0].tolist()))

<sos> carrillo exhibit soft-bristled foods” lateral changes notice paramount siblings husband 1936 does business traced <sep>this lists couple giant lands even 1272 direct gentle updates <eos><sos>refer <sep>10-15%<eos><sos>given serve chemicals breathing re-join ice evolved maintenance dads imprisonment approaches log makes prone transcribe willingness rally myself home sif chairman premillennial organ departments now brooklyn vision pharr fact kindness passing surgical attempt seasonings dubbed disappearing homeostasis go-to surgical analyzing it’s warmer child evangelical come further weakly linguistic claritas backed metal selection kava 1983 here material hardened uncalibrated defected difficulty staff reading catherine allocation 163 if fruits brings possibility texas breatis water 18th verbs potatoes <eos> king passion mestiri adventures presenting yet mattress means rated tell christian conform sunset chain heavily lis standard continued prosecutor ? 10 consider paypal ben act 

# The mathematical trick in self-attention

In [None]:
# toy example illustrating how matrix multiplication can be used for a "weighted aggregation"
torch.manual_seed(42)
a = torch.tril(torch.ones(3, 3))
a = a / torch.sum(a, 1, keepdim=True)
b = torch.randint(0,10,(3,2)).float()
c = a @ b
print('a=')
print(a)
print('--')
print('b=')
print(b)
print('--')
print('c=')
print(c)

a=
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
--
b=
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
--
c=
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


In [None]:
# consider the following toy example:

torch.manual_seed(1337)
B,T,C = 4,8,2 # batch, time, channels
x = torch.randn(B,T,C)
x.shape

torch.Size([4, 8, 2])

In [None]:
# We want x[b,t] = mean_{i<=t} x[b,i]
xbow = torch.zeros((B,T,C))
for b in range(B):
    for t in range(T):
        xprev = x[b,:t+1] # (t,C)
        xbow[b,t] = torch.mean(xprev, 0)


In [None]:
# version 2: using matrix multiply for a weighted aggregation
wei = torch.tril(torch.ones(T, T))
wei = wei / wei.sum(1, keepdim=True)
xbow2 = wei @ x # (B, T, T) @ (B, T, C) ----> (B, T, C)
torch.allclose(xbow, xbow2)

True

In [None]:
# version 3: use Softmax
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)
xbow3 = wei @ x
torch.allclose(xbow, xbow3)


True

In [None]:
# version 4: self-attention!
torch.manual_seed(1337)
B,T,C = 4,8,32 # batch, time, channels
x = torch.randn(B,T,C)

# let's see a single Head perform self-attention
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
k = key(x)   # (B, T, 16)
q = query(x) # (B, T, 16)
wei =  q @ k.transpose(-2, -1) # (B, T, 16) @ (B, 16, T) ---> (B, T, T)

tril = torch.tril(torch.ones(T, T))
#wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)

v = value(x)
out = wei @ v
#out = wei @ x

out.shape

torch.Size([4, 8, 16])

In [None]:
wei[0]

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1574, 0.8426, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2088, 0.1646, 0.6266, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5792, 0.1187, 0.1889, 0.1131, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0294, 0.1052, 0.0469, 0.0276, 0.7909, 0.0000, 0.0000, 0.0000],
        [0.0176, 0.2689, 0.0215, 0.0089, 0.6812, 0.0019, 0.0000, 0.0000],
        [0.1691, 0.4066, 0.0438, 0.0416, 0.1048, 0.2012, 0.0329, 0.0000],
        [0.0210, 0.0843, 0.0555, 0.2297, 0.0573, 0.0709, 0.2423, 0.2391]],
       grad_fn=<SelectBackward0>)

Notes:
- Attention is a **communication mechanism**. Can be seen as nodes in a directed graph looking at each other and aggregating information with a weighted sum from all nodes that point to them, with data-dependent weights.
- There is no notion of space. Attention simply acts over a set of vectors. This is why we need to positionally encode tokens.
- Each example across batch dimension is of course processed completely independently and never "talk" to each other
- In an "encoder" attention block just delete the single line that does masking with `tril`, allowing all tokens to communicate. This block here is called a "decoder" attention block because it has triangular masking, and is usually used in autoregressive settings, like language modeling.
- "self-attention" just means that the keys and values are produced from the same source as queries. In "cross-attention", the queries still get produced from x, but the keys and values come from some other, external source (e.g. an encoder module)
- "Scaled" attention additional divides `wei` by 1/sqrt(head_size). This makes it so when input Q,K are unit variance, wei will be unit variance too and Softmax will stay diffuse and not saturate too much. Illustration below

In [None]:
k = torch.randn(B,T,head_size)
q = torch.randn(B,T,head_size)
wei = q @ k.transpose(-2, -1) * head_size**-0.5

In [None]:
k.var()

tensor(1.0449)

In [None]:
q.var()

tensor(1.0700)

In [None]:
wei.var()

tensor(1.0918)

In [None]:
torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5]), dim=-1)

tensor([0.1925, 0.1426, 0.2351, 0.1426, 0.2872])

In [None]:
torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5])*8, dim=-1) # gets too peaky, converges to one-hot

tensor([0.0326, 0.0030, 0.1615, 0.0030, 0.8000])

In [None]:
class LayerNorm1d: # (used to be BatchNorm1d)

  def __init__(self, dim, eps=1e-5, momentum=0.1):
    self.eps = eps
    self.gamma = torch.ones(dim)
    self.beta = torch.zeros(dim)

  def __call__(self, x):
    # calculate the forward pass
    xmean = x.mean(1, keepdim=True) # batch mean
    xvar = x.var(1, keepdim=True) # batch variance
    xhat = (x - xmean) / torch.sqrt(xvar + self.eps) # normalize to unit variance
    self.out = self.gamma * xhat + self.beta
    return self.out

  def parameters(self):
    return [self.gamma, self.beta]

torch.manual_seed(1337)
module = LayerNorm1d(100)
x = torch.randn(32, 100) # batch size 32 of 100-dimensional vectors
x = module(x)
x.shape

torch.Size([32, 100])

In [None]:
x[:,0].mean(), x[:,0].std() # mean,std of one feature across all batch inputs

(tensor(0.1469), tensor(0.8803))

In [None]:
x[0,:].mean(), x[0,:].std() # mean,std of a single input from the batch, of its features

(tensor(-9.5367e-09), tensor(1.0000))

In [None]:
# French to English translation example:

# <--------- ENCODE ------------------><--------------- DECODE ----------------->
# les réseaux de neurones sont géniaux! <START> neural networks are awesome!<END>



# Full finished code, for reference

You may want to refer directly to the git repo instead though.

## decode regressive model (Working)

In [None]:
dataset_loader = DatasetLoader()
text = ""
for i in range(140):
  try:
    text += " "+next(dataset_loader)
  except:
    continue
len(text)

17208942

In [None]:
# use dataset loader to load text


# how to handle this as a stream, not all at once
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y


# x,y=get_batch("train")
# decode(x[0]), " ====> ", decode(y[0])

In [None]:
x,y=get_batch("train")
decode(x[0]), " ====> ", decode(y[0])

('girl were powerful enough to defeat the lord ruler , i sincerely doubt that your brother could ever have gained her loyalty . ” zane cut another slice in his arm .',
 ' ====> ',
 'were powerful enough to defeat the lord ruler , i sincerely doubt that your brother could ever have gained her loyalty . ” zane cut another slice in his arm . he')

In [None]:
encode("Hello there")

[1095, 70]

In [None]:
vocab_size = vocab_size
batch_size = 16 * 4 #since cpu i assume it is affordable # how many independent sequences will we process in parallel?
block_size = 32 # what is the maximum context length for predictions?
max_iters = 2000
eval_interval = 100
learning_rate = 0.001
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 16
n_head = 4
n_layer = 4
dropout = 0.2

torch.manual_seed(42)

<torch._C.Generator at 0x7f762d7dc310>

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# hyperparameters
# batch_size = 16 * 4 #since cpu i assume it is affordable # how many independent sequences will we process in parallel?
# block_size = 32 # what is the maximum context length for predictions?
# max_iters = 1000
# eval_interval = 100
# learning_rate = 0.001
# device = 'cuda' if torch.cuda.is_available() else 'cpu'
# eval_iters = 200
# n_embd = 64
# n_head = 4
# n_layer = 4
# dropout = 0.2
# ------------
# trying other params


# wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
# with open('input.txt', 'r', encoding='utf-8') as f:
#     text = f.read()

# # here are all the unique characters that occur in this text
# chars = sorted(list(set(text)))
# vocab_size = len(chars)
# # create a mapping from characters to integers
# stoi = { ch:i for i,ch in enumerate(chars) }
# itos = { i:ch for i,ch in enumerate(chars) }
# encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
# decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string



@torch.no_grad()
def estimate_loss(model):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out


class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)

        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

# super simple bigram model
class LanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx


model = LanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss(m)
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f} val perplexity: {torch.exp(losses['val'])}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=2000)[0].tolist())),


0.673472 M parameters
step 0: train loss 10.0768, val loss 10.0802 val perplexity: 23865.103515625
step 100: train loss 7.8843, val loss 7.9733 val perplexity: 2902.40283203125
step 200: train loss 6.6279, val loss 6.8023 val perplexity: 899.8760986328125
step 300: train loss 6.5689, val loss 6.7658 val perplexity: 867.625732421875
step 400: train loss 6.5287, val loss 6.7436 val perplexity: 848.6271362304688
step 500: train loss 6.4904, val loss 6.7078 val perplexity: 818.7305908203125


KeyboardInterrupt: 

In [None]:
def generate_response(text, model, max_new_tokens=32):
  encoded = encode(sentence)
  context = torch.tensor(encoded, dtype=torch.long, device=device)
  return decode(m.generate(context.reshape(1,-1), max_new_tokens=max_new_tokens)[0].tolist())

sentence = "Hello I want you to know that you are very weak"
generate_response(sentence, m )

'hello i want you to know that you are very weak zane 942 motivation take his negotiation quickly اﻟﺪواء <unk> message stood . it ) <unk> • past , <unk> the phrases with doubt , think-and-grow-rich-ebook , ) are . , 8] ,'

In [None]:
self = m
encoded = encode(sentence)
context = torch.tensor(encoded, dtype=torch.long, device=device)
idx = context.reshape(1,-1)
max_new_tokens = 1
for _ in range(max_new_tokens):
    # crop idx to the last block_size tokens
    idx_cond = idx[:, -block_size:]
    # get the predictions
    logits, loss = self(idx_cond)
    # focus only on the last time step
    print(logits.shape)
    logits = logits[:, -1, :] # becomes (B, C)
    print(logits.shape)
    # apply softmax to get probabilities
    probs = F.softmax(logits, dim=-1) # (B, C)
    print(probs.shape)
    # sample from the distribution
    idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
    print(idx_next)
    # append sampled index to the running sequence
    idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)

torch.Size([1, 7, 10000])
torch.Size([1, 10000])
torch.Size([1, 10000])
tensor([[9698]])


## encoder transformer

In [None]:
from torch import nn

class EncoderBlock(nn.Module):
    """ Transformer encoder block """

    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.self_attention = MultiHeadAttention(n_head, head_size)
        self.feedforward = FeedFoward(n_embd)
        self.layer_norm1 = nn.LayerNorm(n_embd)
        self.layer_norm2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        # Self-attention layer
        attention_output = self.self_attention(self.layer_norm1(x))

        # Residual connection and layer normalization
        x = x + attention_output
        x = self.layer_norm2(x)

        # Feedforward layer
        feedforward_output = self.feedforward(x)

        # Residual connection and layer normalization
        x = x + feedforward_output
        x = self.layer_norm2(x)

        return x

class TransformerEncoder(nn.Module):
    """ Transformer encoder """

    def __init__(self, vocab_size, n_embd, n_head, n_layer):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.encoder_blocks = nn.ModuleList([EncoderBlock(n_embd, n_head) for _ in range(n_layer)])

    def forward(self, idx):
        B, T = idx.shape

        # Token embeddings
        token_emb = self.token_embedding_table(idx)  # (B, T, C)

        # Positional embeddings
        position_emb = self.position_embedding_table(torch.arange(T, device=device))  # (T, C)
        position_emb = position_emb.unsqueeze(0).expand(B, -1, -1)  # (B, T, C)

        # Input embeddings
        x = token_emb + position_emb  # (B, T, C)

        # Transformer encoder blocks
        for encoder_block in self.encoder_blocks:
            x = encoder_block(x)

        return x


model = TransformerEncoder(vocab_size, n_embd, n_head, n_layer)
m = model.to(device)

# sample a batch of data
xb, yb = get_batch('train')

# evaluate the loss
logits = model(xb)
logits

tensor([[[ 1.2186, -0.1254,  0.0799,  ...,  1.3814, -1.3092, -1.3559],
         [ 1.7826,  0.3997,  0.9677,  ...,  0.6672, -1.2346, -0.8527],
         [ 0.9369,  0.6580, -1.7637,  ...,  1.6921,  0.2170,  0.0918],
         ...,
         [-1.5682,  0.0779,  0.2396,  ...,  2.6327, -0.1793, -0.1802],
         [ 2.2923,  0.5367, -0.2435,  ..., -0.1020,  0.9818,  0.0750],
         [ 1.4681, -0.0568, -0.0352,  ...,  0.4216, -1.4032, -0.5806]],

        [[ 1.8424, -0.6428, -0.9073,  ...,  0.6001, -0.7625, -0.7102],
         [-0.0432,  0.1730,  0.9283,  ...,  0.0390, -0.8131, -0.6382],
         [-0.1853,  0.6799,  0.5344,  ...,  0.4588,  2.1506, -0.6934],
         ...,
         [-0.3930,  1.0957,  0.9944,  ..., -0.3486,  0.3930, -0.0352],
         [-0.3188,  0.6541,  0.6773,  ..., -1.3148,  0.2284,  0.5875],
         [ 0.7275, -0.0564,  1.5199,  ...,  0.6027, -1.9517, -0.6430]],

        [[ 0.2087, -0.5900, -0.6357,  ...,  0.2519, -0.2378, -0.1660],
         [ 1.0235,  0.4860, -0.5590,  ..., -0

In [None]:
def generate_response(text, model, max_new_tokens=32):
  encoded = encode(sentence)
  context = torch.tensor(encoded, dtype=torch.long, device=device)
  return decode(m.generate(context.reshape(1,-1), max_new_tokens=max_new_tokens)[0].tolist())

sentence = "hello there, i want you to tell me what is Linux as an operating system"
generate_response(sentence, m )

'need bit <unk> that to first the . on 0 don’t on this <unk> one all has ( directory are the , simple and a interface other <unk> user . find <unk> may own runtime like the'

In [None]:
## transformer attempt

## transformer

In [None]:

# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

@torch.no_grad()
def estimate_loss(model):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out


class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)

        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x


In [None]:
class TransformerLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        # Encoder
        self.token_embedding_table_enc = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table_enc = nn.Embedding(block_size, n_embd)
        self.blocks_enc = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f_enc = nn.LayerNorm(n_embd)

        # Decoder
        self.token_embedding_table_dec = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table_dec = nn.Embedding(block_size*2, n_embd)

        first_block = Block(n_embd*2, n_head=n_head)
        dec_blocks = [first_block] + [Block(n_embd, n_head=n_head) for _ in range(n_layer-1)]
        self.blocks_dec = nn.Sequential(*dec_blocks)
        self.ln_f_dec = nn.LayerNorm(n_embd)

        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idxs, targets=None):
        # Encoder
        idx_enc = idx_dec = idxs
        B, T_enc = idx_enc.shape
        tok_emb_enc = self.token_embedding_table_enc(idx_enc)
        pos_emb_enc = self.position_embedding_table_enc(torch.arange(T_enc, device=device))
        x_enc = tok_emb_enc + pos_emb_enc
        x_enc = self.blocks_enc(x_enc)
        x_enc = self.ln_f_enc(x_enc)

        # Decoder
        B, T_dec = idx_dec.shape
        tok_emb_dec = self.token_embedding_table_dec(idx_dec)
        pos_emb_dec = self.position_embedding_table_dec(torch.arange(T_dec, device=device))
        x_dec = tok_emb_dec + pos_emb_dec

        x = torch.cat((x_enc, x_dec), dim=1)

        x_dec = self.blocks_dec(x)
        x_dec = self.ln_f_dec(x_dec)

        # x = torch.cat((x_enc, x_dec), dim=1)

        logits = self.lm_head(x)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)
            print(x.shape, x_dec.shape)
            # print("targets ",targets)
            # targets_shifted = targets[:, 1:].contiguous().view(-1)
            # logits_flat = logits[:, :-1, :].contiguous().view(-1, C)
            # loss = F.cross_entropy(logits_flat, targets_shifted)

        return logits, loss

    def generate(self, idx_enc, idx_dec, max_new_tokens):
        # # Encoder
        # B, T_enc = idx_enc.shape
        # tok_emb_enc = self.token_embedding_table_enc(idx_enc)
        # pos_emb_enc = self.position_embedding_table_enc(torch.arange(T_enc, device=device))
        # x_enc = tok_emb_enc + pos_emb_enc
        # x_enc = self.blocks_enc(x_enc)
        # x_enc = self.ln_f_enc(x_enc)

        # # Decoder
        # B, T_dec = idx_dec.shape
        # tok_emb_dec = self.token_embedding_table_dec(idx_dec)
        # pos_emb_dec = self.position_embedding_table_dec(torch.arange(T_dec, device=device))
        # x_dec = tok_emb_dec + pos_emb_dec
        # x_dec = self.blocks_dec(x_dec)
        # x_dec = self.ln_f_dec(x_dec)

        for _ in range(max_new_tokens):
            logits, loss = self(idx_enc, idx_dec)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx_dec = torch.cat((idx_dec, idx_next), dim=1)

        return idx_dec



model = TransformerLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

index = 0
for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss(m)
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f} val perplexity: {torch.exp(losses['val'])}")

    # sample a batch of data
    # xb, yb = get_batch('train')

    xb, yb = batches[index]
    index += 1

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()


0.524512 M parameters


RuntimeError: Given normalized_shape=[32], expected input with shape [*, 32], but got input of size[64, 64, 16]

In [None]:
def generate_response(text, model, max_new_tokens=32):
  encoded = encode(sentence)
  context = torch.tensor(encoded, dtype=torch.long, device=device)
  return decode(m.generate(context.reshape(1,-1), max_new_tokens=max_new_tokens)[0].tolist())

sentence = "What are the three most important things to consider when deciding what technology to use to build an assist device"
generate_response(sentence, m )

NameError: name 'm_old' is not defined

In [None]:
o = decode(m.generate(context.reshape(1,-1), max_new_tokens=max_new_tokens))

o.shape

In [None]:
def generate_response(text, model, max_new_tokens=32):
  encoded = encode(sentence)
  context = torch.tensor(encoded, dtype=torch.long, device=device)
  return decode(m.generate(context.reshape(1,-1), max_new_tokens=max_new_tokens)[0].tolist())

sentence = "What are the three most important things to consider when deciding what technology to use to build an assist device"
generate_response(sentence, m)

'what are the three most important things to consider when deciding what technology to use to build an assist device know that students it is important that much it is 1980s running the icaew will be interpreted . the ozone i am the right verbs states place , and suffering from the'

In [None]:
sentence = "What are the three most important things to consider when deciding what technology to use to build an assist device"
encoded = encode(sentence)
encoded[:10]

[295, 38, 22, 769, 471, 770, 771, 31, 772, 773]

In [None]:
context = torch.tensor(encoded, dtype=torch.long, device=device)
context

tensor([295,  38,  22, 769, 471, 770, 771,  31, 772, 773, 774, 295, 775,  31,
        221,  31, 776, 554, 777, 778])

In [None]:
# context = torch.zeros(encoded, dtype=torch.long, device=device)
# print(context)
print(decode(m.generate(context.reshape(1,-1), max_new_tokens=32)[0].tolist()))

what are the three most important things to consider when deciding what technology to use to build an assist device mab aerospace mismarked holds that lstm waiting roles horses language lisandro holidays <eos><sos>consider times future s tissues . spans meal certified ali courts played audit patients curiosity saying edges stone terms right
