# Reading in a short story as text sample into Python

## Step 1: Creating Tokens

In [1]:
from signal import pthread_sigmask

import tiktoken

with open("the-verdict.txt", encoding="utf-8") as verdict:
    raw_text = verdict.read()

print(f"Total number of characters: {len(raw_text)}")
print()
print("THE VERDICT")
print("======================================================================================")
print(raw_text)

Total number of characters: 20479

THE VERDICT
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no great surprise to me to hear that, in the height of his glory, he had dropped his painting, married a rich widow, and established himself in a villa on the Riviera. (Though I rather thought it would have been Rome or Florence.)

"The height of his glory"--that was what the women called it. I can hear Mrs. Gideon Thwing--his last Chicago sitter--deploring his unaccountable abdication. "Of course it's going to send the value of my picture 'way up; but I don't think of that, Mr. Rickham--the loss to Arrt is all I think of." The word, on Mrs. Thwing's lips, multiplied its _rs_ as though they were reflected in an endless vista of mirrors. And it was not only the Mrs. Thwings who mourned. Had not the exquisite Hermia Croft, at the last Grafton Gallery show, stopped me before Gisburn's "Moon-dancers" to say, with tears in her eyes: "We shall not loo

In [2]:
import re

sample_text = "Hello, world. This, is a test."
result = re.findall(r"\w+|[^\w\s]", sample_text)
print(result)

['Hello', ',', 'world', '.', 'This', ',', 'is', 'a', 'test', '.']


In [3]:
result = re.findall(r"\w+|\s+|[^\w\s]", raw_text)
print(result)

['I', ' ', 'HAD', ' ', 'always', ' ', 'thought', ' ', 'Jack', ' ', 'Gisburn', ' ', 'rather', ' ', 'a', ' ', 'cheap', ' ', 'genius', '-', '-', 'though', ' ', 'a', ' ', 'good', ' ', 'fellow', ' ', 'enough', '-', '-', 'so', ' ', 'it', ' ', 'was', ' ', 'no', ' ', 'great', ' ', 'surprise', ' ', 'to', ' ', 'me', ' ', 'to', ' ', 'hear', ' ', 'that', ',', ' ', 'in', ' ', 'the', ' ', 'height', ' ', 'of', ' ', 'his', ' ', 'glory', ',', ' ', 'he', ' ', 'had', ' ', 'dropped', ' ', 'his', ' ', 'painting', ',', ' ', 'married', ' ', 'a', ' ', 'rich', ' ', 'widow', ',', ' ', 'and', ' ', 'established', ' ', 'himself', ' ', 'in', ' ', 'a', ' ', 'villa', ' ', 'on', ' ', 'the', ' ', 'Riviera', '.', ' ', '(', 'Though', ' ', 'I', ' ', 'rather', ' ', 'thought', ' ', 'it', ' ', 'would', ' ', 'have', ' ', 'been', ' ', 'Rome', ' ', 'or', ' ', 'Florence', '.', ')', '\n\n', '"', 'The', ' ', 'height', ' ', 'of', ' ', 'his', ' ', 'glory', '"', '-', '-', 'that', ' ', 'was', ' ', 'what', ' ', 'the', ' ', 'women', ' '

In [4]:
print(len(result))

8460


In [5]:
from typing import List

def custom_tokenizer(text: str, include_whitespace: bool = True) -> List[str]:
    """
    Custom tokenizer that adds whitespace if include_whitespace is True.

    Parameters:
        text (str): The text to be tokenized.
        include_whitespace (bool, optional): Whether to include whitespace in the tokenization. Defaults to True.

    Returns:
        tokens (List[str]): A list of tokens.
    """
    if not include_whitespace:
        return re.findall(r"\w+|[^\w\s]", text)
    return re.findall(r"\w+|\s+|[^\w\s]", text)

## Step 2: Creating Token IDs

In [6]:
result = custom_tokenizer(raw_text, include_whitespace=False)

In [7]:
print(len(result))

4827


In [8]:
vocabulary = sorted(set(result))
print(vocabulary)

['!', '"', "'", '(', ')', ',', '-', '.', ':', ';', '?', 'A', 'Ah', 'Among', 'And', 'Are', 'Arrt', 'As', 'At', 'Be', 'Begin', 'Burlington', 'But', 'By', 'Carlo', 'Chicago', 'Claude', 'Come', 'Croft', 'Destroyed', 'Devonshire', 'Don', 'Dubarry_', 'Emperors', 'Florence', 'For', 'Gallery', 'Gideon', 'Gisburn', 'Gisburns', 'Grafton', 'Greek', 'Grindle', 'Grindles', 'HAD', 'Had', 'Hang', 'Has', 'He', 'Her', 'Hermia', 'His', 'How', 'I', 'If', 'In', 'It', 'Jack', 'Jove', 'Just', 'Lord', 'Made', 'Miss', 'Money', 'Monte', 'Moon', 'Mr', 'Mrs', 'My', 'Never', 'No', 'Now', 'Nutley', 'Of', 'Oh', 'On', 'Once', 'Only', 'Or', 'Perhaps', 'Poor', 'Professional', 'Renaissance', 'Rickham', 'Riviera', 'Rome', 'Russian', 'Sevres', 'She', 'Stroud', 'Strouds', 'Suddenly', 'That', 'The', 'Then', 'There', 'They', 'This', 'Those', 'Though', 'Thwing', 'Thwings', 'To', 'Usually', 'Venetian', 'Victor', 'Was', 'We', 'Well', 'What', 'When', 'Why', 'Yes', 'You', '_I', '_am_', '_famille', '_felt_', '_has_', '_have_', '_

In [9]:
print(f"Vocabulary Size: {len(vocabulary)}")

Vocabulary Size: 1148


In [10]:
vocabulary_dictionary = dict(zip(vocabulary, range(len(vocabulary))))
print(vocabulary_dictionary)

{'!': 0, '"': 1, "'": 2, '(': 3, ')': 4, ',': 5, '-': 6, '.': 7, ':': 8, ';': 9, '?': 10, 'A': 11, 'Ah': 12, 'Among': 13, 'And': 14, 'Are': 15, 'Arrt': 16, 'As': 17, 'At': 18, 'Be': 19, 'Begin': 20, 'Burlington': 21, 'But': 22, 'By': 23, 'Carlo': 24, 'Chicago': 25, 'Claude': 26, 'Come': 27, 'Croft': 28, 'Destroyed': 29, 'Devonshire': 30, 'Don': 31, 'Dubarry_': 32, 'Emperors': 33, 'Florence': 34, 'For': 35, 'Gallery': 36, 'Gideon': 37, 'Gisburn': 38, 'Gisburns': 39, 'Grafton': 40, 'Greek': 41, 'Grindle': 42, 'Grindles': 43, 'HAD': 44, 'Had': 45, 'Hang': 46, 'Has': 47, 'He': 48, 'Her': 49, 'Hermia': 50, 'His': 51, 'How': 52, 'I': 53, 'If': 54, 'In': 55, 'It': 56, 'Jack': 57, 'Jove': 58, 'Just': 59, 'Lord': 60, 'Made': 61, 'Miss': 62, 'Money': 63, 'Monte': 64, 'Moon': 65, 'Mr': 66, 'Mrs': 67, 'My': 68, 'Never': 69, 'No': 70, 'Now': 71, 'Nutley': 72, 'Of': 73, 'Oh': 74, 'On': 75, 'Once': 76, 'Only': 77, 'Or': 78, 'Perhaps': 79, 'Poor': 80, 'Professional': 81, 'Renaissance': 82, 'Rickham': 

In [11]:
class SimpleTokenizer:
    """
    A simple tokenizer class that create token IDs.
    """
    def __init__(self, vocab: List[str]):
        self.str_to_int = dict(zip(vocab, range(len(vocab))))
        self.int_to_str = dict(zip(range(len(vocab)), vocab))

    @staticmethod
    def custom_tokenizer(text: str, include_whitespace: bool = True) -> List[str]:
        """
        Custom tokenizer that adds whitespace if include_whitespace is True.

        Parameters:
            text (str): The text to be tokenized.
            include_whitespace (bool, optional): Whether to include whitespace in the tokenization. Defaults to True.

        Returns:
            tokens (List[str]): A list of tokens.
        """
        if not include_whitespace:
            return re.findall(r"\w+|[^\w\s]", text)
        return re.findall(r"\w+|\s+|[^\w\s]", text)

    def encode(self, text: str) -> List[int]:
        """
        Encodes a text into a list of integers.
        Parameters:
            text (str): The text to be encoded.
        Returns:
            tokens (List[int]): A list of integers.
        """
        tokens = self.custom_tokenizer(text, include_whitespace=False)
        token_ids = [self.str_to_int[token] for token in tokens]
        return token_ids

    def decode(self, token_ids: List[int]) -> str:
        """
        Decodes a list of integers into text.
        Parameters:
            token_ids (List[int]): A list of integers.
        Returns:
            text (str): The decoded text.
        """
        tokens = [self.int_to_str[ID] for ID in token_ids]
        text = " ".join(tokens)
        # Replace spaces before the specified punctuations
        text = re.sub(r"\s+([,.?\"()'])", r"\1", text)
        return text

In [12]:
tokenizer = SimpleTokenizer(vocabulary)

In [13]:
text = "It's the last he painted, you know, Mrs. Gisburn said with pardonable pride"

ids = tokenizer.encode(text)
print(ids)

[56, 2, 867, 1006, 615, 546, 760, 5, 1144, 609, 5, 67, 7, 38, 868, 1126, 769, 810]


In [14]:
tokenizer.decode(ids)

"It' s the last he painted, you know, Mrs. Gisburn said with pardonable pride"

In [15]:
text = """
I am currently working as an Assistant AI Engineer at Global InfoVentures Pvt. Ltd., where I am developing and integrating AI-driven solutions such as Conversational AI for the College ERP and Multi-Person Face Recognition Attendance Systems into the G5 Portal. With a Bachelor of Technology in Computer Science and Engineering from ABES Institute of Technology, Ghaziabad, I bring 2 years of experience in the dynamic field of artificial intelligence.

My passion lies in leveraging AI technologies to solve real-world problems, particularly in the areas of Conversational AI, Computer Vision, and Machine Learning. I am committed to continuous learning and staying at the forefront of emerging technologies. My current focus is on deepening my expertise in AI and exploring its intersection with other cutting-edge domains like Web 3.0 and blockchain.

I believe in the power of collaboration and am always eager to connect with fellow AI enthusiasts, seasoned professionals, and innovators who share my passion for technology. Together, we can explore new opportunities, share knowledge, and drive meaningful change in the tech industry.

If you're passionate about AI and are looking to collaborate, innovate, or simply share ideas, I would love to connect. Let's work together to shape the future and create impactful solutions.
"""

In [16]:
# tokenizer.encode(text)

## ADDING SPECIAL CONTEXT TOKENS

In [17]:
vocabulary.extend(["<|unknown|>", "<|endoftext|>"])
print(vocabulary)

['!', '"', "'", '(', ')', ',', '-', '.', ':', ';', '?', 'A', 'Ah', 'Among', 'And', 'Are', 'Arrt', 'As', 'At', 'Be', 'Begin', 'Burlington', 'But', 'By', 'Carlo', 'Chicago', 'Claude', 'Come', 'Croft', 'Destroyed', 'Devonshire', 'Don', 'Dubarry_', 'Emperors', 'Florence', 'For', 'Gallery', 'Gideon', 'Gisburn', 'Gisburns', 'Grafton', 'Greek', 'Grindle', 'Grindles', 'HAD', 'Had', 'Hang', 'Has', 'He', 'Her', 'Hermia', 'His', 'How', 'I', 'If', 'In', 'It', 'Jack', 'Jove', 'Just', 'Lord', 'Made', 'Miss', 'Money', 'Monte', 'Moon', 'Mr', 'Mrs', 'My', 'Never', 'No', 'Now', 'Nutley', 'Of', 'Oh', 'On', 'Once', 'Only', 'Or', 'Perhaps', 'Poor', 'Professional', 'Renaissance', 'Rickham', 'Riviera', 'Rome', 'Russian', 'Sevres', 'She', 'Stroud', 'Strouds', 'Suddenly', 'That', 'The', 'Then', 'There', 'They', 'This', 'Those', 'Though', 'Thwing', 'Thwings', 'To', 'Usually', 'Venetian', 'Victor', 'Was', 'We', 'Well', 'What', 'When', 'Why', 'Yes', 'You', '_I', '_am_', '_famille', '_felt_', '_has_', '_have_', '_

In [18]:
class SimpleTokenizerV2:
    """
    A simple tokenizer class that create token IDs.
    """
    def __init__(self, vocab: List[str]):
        self.str_to_int = dict(zip(vocab, range(len(vocab))))
        self.int_to_str = dict(zip(range(len(vocab)), vocab))

    @staticmethod
    def custom_tokenizer(text: str, include_whitespace: bool = True) -> List[str]:
        """
        Custom tokenizer that adds whitespace if include_whitespace is True.

        Parameters:
            text (str): The text to be tokenized.
            include_whitespace (bool, optional): Whether to include whitespace in the tokenization. Defaults to True.

        Returns:
            tokens (List[str]): A list of tokens.
        """
        if not include_whitespace:
            return re.findall(r"\w+|[^\w\s]", text)
        return re.findall(r"\w+|\s+|[^\w\s]", text)

    def encode(self, text: str) -> List[int]:
        """
        Encodes a text into a list of integers.
        Parameters:
            text (str): The text to be encoded.
        Returns:
            tokens (List[int]): A list of integers.
        """
        tokens = self.custom_tokenizer(text, include_whitespace=False)
        tokens = [
            item if item in self.str_to_int
            else "<|unknown|>" for item in tokens
        ]
        token_ids = [self.str_to_int[token] for token in tokens]
        return token_ids

    def decode(self, token_ids: List[int]) -> str:
        """
        Decodes a list of integers into text.
        Parameters:
            token_ids (List[int]): A list of integers.
        Returns:
            text (str): The decoded text.
        """
        tokens = [self.int_to_str[ID] for ID in token_ids]
        text = " ".join(tokens)
        # Replace spaces before the specified punctuations
        text = re.sub(r"\s+([,.?\"()'])", r"\1", text)
        return text

In [19]:
tokenizer = SimpleTokenizerV2(vocabulary)

In [20]:
print(tokenizer.str_to_int)

{'!': 0, '"': 1, "'": 2, '(': 3, ')': 4, ',': 5, '-': 6, '.': 7, ':': 8, ';': 9, '?': 10, 'A': 11, 'Ah': 12, 'Among': 13, 'And': 14, 'Are': 15, 'Arrt': 16, 'As': 17, 'At': 18, 'Be': 19, 'Begin': 20, 'Burlington': 21, 'But': 22, 'By': 23, 'Carlo': 24, 'Chicago': 25, 'Claude': 26, 'Come': 27, 'Croft': 28, 'Destroyed': 29, 'Devonshire': 30, 'Don': 31, 'Dubarry_': 32, 'Emperors': 33, 'Florence': 34, 'For': 35, 'Gallery': 36, 'Gideon': 37, 'Gisburn': 38, 'Gisburns': 39, 'Grafton': 40, 'Greek': 41, 'Grindle': 42, 'Grindles': 43, 'HAD': 44, 'Had': 45, 'Hang': 46, 'Has': 47, 'He': 48, 'Her': 49, 'Hermia': 50, 'His': 51, 'How': 52, 'I': 53, 'If': 54, 'In': 55, 'It': 56, 'Jack': 57, 'Jove': 58, 'Just': 59, 'Lord': 60, 'Made': 61, 'Miss': 62, 'Money': 63, 'Monte': 64, 'Moon': 65, 'Mr': 66, 'Mrs': 67, 'My': 68, 'Never': 69, 'No': 70, 'Now': 71, 'Nutley': 72, 'Of': 73, 'Oh': 74, 'On': 75, 'Once': 76, 'Only': 77, 'Or': 78, 'Perhaps': 79, 'Poor': 80, 'Professional': 81, 'Renaissance': 82, 'Rickham': 

In [21]:
print(tokenizer.int_to_str)

{0: '!', 1: '"', 2: "'", 3: '(', 4: ')', 5: ',', 6: '-', 7: '.', 8: ':', 9: ';', 10: '?', 11: 'A', 12: 'Ah', 13: 'Among', 14: 'And', 15: 'Are', 16: 'Arrt', 17: 'As', 18: 'At', 19: 'Be', 20: 'Begin', 21: 'Burlington', 22: 'But', 23: 'By', 24: 'Carlo', 25: 'Chicago', 26: 'Claude', 27: 'Come', 28: 'Croft', 29: 'Destroyed', 30: 'Devonshire', 31: 'Don', 32: 'Dubarry_', 33: 'Emperors', 34: 'Florence', 35: 'For', 36: 'Gallery', 37: 'Gideon', 38: 'Gisburn', 39: 'Gisburns', 40: 'Grafton', 41: 'Greek', 42: 'Grindle', 43: 'Grindles', 44: 'HAD', 45: 'Had', 46: 'Hang', 47: 'Has', 48: 'He', 49: 'Her', 50: 'Hermia', 51: 'His', 52: 'How', 53: 'I', 54: 'If', 55: 'In', 56: 'It', 57: 'Jack', 58: 'Jove', 59: 'Just', 60: 'Lord', 61: 'Made', 62: 'Miss', 63: 'Money', 64: 'Monte', 65: 'Moon', 66: 'Mr', 67: 'Mrs', 68: 'My', 69: 'Never', 70: 'No', 71: 'Now', 72: 'Nutley', 73: 'Of', 74: 'Oh', 75: 'On', 76: 'Once', 77: 'Only', 78: 'Or', 79: 'Perhaps', 80: 'Poor', 81: 'Professional', 82: 'Renaissance', 83: 'Rickha

In [22]:
text1 = "It's the last he painted, you know, Mrs. Gisburn said with pardonable pride."

ids = tokenizer.encode(text1)
print(ids)

[56, 2, 867, 1006, 615, 546, 760, 5, 1144, 609, 5, 67, 7, 38, 868, 1126, 769, 810, 7]


In [23]:
tokenizer.decode(ids)

"It' s the last he painted, you know, Mrs. Gisburn said with pardonable pride."

In [24]:
text2 = """
I am currently working as an Assistant AI Engineer at Global InfoVentures Pvt. Ltd., where I am developing and integrating AI-driven solutions such as Conversational AI for the College ERP and Multi-Person Face Recognition Attendance Systems into the G5 Portal. With a Bachelor of Technology in Computer Science and Engineering from ABES Institute of Technology, Ghaziabad, I bring 2 years of experience in the dynamic field of artificial intelligence.

My passion lies in leveraging AI technologies to solve real-world problems, particularly in the areas of Conversational AI, Computer Vision, and Machine Learning. I am committed to continuous learning and staying at the forefront of emerging technologies. My current focus is on deepening my expertise in AI and exploring its intersection with other cutting-edge domains like Web 3.0 and blockchain.

I believe in the power of collaboration and am always eager to connect with fellow AI enthusiasts, seasoned professionals, and innovators who share my passion for technology. Together, we can explore new opportunities, share knowledge, and drive meaningful change in the tech industry.

If you're passionate about AI and are looking to collaborate, innovate, or simply share ideas, I would love to connect. Let's work together to shape the future and create impactful solutions.
"""

In [25]:
ids = tokenizer.encode(text2)
print(ids)

[53, 1148, 1148, 1136, 188, 169, 1148, 1148, 1148, 191, 1148, 1148, 1148, 7, 1148, 7, 5, 1111, 53, 1148, 1148, 170, 1148, 1148, 6, 1148, 1148, 967, 188, 1148, 1148, 473, 1006, 1148, 1148, 170, 1148, 6, 1148, 1148, 1148, 1148, 1148, 593, 1006, 1148, 1148, 7, 1148, 129, 1148, 736, 1148, 582, 1148, 1148, 170, 1148, 495, 1148, 1148, 736, 1148, 5, 1148, 5, 53, 1148, 1148, 1141, 736, 1148, 582, 1006, 1148, 1148, 736, 1148, 1148, 7, 68, 1148, 634, 582, 1148, 1148, 1148, 1034, 1148, 837, 6, 1148, 1148, 5, 1148, 582, 1006, 1148, 736, 1148, 1148, 5, 1148, 1148, 5, 170, 1148, 1148, 7, 53, 1148, 1148, 1034, 1148, 1148, 170, 1148, 191, 1006, 1148, 736, 1148, 1148, 7, 68, 326, 1148, 598, 741, 1148, 711, 1148, 582, 1148, 170, 1148, 600, 1148, 1126, 749, 1148, 6, 1148, 1148, 640, 1148, 1148, 7, 1148, 170, 1148, 7, 53, 1148, 582, 1006, 1148, 736, 1148, 170, 1148, 163, 1148, 1034, 1148, 1126, 452, 1148, 1148, 5, 1148, 1148, 5, 170, 1148, 1115, 1148, 711, 1148, 473, 1148, 7, 1148, 5, 1148, 257, 1148, 114

In [26]:
tokenizer.decode(ids)

"I <|unknown|> <|unknown|> working as an <|unknown|> <|unknown|> <|unknown|> at <|unknown|> <|unknown|> <|unknown|>. <|unknown|>., where I <|unknown|> <|unknown|> and <|unknown|> <|unknown|> - <|unknown|> <|unknown|> such as <|unknown|> <|unknown|> for the <|unknown|> <|unknown|> and <|unknown|> - <|unknown|> <|unknown|> <|unknown|> <|unknown|> <|unknown|> into the <|unknown|> <|unknown|>. <|unknown|> a <|unknown|> of <|unknown|> in <|unknown|> <|unknown|> and <|unknown|> from <|unknown|> <|unknown|> of <|unknown|>, <|unknown|>, I <|unknown|> <|unknown|> years of <|unknown|> in the <|unknown|> <|unknown|> of <|unknown|> <|unknown|>. My <|unknown|> lies in <|unknown|> <|unknown|> <|unknown|> to <|unknown|> real - <|unknown|> <|unknown|>, <|unknown|> in the <|unknown|> of <|unknown|> <|unknown|>, <|unknown|> <|unknown|>, and <|unknown|> <|unknown|>. I <|unknown|> <|unknown|> to <|unknown|> <|unknown|> and <|unknown|> at the <|unknown|> of <|unknown|> <|unknown|>. My current <|unknown|> i

In [27]:
text = " <|endoftext|> ".join((text1, text2))

print(text)

It's the last he painted, you know, Mrs. Gisburn said with pardonable pride. <|endoftext|> 
I am currently working as an Assistant AI Engineer at Global InfoVentures Pvt. Ltd., where I am developing and integrating AI-driven solutions such as Conversational AI for the College ERP and Multi-Person Face Recognition Attendance Systems into the G5 Portal. With a Bachelor of Technology in Computer Science and Engineering from ABES Institute of Technology, Ghaziabad, I bring 2 years of experience in the dynamic field of artificial intelligence.

My passion lies in leveraging AI technologies to solve real-world problems, particularly in the areas of Conversational AI, Computer Vision, and Machine Learning. I am committed to continuous learning and staying at the forefront of emerging technologies. My current focus is on deepening my expertise in AI and exploring its intersection with other cutting-edge domains like Web 3.0 and blockchain.

I believe in the power of collaboration and am always

In [28]:
print(tokenizer.encode(text))

[56, 2, 867, 1006, 615, 546, 760, 5, 1144, 609, 5, 67, 7, 38, 868, 1126, 769, 810, 7, 1148, 1148, 1148, 1148, 1148, 53, 1148, 1148, 1136, 188, 169, 1148, 1148, 1148, 191, 1148, 1148, 1148, 7, 1148, 7, 5, 1111, 53, 1148, 1148, 170, 1148, 1148, 6, 1148, 1148, 967, 188, 1148, 1148, 473, 1006, 1148, 1148, 170, 1148, 6, 1148, 1148, 1148, 1148, 1148, 593, 1006, 1148, 1148, 7, 1148, 129, 1148, 736, 1148, 582, 1148, 1148, 170, 1148, 495, 1148, 1148, 736, 1148, 5, 1148, 5, 53, 1148, 1148, 1141, 736, 1148, 582, 1006, 1148, 1148, 736, 1148, 1148, 7, 68, 1148, 634, 582, 1148, 1148, 1148, 1034, 1148, 837, 6, 1148, 1148, 5, 1148, 582, 1006, 1148, 736, 1148, 1148, 5, 1148, 1148, 5, 170, 1148, 1148, 7, 53, 1148, 1148, 1034, 1148, 1148, 170, 1148, 191, 1006, 1148, 736, 1148, 1148, 7, 68, 326, 1148, 598, 741, 1148, 711, 1148, 582, 1148, 170, 1148, 600, 1148, 1126, 749, 1148, 6, 1148, 1148, 640, 1148, 1148, 7, 1148, 170, 1148, 7, 53, 1148, 582, 1006, 1148, 736, 1148, 170, 1148, 163, 1148, 1034, 1148, 112

In [29]:
print(tokenizer.decode(tokenizer.encode(text)))

It' s the last he painted, you know, Mrs. Gisburn said with pardonable pride. <|unknown|> <|unknown|> <|unknown|> <|unknown|> <|unknown|> I <|unknown|> <|unknown|> working as an <|unknown|> <|unknown|> <|unknown|> at <|unknown|> <|unknown|> <|unknown|>. <|unknown|>., where I <|unknown|> <|unknown|> and <|unknown|> <|unknown|> - <|unknown|> <|unknown|> such as <|unknown|> <|unknown|> for the <|unknown|> <|unknown|> and <|unknown|> - <|unknown|> <|unknown|> <|unknown|> <|unknown|> <|unknown|> into the <|unknown|> <|unknown|>. <|unknown|> a <|unknown|> of <|unknown|> in <|unknown|> <|unknown|> and <|unknown|> from <|unknown|> <|unknown|> of <|unknown|>, <|unknown|>, I <|unknown|> <|unknown|> years of <|unknown|> in the <|unknown|> <|unknown|> of <|unknown|> <|unknown|>. My <|unknown|> lies in <|unknown|> <|unknown|> <|unknown|> to <|unknown|> real - <|unknown|> <|unknown|>, <|unknown|> in the <|unknown|> of <|unknown|> <|unknown|>, <|unknown|> <|unknown|>, and <|unknown|> <|unknown|>. I <

## BYTE PAIR ENCODING

In [30]:
!python3 -m pip install tiktoken

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [31]:
import importlib
import tiktoken
importlib.reload(tiktoken)

<module 'tiktoken' from '/home/hamza/SelfLearningProjects/Building-LLM-From-Scratch/.venv/lib/python3.10/site-packages/tiktoken/__init__.py'>

In [32]:
tokenizer = tiktoken.get_encoding("gpt2")

In [33]:
text = "Hello, do you like tea? <|endoftext|> In the sunlit terraces of someunknownPlace."

integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 286, 617, 34680, 27271, 13]


In [34]:
strings = tokenizer.decode(integers)

In [35]:
print(strings)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of someunknownPlace.


In [36]:
integers = tokenizer.encode("Akwirw ier")
print(integers)

strings = tokenizer.decode(integers)
print(strings)

[33901, 86, 343, 86, 220, 959]
Akwirw ier


## CREATE INPUT-TARGET PAIRS

In [37]:
with open("the-verdict.txt", encoding="utf-8") as verdict:
    raw_text = verdict.read()

print(f"Total number of characters: {len(raw_text)}")
print()
print("THE VERDICT")
print("======================================================================================")
print(raw_text)

Total number of characters: 20479

THE VERDICT
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no great surprise to me to hear that, in the height of his glory, he had dropped his painting, married a rich widow, and established himself in a villa on the Riviera. (Though I rather thought it would have been Rome or Florence.)

"The height of his glory"--that was what the women called it. I can hear Mrs. Gideon Thwing--his last Chicago sitter--deploring his unaccountable abdication. "Of course it's going to send the value of my picture 'way up; but I don't think of that, Mr. Rickham--the loss to Arrt is all I think of." The word, on Mrs. Thwing's lips, multiplied its _rs_ as though they were reflected in an endless vista of mirrors. And it was not only the Mrs. Thwings who mourned. Had not the exquisite Hermia Croft, at the last Grafton Gallery show, stopped me before Gisburn's "Moon-dancers" to say, with tears in her eyes: "We shall not loo

In [38]:
enc_text = tokenizer.encode(raw_text)
print(enc_text)
print(len(enc_text))

[40, 367, 2885, 1464, 1807, 3619, 402, 271, 10899, 2138, 257, 7026, 15632, 438, 2016, 257, 922, 5891, 1576, 438, 568, 340, 373, 645, 1049, 5975, 284, 502, 284, 3285, 326, 11, 287, 262, 6001, 286, 465, 13476, 11, 339, 550, 5710, 465, 12036, 11, 6405, 257, 5527, 27075, 11, 290, 4920, 2241, 287, 257, 4489, 64, 319, 262, 34686, 41976, 13, 357, 10915, 314, 2138, 1807, 340, 561, 423, 587, 10598, 393, 28537, 2014, 198, 198, 1, 464, 6001, 286, 465, 13476, 1, 438, 5562, 373, 644, 262, 1466, 1444, 340, 13, 314, 460, 3285, 9074, 13, 46606, 536, 5469, 438, 14363, 938, 4842, 1650, 353, 438, 2934, 489, 3255, 465, 48422, 540, 450, 67, 3299, 13, 366, 5189, 1781, 340, 338, 1016, 284, 3758, 262, 1988, 286, 616, 4286, 705, 1014, 510, 26, 475, 314, 836, 470, 892, 286, 326, 11, 1770, 13, 8759, 2763, 438, 1169, 2994, 284, 943, 17034, 318, 477, 314, 892, 286, 526, 383, 1573, 11, 319, 9074, 13, 536, 5469, 338, 11914, 11, 33096, 663, 4808, 3808, 62, 355, 996, 484, 547, 12548, 287, 281, 13079, 410, 12523, 286, 

In [39]:
enc_sample = enc_text[50:]

In [40]:
context_size = 8

x = enc_sample[:context_size]
y = enc_sample[1:context_size+ 1]

print(f"x: {x}")
print(f"y:      {y}")

x: [290, 4920, 2241, 287, 257, 4489, 64, 319]
y:      [4920, 2241, 287, 257, 4489, 64, 319, 262]


In [41]:
for i in range(1, context_size + 1):
    context = enc_sample[:i]
    desired = enc_sample[i]

    print(f"{context} -> {desired}")

[290] -> 4920
[290, 4920] -> 2241
[290, 4920, 2241] -> 287
[290, 4920, 2241, 287] -> 257
[290, 4920, 2241, 287, 257] -> 4489
[290, 4920, 2241, 287, 257, 4489] -> 64
[290, 4920, 2241, 287, 257, 4489, 64] -> 319
[290, 4920, 2241, 287, 257, 4489, 64, 319] -> 262


In [42]:
for i in range(1, context_size + 1):
    context = enc_sample[:i]
    desired = enc_sample[i]

    print(f"{tokenizer.decode(context)} -> {tokenizer.decode([desired])}")

 and ->  established
 and established ->  himself
 and established himself ->  in
 and established himself in ->  a
 and established himself in a ->  vill
 and established himself in a vill -> a
 and established himself in a villa ->  on
 and established himself in a villa on ->  the


## IMPLEMENTING DATALOADER

In [43]:
import torch
from torch.utils.data import Dataset, DataLoader


class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i+max_length]
            target_chunk = token_ids[i+1:i+max_length+1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [44]:
def create_dataloader_v1(txt, batch_size=4, max_length=256, stride=128, shuffle=False, drop_last=True, num_workers=0):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [45]:
with open("the-verdict.txt", encoding="utf-8") as verdict:
    raw_text = verdict.read()

In [75]:
dataloader = create_dataloader_v1(raw_text, batch_size=1, max_length=4, stride=4)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


In [76]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[1807, 3619,  402,  271]]), tensor([[ 3619,   402,   271, 10899]])]


In [77]:
dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=4, stride=4)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)

print("Inputs:\n", inputs)
print("\nTargets:\n", targets)

Inputs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Targets:
 tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])


# WORD2VEC EMBEDDINGS

In [49]:
import gensim.downloader as api
model = api.load("word2vec-google-news-300")

[=-------------------------------------------------] 3.2% 53.4/1662.8MB downloaded

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



[==------------------------------------------------] 5.6% 93.4/1662.8MB downloaded

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



[===-----------------------------------------------] 6.7% 111.9/1662.8MB downloaded

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



[====----------------------------------------------] 8.0% 133.4/1662.8MB downloaded

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



[=====---------------------------------------------] 10.4% 173.4/1662.8MB downloaded

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)





In [50]:
word_vectors = model

print(word_vectors["computer"])

[ 1.07421875e-01 -2.01171875e-01  1.23046875e-01  2.11914062e-01
 -9.13085938e-02  2.16796875e-01 -1.31835938e-01  8.30078125e-02
  2.02148438e-01  4.78515625e-02  3.66210938e-02 -2.45361328e-02
  2.39257812e-02 -1.60156250e-01 -2.61230469e-02  9.71679688e-02
 -6.34765625e-02  1.84570312e-01  1.70898438e-01 -1.63085938e-01
 -1.09375000e-01  1.49414062e-01 -4.65393066e-04  9.61914062e-02
  1.68945312e-01  2.60925293e-03  8.93554688e-02  6.49414062e-02
  3.56445312e-02 -6.93359375e-02 -1.46484375e-01 -1.21093750e-01
 -2.27539062e-01  2.45361328e-02 -1.24511719e-01 -3.18359375e-01
 -2.20703125e-01  1.30859375e-01  3.66210938e-02 -3.63769531e-02
 -1.13281250e-01  1.95312500e-01  9.76562500e-02  1.26953125e-01
  6.59179688e-02  6.93359375e-02  1.02539062e-02  1.75781250e-01
 -1.68945312e-01  1.21307373e-03 -2.98828125e-01 -1.15234375e-01
  5.66406250e-02 -1.77734375e-01 -2.08984375e-01  1.76757812e-01
  2.38037109e-02 -2.57812500e-01 -4.46777344e-02  1.88476562e-01
  5.51757812e-02  5.02929

In [51]:
print(word_vectors["computer"].shape)

(300,)


## King + Woman - Man = ?

In [52]:
print(word_vectors.most_similar(positive=["king", "woman"], negative=["man"], topn=10))

[('queen', 0.7118191123008728), ('monarch', 0.6189674735069275), ('princess', 0.5902430415153503), ('crown_prince', 0.5499458909034729), ('prince', 0.5377322435379028), ('kings', 0.5236843824386597), ('Queen_Consort', 0.5235945582389832), ('queens', 0.5181134939193726), ('sultan', 0.5098593235015869), ('monarchy', 0.5087411403656006)]


In [58]:
print(word_vectors.similarity("man", "woman"))
print(word_vectors.similarity("uncle", "aunt"))
print(word_vectors.similarity("king", "queen"))
print(word_vectors.similarity("paper", "water"))
print(word_vectors.similarity("love", "lust"))
print(word_vectors.similarity("king", "prince"))

0.7664013
0.76434743
0.6510956
0.11408083
0.49054837
0.61599934


In [66]:
vocab_size = 50257
output_dim = 256

embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [67]:
print(embedding_layer)

Embedding(50257, 256)


In [68]:
print(embedding_layer.weight)

Parameter containing:
tensor([[-0.4929, -0.6450, -1.0246,  ..., -1.7757,  0.6923,  1.0577],
        [ 0.5537, -1.2345,  1.2049,  ...,  1.4926, -1.1857,  0.7473],
        [ 1.8012,  1.6329, -0.0124,  ..., -0.2303, -0.5655,  1.0179],
        ...,
        [-1.1298,  2.3401, -0.6434,  ...,  1.4289,  1.3869, -1.6856],
        [ 0.6242,  1.2071, -0.4798,  ..., -0.4190,  0.0043, -0.1600],
        [-1.7224, -1.9753,  0.9367,  ..., -0.4196,  2.1439,  0.3224]],
       requires_grad=True)


In [69]:
print(embedding_layer.weight.shape)

torch.Size([50257, 256])


In [70]:
print(embedding_layer(torch.tensor([3])))

tensor([[-1.9122e+00, -1.1795e+00,  2.0013e+00,  9.5173e-01,  8.5986e-01,
         -2.6641e-01, -1.6078e+00,  1.6468e+00,  1.3746e+00, -6.9106e-01,
         -7.5227e-01,  2.8173e-01,  8.7650e-01, -1.4891e+00, -2.4414e+00,
         -5.6740e-01,  3.4028e-01,  1.1848e+00,  2.0492e-01, -1.9629e-01,
          1.6003e-01,  4.6897e-01,  7.1532e-01, -8.1305e-01,  3.6588e-02,
         -7.8822e-01,  3.5511e-01,  1.6221e+00,  1.9618e-01,  2.3492e+00,
         -4.6375e-01,  1.5493e-01,  1.7543e+00,  1.0009e-01,  2.9186e-01,
         -1.6354e-01,  7.1212e-01, -1.6300e-01, -8.1122e-01,  7.0495e-01,
         -1.3796e+00,  2.7165e+00, -1.4820e+00, -2.1050e+00,  1.1250e+00,
          7.2963e-01,  6.7500e-01,  1.2340e+00,  5.2935e-01,  1.9421e+00,
          4.2064e-01, -3.7821e-01, -3.4056e-02, -4.0355e-01, -1.2833e+00,
         -9.9081e-01,  5.0731e-01,  3.1345e-01, -6.5149e-01, -5.1123e-01,
         -9.1441e-01,  1.4891e+00,  2.5822e-01,  4.2337e-01, -5.4219e-01,
         -9.3368e-01, -1.6910e+00,  1.

In [71]:
print(raw_text)

I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no great surprise to me to hear that, in the height of his glory, he had dropped his painting, married a rich widow, and established himself in a villa on the Riviera. (Though I rather thought it would have been Rome or Florence.)

"The height of his glory"--that was what the women called it. I can hear Mrs. Gideon Thwing--his last Chicago sitter--deploring his unaccountable abdication. "Of course it's going to send the value of my picture 'way up; but I don't think of that, Mr. Rickham--the loss to Arrt is all I think of." The word, on Mrs. Thwing's lips, multiplied its _rs_ as though they were reflected in an endless vista of mirrors. And it was not only the Mrs. Thwings who mourned. Had not the exquisite Hermia Croft, at the last Grafton Gallery show, stopped me before Gisburn's "Moon-dancers" to say, with tears in her eyes: "We shall not look upon its like again"?

Well!--even through th

In [78]:
max_length = 4
batch_size = 8
dataloader = create_dataloader_v1(
    raw_text,
    batch_size=batch_size,
    max_length=max_length,
    stride=max_length
)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)

print("Inputs:\n", inputs)
print("\nTargets:\n", targets)

Inputs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Targets:
 tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])


In [86]:
print(inputs.shape)

torch.Size([8, 4])


In [87]:
token_embeddings = embedding_layer(inputs)
print(token_embeddings.shape)

torch.Size([8, 4, 256])


In [88]:
print(token_embeddings)

tensor([[[ 0.9601, -1.6877, -1.7196,  ...,  0.3569,  1.9970, -0.7171],
         [-0.4689, -1.8422, -0.1349,  ..., -0.9923, -2.9433, -1.8708],
         [-0.0756,  1.7488, -1.1992,  ..., -0.3497,  1.0420, -1.9209],
         [ 0.5741,  2.9795, -1.6599,  ...,  0.6334,  0.3045,  0.1579]],

        [[ 0.2489, -0.0203,  2.2110,  ..., -0.4581,  1.2608,  0.5329],
         [ 0.3303,  1.5201, -0.1826,  ..., -0.2602, -0.7301,  1.8632],
         [-0.1387, -2.0924, -0.1258,  ...,  1.4043, -0.8699, -1.0886],
         [ 0.7005,  1.4989,  2.0600,  ...,  0.6639, -1.0436,  0.1660]],

        [[-0.0587, -0.3765,  0.2245,  ...,  0.7806, -2.7868, -1.8325],
         [ 0.2802,  1.2549, -1.2488,  ...,  1.5823, -1.9168, -0.2692],
         [ 0.7033,  0.7588, -0.3033,  ..., -0.9565, -0.3696,  0.3556],
         [ 0.0956, -0.6824,  0.0070,  ..., -1.1174, -0.2377, -0.8604]],

        ...,

        [[ 1.4084, -0.7480,  0.6181,  ..., -1.8359,  0.0869, -1.2618],
         [-0.0918,  1.1863,  1.7509,  ...,  0.7536, -0.04

In [98]:
context_length = max_length
position_embeddings = torch.nn.Embedding(context_length, output_dim)
print(position_embeddings.weight)

Parameter containing:
tensor([[ 0.5844,  0.5423, -0.3747,  ...,  0.4857, -0.3348, -0.5082],
        [ 0.1089, -0.2572, -0.6037,  ..., -0.2221, -0.2554, -0.9325],
        [-0.7410,  0.1916,  0.8543,  ..., -0.1104,  0.1787,  1.3902],
        [-0.6391, -0.1373, -1.6579,  ..., -0.4924,  1.0987,  0.6292]],
       requires_grad=True)


In [99]:
position_embeddings = position_embeddings(torch.arange(max_length))
print(position_embeddings.shape)

torch.Size([4, 256])


In [100]:
input_embeddings = token_embeddings + position_embeddings
print(input_embeddings.shape)

torch.Size([8, 4, 256])


In [101]:
print(input_embeddings)

tensor([[[ 1.5445, -1.1454, -2.0943,  ...,  0.8426,  1.6622, -1.2253],
         [-0.3600, -2.0993, -0.7386,  ..., -1.2145, -3.1988, -2.8032],
         [-0.8165,  1.9404, -0.3449,  ..., -0.4601,  1.2208, -0.5307],
         [-0.0650,  2.8422, -3.3178,  ...,  0.1410,  1.4032,  0.7872]],

        [[ 0.8333,  0.5220,  1.8363,  ...,  0.0275,  0.9259,  0.0247],
         [ 0.4392,  1.2630, -0.7863,  ..., -0.4823, -0.9855,  0.9307],
         [-0.8797, -1.9007,  0.7286,  ...,  1.2939, -0.6911,  0.3016],
         [ 0.0614,  1.3615,  0.4021,  ...,  0.1715,  0.0550,  0.7952]],

        [[ 0.5256,  0.1658, -0.1502,  ...,  1.2663, -3.1216, -2.3406],
         [ 0.3892,  0.9978, -1.8525,  ...,  1.3601, -2.1722, -1.2017],
         [-0.0377,  0.9505,  0.5510,  ..., -1.0670, -0.1909,  1.7458],
         [-0.5434, -0.8197, -1.6509,  ..., -1.6099,  0.8610, -0.2312]],

        ...,

        [[ 1.9928, -0.2057,  0.2434,  ..., -1.3502, -0.2480, -1.7700],
         [ 0.0172,  0.9291,  1.1472,  ...,  0.5314, -0.29