In [3]:
import os
import sys
import torch
import json
from pathlib import Path
from typing import List
from pprint import pprint
sys.path.append(str(Path(os.getcwd()).parent))

from huggingface_hub import notebook_login
from llama_index.core import Settings
from llama_index.llms.groq import Groq
from llama_index.llms.nvidia import NVIDIA
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core.callbacks import CallbackManager, TokenCountingHandler
from llama_index.core.program import LLMTextCompletionProgram
from pydantic import BaseModel, parse_obj_as
from transformers import AutoTokenizer, AutoModelForCausalLM
from accelerate import disk_offload

from data.transcript import get_video_transcript








In [4]:
GROQ_API_KEY = os.environ['GROQ_API_KEY']
NVIDIA_API_KEY = os.environ['NVIDIA_API_KEY']

In [5]:
llm = NVIDIA(model="meta/llama-3.1-8b-instruct", api_key=NVIDIA_API_KEY)
llm = Groq(model="llama-3.1-8b-instant")

In [6]:
transcript = get_video_transcript(url="https://www.youtube.com/watch?v=zduSFxRajkE")

In [7]:
text = ""
for item in transcript:
    text += item['text']

In [8]:
text[:500]

"hi everyone so in this video I'd like usto cover the process of tokenization inlarge language models now you see herethat I have a set face and that'sbecause uh tokenization is my leastfavorite part of working with largelanguage models but unfortunately it isnecessary to understand in some detailbecause it it is fairly hairy gnarly andthere's a lot of hidden foot guns to beaware of and a lot of oddness with largelanguage models typically traces back totokenization so what istokenization now in m"

In [9]:
class Topic(BaseModel):
    name: str
    description: str

In [10]:
tokenizer = AutoTokenizer.from_pretrained("nvidia/Llama3-ChatQA-2-70B")
token_counter = TokenCountingHandler(
    tokenizer=tokenizer.encode
)

In [11]:
# sllm = llm.as_structured_llm(output_cls=Topic)

In [12]:
topic_extractor_prompt = f"""
Given the transcript of a YouTube video tutorial:

{text[:-100000]}

Understand what this video is about and identify the main topics.
Include as much description as possible for each topic.
Make sure topics are not ambiguous and have meaningful segregation of content.

Format your response as a JSON array of objects, where each object has two fields:
- "name": The name of the topic (a short, concise title)
- "description": A detailed description of the topic

Example format:
[
  {{
    "name": "Topic 1",
    "description": "Detailed description of Topic 1..."
  }},
  {{
    "name": "Topic 2",
    "description": "Detailed description of Topic 2..."
  }}
]

Remove the preamble and ensure your output is valid JSON. 
"""

In [13]:
response = llm.complete(topic_extractor_prompt)

In [14]:
json.loads(response.text)

[{'name': 'Introduction to Tokenization',
  'description': "The host introduces the topic of tokenization in large language models, explaining that it's a crucial but complex process. They mention that tokenization is their least favorite part of working with large language models, but it's necessary to understand in detail. The host also mentions that many issues with large language models can be traced back to tokenization."},
 {'name': 'Naive Tokenization',
  'description': "The host explains that in their previous video, they implemented a naive and simple version of tokenization. They used a vocabulary of 65 possible characters and created a lookup table to convert characters into tokens. The host shows an example of how this process works and how it's used to plug text into large language models."},
 {'name': 'Character-Level Tokenization vs. Chunk-Level Tokenization',
  'description': 'The host explains that in state-of-the-art language models, people use more complicated scheme

In [15]:
try:
    json_response = json.loads(response.text)
    topics = parse_obj_as(List[Topic], json_response)
    topics_dict = {topic.name: topic.description for topic in topics}
except json.JSONDecodeError:
    print("Error: The LLM output was not valid JSON.")
except ValueError:
    print("Error: The LLM output did not match the expected format.")
    
topics = ', '.join(list(topics_dict.keys()))
topics_out = f"""I have identified the following topics from the video: {topics}"""

C:\Users\Hashir\AppData\Local\Temp\ipykernel_14300\482209594.py:3: PydanticDeprecatedSince20: `parse_obj_as` is deprecated. Use `pydantic.TypeAdapter.validate_python` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.9/migration/
  topics = parse_obj_as(List[Topic], json_response)


In [16]:
topics_out

'I have identified the following topics from the video: Introduction to Tokenization, Naive Tokenization, Character-Level Tokenization vs. Chunk-Level Tokenization, BPE (Byte Pair Encoding) Algorithm, UTF-8 Encoding, UTF-8 Encoding Limitations, Tokenization and Vocabulary Size, Tokenization and Language Models, GPT-4 Tokenizer vs. GPT-2 Tokenizer, Code Implementation'

In [17]:
response = """[
    {
        "question": "What is the primary goal of the BPE (Byte Pair Encoding) algorithm?",
        "options": [
            "a) To encrypt byte sequences",
            "b) To compress byte sequences into a variable amount of tokens",
            "c) To decrypt byte sequences",
            "d) To convert byte sequences into a fixed amount of tokens"
        ],
        "answer": "b) To compress byte sequences into a variable amount of tokens",
        "explanation": "The BPE algorithm is used to compress byte sequences into a variable amount of tokens, allowing for more efficient storage and processing."
    },
    {
        "question": "How does the BPE algorithm find the pair of tokens to replace?",
        "options": [
            "a) By randomly selecting pairs",
            "b) By iterating through the sequence and finding the pair that occurs most frequently",
            "c) By using a predefined set of rules",
            "d) By using a machine learning model"
        ],
        "answer": "b) By iterating through the sequence and finding the pair that occurs most frequently",
        "explanation": "The BPE algorithm iteratively finds the pair of tokens that occur most frequently in the sequence and replaces them with a new token."
    },
    {
        "question": "What happens when the BPE process is repeated?",
        "options": [
            "a) The vocabulary size decreases",
            "b) The vocabulary size remains the same",
            "c) The vocabulary size increases",
            "d) The sequence becomes less compressed"
        ],
        "answer": "c) The vocabulary size increases",
        "explanation": "When the BPE process is repeated, the vocabulary size increases as more pairs of tokens are replaced with new tokens."
    },
    {
        "question": "What is the result of replacing a pair of tokens with a new token in the BPE algorithm?",
        "options": [
            "a) The sequence becomes less compressed",
            "b) The sequence remains the same",
            "c) The sequence becomes more compressed",
            "d) The sequence is encrypted"
        ],
        "answer": "c) The sequence becomes more compressed",
        "explanation": "Replacing a pair of tokens with a new token in the BPE algorithm results in a more compressed sequence."
    },
    {
        "question": "What is the benefit of using the BPE algorithm?",
        "options": [
            "a) It increases the storage requirements of the sequence",
            "b) It decreases the processing time of the sequence",
            "c) It allows for more efficient storage and processing of the sequence",
            "d) It reduces the vocabulary size of the sequence"
        ],
        "answer": "c) It allows for more efficient storage and processing of the sequence",
        "explanation": "The BPE algorithm allows for more efficient storage and processing of the sequence by compressing it into a variable amount of tokens."
    }
]"""

In [18]:
import json

json.loads(response)

[{'question': 'What is the primary goal of the BPE (Byte Pair Encoding) algorithm?',
  'options': ['a) To encrypt byte sequences',
   'b) To compress byte sequences into a variable amount of tokens',
   'c) To decrypt byte sequences',
   'd) To convert byte sequences into a fixed amount of tokens'],
  'answer': 'b) To compress byte sequences into a variable amount of tokens',
  'explanation': 'The BPE algorithm is used to compress byte sequences into a variable amount of tokens, allowing for more efficient storage and processing.'},
 {'question': 'How does the BPE algorithm find the pair of tokens to replace?',
  'options': ['a) By randomly selecting pairs',
   'b) By iterating through the sequence and finding the pair that occurs most frequently',
   'c) By using a predefined set of rules',
   'd) By using a machine learning model'],
  'answer': 'b) By iterating through the sequence and finding the pair that occurs most frequently',
  'explanation': 'The BPE algorithm iteratively find

In [19]:
json_str = """
```json
[
    {
        "name": "a",
        "description": "bc"
    }
]
```
"""

json_str

'\n```json\n[\n    {\n        "name": "a",\n        "description": "bc"\n    }\n]\n```\n'

In [20]:
json.loads(json_str)

JSONDecodeError: Expecting value: line 2 column 1 (char 1)

In [26]:
import re
from pprint import pprint

json_str2 = re.sub(r"(```|json)", "", json_str)

json_str, json_str2

('\n```json\n[\n    {\n        "name": "a",\n        "description": "bc"\n    }\n]\n```\n',
 '\n\n[\n    {\n        "name": "a",\n        "description": "bc"\n    }\n]\n\n')

In [33]:
data = json.loads(json_str2)
data

[{'name': 'a', 'description': 'bc'}]