In [55]:
import os
import sys
import torch
import json
from pathlib import Path
from typing import List
from pprint import pprint
sys.path.append(str(Path(os.getcwd()).parent))

from huggingface_hub import notebook_login
from llama_index.core import Settings
from llama_index.llms.groq import Groq
from llama_index.llms.nvidia import NVIDIA
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core.callbacks import CallbackManager, TokenCountingHandler
from llama_index.core.program import LLMTextCompletionProgram
from pydantic import BaseModel, parse_obj_as
from transformers import AutoTokenizer, AutoModelForCausalLM
from accelerate import disk_offload

from data.transcript import get_video_transcript

In [2]:
GROQ_API_KEY = os.environ['GROQ_API_KEY']
NVIDIA_API_KEY = os.environ['NVIDIA_API_KEY']

In [12]:
llm = NVIDIA(model="meta/llama-3.1-8b-instruct", api_key=NVIDIA_API_KEY)
llm = Groq(model="llama-3.1-8b-instant")

In [64]:
transcript = get_video_transcript(url="https://www.youtube.com/watch?v=zduSFxRajkE")

In [65]:
text = ""
for item in transcript:
    text += item['text']

In [66]:
text[:500]

"hi everyone so in this video I'd like usto cover the process of tokenization inlarge language models now you see herethat I have a set face and that'sbecause uh tokenization is my leastfavorite part of working with largelanguage models but unfortunately it isnecessary to understand in some detailbecause it it is fairly hairy gnarly andthere's a lot of hidden foot guns to beaware of and a lot of oddness with largelanguage models typically traces back totokenization so what istokenization now in m"

In [67]:
class Topic(BaseModel):
    name: str
    description: str

In [68]:
tokenizer = AutoTokenizer.from_pretrained("nvidia/Llama3-ChatQA-2-70B")
token_counter = TokenCountingHandler(
    tokenizer=tokenizer.encode
)

In [69]:
# sllm = llm.as_structured_llm(output_cls=Topic)

In [70]:
topic_extractor_prompt = f"""
Given the transcript of a YouTube video tutorial:

{text[:-100000]}

Understand what this video is about and identify the main topics.
Include as much description as possible for each topic.
Make sure topics are not ambiguous and have meaningful segregation of content.

Format your response as a JSON array of objects, where each object has two fields:
- "name": The name of the topic (a short, concise title)
- "description": A detailed description of the topic

Example format:
[
  {{
    "name": "Topic 1",
    "description": "Detailed description of Topic 1..."
  }},
  {{
    "name": "Topic 2",
    "description": "Detailed description of Topic 2..."
  }}
]

Remove the preamble and ensure your output is valid JSON. 
"""

In [71]:
response = llm.complete(topic_extractor_prompt)

In [72]:
json.loads(response.text)

[{'name': 'Introduction to Tokenization', 'description': 'The video begins with an introduction to tokenization, a process essential for working with large language models. The speaker explains that tokenization is their least favorite part of working with large language models, but it is necessary to understand in detail due to its complexity and potential issues. The speaker also mentions that tokenization is the process of translating strings or text into sequences of tokens, which are the fundamental units of large language models.'}, {'name': 'Naive Tokenization', 'description': 'The speaker explains that in their previous video, they implemented a naive tokenization process, which was a character-level tokenizer. They loaded a training set, created a vocabulary of 65 possible characters, and created a lookup table for converting characters into tokens. The speaker then demonstrated how this process worked by tokenizing a string and encoding it into tokens.'}, {'name': 'BPE Encodi

In [75]:
try:
    json_response = json.loads(response.text)
    topics = parse_obj_as(List[Topic], json_response)
    topics_dict = {topic.name: topic.description for topic in topics}
except json.JSONDecodeError:
    print("Error: The LLM output was not valid JSON.")
except ValueError:
    print("Error: The LLM output did not match the expected format.")
    
topics = ', '.join(list(topics_dict.keys()))
topics_out = f"""I have identified the following topics from the video: {topics}"""

C:\Users\Hashir\AppData\Local\Temp\ipykernel_2524\482209594.py:3: PydanticDeprecatedSince20: `parse_obj_as` is deprecated. Use `pydantic.TypeAdapter.validate_python` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.9/migration/
  topics = parse_obj_as(List[Topic], json_response)


In [76]:
topics_out

'I have identified the following topics from the video: Introduction to Tokenization, Naive Tokenization, BPE Encoding Algorithm, Tokenization Issues, UTF-8 Encoding, Tokenization in Large Language Models, Comparison of Tokenizers, Implementation of Tokenization'

In [19]:
topics = sllm.complete(topic_extractor_prompt)

ValidationError: 1 validation error for LLMStructuredPredictEndEvent
output
  Input should be a valid dictionary or instance of BaseModel [type=model_type, input_value="2 validation errors for ...antic.dev/2.9/v/missing", input_type=str]
    For further information visit https://errors.pydantic.dev/2.9/v/model_type

In [None]:
topics