In [None]:
import os
os.environ["OPENAI_API_KEY"] ="paste your api key"

In [5]:
!pip install -q youtube-transcript-api langchain-community langchain-openai \
               faiss-cpu tiktoken python-dotenv

In [6]:
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import PromptTemplate

# Step 1a - Indexing (Document Ingestion)

In [7]:
video_id = 'Xpr8D6LeAtw'

try:
    ytt_api = YouTubeTranscriptApi()
    transcript_list = ytt_api.fetch(video_id)
    # transcript_list is iterable; each snippet has a 'text' attribute
    transcript = " ".join([snippet.text for snippet in transcript_list])
    print(transcript)
except Exception as e:
    print(f"An error occurred: {e}")

[Music] hello everyone welcome to the introductory lecture in this series which is titled build a large language model from scratch my name is Dr Raj dander I graduated from IIT Madras with a bch in mechanical engineering in 2017 after that I did my PhD from the Massachusetts Institute of Technology MIT I graduated from MIT with a PhD in machine learning in 2022 since then I've come back to India and we are all on a mission to basically make AI accessible to everyone U at viua which is our YouTube channel we have made several playlists now on machine learning on deep learning and uh the main approach or the philosophy which we really follow when we teach is to teach everything from from the basics not to assume anything teach you the nuts and bolts of every single concept the real reason behind this series is that as we all know large language models and generative AI are transforming everything around us startups are forming in this space companies are switching to large language mode

In [8]:
transcript_list

FetchedTranscript(snippets=[FetchedTranscriptSnippet(text='[Music]', start=0.0, duration=7.839), FetchedTranscriptSnippet(text='hello everyone welcome to the', start=5.359, duration=4.921), FetchedTranscriptSnippet(text='introductory lecture in this series', start=7.839, duration=4.8), FetchedTranscriptSnippet(text='which is titled build a large language', start=10.28, duration=5.88), FetchedTranscriptSnippet(text='model from scratch my name is Dr Raj', start=12.639, duration=6.961), FetchedTranscriptSnippet(text='dander I graduated from IIT Madras with', start=16.16, duration=6.439), FetchedTranscriptSnippet(text='a bch in mechanical engineering in', start=19.6, duration=7.36), FetchedTranscriptSnippet(text='2017 after that I did my PhD from the', start=22.599, duration=6.641), FetchedTranscriptSnippet(text='Massachusetts Institute of Technology', start=26.96, duration=6.08), FetchedTranscriptSnippet(text='MIT I graduated from MIT with a PhD in', start=29.24, duration=5.8), FetchedTra

# Step 1b - Indexing (Text Splitting)

In [9]:
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splitter.create_documents([transcript])

In [10]:
len(chunks)

18

In [11]:
chunks[10]

Document(metadata={}, page_content="the need for this skill is only going to increase in the future so this again brings me to the question that if someone wants to learn about large language models how do they go about doing this so let's say you go to Google and you search build llms learn which say which means you want to learn about large language models there are a number of courses which show up really over here now if you go to many of these courses you will see let let say build llm apps it's about app development it does not teach you how to build a large language model from scratch here is another course Master SL Master large language model Concepts if you look at this course description right they don't teach you how to build an llm from scratch at all they don't teach you the nuts and bolts it's a pretty quick course this is also not what I'm looking for what I'm looking for is one course which teaches me the foundations in a lot of detail and in a lot of depth I want to k

# Step 1c & 1d - Indexing (Embedding Generation and Storing in Vector Store)

In [12]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
vector_store = FAISS.from_documents(chunks, embeddings)

In [13]:
vector_store.index_to_docstore_id

{0: 'e9d28293-0d6a-4b31-9559-ec11122aaa2d',
 1: '15d4bbca-73d5-463b-b0e5-9beba7871995',
 2: '44947742-bac9-46e5-ab3a-919d0013fc12',
 3: '619f4ecf-abb1-4d38-a5f2-e642087e5ffa',
 4: '321f45be-290e-4936-a6dc-40348ac36d4c',
 5: '79406364-e720-4477-be0e-f485eec53e82',
 6: 'ec4492e4-3814-49a3-97a5-7611acb387f5',
 7: 'f43b7183-599b-42a3-b309-bf279f7cffbe',
 8: '3325e560-bdb0-4b2c-9347-e70708474235',
 9: 'be065dd0-dbb4-40c4-92fe-2988c16871da',
 10: 'a59d83d6-df22-4dc6-9660-d95781f5373f',
 11: '249ead1d-dda0-417f-90f0-5ee616b48a72',
 12: '2d253cc4-2663-4742-8c33-1fc9afaf34e8',
 13: 'f2381a9e-f6c3-4e90-8177-6c2c772c19c1',
 14: 'ed31f6a9-f90c-41e4-a614-266f7e7dffc2',
 15: 'f77a2a13-afe9-45ea-9de2-15b23c2d72fe',
 16: '55c925ee-1098-4ebd-8f78-9dcc1fc0bce1',
 17: 'e591f1c1-d4df-4a1e-8fd1-01e1d3fad2c2'}

In [14]:
vector_store.get_by_ids(['244e8d52-4c03-4543-a4f2-a9d571c01c4f'])

[]

# Step 2 - Retrieval

In [15]:
retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 4})

In [16]:
retriever

VectorStoreRetriever(tags=['FAISS', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x000001271FF08D00>, search_kwargs={'k': 4})

In [17]:
retriever.invoke('What is large language model')

[Document(id='a59d83d6-df22-4dc6-9660-d95781f5373f', metadata={}, page_content="the need for this skill is only going to increase in the future so this again brings me to the question that if someone wants to learn about large language models how do they go about doing this so let's say you go to Google and you search build llms learn which say which means you want to learn about large language models there are a number of courses which show up really over here now if you go to many of these courses you will see let let say build llm apps it's about app development it does not teach you how to build a large language model from scratch here is another course Master SL Master large language model Concepts if you look at this course description right they don't teach you how to build an llm from scratch at all they don't teach you the nuts and bolts it's a pretty quick course this is also not what I'm looking for what I'm looking for is one course which teaches me the foundations in a lot

# Step 3 - Augmentation

In [18]:
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.2)

In [19]:
prompt = PromptTemplate(
    template="""
      You are a helpful assistant.
      Answer ONLY from the provided transcript context.
      If the context is insufficient, just say you don't know.

      {context}
      Question: {question}
    """,
    input_variables = ['context', 'question']
)

In [20]:
question          = "is the topic of artificial intelligence discussed in this video? if yes then what was discussed"
retrieved_docs    = retriever.invoke(question)

In [21]:
retrieved_docs


[Document(id='3325e560-bdb0-4b2c-9347-e70708474235', metadata={}, page_content="of there is also a lot of confusion with many people regarding what is generative AI what are llms really but generative AI is a broader subset and it includes language it includes video audio 3D models all the things so have a look at some of these videos don't these videos look incredibly realistic this video then this video The Waves video this particular video you'll be surprised to know that all of these videos are made by artificial intelligence these videos are not shot on camera they are made by AI this is the power of generative AI currently finally uh when we work with schools we have developed our own a application so this is viu's AI application uh or application on llms here you can see that there are a huge number of functionalities for example you can click on McQ generator and you can just type in the topic let's say gravity and you can click on generate now you'll see that within a matter o

In [22]:
context_text = "\n\n".join(doc.page_content for doc in retrieved_docs)
context_text

"of there is also a lot of confusion with many people regarding what is generative AI what are llms really but generative AI is a broader subset and it includes language it includes video audio 3D models all the things so have a look at some of these videos don't these videos look incredibly realistic this video then this video The Waves video this particular video you'll be surprised to know that all of these videos are made by artificial intelligence these videos are not shot on camera they are made by AI this is the power of generative AI currently finally uh when we work with schools we have developed our own a application so this is viu's AI application uh or application on llms here you can see that there are a huge number of functionalities for example you can click on McQ generator and you can just type in the topic let's say gravity and you can click on generate now you'll see that within a matter of seconds the large language model which is powering this application will\n\na

In [23]:
final_prompt = prompt.invoke({"context": context_text, "question": question})

In [24]:
final_prompt

StringPromptValue(text="\n      You are a helpful assistant.\n      Answer ONLY from the provided transcript context.\n      If the context is insufficient, just say you don't know.\n\n      of there is also a lot of confusion with many people regarding what is generative AI what are llms really but generative AI is a broader subset and it includes language it includes video audio 3D models all the things so have a look at some of these videos don't these videos look incredibly realistic this video then this video The Waves video this particular video you'll be surprised to know that all of these videos are made by artificial intelligence these videos are not shot on camera they are made by AI this is the power of generative AI currently finally uh when we work with schools we have developed our own a application so this is viu's AI application uh or application on llms here you can see that there are a huge number of functionalities for example you can click on McQ generator and you c

# Step 4 - Generation

In [25]:
answer = llm.invoke(final_prompt)
print(answer.content)

Yes, the topic of artificial intelligence is discussed in the video. It covers generative AI as a broader subset that includes various forms of media such as language, video, audio, and 3D models. The video highlights the capabilities of generative AI, showcasing realistic videos created by AI, and discusses the development of an AI application for generating multiple choice questions using large language models (LLMs). It emphasizes the transformative impact of LLMs and generative AI on education and the job market, noting the expected growth in demand for skills related to these technologies. Additionally, the speaker introduces a series aimed at teaching about large language models from the basics.


#  Building a Chain

In [26]:
from langchain_core.runnables import RunnableParallel, RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import StrOutputParser

In [27]:
def format_docs(retrieved_docs):
  context_text = "\n\n".join(doc.page_content for doc in retrieved_docs)
  return context_text

In [28]:
parallel_chain = RunnableParallel({
    'context': retriever | RunnableLambda(format_docs),
    'question': RunnablePassthrough()
})

In [29]:
parallel_chain.invoke('what is llm')

{'context': "very confident about this subject very few people have this knowledge right now and I'm making this YouTube playlist which will be a very comprehensive playlist showing you everything about building an llm from scratch the way I'm making this playlist or the way I will make videos in this playlist is to teach te you everything from the basics as a beginner without assuming anything and at the end of this playlist you will have built an llm from scratch successfully all by yourself you'll see that after this point everything which comes later all the application Parts everything will just start seeming extremely easy to you so that's the whole philosophy behind making this lecture series it takes a huge amount of effort on our part to make this series because as you you will see and I'll show you in some time to make every lecture we are going to make detailed lecture notes I'll share those lecture notes with you and all the videos in this series will be available completel

In [30]:
parser = StrOutputParser()

In [31]:
main_chain = parallel_chain | prompt | llm | parser

In [32]:
main_chain.invoke('Can you summarize the video')

'The video outlines a course focused on large language models (LLMs), which will be presented in a series of detailed video lectures. The instructor plans to convert a 48-page book on LLMs into approximately 35 to 50 videos, aiming to make the content engaging and fundamental. The course will be offered for free, and the instructor encourages viewers to provide feedback in the comments to stay motivated. The next lecture will introduce LLMs, and the instructor shares their personal learning journey and critiques existing online materials on the topic. The video also briefly mentions the history of natural language processing, referencing an early chatbot called Elisa.'