# Summarising YouTube video transcript

In [None]:
import os
import openai
from IPython.display import display, HTML, Markdown
from pprint import pprint

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file
openai.api_key = os.environ['OPENAI_API_KEY']

In [None]:
from langchain.callbacks import OpenAICallbackHandler

totals_cb = OpenAICallbackHandler()

print(totals_cb)

### YouTube videos used as examples:

- [Yann LeCun and Andrew Ng: Why the 6-month AI Pause is a Bad Idea](https://www.youtube.com/watch?v=BY9KV8uCtj4)
- [OpenAI CEO Sam Altman on the Future of AI](https://www.youtube.com/watch?v=A5uMNMAWi3E)
- [Chat with OpenAI CEO and and Co-founder Sam Altman, and Chief Scientist Ilya Sutskever](https://www.youtube.com/watch?v=mC-0XqTAeMQ)
- [The Godfather in Conversation: Why Geoffrey Hinton is worried about the future of AI](https://www.youtube.com/watch?v=-9cW4Gcn5WY)

In [None]:
from langchain.document_loaders import YoutubeLoader

# YouTubeLoder uses 'youtube-transcript-api' library
# https://github.com/jdepoix/youtube-transcript-api

videos=[
    "https://www.youtube.com/watch?v=BY9KV8uCtj4", # total tokens used - ca. 8k
    "https://www.youtube.com/watch?v=A5uMNMAWi3E", # total tokens used - ca. 8k
    "https://www.youtube.com/watch?v=mC-0XqTAeMQ", # total tokens used - ca. 11k
    "https://www.youtube.com/watch?v=-9cW4Gcn5WY", # total tokens used - ca. 13k
]

loader = YoutubeLoader.from_youtube_url(
    videos[0],
    language=["en"],
    translation="en",
)

docs = loader.load()

print(f"Document has {len(docs)} pages\n")
pprint(docs[0])

In [None]:
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI()

total_tokens = 0
for n, page in enumerate(docs):
    tokens = llm.get_num_tokens(page.page_content)
    total_tokens += tokens
    print(f"Page {n+1:2d}: {tokens:>}")
    
print(f"Total number of tokens in document: {total_tokens}")

In [None]:
from langchain.text_splitter import TokenTextSplitter

token_splitter = TokenTextSplitter(chunk_size=2000, chunk_overlap=50)

chunks = token_splitter.split_documents(docs)

print(f"Documents split into {len(chunks)} chunks\n")

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.chains.summarize import load_summarize_chain

summary_chain_refine = load_summarize_chain(
    ChatOpenAI(temperature=0.0), 
    chain_type="refine", 
)

In [None]:
summary_refine = summary_chain_refine(chunks, callbacks=[totals_cb])

In [None]:
display(Markdown(summary_refine["output_text"]))

In [None]:
pprint(totals_cb)