
# Populate chapters of Youtube video


In [9]:
# Input variables
youtube_video_id="31a2kI6r6gw"
language = "ko"
max_token = 3000
model = "gpt-3.5-turbo"
chunk_size = 1000
chunk_overlap = 20

# Officially no way to get chapter automatically, 
# so copy and pagese the time stamp and chapter in description of Youtube video. 

chapter_part_in_description = """
35:11 전체내용
"""

In [10]:
# Officially no way to get chapter automatically, 
# so we need to parse the text in description and set up the dictionary 
# [ (time_in_sec, chapter_title) ]
import re 
pattern = r'(\d+(:\d+){1,2})\s(.+)'
matches = re.findall(pattern, chapter_part_in_description)

def time_to_seconds(time):
    parts = time.split(':')
    seconds = int(parts[-1])
    minutes = int(parts[-2]) if len(parts) > 1 else 0
    hours = int(parts[-3]) if len(parts) > 2 else 0
    return hours * 3600 + minutes * 60 + seconds

chapters = [(time_to_seconds(time), title.strip()) for time, _, title in matches]


# Build up note with chapter and script under each chapter 

In [11]:
from collections import deque
from youtube_transcript_api import YouTubeTranscriptApi

# Populate the script of YouTube video
data = YouTubeTranscriptApi.get_transcripts([youtube_video_id], languages=[language])
script_data = deque( data[0][youtube_video_id] )


# Put the script under each chatpter
# [ 
#   { 
#    "title": current_title,
#    "script": script_in_chapter
#    } ....
#  ]

script_by_chapter = []

script_in_chapter = ""
for i in range( len(chapters) ):
    current_time_in_sec, current_title = chapters[i]
    next_time_in_sec, next_title = chapters[i + 1] if i + 1 < len(chapters) else (None, None)

    if len(script_data) == 0:
        break

    s = script_data.popleft()
    end_time_of_script_in_sec = int( s['start'] + s['duration'] )

    if next_time_in_sec is not None:
        
        while end_time_of_script_in_sec < next_time_in_sec:
            script_in_chapter += s['text']
            script_in_chapter += " "
            s = script_data.popleft()
            end_time_of_script_in_sec = int( s['start'] + s['duration'] )

        chapter_data = { 
                        "title": current_title,
                        "script": script_in_chapter
                        }
        
        script_by_chapter.append(chapter_data)
        script_in_chapter = ""        

    else:
        script_in_chapter = ""

        while len(script_data) > 0 :
            script_in_chapter += s['text']
            script_in_chapter += " "
            s = script_data.popleft()
            end_time_of_script_in_sec = int( s['start'] + s['duration'] )

        chapter_data = { 
                        "title": current_title,
                        "script": script_in_chapter,
                        "summary" : ""
                        }
        
        script_by_chapter.append(chapter_data)



In [12]:

# Temporary save data into file 
import os 
import json 

with open( "temp_script_by_chapter.json", "w") as file:
    file.write( json.dumps(script_by_chapter, indent=2, ensure_ascii=False) )


# Write note by summarizing contents


In [13]:
# Setup OpenAI API key 
from dotenv import load_dotenv

load_dotenv()

True

In [14]:
import openai

def summarize_text_with_gpt3(text, max_token=3000, model="gpt-3.5-turbo"):
    prompt = f"Summarize following text with bulletin points in Korean:\n{text}"
    response = openai.ChatCompletion.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=max_token
    )

    corrected_text = response.choices[0].message.content
    return corrected_text

In [15]:

from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = chunk_size,
    chunk_overlap  = chunk_overlap,
    length_function = len
)


In [16]:
import time

# Summarize each chapter 
for c in script_by_chapter:
    texts = text_splitter.split_text(c["script"])

    title = c["title"]
    print( f"Chapter {title} is in-pro. ")
    
    summarized_text = ""
    for t in texts:
        time.sleep(0.05) # Avoid the bad request error. 
        partial_summary = summarize_text_with_gpt3(t, max_token = max_token, model = model)
        summarized_text += partial_summary
        print( ".", end="")

    c["summary"] = summarized_text
    print('\n')
    time.sleep(0.5) # Avoid the bad request error. 


Chapter 전체내용 is in-pro. 
...................



## Print it formmated text

In [17]:
with open( "script_by_chapter.json", "w") as file:
    file.write( json.dumps(script_by_chapter, indent=2, ensure_ascii=False) )


In [18]:
full_markdown_text = ""


for c in script_by_chapter:
    full_markdown_text += f"# {c['title']} \n\n"
    full_markdown_text += f"## Summary \n"
    full_markdown_text += f"{c['summary']} \n\n"
    full_markdown_text += f"## Script \n\n"
    full_markdown_text += f"{c['script']} \n"
    full_markdown_text += "\n\n"



In [19]:
from IPython.core.display import display_markdown

display_markdown(full_markdown_text)

In [20]:

with open( "markdown_note.md", "w") as file:
    file.write(full_markdown_text)