
# Populate chapters of Youtube video


## Input variables

In [None]:
# Youtube video ID
youtube_video_id="MZQ6bc6mPAE"

# Language of subscription 
language = "ko"

# Recommended parameters for my testing. 
max_token = 3000
model = "gpt-3.5-turbo"
chunk_size = 900
chunk_overlap = 100

# Officially no way to get chapter automatically, 
# so copy and pagese the time stamp and chapter in description of Youtube video. 

chapter_part_in_description = """
00:00 시작
05:24 해먹을 결심: 탄핵해야 하는 이유
10:55 댓글 읽어보기
24:27 인사조직론이란 무엇인가?
33:52 (게르만 모형) 왜 직무가 중요한가?
43:11 추미애의 직무인식
49:16 직무의 존재목적, 칸트의 인간관과 경영학적 응용
1:04:40 성과책임의 사회적 의미 (참고)직무의 3대 구성요소
1:06:05 직무개념의 부재
1:07:44 주진우와 양향자
1:11:52 추미애의 고백과 진심
1:35:09 왜 역량인가?
1:35:47 역량의 개념에 대한 이해
1:38:05 《성취예측모형》 프레임워크와 역량사전
1:41:48 진실한 리더십과 인재평가의 프레임워크
1:43:59 DANO 경영플랫폼 운용_리더십이란 무엇인가?
1:49:42 추미애에 대한 오해의 프레임과 추미애의 비전은 무엇인가?
1:53:21 이재명과 추미애 vs. 이낙연과 김진표
1:55:52 푸른 하늘을(김수영 시인, 1960.06.15.)
1:58:57 정리
"""

In [None]:
# Officially no way to get chapter automatically, 
# so we need to parse the text in description and set up the dictionary 
# [ (time_in_sec, chapter_title) ]
import re 
pattern = r'(\d+(:\d+){1,2})\s(.+)'
matches = re.findall(pattern, chapter_part_in_description)

def time_to_seconds(time):
    parts = time.split(':')
    seconds = int(parts[-1])
    minutes = int(parts[-2]) if len(parts) > 1 else 0
    hours = int(parts[-3]) if len(parts) > 2 else 0
    return hours * 3600 + minutes * 60 + seconds

chapters = [(time_to_seconds(time), title.strip()) for time, _, title in matches]


# Build up note with chapter and script under each chapter 

In [None]:
from collections import deque
from youtube_transcript_api import YouTubeTranscriptApi

# Populate the script of YouTube video
data = YouTubeTranscriptApi.get_transcripts([youtube_video_id], languages=[language])
script_data = deque( data[0][youtube_video_id] )


# Put the script under each chatpter
# [ 
#   { 
#    "title": current_title,
#    "script": script_in_chapter
#    } ....
#  ]

script_by_chapter = []

script_in_chapter = ""
for i in range( len(chapters) ):
    current_time_in_sec, current_title = chapters[i]
    next_time_in_sec, next_title = chapters[i + 1] if i + 1 < len(chapters) else (None, None)

    if len(script_data) == 0:
        break

    s = script_data.popleft()
    end_time_of_script_in_sec = int( s['start'] + s['duration'] )

    if next_time_in_sec is not None:
        
        while end_time_of_script_in_sec < next_time_in_sec:
            script_in_chapter += s['text']
            script_in_chapter += " "
            s = script_data.popleft()
            end_time_of_script_in_sec = int( s['start'] + s['duration'] )

        chapter_data = { 
                        "title": current_title,
                        "script": script_in_chapter
                        }
        
        script_by_chapter.append(chapter_data)
        script_in_chapter = ""        

    else:
        script_in_chapter = ""

        while len(script_data) > 0 :
            script_in_chapter += s['text']
            script_in_chapter += " "
            s = script_data.popleft()
            end_time_of_script_in_sec = int( s['start'] + s['duration'] )

        chapter_data = { 
                        "title": current_title,
                        "script": script_in_chapter,
                        "summary" : ""
                        }
        
        script_by_chapter.append(chapter_data)



In [None]:

# Temporary save data into file 
import os 
import json 

with open( "temp_script_by_chapter.json", "w") as file:
    file.write( json.dumps(script_by_chapter, indent=2, ensure_ascii=False) )


# Write note by summarizing contents


In [None]:
# Setup OpenAI API key 
from dotenv import load_dotenv

load_dotenv()

In [None]:
import openai

def summarize_text_with_gpt3(text, max_token=3000, model="gpt-3.5-turbo"):
    prompt = f"Summarize following text with bulletin points in Korean:\n{text}"
    response = openai.ChatCompletion.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=max_token
    )

    corrected_text = response.choices[0].message.content
    return corrected_text

In [None]:

from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = chunk_size,
    chunk_overlap  = chunk_overlap,
    length_function = len
)


In [None]:
import time

# Summarize each chapter 
for c in script_by_chapter:
    texts = text_splitter.split_text(c["script"])

    title = c["title"]
    print( f"Chapter {title} is in-pro. ")
    
    summarized_text = ""
    for t in texts:
        time.sleep(0.05) # Avoid the bad request error. 
        partial_summary = summarize_text_with_gpt3(t, max_token = max_token, model = model)
        summarized_text += partial_summary
        print( ".", end="")

    c["summary"] = summarized_text
    print('\n')
    time.sleep(0.5) # Avoid the bad request error. 


## Publish markdown document

Find `markdown_note.md`. This is the summarized note for this Youtube video. 

In [None]:
with open( "script_by_chapter.json", "w") as file:
    file.write( json.dumps(script_by_chapter, indent=2, ensure_ascii=False) )


In [None]:
full_markdown_text = ""

for c in script_by_chapter:
    full_markdown_text += f"# {c['title']} \n\n"
    full_markdown_text += f"## Summary \n"
    full_markdown_text += f"{c['summary']} \n\n"
    full_markdown_text += f"## Script \n\n"
    full_markdown_text += f"{c['script']} \n"
    full_markdown_text += "\n\n"

In [None]:

# Write markdown document for note.
with open( "markdown_note.md", "w") as file:
    file.write(full_markdown_text)