
# Populate chapters of Youtube video


In [9]:
youtube_video_id="KoQN29vi_0A"
language = "ko"

# Officially no way to get chapter automatically, 
# so copy and pagese the time stamp and chapter in description of Youtube video. 

chapter_part_in_description = """
00:00 시작
12:56 댓글 읽어보기
25:51 김건희와 윤석열의 의식
47:07 《성취예측모형》 프레임워크와 학습능력: 김건희와 정명훈의 경우
1:08:23 〈DANO 경영 플랫폼〉으로 본 윤석열: 비전, 전략, 조직
1:38:32 《성취예측모형》으로 본 윤석열: 한국적 무속의 특징과 콤플렉스
1:49:32 문명사에 대한 이해가 중요한 이유: 무당에 빠진 인간들을 더 계몽해야 한다
1:52:11 APM 지수가 우리에게 말해주는 것
1:52:57 그리고 우리가 반드시 알아야 할 것들
1:58:02 정리
"""

In [10]:
# Officially no way to get chapter automatically, 
# so we need to parse the text in description and set up the dictionary 
# [ (time_in_sec, chapter_title) ]
import re 
pattern = r'(\d+(:\d+){1,2})\s(.+)'
matches = re.findall(pattern, chapter_part_in_description)

def time_to_seconds(time):
    parts = time.split(':')
    seconds = int(parts[-1])
    minutes = int(parts[-2]) if len(parts) > 1 else 0
    hours = int(parts[-3]) if len(parts) > 2 else 0
    return hours * 3600 + minutes * 60 + seconds

chapters = [(time_to_seconds(time), title.strip()) for time, _, title in matches]


# Build up note with chapter and script under each chapter 

In [13]:
from collections import deque
from youtube_transcript_api import YouTubeTranscriptApi

# Populate the script of YouTube video
data = YouTubeTranscriptApi.get_transcripts([youtube_video_id], languages=[language])
script_data = deque( data[0][youtube_video_id] )


# Put the script under each chatpter
# [ 
#   { 
#    "title": current_title,
#    "script": script_in_chapter
#    } ....
#  ]

script_by_chapter = []

script_in_chapter = ""
for i in range( len(chapters) ):
    current_time_in_sec, current_title = chapters[i]
    next_time_in_sec, next_title = chapters[i + 1] if i + 1 < len(chapters) else (None, None)

    if len(script_data) == 0:
        break

    s = script_data.popleft()
    end_time_of_script_in_sec = int( s['start'] + s['duration'] )

    if next_time_in_sec is not None:
        
        while end_time_of_script_in_sec < next_time_in_sec:
            script_in_chapter += s['text']
            script_in_chapter += " "
            s = script_data.popleft()
            end_time_of_script_in_sec = int( s['start'] + s['duration'] )

        chapter_data = { 
                        "title": current_title,
                        "script": script_in_chapter
                        }
        
        script_by_chapter.append(chapter_data)
        script_in_chapter = ""        

    else:
        script_in_chapter = ""

        while len(script_data) > 0 :
            script_in_chapter += s['text']
            script_in_chapter += " "
            s = script_data.popleft()
            end_time_of_script_in_sec = int( s['start'] + s['duration'] )

        chapter_data = { 
                        "title": current_title,
                        "script": script_in_chapter,
                        "summary" : ""
                        }
        
        script_by_chapter.append(chapter_data)



In [14]:

# Temporary save data into file 
import os 
import json 

with open( "temp_script_by_chapter.json", "w") as file:
    file.write( json.dumps(script_by_chapter, indent=2, ensure_ascii=False) )


# Write note by summarizing contents


In [15]:
# Setup OpenAI API key 
from dotenv import load_dotenv

load_dotenv()

True

In [16]:
import openai

def summarize_text_with_gpt3(text):
    prompt = f"Summarize following text with bulletin points in Korean:\n{text}"
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=3000
    )

    corrected_text = response.choices[0].message.content
    return corrected_text

In [17]:

from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 600,
    chunk_overlap  = 20,
    length_function = len
)


In [18]:
import time

# Summarize each chapter 
for c in script_by_chapter:
    texts = text_splitter.split_text(c["script"])

    title = c["title"]
    print( f"Chapter {title} is in-pro. ")
    
    summarized_text = ""
    for t in texts:
        time.sleep(0.01) # Avoid the bad request error. 
        partial_summary = summarize_text_with_gpt3(t)
        summarized_text += partial_summary
        print( ".", end="")

    c["summary"] = summarized_text
    print('\n')
    time.sleep(0.5) # Avoid the bad request error. 


Chapter 시작 is in-pro. 
.......

Chapter 댓글 읽어보기 is in-pro. 
........

Chapter 김건희와 윤석열의 의식 is in-pro. 
.............

Chapter 《성취예측모형》 프레임워크와 학습능력: 김건희와 정명훈의 경우 is in-pro. 
............

Chapter 〈DANO 경영 플랫폼〉으로 본 윤석열: 비전, 전략, 조직 is in-pro. 
..................

Chapter 《성취예측모형》으로 본 윤석열: 한국적 무속의 특징과 콤플렉스 is in-pro. 
........

Chapter 문명사에 대한 이해가 중요한 이유: 무당에 빠진 인간들을 더 계몽해야 한다 is in-pro. 
..

Chapter APM 지수가 우리에게 말해주는 것 is in-pro. 
.

Chapter 그리고 우리가 반드시 알아야 할 것들 is in-pro. 
....



## Print it formmated text

In [19]:
with open( "script_by_chapter.json", "w") as file:
    file.write( json.dumps(script_by_chapter, indent=2, ensure_ascii=False) )


In [20]:
full_markdown_text = ""


for c in script_by_chapter:
    full_markdown_text += f"# {c['title']} \n\n"
    full_markdown_text += f"## Summary \n"
    full_markdown_text += f"{c['summary']} \n\n"
    full_markdown_text += f"## Script \n\n"
    full_markdown_text += f"{c['script']} \n"
    full_markdown_text += "\n\n"



In [23]:
from IPython.core.display import display_markdown

display_markdown(full_markdown_text)

In [22]:

with open( "markdown_note.md", "w") as file:
    file.write(full_markdown_text)