
# Summarize Youtube video lecture 
- Summarize Youtube's script by chapter creater configured. 
  - Create `markdown_note.md` with script and summary.
- Use [yt-dlp](https://pypi.org/project/yt-dlp/), [pydub](https://pypi.org/project/pydub/), [OpenAI-Whisper](https://pypi.org/project/openai-whisper/), [langchain](https://github.com/hwchase17/langchain), and [OpenAI](https://github.com/openai/openai-python) package. 


## Install dependency

In [None]:
# Install libav and ffmpeg. 
! brew install ffmpeg 

# For linux (aptitude)
# apt-get install libav-tools libavcodec-extra ffmpeg

# install package 
! pip install -r requirements.txt

## Input variables

In [None]:
# Youtube video ID
youtube_video_id="Lq5a1inNCzw"

# Language of subscription 
language = "ko"


# Officially no way to get chapter automatically, 
# so copy and paste the time stamp and chapter in description of Youtube video.
# CAUTION: Timestamp 00:00 must be defined! 
chapter_part_in_description = """
00:00 시작
02:33 댓글 읽어보기
10:00 댓글 읽어보기-2
20:00 댓글 읽어보기-3
30:00 댓글 읽어보기-4
51:30 서울양평고속도로 무단 종점변경에 대한 세 가지 질문(한준호 의원)
54:35 양평군수 전진선의 행태
1:04:45 사실부합성의 조건을 어떻게 충족시킬 것인가?
1:17:12 이소영 의원의 설명을 들어보자(분석적, 개념적 사고)
1:40:24 진리에 근거한 인재평가의 프레임워크
1:44:00 자아(Ego)와 자기(Self), 자기기만
1:51:10 정리
"""

In [None]:
# Officially no way to get chapter automatically, 
# so we need to parse the text in description and set up the dictionary 
# [ (time_in_sec, chapter_title) ]
import re 
pattern = r'(\d+(:\d+){1,2})\s(.+)'
matches = re.findall(pattern, chapter_part_in_description)

def time_to_seconds(time):
    parts = time.split(':')
    seconds = int(parts[-1])
    minutes = int(parts[-2]) if len(parts) > 1 else 0
    hours = int(parts[-3]) if len(parts) > 2 else 0
    return hours * 3600 + minutes * 60 + seconds

chapters = [(time_to_seconds(time), title.strip()) for time, _, title in matches]


# Build up note with chapter and script under each chapter 

In [None]:
import yt_dlp

# Download youtube video and extract audio file. 
def download(video_id: str) -> str:
    video_url = f'https://www.youtube.com/watch?v={video_id}'
    ydl_opts = {
        'format': 'm4a/bestaudio/best',
        'paths': {'home': 'audio/'},
        'outtmpl': {'default': '%(id)s.%(ext)s'},
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'mp3',
        }]
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        error_code = ydl.download([video_url])
        if error_code != 0:
            raise Exception('Failed to download video')

    return f'audio/{video_id}.mp3'

file_path = download(youtube_video_id)



In [None]:

from langchain.text_splitter import RecursiveCharacterTextSplitter

chunk_size = 800
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = chunk_size,
    chunk_overlap  = 0,
    length_function = len
)


In [None]:
# split audio file
import time
from pydub import AudioSegment

audio_data = AudioSegment.from_mp3(file_path)

for i in range( len(chapters) ):
    current_time_in_sec, current_title = chapters[i]
    next_time_in_sec, next_title = chapters[i + 1] if i + 1 < len(chapters) else (None, None)

    current_time_in_ms = current_time_in_sec * 1000
    next_time_in_ms = next_time_in_sec * 1000 if next_time_in_sec is not None else 0

    if next_time_in_sec:
        splitted_audio_data = audio_data[current_time_in_ms:next_time_in_ms]
    else:
        splitted_audio_data = audio_data[current_time_in_ms:]

    splitted_audio_data.export(f'audio/{i}.mp3' , format="mp3")

In [None]:
# Transcribe the text from audio files.
from datetime import datetime
import os
import whisper
from dotenv import load_dotenv

# Setup OpenAI API key 
load_dotenv()

# Prepare the file path for text
text_file_folder_path = os.path.join( os.getcwd(), 'text')
if not os.path.exists( text_file_folder_path ):
    os.makedirs(text_file_folder_path) 


# You can adjust the model used here. Model choice is typically a tradeoff between accuracy and speed.
# All available models are located at https://github.com/openai/whisper/#available-models-and-languages.
whisper_model = whisper.load_model("small")

script_by_chapter = []
def transcribe(file_path: str) -> str:
    # `fp16` defaults to `True`, which tells the model to attempt to run on GPU.
    # For local demonstration purposes, we'll run this on the CPU by setting it to `False`.
    transcription = whisper_model.transcribe(file_path, fp16=False)
    return transcription['text'] # type: ignore

for i in range( len(chapters) ):
    current_time_in_sec, current_title = chapters[i]
    print( f'{datetime.now()} : {current_title} is transcripting... \n' )
    audio_file_path = os.path.join( os.getcwd(), 'audio', f'{i}.mp3' )
    transcript = transcribe(audio_file_path)

    chapter_data = { 
                "title": current_title,
                "script": transcript,
                "summary" : ""
                }
    
    script_by_chapter.append(chapter_data)

    # Save transcript file
    text_file = os.path.join(text_file_folder_path, f'{i}.txt')
    with open( text_file, "w") as file:
        file.write(chapter_data["script"])



In [None]:

# Temporary save data into file 
import os 
import json 

with open( "temp_script_by_chapter.json", "w") as file:
    file.write( json.dumps(script_by_chapter, indent=2, ensure_ascii=False) )


# Write note by summarizing contents


In [None]:
import os
import llm


# Prepare the file path for text
text_file_folder_path = os.path.join( os.getcwd(), 'text')

# Create temporary folder to store note
note_file_folder_path = os.path.join( os.getcwd(), 'note')
if not os.path.exists( note_file_folder_path ):
    os.makedirs(note_file_folder_path) 


# Summarize each chapter
index = 0
for index, c in enumerate(script_by_chapter):

    transcript = ""
    # Read text data from file.
    with open( os.path.join(text_file_folder_path, f'{index}.txt'), "r" ) as file:
        transcript = file.read()


    # Log
    title = c["title"]
    print( f"Chapter {title} is processing..")


    # Split script
    texts = text_splitter.split_text(transcript)

    # Summarize the text 
    summarized_text = ""
    for t in texts:        
        partial_summary = llm.rectify_and_summarize_text(t)
        summarized_text += partial_summary
        print( ".", end="")

    c["summary"] = summarized_text

    # Save note into file
    text_file = os.path.join(note_file_folder_path, f'{index}.txt')
    with open( text_file, "w") as file:
        file.write(summarized_text)
    index += 1

    print('\n')



## Publish markdown document

Will write down all contents into `markdown_note.md`. This is the summarized note for this Youtube video. 

In [None]:
import os

# Remove temporary data
os.remove("temp_script_by_chapter.json")

# Save chapter data into file 
with open( "script_by_chapter.json", "w") as file:
    file.write( json.dumps(script_by_chapter, indent=2, ensure_ascii=False) )


In [None]:
full_markdown_text = ""

for c in script_by_chapter:
    full_markdown_text += f"# {c['title']} \n\n"
    full_markdown_text += f"## Summary \n"
    full_markdown_text += f"{c['summary']} \n\n"
    full_markdown_text += f"## Script \n\n"
    full_markdown_text += f"{c['script']} \n"
    full_markdown_text += "\n\n"

In [None]:

# Write markdown document for note.
with open( "markdown_note.md", "w") as file:
    file.write(full_markdown_text)