
# Generate transcript from Youtube video 
- Summarize Youtube's script by chapter creater configured. 
  - Create `markdown_note.md` with script and summary.
- Use [yt-dlp](https://pypi.org/project/yt-dlp/), [pydub](https://pypi.org/project/pydub/), [OpenAI-Whisper](https://pypi.org/project/openai-whisper/), [langchain](https://github.com/hwchase17/langchain), and [OpenAI](https://github.com/openai/openai-python) package. 


## Install dependency

In [None]:
# Install libav and ffmpeg. 
! brew install ffmpeg 

# For linux (aptitude)
# apt-get install libav-tools libavcodec-extra ffmpeg

# install package 
! pip install -r requirements.txt

## Input variables

In [None]:
# Youtube video ID
youtube_video_id="O8GLjpnW-cM"

# Language of subscription 
language = "ko"


# Officially no way to get chapter automatically, 
# so copy and paste the time stamp and chapter in description of Youtube video.
# CAUTION: Timestamp 00:00 must be defined! 
chapter_part_in_description = """
00:00 시작
05:00 댓글 읽어보기
31:35 조직을 어떻게 분석할 것인가? 인사조직론의 구조
36:16 앵글로색슨 모형의 피라미드형 계급구조 vs. 게르만 모형의 네트워크형 수평구조
44:07 대전환의 의미: 이것이 개혁이다
51:07 대의원제도의 폐해: 원내대표 박광온을 비롯한 수박들의 사고체계
1:09:16 경영플랫폼의 설계와 운용에서 가장 중요한 것은 무엇인가?
1:12:11 이재명의 민주당의 비전은 무엇인가? 기본사회를 향하여
1:14:12 기본사회라는 비전은 어떻게 전략으로 전환되는가?
1:24:12 부탁의 말씀
1:39:03 전략이란 무엇인가? 이니셔티브들의 조합
1:41:12 전략의 실행은 철학, 실력, 용기를 필요로 하며 자율성을 위한 분권화를 말한다
1:52:26 촛불혁명의 정신을 생각하자
1:53:58 질서와 자유를 조화시켰던 독일의 지식인들과 조화의 사상가 조소앙의 삼균주의
2:01:53 정리
"""

In [None]:
# Officially no way to get chapter automatically, 
# so we need to parse the text in description and set up the dictionary 
# [ (time_in_sec, chapter_title) ]
import re 
pattern = r'(\d+(:\d+){1,2})\s(.+)'
matches = re.findall(pattern, chapter_part_in_description)

def time_to_seconds(time):
    parts = time.split(':')
    seconds = int(parts[-1])
    minutes = int(parts[-2]) if len(parts) > 1 else 0
    hours = int(parts[-3]) if len(parts) > 2 else 0
    return hours * 3600 + minutes * 60 + seconds

chapters = [(time_to_seconds(time), title.strip()) for time, _, title in matches]


# Build up note with chapter and script under each chapter 

In [None]:
import os
import yt_dlp

# Download youtube video and extract audio file. 
def download(video_id: str) -> str:
    video_url = f'https://www.youtube.com/watch?v={video_id}'
    ydl_opts = {
        'format': 'm4a/bestaudio/best',
        'paths': {'home': 'audio/'},
        'outtmpl': {'default': '%(id)s.%(ext)s'},
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'mp3',
        }]
    }

    if os.path.exists(f'audio/{video_id}.mp3'):
        return ""

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        error_code = ydl.download([video_url])
        if error_code != 0:
            raise Exception('Failed to download video')

    return f'audio/{video_id}.mp3'

file_path = download(youtube_video_id)



In [None]:

from langchain.text_splitter import RecursiveCharacterTextSplitter

chunk_size = 800
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = chunk_size,
    chunk_overlap  = 0,
    length_function = len
)


In [None]:
# split audio file
import os
import time
from pydub import AudioSegment

if os.path.exists(file_path) or file_path != '':
    audio_data = AudioSegment.from_mp3(file_path)
    
    for i in range( len(chapters) ):

        current_time_in_sec, current_title = chapters[i]
        next_time_in_sec, next_title = chapters[i + 1] if i + 1 < len(chapters) else (None, None)

        current_time_in_ms = current_time_in_sec * 1000
        next_time_in_ms = next_time_in_sec * 1000 if next_time_in_sec is not None else 0

        if next_time_in_sec:
            splitted_audio_data = audio_data[current_time_in_ms:next_time_in_ms]
        else:
            splitted_audio_data = audio_data[current_time_in_ms:]

        splitted_audio_data.export(f'audio/{i}.mp3' , format="mp3")

In [None]:
# Transcribe the text from audio files.
import os
from datetime import datetime
from dotenv import load_dotenv

import whisper
import llm

# Setup OpenAI API key 
load_dotenv()

# Prepare the file path for text
text_file_folder_path = os.path.join( os.getcwd(), 'text')
if not os.path.exists( text_file_folder_path ):
    os.makedirs(text_file_folder_path) 


# You can adjust the model used here. Model choice is typically a tradeoff between accuracy and speed.
# All available models are located at https://github.com/openai/whisper/#available-models-and-languages.
whisper_model = whisper.load_model("small")

script_by_chapter = []
def transcribe(file_path: str) -> str:
    # `fp16` defaults to `True`, which tells the model to attempt to run on GPU.
    # For local demonstration purposes, we'll run this on the CPU by setting it to `False`.
    transcription = whisper_model.transcribe(file_path, fp16=False)
    return transcription['text'] # type: ignore

for i in range( len(chapters) ):
    current_time_in_sec, current_title = chapters[i]
    print( f'{datetime.now()} : {current_title} is transcripting... \n' )
    audio_file_path = os.path.join( os.getcwd(), 'audio', f'{i}.mp3' )
    transcript = transcribe(audio_file_path)


    # Split script
    texts = text_splitter.split_text(transcript)

    # Summarize the text 
    corrected_text = ""
    for t in texts:
        # summarize text as note
        partial_summary = llm.correct_text(t)
        corrected_text += partial_summary
        corrected_text += "\n\n"
        print( ".", end="")
    print("\n")

    chapter_data = { 
                "title": current_title,
                "script": corrected_text,
                "summary" : ""
                }
    
    script_by_chapter.append(chapter_data)

    # Save transcript file
    text_file = os.path.join(text_file_folder_path, f'{i}.txt')
    with open( text_file, "w") as file:
        file.write(chapter_data["script"])



In [None]:

# Temporary save data into file 
import os 
import json 

with open( "temp_script_by_chapter.json", "w") as file:
    file.write( json.dumps(script_by_chapter, indent=2, ensure_ascii=False) )
