
# Generate transcript from Youtube video 
- Summarize Youtube's script by chapter creater configured. 
  - Create `markdown_note.md` with script and summary.
- Use [yt-dlp](https://pypi.org/project/yt-dlp/), [pydub](https://pypi.org/project/pydub/), [OpenAI-Whisper](https://pypi.org/project/openai-whisper/), [langchain](https://github.com/hwchase17/langchain), and [OpenAI](https://github.com/openai/openai-python) package. 


## Input variables

In [1]:
# Youtube video ID
youtube_video_id="Qc6tYyhYZMg"

# Language of subscription 
language = "ko"


# Officially no way to get chapter automatically, 
# so copy and paste the time stamp and chapter in description of Youtube video.
# CAUTION: Timestamp 00:00 must be defined! 
chapter_part_in_description = """
00:00 시작
00:01  윤석열은 도대체 왜 이러는 걸까?
12:39  나는 왜 유튜브를 하려고 하는가? 왜 하필 라이브(live)로.
24:04 냉철한 사랑, 즉 조직론적 사랑에 대하여
40:46 조직론적 사랑은 논문표절을 어떻게 처리할까?

"""

hint_to_fix = \
"""
총화대에서 -> 청와대에서
문소 -> 문서
사회 통염 -> 사회통념
포난라서 -> 퍼날라서
학교 좀 -> 합격점
조직설리 -> 조직설계
있어 빌리티만 -> 있어빌리티만
문제인 -> 문재인
학교 좀 -> 합격점
조직설리 -> 조직설계
있어 빌리티만 -> 있어빌리티만
순회부가 -> 수뇌부가
집에 구조 -> 지배구조
구수도 -> 고스톱
민주공항 -> 민주공화국
앵글로스엑스 -> 앵글로색슨
게흐만모형 -> 게르만 모형
다른 100년 -> 다른백년
"""

In [2]:
# Officially no way to get chapter automatically, 
# so we need to parse the text in description and set up the dictionary 
# [ (time_in_sec, chapter_title) ]
import re 
pattern = r'(\d+(:\d+){1,2})\s(.+)'
matches = re.findall(pattern, chapter_part_in_description)

def time_to_seconds(time):
    parts = time.split(':')
    seconds = int(parts[-1])
    minutes = int(parts[-2]) if len(parts) > 1 else 0
    hours = int(parts[-3]) if len(parts) > 2 else 0
    return hours * 3600 + minutes * 60 + seconds

chapters = [(time_to_seconds(time), title.strip()) for time, _, title in matches]


# Build up note with chapter and script under each chapter 

In [3]:
import os
import yt_dlp

# Download youtube video and extract audio file. 
def download(video_id: str) -> str:
    video_url = f'https://www.youtube.com/watch?v={video_id}'
    ydl_opts = {
        'format': 'm4a/bestaudio/best',
        'paths': {'home': 'audio/'},
        'outtmpl': {'default': '%(id)s.%(ext)s'},
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'mp3',
        }]
    }

    if os.path.exists(f'audio/{video_id}.mp3'):
        return ""

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        error_code = ydl.download([video_url])
        if error_code != 0:
            raise Exception('Failed to download video')

    return f'audio/{video_id}.mp3'

file_path = download(youtube_video_id)



[youtube] Extracting URL: https://www.youtube.com/watch?v=Qc6tYyhYZMg
[youtube] Qc6tYyhYZMg: Downloading webpage
[youtube] Qc6tYyhYZMg: Downloading ios player API JSON
[youtube] Qc6tYyhYZMg: Downloading android player API JSON
[youtube] Qc6tYyhYZMg: Downloading m3u8 information
[info] Qc6tYyhYZMg: Downloading 1 format(s): 140
[download] audio/Qc6tYyhYZMg.m4a has already been downloaded
[download] 100% of   75.07MiB
[ExtractAudio] Destination: audio/Qc6tYyhYZMg.mp3
Deleting original file audio/Qc6tYyhYZMg.m4a (pass -k to keep)


In [4]:
# split audio file
import os
import time
from pydub import AudioSegment

if os.path.exists(file_path) or file_path != '':
    audio_data = AudioSegment.from_mp3(file_path)
    
    for i in range( len(chapters) ):

        current_time_in_sec, current_title = chapters[i]
        next_time_in_sec, next_title = chapters[i + 1] if i + 1 < len(chapters) else (None, None)

        current_time_in_ms = current_time_in_sec * 1000
        next_time_in_ms = next_time_in_sec * 1000 if next_time_in_sec is not None else 0

        if next_time_in_sec:
            splitted_audio_data = audio_data[current_time_in_ms:next_time_in_ms]
        else:
            splitted_audio_data = audio_data[current_time_in_ms:]

        splitted_audio_data.export(f'audio/{i}.mp3' , format="mp3")

In [5]:
# Transcribe the text from audio files.
import os
from datetime import datetime
from dotenv import load_dotenv

import whisper
import llm

# Setup OpenAI API key 
load_dotenv()

# Prepare the file path for text
text_file_folder_path = os.path.join( os.getcwd(), 'text')
if not os.path.exists( text_file_folder_path ):
    os.makedirs(text_file_folder_path) 

# clear text or not
clear_text:bool = False

# You can adjust the model used here. Model choice is typically a tradeoff between accuracy and speed.
# All available models are located at https://github.com/openai/whisper/#available-models-and-languages.
whisper_model = whisper.load_model("small")

script_by_chapter = []
def transcribe(file_path: str) -> str:
    # `fp16` defaults to `True`, which tells the model to attempt to run on GPU.
    # For local demonstration purposes, we'll run this on the CPU by setting it to `False`.
    transcription = whisper_model.transcribe(file_path, fp16=False)
    return transcription['text'] # type: ignore

for i in range( len(chapters) ):
    current_time_in_sec, current_title = chapters[i]
    print( f'{datetime.now()} : {current_title} is transcripting... \n' )

    audio_file_path = os.path.join( os.getcwd(), 'audio', f'{i}.mp3' )
    text_file = os.path.join(text_file_folder_path, f'{i}.txt')

    if not os.path.exists(text_file):
        transcript = transcribe(audio_file_path)

        if clear_text : 
            cleaned_transcript = llm.clean_up_sentence_punctuation_and_fix_errors(transcript, 
                                                                                hint_to_fix, 
                                                                                chunk_size=200, 
                                                                                verbose=True)

            merged_text = \
        f"""
        {transcript}

        --------

        {cleaned_transcript}
        """

            chapter_data = { 
                        "title": current_title,
                        "script": merged_text,
                        "summary" : ""
                        }
        else:
             chapter_data = { 
                        "title": current_title,
                        "script": transcript,
                        "summary" : ""
                        }

        
        script_by_chapter.append(chapter_data)

        # Save transcript file

        with open( text_file, "w") as file:
            file.write(chapter_data["script"])
    else:
        chapter_data = { 
            "title": current_title,
            "script": "",
            "summary" : ""
            }

        script_by_chapter.append(chapter_data)  



  @numba.jit


2024-01-25 09:32:05.388970 : 시작 is transcripting... 



KeyboardInterrupt: 

In [None]:

# Temporary save data into file 
import os 
import json 

with open( "temp_script_by_chapter.json", "w") as file:
    file.write( json.dumps(script_by_chapter, indent=2, ensure_ascii=False) )
