# --- Input custom variables here --- 

In [1]:
PLAYLIST_URL = "https://www.youtube.com/playlist?list=PL9zq2zalZB1JRZsPtYeFVQAZkrDbZx3Qw" # My Learning Playlist 
TEST_URL = "https://www.youtube.com/watch?v=vaUy6zyJfwU" # 1 minute video 

WHISPER_MODEL_SIZE = "tiny"

# --- How to use --- 

Export notebook as `transcribe_youtube.py`, then import to use the key functions



### example usage 

In your Python script/notebook: 

```
from transcribe_youtube import get_transcription_from_youtube_url, get_transcriptions_from_youtube_playlist

PLAYLIST_URL = "https://www.youtube.com/playlist?list=PL9zq2zalZB1JRZsPtYeFVQAZkrDbZx3Qw" # My Learning Playlist 
TEST_URL = "https://www.youtube.com/watch?v=vaUy6zyJfwU" # 1 minute video 

# get transcription for a single video 
single_video_transcription = get_transcription_from_youtube_url(TEST_URL)

# get transcriptions for all videos in a playlist
playlist_transcriptions = get_transcriptions_from_youtube_playlist(PLAYLIST_URL)

```


### return object
Returns a dict with `title`, `url`, and `transcription`


Example: 
```
{'title': 'Deep Learning Maps Animal Movement',
 'url': 'https://www.youtube.com/watch?v=vaUy6zyJfwU',
 'transcription': ' We developed a new type of 3D deep learning approach that can take in normal color videos of behaving animals and behaving humans and then output the precise 3D locations of body landmarks so skeletal joints that you can track over time and thus provide a comprehensive description of how subjects are moving. This is a huge leap forward compared to a traditional motion capture system in which subjects need to wear highly invasive markers on the body. And then another big issue with motion captures that it requires that you have a clear line of sights to these markers and in a deep learning-based approach that we develop, we relieve this requirement.'}
 ```

# --- Run Notebook --- 

In [2]:
# !pip install replicate
# !pip install pytube
# !pip install flask 

In [3]:
import pandas as pd
from pytube import YouTube, Playlist 

### Load Replicate's Whisper API 

In [4]:
import os
from dotenv import load_dotenv
load_dotenv()

REPLICATE_API_TOKEN = os.getenv("REPLICATE_API_KEY")
REPLICATE_MODEL_VERSION = os.getenv("REPLICATE_MODEL_VERSION")

In [5]:
# create a replicate client 
import replicate
client = replicate.Client(api_token=REPLICATE_API_TOKEN)
model = client.models.get("openai/whisper")
version = model.versions.get(REPLICATE_MODEL_VERSION)

In [6]:
def transcribe_audio(audio_url):
    output = version.predict(
        audio=audio_url,
        language="en", 
        model=WHISPER_MODEL_SIZE
    )

    return output 

In [7]:
def get_mp3_url_from_youtube(youtube_url):
    mp3 = YouTube(youtube_url).streams.filter(only_audio=True).first()
    return mp3.url 

In [8]:
# the main function 
import time 

def get_raw_transcription_from_youtube_url(youtube_url):
    start_time = time.time()
    mp3_url = get_mp3_url_from_youtube(youtube_url)
    whisperresponse = transcribe_audio(mp3_url)
    # print ("Time taken to transcribe (sec): ", time.time() - start_time)
    return whisperresponse['transcription']


In [9]:
def get_transcription_from_youtube_url(youtube_url):
    yt_object = YouTube(youtube_url)
    title = yt_object.title
    transcription = get_raw_transcription_from_youtube_url(youtube_url)

    return {
        "title": title,
        "url": youtube_url,
        "transcription": transcription
    }


In [10]:
# TESTING 
# # create an mp3 object from the youtube video

# mp3 = YouTube(TEST_URL).streams.filter(only_audio=True).first()
# print (mp3.url)

# whisperresponse = transcribe_audio(mp3.url)
# trans = whisperresponse['transcription']
# trans

In [11]:
# trans = get_transcription_from_yt_url(TEST_URL)
# trans

# Run transcription on an entire YouTube Playlist 

### playlist helpers 

In [12]:
def get_urls_from_youtube_playlist(playlist_url):
    """Returns a list of video urls from a youtube playlist"""
    playlist = Playlist(playlist_url)
    return playlist.video_urls

# print (get_urls_from_youtube_playlist(PLAYLIST_URL))

### main function: youtube playlist -> dict of transcriptions

In [18]:
# get the urls from the playlist

def get_transcriptions_from_youtube_playlist(playlist_url):
    """
    Returns a list of dictionaries with the following keys:
    - title
    - url
    - transcription
    """

    start_time = time.time()

    # Get list of individual video URLS
    playlist_url_list = get_urls_from_youtube_playlist(PLAYLIST_URL)

    # Get playlist title 
    playlist_title = Playlist(PLAYLIST_URL).title

    # ========
    # For each video, get the transcription
    transcriptions = [] # list to be returned 

    for url in playlist_url_list:
        transcriptions.append(
            get_transcription_from_youtube_url(url)
        )

    # OPTIONAL: save to a csv file
    df = pd.DataFrame(transcriptions)
    df.to_csv("{playlist_title}_transcriptions.csv", index=False)

    # log the time taken, round to 2 decimal places
    print (f"\n===\nTime taken to transcribe Playlist '{playlist_title}' (sec): \n", round(time.time() - start_time, 2), "\n===")

    return transcriptions


# --- Run tests ---

In [14]:
single_video_transcription = get_transcription_from_youtube_url(TEST_URL)
single_video_transcription

{'title': 'Deep Learning Maps Animal Movement',
 'url': 'https://www.youtube.com/watch?v=vaUy6zyJfwU',
 'transcription': ' We developed a new type of 3D deep learning approach that can take in normal color videos of behaving animals and behaving humans and then output the precise 3D locations of body landmarks so skeletal joints that you can track over time and thus provide a comprehensive description of how subjects are moving. This is a huge leap forward compared to a traditional motion capture system in which subjects need to wear highly invasive markers on the body. And then another big issue with motion captures that it requires that you have a clear line of sights to these markers and in a deep learning-based approach that we develop, we relieve this requirement.'}

In [19]:
playlist_transcriptions = get_transcriptions_from_youtube_playlist(PLAYLIST_URL)
playlist_transcriptions


===
Time taken to transcribe Playlist 'My Learning Playlist' (sec): 
 127.51 
===


[{'title': 'Deep Learning Maps Animal Movement',
  'url': 'https://www.youtube.com/watch?v=vaUy6zyJfwU',
  'transcription': ' We developed a new type of 3D deep learning approach that can take in normal color videos of behaving animals and behaving humans and then output the precise 3D locations of body landmarks so skeletal joints that you can track over time and thus provide a comprehensive description of how subjects are moving. This is a huge leap forward compared to a traditional motion capture system in which subjects need to wear highly invasive markers on the body. And then another big issue with motion captures that it requires that you have a clear line of sights to these markers and in a deep learning-based approach that we develop, we relieve this requirement.'},
 {'title': "First look: Stable Diffusion's Top 10 Datasets",
  'url': 'https://www.youtube.com/watch?v=CcKJD7PUaZg',
  'transcription': " I don't think I've ever done a one minute video before. We're wasting time. 