# setup package and prepare working directories

In [1]:
!pip install pytube
!pip install yt_dlp

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import datetime
import os
import pandas as pd
import pytube as pt
from tqdm.notebook import tqdm
import yt_dlp

In [3]:
gdrive_home_path = "/content/drive"
from google.colab import drive
drive.mount(gdrive_home_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
DIR_NAME = "uraradi_transcript"

work_dir = os.path.join(gdrive_home_path, "MyDrive", DIR_NAME)
work_sounds_dir = os.path.join(work_dir, "sounds")
work_transcripts_dir = os.path.join(work_dir, "transcripts")

os.makedirs(work_dir, exist_ok=True)
os.makedirs(work_sounds_dir, exist_ok=True)
os.makedirs(work_transcripts_dir, exist_ok=True)

# load playlist and transform to dataframe

In [5]:
playlist_url = "https://youtube.com/playlist?list=PLShwbdwZFm3r77Bwrr1quz2CpqJc6BZVL"
pl = pt.Playlist(url=playlist_url)

ydl_opts = {
    "ignore_no_formats_error": True,
    "quiet": True
}

radioinfo_columns = ["url", "title", "date", "length_s"]
radioinfo_list = []
for url in tqdm(pl.video_urls):
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        info = ydl.extract_info(url, download=False)
    if "duration" in info.keys():   # skip if info["duration"] is not defined, that is, youtubelive will begin in future.
        title = info["title"]
        publish_date = str(info["release_date"][0:4]).zfill(4) + "-" +\
                       str(info["release_date"][4:6]).zfill(2) + "-" +\
                       str(info["release_date"][6:8]).zfill(2)  #yyyymmdd to yyyy-mm-dd
        length = info["duration"]
        videoinfo = [url, title, publish_date, length]
        radioinfo_list.append(videoinfo)
df = pd.DataFrame(radioinfo_list, columns=radioinfo_columns)
df.sort_values(by="date", ascending=True).reset_index(drop=True)
df.head()

  0%|          | 0/80 [00:00<?, ?it/s]



Unnamed: 0,url,title,date,length_s
0,https://www.youtube.com/watch?v=YCvPUXbebzg,ラジオネームChatGPT｜#74.5 裏ラジオウルナイト｜大浦るかこ // あにまーれ,2023-03-10,2766
1,https://www.youtube.com/watch?v=o61S1MnpG9c,熱が冷めるとき｜#74 裏ラジオウルナイト｜大浦るかこ // あにまーれ,2023-03-03,6722
2,https://www.youtube.com/watch?v=DjD9aKYtVz0,【裏ラジ#73】母校の様子が変 / 裏ラジオウルナイト【大浦るかこ / あにまーれ】,2023-02-24,6272
3,https://www.youtube.com/watch?v=MpPw4htq3ZE,【裏ラジ#72】葬式で流す曲 / 裏ラジオウルナイト【龍ヶ崎リン・大浦るかこ / あにまーれ】,2023-02-17,8647
4,https://www.youtube.com/watch?v=5M9wGdW7ko0,【裏ラジ#71】珈琲vs紅茶 / 裏ラジオウルナイト【ハヤシ・大浦るかこ / あにまーれ】,2023-02-10,6681


# export to csv

In [6]:
filename = "playlist_" + pl.title + ".csv"

df.to_csv(os.path.join(work_dir, filename), index=False)