# Process YouTube subtitles

## Dependencies

In [11]:
import json
import os

import pandas as pd
import polars as pl

import tidder.polars as tpl
from tidder.data import Captions, Video

## Constants

In [3]:
DATA_PATH = "/Users/leonardoschettini/Documents/personal/tidder/data/"

DATA_PATH

'/Users/leonardoschettini/Documents/personal/tidder/data/'

In [4]:
SAMPLE_DATA_PATH = os.path.join(
    DATA_PATH,
    "Dan Koe/channel/20231105 - The Greatest Skill Of The 21st Century (The Top 1% Exploit This)",
)

SAMPLE_DATA_PATH

'/Users/leonardoschettini/Documents/personal/tidder/data/Dan Koe/channel/20231105 - The Greatest Skill Of The 21st Century (The Top 1% Exploit This)'

In [5]:
AUTO_SUBTITLE_FILE_NAME = "auto-subtitle.en.vtt"
EN_US_SUBTITLE_FILE_NAME = "subtitle.en-US.vtt"
VIDEO_INFO_FILE_NAME = "video.info.json"

In [6]:
SAMPLE_AUTO_SUBTITLE_FILE = os.path.join(SAMPLE_DATA_PATH, AUTO_SUBTITLE_FILE_NAME)
SAMPLE_EN_US_SUBTITLE_FILE = os.path.join(SAMPLE_DATA_PATH, EN_US_SUBTITLE_FILE_NAME)
SAMPLE_VIDEO_INFO_FILE = os.path.join(SAMPLE_DATA_PATH, VIDEO_INFO_FILE_NAME)

In [7]:
# Read text file content
with open(SAMPLE_AUTO_SUBTITLE_FILE, "r") as f:
    print(f.read())

WEBVTT
Kind: captions
Language: en

00:00:00.120 --> 00:00:02.389 align:start position:0%
 
the<00:00:00.240><c> greatest</c><00:00:00.599><c> skill</c><00:00:00.960><c> of</c><00:00:01.079><c> the</c><00:00:01.240><c> 21st</c><00:00:01.760><c> century</c>

00:00:02.389 --> 00:00:02.399 align:start position:0%
the greatest skill of the 21st century
 

00:00:02.399 --> 00:00:04.749 align:start position:0%
the greatest skill of the 21st century
is<00:00:02.600><c> not</c><00:00:03.000><c> email</c><00:00:03.360><c> marketing</c><00:00:03.919><c> it</c><00:00:04.040><c> is</c><00:00:04.160><c> not</c><00:00:04.400><c> graphic</c>

00:00:04.749 --> 00:00:04.759 align:start position:0%
is not email marketing it is not graphic
 

00:00:04.759 --> 00:00:06.909 align:start position:0%
is not email marketing it is not graphic
design<00:00:05.319><c> it</c><00:00:05.440><c> is</c><00:00:05.560><c> not</c><00:00:05.759><c> web</c><00:00:06.000><c> design</c><00:00:06.480><c> it</c><00:00:06.600><

## Functions

In [8]:
auto_captions = Captions.from_file(SAMPLE_AUTO_SUBTITLE_FILE)

auto_captions.df.head()

start,end,text
f64,f64,str
0.12,4.749,"""the greatest s…"
2.399,6.909,"""is not email m…"
4.759,8.549,"""design it is n…"
6.919,11.07,"""artificial int…"
8.559,14.43,"""personal brand…"


In [9]:
captions = Captions.from_file(SAMPLE_EN_US_SUBTITLE_FILE)

captions.df.head()

start,end,text
f64,f64,str
0.041,3.837,"""The greatest s…"
3.878,5.255,"""It is not grap…"
5.255,6.423,"""It is not web …"
6.423,8.133,"""It is not arti…"
8.133,9.426,"""It is not pers…"


In [12]:
# Read json
with open(SAMPLE_VIDEO_INFO_FILE) as f:
    video_info = json.load(f)

In [55]:
video_info_df = pl.DataFrame([video_info])

video_info_df.columns

dict_keys(['id', 'title', 'formats', 'thumbnails', 'thumbnail', 'description', 'channel_id', 'channel_url', 'duration', 'view_count', 'age_limit', 'webpage_url', 'categories', 'tags', 'playable_in_embed', 'live_status', '_format_sort_fields', 'automatic_captions', 'subtitles', 'comment_count', 'chapters', 'heatmap', 'channel', 'channel_follower_count', 'uploader', 'uploader_id', 'uploader_url', 'upload_date', 'availability', 'webpage_url_basename', 'webpage_url_domain', 'extractor', 'extractor_key', 'playlist_count', 'playlist', 'playlist_id', 'playlist_title', 'playlist_uploader', 'playlist_uploader_id', 'n_entries', 'playlist_index', 'display_id', 'fulltitle', 'duration_string', 'is_live', 'was_live', 'epoch', 'format', 'format_id', 'ext', 'protocol', 'language', 'format_note', 'filesize_approx', 'tbr', 'width', 'height', 'resolution', 'fps', 'dynamic_range', 'vcodec', 'vbr', 'aspect_ratio', 'acodec', 'abr', 'asr', 'audio_channels', '_type', '_version'])

In [14]:
relevant_columns = [
    "title",
    "formats",
    "description",
    "duration",
    "categories",
    "tags",
    "playable_in_embed",
    "automatic_captions",
    "subtitles",
    "chapters",
    "heatmap",
    "upload_date",
    "availability",
    "playlist_title",
    "playlist_index",
    "fulltitle",
    "language",
    "filesize_approx",
]

In [15]:
# Polars seems to have no way to constrain overall cell width for all data types.
# https://stackoverflow.com/q/75786523/7454638
# with pl.Config(tbl_cols=len(relevant_columns), fmt_str_lengths=20) as cfg:
#     display(video_info_df.select(relevant_columns))

with pd.option_context("display.max_columns", None):
    display(video_info_df.to_pandas()[relevant_columns])

Unnamed: 0,title,formats,description,duration,categories,tags,playable_in_embed,automatic_captions,subtitles,chapters,heatmap,upload_date,availability,playlist_title,playlist_index,fulltitle,language,filesize_approx
0,The Greatest Skill Of The 21st Century (The To...,"[{'format_id': 'sb2', 'format_note': 'storyboa...",Download this PDF and emulate it while you are...,1647,[Education],"[self improvement, mindset, personal developme...",True,{'en-US': [{'url': 'https://manifest.googlevid...,"{'en-US': [{'ext': 'json3', 'url': 'https://ww...","[{'start_time': 0.0, 'title': 'What Is The Gre...","[{'start_time': 0.0, 'end_time': 16.47, 'value...",20231105,public,Dan Koe - Videos,4,The Greatest Skill Of The 21st Century (The To...,en,27050826


In [38]:
(
    captions_df.with_columns(
        tpl.time_based_replace(
            video_info["chapters"], info_value_column="title", taget_column="chapters"
        )
    ).with_columns(
        tpl.time_based_replace(
            video_info["heatmap"],
            info_value_column="value",
            taget_column="heatmap",
            default_value=0,
        )
    )
)

start,end,text,chapters,heatmap
f64,f64,str,str,f64
0.041,3.837,"""The greatest s…","""What Is The Gr…",0.01439
3.878,5.255,"""It is not grap…","""What Is The Gr…",0.01439
5.255,6.423,"""It is not web …","""What Is The Gr…",0.01439
6.423,8.133,"""It is not arti…","""What Is The Gr…",0.01439
8.133,9.426,"""It is not pers…","""What Is The Gr…",0.01439
9.426,11.052,"""It is not vide…","""What Is The Gr…",0.01439
11.052,14.848,"""or photography…","""What Is The Gr…",0.01439
14.889,16.349,"""It is the base…","""What Is The Gr…",0.01439
16.349,18.351,"""of all of thes…","""What Is The Gr…",0.0
18.351,20.437,"""So the greates…","""What Is The Gr…",0.0
