In [None]:
#| default_exp core

# core

> Core functionality of `YoutubeTimestamper` 

In [None]:
# | hide
from nbdev.showdoc import *

In [None]:
%reload_ext lab_black

In [None]:
# | export
from urllib.parse import urlparse, parse_qs
from contextlib import suppress

from youtube_transcript_api import YouTubeTranscriptApi
from deepmultilingualpunctuation import PunctuationModel
import spacy
from spacy.lang.en import English
from fastcore.all import *
import datetime
import re

The program flow for generating timestamps is relatively simple:

1. Get the id from the youtube video url passed
2. Fetch the transcripts from the id and concatenate them
3. Use `deepmultilingualpunctuation` to restore punctuations
4. Use `spacy` to split into senteces
5. Get sentences that have `?` and match them up with the timestamps

In [None]:
# | export
class YoutubeTimestamper:
    """A class that extracts the transcript from a youtube video, identifies the questions in it and attaches timestamps to them."""

    punct_model = PunctuationModel()
    nlp = English()
    nlp.add_pipe("sentencizer")

    def __init__(self, video_url: str):  # A url of a YouTube Video
        store_attr()

    __repr__ = basic_repr("video_url")



In [None]:
sample_url = "https://www.youtube.com/watch?v=QGCvycOXs2M"
yt_timestamper = YoutubeTimestamper(sample_url)
yt_timestamper

__main__.YoutubeTimestamper(video_url='https://www.youtube.com/watch?v=QGCvycOXs2M')

In [None]:
# | export
@patch
def _get_yt_id(self: YoutubeTimestamper) -> None:
    """Extracts the id from the url"""
    query = urlparse(self.video_url)
    if query.hostname == "youtu.be":
        return query.path[1:]
    if query.hostname in {"www.youtube.com", "youtube.com", "music.youtube.com"}:
        with suppress(KeyError):
            return parse_qs(query.query)["list"][0]
        if query.path == "/watch":
            return parse_qs(query.query)["v"][0]
        if query.path[:7] == "/watch/":
            return query.path.split("/")[1]
        if query.path[:7] == "/embed/":
            return query.path.split("/")[2]
        if query.path[:3] == "/v/":
            return query.path.split("/")[2]

In [None]:
yt_timestamper.video_url

'https://www.youtube.com/watch?v=QGCvycOXs2M'

In [None]:
test_eq(yt_timestamper._get_yt_id(), "QGCvycOXs2M")

## Get transcripts from the id

In [None]:
# | export


@patch
def _get_transcript(self: YoutubeTimestamper) -> None:
    """Fetches the transcripts for the video using the `youtube_transcript_api` package and stores it in the `transcript` variable."""
    self._yt_transcript_api = YouTubeTranscriptApi()
    video_id = self._get_yt_id()
    self.transcript = self._yt_transcript_api.get_transcript(video_id)
    self._transcript_str = " ".join([ts["text"] for ts in self.transcript])

In [None]:
yt_timestamper._get_transcript()

In [None]:
yt_timestamper.transcript[:3]

[{'text': '[Music]', 'start': 1.28, 'duration': 12.54},
 {'text': "hey this is sayam bhutani and you're",
  'start': 13.92,
  'duration': 2.56},
 {'text': 'listening to', 'start': 15.679, 'duration': 3.68}]

This is the raw unpunctuated transcript

In [None]:
yt_timestamper._transcript_str[:500]

"[Music] hey this is sayam bhutani and you're listening to chai time data science a podcast for data science enthusiasts where i interview practitioners researchers and calculus about their journey experience and talk all things about data science [Music] hello and welcome to quarantine chai with kaggle heroes with this new look i am interviewing the new 4x kaggle grandmaster chris dude chris has an amazing very diverse and very rich background and we connect the dots of his journey his professio"

In [None]:
assert len(yt_timestamper.transcript) > 0
assert "," not in yt_timestamper._transcript_str[:500]
assert "." not in yt_timestamper._transcript_str[:500]

## Restore Punctuations

In [None]:
# | export


@patch
def _restore_punctuations(self: YoutubeTimestamper) -> None:
    """Punctuates the transcript string"""
    self._transcript_punct = YoutubeTimestamper.punct_model.restore_punctuation(
        self._transcript_str
    )

In [None]:
yt_timestamper._restore_punctuations()



The punctuations have now been added.

In [None]:
yt_timestamper._transcript_punct[:500]

"[Music]. hey, this is sayam bhutani and you're listening to chai time- data science, a podcast for data science enthusiasts where i interview practitioners, researchers and calculus about their journey experience and talk all things about data science. [Music]. hello and welcome to quarantine chai with kaggle heroes. with this new look, i am interviewing the new 4x kaggle grandmaster, chris dude. chris has an amazing, very diverse and very rich background and we connect the dots of his journey, "

In [None]:
assert "," in yt_timestamper._transcript_punct[:500]
assert "." in yt_timestamper._transcript_punct[:500]

## Getting questions

In [None]:
# | export


@patch
def _get_sentences(self: YoutubeTimestamper) -> None:
    """Parses the transcript into sentences using spacy's sentenciser"""
    transcript_parsed = YoutubeTimestamper.nlp(self._transcript_punct)
    self.transcript_sents = L(transcript_parsed.sents)

In [None]:
yt_timestamper._get_sentences()
yt_timestamper.transcript_sents[5:10]

(#5) [chris has an amazing, very diverse and very rich background and we connect the dots of his journey, his professional journey, with data science.,we talk about his previous life.,how did he transition into data science and cargill and his journey on kaggle?,chris at the time of recording has just become a forex kaggle ground master.,he's ranked 32 on the competition steer, two on the data sets tier and is currently number one in kernels and discussions.]

In [None]:
assert len(yt_timestamper.transcript_sents) > 0

In [None]:
# | export


@patch
def _get_questions(
    self: YoutubeTimestamper,
    next_q_thresh: int = 15,  # The number of tokens within a question which if the next question is present, it'll be considered part of the same question
) -> None:
    """Gets a continuous block of question sentences"""
    questions = L("")
    prev_q_end = -90
    for sent in self.transcript_sents:
        if "?" in sent.text:
            if (sent.start - prev_q_end) <= next_q_thresh:
                questions[-1] += " " + sent.text
            else:
                questions.append(sent.text)
            prev_q_end = sent.end

    self.questions = L(q for q in questions if q.strip() != "")

In [None]:
yt_timestamper._get_questions()
yt_timestamper.questions[:3]

(#3) ['how did he transition into data science and cargill and his journey on kaggle?','can you tell us a bit more about that, chris, now that you remember of it the secret?','did you get your invite to the fight clubs yet, or can you share a bit about those?']

In [None]:
assert len(yt_timestamper.questions) > 0

In [None]:
# | export


@patch
def _get_timestamps_for_questions(
    self: YoutubeTimestamper,
) -> None:
    """Matches the questions with the timestamps"""
    timestamps = L([(0, "Introduction")])
    transcript_pieces = [t for t in self.transcript]
    for question in self.questions:
        question_nopunct = re.sub("[,.?!]", "", question)
        for ts in transcript_pieces:
            if (ts["text"] in question_nopunct) and (
                question_nopunct[10:20] in ts["text"]
            ):
                timestamps.append((ts["start"], question))
                # print((ts["start"], ts["text"], question))
                break
        transcript_pieces.remove(ts)
    self.timestamps = timestamps

In [None]:
yt_timestamper._get_timestamps_for_questions()
assert len(yt_timestamper.timestamps) > 0

In [None]:
yt_timestamper.timestamps

(#36) [(0, 'Introduction'),(68.64, 'how did he transition into data science and cargill and his journey on kaggle?'),(143.52, 'can you tell us a bit more about that, chris, now that you remember of it the secret?'),(168.56, 'did you get your invite to the fight clubs yet, or can you share a bit about those?'),(284.479, "then i, um i graduated with a bachelor's degree in mathematics and then, um, immediately afterwards, i?"),(439.599, 'when did kaggle come into the picture? when did you find your addiction for kagan?'),(508.879, "you know you build a model and is your model more accurate than the other guy's model?"),(615.12, 'how did you go from just starting your journey to today being the forex grandmaster?'),(709.36, 'was it part of the enjoyment process, just getting involved, or were you making a conscious effort to? you know, maybe list down points where you need to improve?'),(760.079, 'and even if a competition or something looks similar to a previous one, i say to myself: you 

## Outputs

In [None]:
# | export


@patch
def _render_timestamps(self: YoutubeTimestamper, limit=None) -> None:
    """Renders the timestamps in the right format"""
    render_ts = self.timestamps[:limit] if limit else self.timestamps
    for t in render_ts:
        timestamp = f"{datetime.timedelta(seconds=t[0])}"
        timestamp = timestamp.split(".")[0].rjust(8, "0")
        print(timestamp, t[1])
    print(
        "\nCreated using youtube-timestamper - https://ilangurudev.github.io/youtube-timestamper/"
    )

In [None]:
yt_timestamper._render_timestamps(limit=10)

00:00:00 Introduction
00:01:08 how did he transition into data science and cargill and his journey on kaggle?
00:02:23 can you tell us a bit more about that, chris, now that you remember of it the secret?
00:02:48 did you get your invite to the fight clubs yet, or can you share a bit about those?
00:04:44 then i, um i graduated with a bachelor's degree in mathematics and then, um, immediately afterwards, i?
00:07:19 when did kaggle come into the picture? when did you find your addiction for kagan?
00:08:28 you know you build a model and is your model more accurate than the other guy's model?
00:10:15 how did you go from just starting your journey to today being the forex grandmaster?
00:11:49 was it part of the enjoyment process, just getting involved, or were you making a conscious effort to? you know, maybe list down points where you need to improve?
00:12:40 and even if a competition or something looks similar to a previous one, i say to myself: you know, what new angle can i do her

In [None]:
# | export


@patch
def suggest_question_timestamps(
    self: YoutubeTimestamper,
    next_q_thresh: int = 15,  # The number of tokens within a question which if the next question is present, it'll be considered part of the same question
) -> None:
    """Suggest timestamps based on questions found in the transcripts."""
    if "self.questions" not in vars():
        self._get_transcript()
        self._restore_punctuations()
        self._get_sentences()
    self._get_questions(next_q_thresh)
    self._get_timestamps_for_questions()
    self._render_timestamps()

In [None]:
yt_timestamper.suggest_question_timestamps()

00:00:00 Introduction
00:01:08 how did he transition into data science and cargill and his journey on kaggle?
00:02:23 can you tell us a bit more about that, chris, now that you remember of it the secret?
00:02:48 did you get your invite to the fight clubs yet, or can you share a bit about those?
00:04:44 then i, um i graduated with a bachelor's degree in mathematics and then, um, immediately afterwards, i?
00:07:19 when did kaggle come into the picture? when did you find your addiction for kagan?
00:08:28 you know you build a model and is your model more accurate than the other guy's model?
00:10:15 how did you go from just starting your journey to today being the forex grandmaster?
00:11:49 was it part of the enjoyment process, just getting involved, or were you making a conscious effort to? you know, maybe list down points where you need to improve?
00:12:40 and even if a competition or something looks similar to a previous one, i say to myself: you know, what new angle can i do her