# Summary

Start tying openai and youtube functionality together to manage the punctuation process.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from pathlib import Path
from youtube_transcript_api import YouTubeTranscriptApi, NoTranscriptFound

from jabberwocky.config import C
from jabberwocky.openai_utils import query_gpt3
from jabberwocky.utils import load_prompt
from jabberwocky.youtube import realign_punctuated_text, get_transcripts, \
    text_segment, video_id
from htools import *

In [3]:
cd_root()

Current directory: /Users/hmamin/jabberwocky


In [171]:
class UnpunctuatedTranscript:
    
    def __init__(self, df_gen, **kwargs):
        self.df_gen = df_gen
        self.df_punct = self.df_gen.copy()
        self.df_punct['text'] = np.nan
        # TODO: adjust so prompt template saved separately from prompt_kwargs.
        # TODO: adjust load_prompt to have option to skip .format call 
        # if no text passed in.
        self.prompt_kwargs = load_prompt('punctuate')
        self.prompt_fmt = self.prompt_kwargs.pop('prompt')
        self.prompt_kwargs.update(kwargs)
        
    @property
    def df(self):
        # TODO: maybe should return self.df_punct, possibly after punct query? 
        # Unsure.
        return self.df_gen
    
    def _punctuate_chunk(self, df_chunk, **kwargs) -> str:
        text = ' '.join(df_chunk.text)
        prompt = self.prompt_fmt.format(text).rstrip()
        # n_tokens is roughly 1.33 * n_words. Add buffer for punctuation.
        prompt_kwargs = {**self.prompt_kwargs,
                         **kwargs, 
                         'max_tokens': int(len(text.split()) * 2)}
        _, text_punct = query_gpt3(prompt, **prompt_kwargs)
        return text_punct
    
#     def punctuated_chunk(self, start_idx, end_idx, align_kwargs=None, 
#                         **query_kwargs):
#         df_chunk = self.df_gen.loc[start_idx:end_idx]
#         for idx in self.na_index_chunks(self.df_punct.loc[start_idx:end_idx]):
#             print(idx)
#         return
        
        
#         text_punct = self._punctuate_chunk(df_chunk, **query_kwargs)
#         df_chunk_punct = realign_punctuated_text(df_chunk, text_punct, 
#                                                  **ifnone(align_kwargs, {}))
#         # Must use 'values' attribute because realignment func resets index.
#         self.df_punct.loc[start_idx:end_idx, 'text'] = df_chunk_punct.text.values
#         return self.df_punct.loc[start_idx:end_idx, :]
    
    # This version only punctuates rows of the relevant chunk that haven't 
    # been previously punctuated. While this is faster and cheaper, I've seen
    # some hints that the punctuation task may work better when we pass it
    # long-ish chunks of text and not little partial snippets. So it might
    # actually be better not to do this? I was going to say it might do better
    # if given full sentences rather than fragments, but I guess we can't 
    # easily extract those without doing the actual punctuation step.
    def punctuated_chunk(self, start_idx, end_idx, align_kwargs=None, 
                        **query_kwargs):
        for idx in self.na_index_chunks(self.df_punct.loc[start_idx:end_idx]):
            print(idx)
            df_chunk = self.df_gen.loc[idx]
            text_punct = self._punctuate_chunk(df_chunk, **query_kwargs)
            df_chunk_punct = realign_punctuated_text(
                df_chunk, text_punct, **ifnone(align_kwargs, {})
            )
            # Use 'values' attribute because realignment func resets index.
            self.df_punct.loc[idx, 'text'] = df_chunk_punct.text.values
        return self.df_punct.loc[start_idx:end_idx, :]
    
    @staticmethod
    def na_index_chunks(chunk) -> list:
        """Given a chunk of a df that may contain null text rows, return a 
        list of lists where each nested list contains the indices of a 
        contiguous chunk of null rows.
        """
        nans = chunk[chunk.text.isnull()]
        if nans.empty: return []
        last_idx = nans.index[-1]
        res = []
        curr_chunk = []
        prev = None
        for idx in nans.index:
            if prev is None or (idx == prev + 1): 
                curr_chunk.append(idx)
            else:
                res.append(curr_chunk)
                curr_chunk = [idx]
            if idx == last_idx:
                res.append(curr_chunk)
            prev = idx
        return res

In [172]:
class PunctuatedTranscript:
    
    def __init__(self, df_gen, df_punct, **kwargs):
        """
        kwargs:
            Just for compatibility with UnpunctuatedTranscript, which needs
            these to specify args like 'rstrip' when loading a prompt.
        """
        self.df_gen = df_gen
        self.df_punct = df_punct
        
    @property
    def df(self):
        return self.df_punct
    
    def punctuated_chunk(self, start_idx, end_idx, **kwargs):
        return self.df_punct.loc[start_idx:end_idx, :]

In [173]:
@auto_repr
class Transcript:
    
    def __init__(self, url, **kwargs):
        self.url = url
        self.id = video_id(url)
        self._transcript = self._fetch_transcripts(url, **kwargs)
        self.is_generated = isinstance(self._transcript,
                                       UnpunctuatedTranscript)
        self.start_time, self.end_time = self.df.start.ends(1)
            
    def _time_range(self, start, end, full_sentences=True):
        assert end > start, 'End time must be later than start time.'
        assert start >= 0 and end >= 0, 'Times must be non-negative.'

        df = self.df
        if start < self.start_time:
            start_idx = 0
        else:
            start_idx = df.loc[df.start <= start].index[-1]

        if end > df.start.iloc[-1]:
            end_idx = df.tail(1).index[0]
        else:
            end_idx = df.loc[df.start >= end].index[0]
        return df.iloc[start_idx:end_idx+1]
    
    def time_range(self, start, end, full_sentences=True, align_kwargs=None, 
                   **query_kwargs):
        chunk = self._time_range(start, end, full_sentences=full_sentences)
        return self._transcript.punctuated_chunk(*chunk.ends(1).index, 
                                                 align_kwargs=align_kwargs,
                                                 **query_kwargs)
    
    @property
    def df(self):
        return self._transcript.df
    
    def _fetch_transcripts(self, url, **kwargs):
        """Wrapper to fetch youtube transcripts and create the appropriate
        transcript object depending on whether a manually generated (i.e.
        punctuated) transcript was retrieved.
        
        Parameters
        ----------
        url: str
        verbose: bool
        """
        df_gen, df_man, _ = self.get_transcripts(
            url, verbose=kwargs.get('verbose', True)
        )
        if df_man is None:
            return UnpunctuatedTranscript(df_gen, **kwargs)
        else:
            return PunctuatedTranscript(df_gen, df_man, **kwargs)
        
    @staticmethod
    def get_transcripts(url, verbose=True):
        """Fetch one or more transcripts for a youtube video given its URL.

        Parameters
        ----------
        url: str
            Don't include any channel-related suffix. E.G. use
            https://www.youtube.com/watch?v=OZbCRN3C_Hs, not
            https://www.youtube.com/watch?v=OZbCRN3C_Hs&ab_channel=BBC.
        verbose: bool
            Warn

        Returns
        -------
        DotDict: Contains keys 'id' (maps to video ID str), 'generated',
        and 'manual' (the latter two lap to pandas dfs or None if no 
        transcript was found). Manual transcripts are human-created.
        Generated transcripts are a bit lower quality and tend to lack
        punctuation.
        """
        langs = ['en', 'en-GB']
        id_ = video_id(url)
        res = {'generated': None, 'manual': None}
        trans_list = YouTubeTranscriptApi.list_transcripts(id_)
        res['generated'] = trans_list.find_generated_transcript(langs)
        try:
            res['manual'] = trans_list.find_manually_created_transcript(langs)
        except NoTranscriptFound:
            if verbose: warnings.warn('No manual transcript found.')
        if verbose:
            non_eng = [k for k, v in res.items()
                       if v and ('United Kingdom' in v.language)]
            if non_eng:
                warnings.warn(
                    f'{non_eng} {"has" if len(non_eng) == 1 else "have"} '
                    'language en-GB, not en.'
                )
        return Args(**{k: pd.DataFrame(v.fetch()) if v else v 
                       for k, v in res.items()},
                    id=id_)
    
    def __str__(self):
        return f'{type(self).__name__}(url={self.url}, '\
               f'is_generated={self.is_generated})'

In [174]:
gen_url = 'https://www.youtube.com/watch?v=AtTsn1Ia4JY&ab_channel=LukeThomas'
man_url = 'https://www.youtube.com/watch?v=NNnIGh9g6fA'

In [175]:
trans_gen = Transcript(gen_url)
trans_gen



You should probably adjust max_tokens based on the length of the input. Bumping up to engine 2 or 3 might help a little, but engine 1 is serviceable (probably best to avoid 0 though). You should probably try training a huggingface model to add punctuation instead of using gpt3 credits though.
-------------------------------------------------------------------------------



Transcript(url='https://www.youtube.com/watch?v=AtTsn1Ia4JY&ab_channel=LukeThomas')

In [9]:
trans_man = Transcript(man_url)
trans_man

Transcript(url='https://www.youtube.com/watch?v=NNnIGh9g6fA')

In [202]:
trans_man.time_range(7, 16)

Unnamed: 0,text,start,duration
1,Stanford University.,4.94,0.92
2,"This is BIO 150, isn't it?",9.36,3.37
3,"OK, just wanted to make sure.",12.73,2.74
4,So we start off with a scenario.,15.47,3.42
5,"40-year-old guy--\nquiet, suburban life.",18.89,4.06


In [176]:
trans_gen._time_range(60, 70)

Unnamed: 0,text,start,duration
35,stuff but there is,58.879,2.801
36,a little bit more to the story what was,60.16,3.44
37,dan hooker trying what was his,61.68,3.839
38,game plan here and more to the point,63.6,4.16
39,there are some finer details to that,65.519,3.121
40,finish that,67.76,3.52
41,really deserve to get mentioned and i,68.64,4.08
42,don't know exactly how far,71.28,2.64


In [177]:
res = trans_gen.time_range(60, 70)

[35, 36, 37, 38, 39, 40, 41, 42]


  'Max score < 80. Your rows may have gotten misaligned '


In [178]:
res

Unnamed: 0,text,start,duration
35,stuff but there is,58.879,2.801
36,a little bit more to the story. What was,60.16,3.44
37,Dan Hooker,61.68,3.839
38,"trying to do here? And more to the point,",63.6,4.16
39,there are some finer details to that,65.519,3.121
40,finish that,67.76,3.52
41,really deserve to get mentioned. And I,68.64,4.08
42,don't know exactly how far,71.28,2.64


In [179]:
res = trans_gen.time_range(50, 80)

[28, 29, 30, 31, 32, 33, 34]


  'Max score < 80. Your rows may have gotten misaligned '
  'Max score < 80. Your rows may have gotten misaligned '
  'Max score < 80. Your rows may have gotten misaligned '


[43, 44, 45, 46, 47]


  'Max score < 80. Your rows may have gotten misaligned '


In [180]:
res

Unnamed: 0,text,start,duration
28,"and um as always, the devil is in the",49.28,4.24
29,details it doesn't take a genius to look,51.84,2.96
30,at like what,53.52,2.48
31,"happened to be like, Michael Chandler, him with a",54.8,1.68
32,"big ass punch,",56.0,1.76
33,"you know, and kind of faded low and then",56.48,2.399
34,went high and then all that kind of,57.76,2.4
35,stuff but there is,58.879,2.801
36,a little bit more to the story. What was,60.16,3.44
37,Dan Hooker,61.68,3.839


In [181]:
trans_gen._transcript.df_punct.loc[25:50]

Unnamed: 0,text,start,duration
25,,44.719,3.52
26,,46.8,2.48
27,,48.239,3.601
28,"and um as always, the devil is in the",49.28,4.24
29,details it doesn't take a genius to look,51.84,2.96
30,at like what,53.52,2.48
31,"happened to be like, Michael Chandler, him with a",54.8,1.68
32,"big ass punch,",56.0,1.76
33,"you know, and kind of faded low and then",56.48,2.399
34,went high and then all that kind of,57.76,2.4


In [184]:
# Much faster this time because it's been pre-punctuated.
res = trans_gen.time_range(50, 80)

In [185]:
res

Unnamed: 0,text,start,duration
28,"and um as always, the devil is in the",49.28,4.24
29,details it doesn't take a genius to look,51.84,2.96
30,at like what,53.52,2.48
31,"happened to be like, Michael Chandler, him with a",54.8,1.68
32,"big ass punch,",56.0,1.76
33,"you know, and kind of faded low and then",56.48,2.399
34,went high and then all that kind of,57.76,2.4
35,stuff but there is,58.879,2.801
36,a little bit more to the story. What was,60.16,3.44
37,Dan Hooker,61.68,3.839


In [170]:
def na_index_chunks(chunk):
    nans = chunk[chunk.text.isnull()]
    if nans.empty: return []
    last_idx = nans.index[-1]
    res = []
    curr_chunk = []
    prev = None
    for idx in nans.index:
        if prev is None or (idx == prev + 1): 
            curr_chunk.append(idx)
        else:
            res.append(curr_chunk)
            curr_chunk = [idx]
        if idx == last_idx:
            res.append(curr_chunk)
        prev = idx
    return res

In [117]:
na_index_chunks(tmp.head(3))

[[32, 33, 34]]

In [116]:
eprint([tmp.loc[row] for row in na_index_chunks(tmp)])

 0:    text  start  duration
32  NaN  56.00     1.760
33  NaN  56.48     2.399
34  NaN  57.76     2.400
 1:    text  start  duration
46  NaN  78.88     4.720
47  NaN  80.72     4.079
48  NaN  83.60     1.920


In [249]:
res = trans_gen.time_range(10, 30)

  'Max score < 80. Your rows may have gotten misaligned '


In [250]:
len(res)

4

In [251]:
# Method output
res[0]

Unnamed: 0,text,start,duration
4,to talk about ufc debuts. Wow. But,6.879,5.521
5,"how did he do it? Of course, a big",10.16,2.8
6,punch. But there's always more to the,12.4,3.359
7,story. I'm,12.96,5.52
8,going to tell you what it is. It's,15.759,4.561
9,time for technical difficulties.,18.48,3.36
10,I came straight from hell with a pitch,20.32,2.32
11,black force with a black pitch fork,21.84,2.56
12,and a pitch black Porsche. All right.,22.64,4.559
13,,24.4,4.719


In [252]:
# unpunct df
res[1]

Unnamed: 0,text,start,duration
4,absolutely spectacular fashion knocking,6.879,5.521
5,out dan hooker in just over a couple of,10.16,2.8
6,minutes,12.4,3.359
7,at ufc 257 in the co-main event you want,12.96,5.52
8,to talk about ufc debuts wow but,15.759,4.561
9,how did he do it yes of course a big,18.48,3.36
10,punch but there's always more to the,20.32,2.32
11,story,21.84,2.56
12,i'm going to tell you what it is it's,22.64,4.559
13,time for technical difficulties,24.4,4.719


In [254]:
# punct text
res[2]

"absolutely spectacular fashion knocking out dan hooker in just over a couple of minutes at ufc 257 in the co-main event. You want to talk about ufc debuts. Wow. But how did he do it? Of course, a big punch. But there's always more to the story. I'm going to tell you what it is. It's time for technical difficulties. I came straight from hell with a pitch black force with a black pitch fork and a pitch black Porsche. All right."

In [255]:
# punct df
res[3]

Unnamed: 0,text,start,duration
0,absolutely spectacular fashion knocking,6.879,5.521
1,out dan hooker in just over a couple of,10.16,2.8
2,minutes at,12.4,3.359
3,ufc 257 in the co-main event. You want,12.96,5.52
4,to talk about ufc debuts. Wow. But,15.759,4.561
5,"how did he do it? Of course, a big",18.48,3.36
6,punch. But there's always more to the,20.32,2.32
7,story. I'm,21.84,2.56
8,going to tell you what it is. It's,22.64,4.559
9,time for technical difficulties.,24.4,4.719


In [263]:
res[0]

Unnamed: 0,text,start,duration
4,to talk about ufc debuts. Wow. But,6.879,5.521
5,"how did he do it? Of course, a big",10.16,2.8
6,punch. But there's always more to the,12.4,3.359
7,story. I'm,12.96,5.52
8,going to tell you what it is. It's,15.759,4.561
9,time for technical difficulties.,18.48,3.36
10,I came straight from hell with a pitch,20.32,2.32
11,black force with a black pitch fork,21.84,2.56
12,and a pitch black Porsche. All right.,22.64,4.559
13,,24.4,4.719


In [262]:
res[0].loc[4:16, 'text'] = res[-1].loc[:, 'text']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [261]:
res[0]

Unnamed: 0,text,start,duration
4,absolutely spectacular fashion knocking,6.879,5.521
5,out dan hooker in just over a couple of,10.16,2.8
6,minutes at,12.4,3.359
7,ufc 257 in the co-main event. You want,12.96,5.52
8,to talk about ufc debuts. Wow. But,15.759,4.561
9,"how did he do it? Of course, a big",18.48,3.36
10,punch. But there's always more to the,20.32,2.32
11,story. I'm,21.84,2.56
12,going to tell you what it is. It's,22.64,4.559
13,time for technical difficulties.,24.4,4.719


In [234]:
realign_punctuated_text(x, y)

  'Max score < 80. Your rows may have gotten misaligned '


Unnamed: 0,text,start,duration
0,"absolutely spectacular fashion, knocking",6.879,5.521
1,out Dan Hooker in just over a couple of,10.16,2.8
2,minutes at,12.4,3.359
3,"UFC 257 in the co-main event, you want",12.96,5.52
4,"to talk about UFC debuts, wow, but",15.759,4.561
5,"how did he do it? Of course, a big",18.48,3.36
6,"punch, but there's always more to the",20.32,2.32
7,story. I'm,21.84,2.56
8,going to tell you what it is. It's,22.64,4.559
9,time for technical difficulties.,24.4,4.719
