# Summary

Start tying openai and youtube functionality together to manage the punctuation process.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import matplotlib.pyplot as plt
from nltk.tokenize import sent_tokenize
import numpy as np
import os
import pandas as pd
from pathlib import Path
from youtube_transcript_api import YouTubeTranscriptApi, NoTranscriptFound

from jabberwocky.config import C
from jabberwocky.core import realign_punctuated_text
from jabberwocky.openai_utils import query_gpt3, load_prompt, PromptManager
from jabberwocky.youtube import get_transcripts, text_segment, video_id
from htools import *

In [3]:
cd_root()

Current directory: /Users/hmamin/jabberwocky


In [482]:
def punctuate_mock_func(prompt, random_punct=True, sentence_len=15):
    text = prompt.rpartition('\n\nPassage: ')[-1]\
                 .rpartition('\n\nPassage with punctuation:')[0]
    if random_punct:
        words = text.split(' ')
        new_words = []
        for idx in range(0, max(sentence_len, len(words)), sentence_len):
            new_words.append(
                ' '.join(words[idx:idx+sentence_len]).capitalize() + '.'
            )
        text = ' '.join(new_words)
    return text

In [484]:
rand_text = 'qq ttt nnn a pe ennd ghakl iii i will aslkj eiou alsjd'\
            'wioeuwoiefp alskjdf dkfj wioeupa lajdf asdlfjz ccjj ppq eee'\
            'kjj aa ee bpbp ajma aol'
tmp = load_prompt('punctuate', rand_text)['prompt']
punctuate_mock_func(tmp)

punctuate: You should probably adjust max_tokens based on the length of the input. Bumping up to engine 2 or 3 might help a little, but engine 1 is serviceable (probably best to avoid 0 though). You should probably try training a huggingface model to add punctuation instead of using gpt3 credits though.
-------------------------------------------------------------------------------



'Qq ttt nnn a pe ennd ghakl iii i will aslkj eiou alsjdwioeuwoiefp alskjdf dkfj. Wioeupa lajdf asdlfjz ccjj ppq eeekjj aa ee bpbp ajma aol.'

In [388]:
def na_index_chunks(chunk, mode='isnull', flat=False) -> list:
    """Given a chunk of a df that may contain null text rows, return a 
    list of lists where each nested list contains the indices of a 
    contiguous chunk of null rows.
    """
    # Depending on mode, these are either nans or not nans.
    nans = chunk[getattr(chunk.text, mode)()]
    if nans.empty: return []
    last_idx = nans.index[-1]
    res = []
    curr_chunk = []
    prev = None
    for idx in nans.index:
        if prev is None or (idx == prev + 1): 
            curr_chunk.append(idx)
        else:
            res.append(curr_chunk)
            curr_chunk = [idx]
        if idx == last_idx:
            res.append(curr_chunk)
        prev = idx
    return flatten(res) if flat else res

In [497]:
class UnpunctuatedTranscript:
    
    def __init__(self, df_gen, **kwargs):
        self.df_gen = df_gen
        self.df_punct = self.df_gen.copy()
        self.df_punct['text'] = np.nan
        
        # Allow kwargs but no extra_kwargs at this point since the latter is
        # really meant to be a stopgap solution (which you can pass in when 
        # calling the _punctuate_chunk method).
        self.manager = PromptManager('punctuate', verbose=False)
        self.kwargs = dict(kwargs)
        
    @property
    def df(self):
        return self.df_gen
    
    def _punctuate_chunk(self, df_chunk, extra_kwargs=None, **kwargs) -> str:
        # Don't use stream=True or return_full=True here. Just want a string.
        text = ' '.join(df_chunk.text)
        max_tokens = int(len(text.split()) * 2)
        kwargs = dict(self.kwargs, **kwargs, max_tokens=max_tokens)
        if kwargs.get('mock', False): 
            kwargs['mock_func'] = punctuate_mock_func
        return self.manager.query(task='punctuate', 
                                  text=text,
                                  extra_kwargs=extra_kwargs,
                                  **kwargs)[1]
    
    # This version only punctuates rows of the relevant chunk that haven't 
    # been previously punctuated. While this is faster and cheaper, I've seen
    # some hints that the punctuation task may work better when we pass it
    # long-ish chunks of text and not little partial snippets. So it might
    # actually be better not to do this? I was going to say it might do better
    # if given full sentences rather than fragments, but I guess we can't 
    # easily extract those without doing the actual punctuation step.
    def punctuated_chunk(self, start_idx, end_idx, punctuate, 
                         align_kwargs=None, extra_kwargs=None, 
                         **query_kwargs):
        unpunct_chunk = self.df_gen.loc[start_idx:end_idx, :]
        if not punctuate:
            return unpunct_chunk
        
        # If our punctuated df has any null chunks, we only want to punctuate
        # them if specifically asked to.
        chunk = self.df_punct.loc[start_idx:end_idx, :]
        if punctuate == 'if_cached' and chunk.text.isnull().sum() > 0:
            return unpunct_chunk
        
        # When the whole chunk is pre-punctuated, na_index_chunks returns
        # an empty list so we're not doing any unnecessary gpt3 querying.
        for idx in na_index_chunks(chunk):
            df_chunk = self.df_gen.loc[idx]
            text_punct = self._punctuate_chunk(df_chunk,
                                               extra_kwargs=extra_kwargs,
                                               **query_kwargs)
            df_chunk_punct = realign_punctuated_text(
                df_chunk, text_punct, **ifnone(align_kwargs, {})
            )
            # Use 'values' attribute because realignment func resets index.
            self.df_punct.loc[idx, 'text'] = df_chunk_punct.text.values
        return self.df_punct.loc[start_idx:end_idx, :]
    
    def clear_punctuations(self):
        self.df_punct['text'] = np.nan

In [498]:
class PunctuatedTranscript:
    
    # Automated YouTube transcripts seem to skip all punctuation except single
    # quotes. Just delete most but replace a few with a space. This risks 
    # creating multiple spaces so we also try to replace those. Save space as 
    # a variable to avoid typos and make it explicit when we want to replace 
    # with a space vs. nothing.
    space = ' '
    punct_rm = '|'.join(re.escape(char) for char in punctuation 
                        if char not in ('/', '-'))
    punct_space = '|'.join(map(re.escape, ['/', '-', space * 2, space * 3]))
    
    def __init__(self, df_gen, df_punct, **kwargs):
        """
        kwargs:
            Just for compatibility with UnpunctuatedTranscript, which needs
            these to specify args like 'rstrip' when loading a prompt.
        """
        self.df_gen = df_gen
        self.df_punct = df_punct
        
    @property
    def df(self):
        return self.df_punct
    
    def punctuated_chunk(self, start_idx, end_idx, punctuate=True, **kwargs):
        chunk = self.df_punct.loc[start_idx:end_idx, :]
        # Notice this covers case where punctuate is True or a string.
        if punctuate:
            return chunk
        return chunk.assign(
            text=lambda x: x.text.str.lower()
                            .str.replace(self.punct_rm, '')
                            .str.replace(self.punct_space, self.space)
        )
    
    def clear_punctuations(self):
        warnings.warn('This is a manual transcript so there are no gpt3 '
                      'punctuations to clear.')

In [500]:
@auto_repr
class Transcript:
    
    def __init__(self, url, **kwargs):
        self.url = url
        self.id = video_id(url)
        self._transcript = self._fetch_transcripts(url, **kwargs)
        self.is_generated = isinstance(self._transcript,
                                       UnpunctuatedTranscript)
        # These are technically the start times for the first and last time 
        # segments, which are slightly different from the video start and end
        # times.
        self.start_time, self.end_time = self.df.start.ends(1)
            
    def _time_range(self, start, end) -> pd.DataFrame:
        assert end > start, 'End time must be later than start time.'
        assert start >= 0 and end >= 0, 'Times must be non-negative.'

        df = self.df
        if start < self.start_time:
            start_idx = 0
        else:
            start_idx = df.loc[df.start <= start].index[-1]

        if end > df.start.iloc[-1]:
            end_idx = df.tail(1).index[0]
        else:
            end_idx = df.loc[df.start >= end].index[0]
        return df.iloc[start_idx:end_idx+1]

    def time_range(self, start, end, punctuate='if_cached', align_kwargs=None,
                   extra_kwargs=None, **query_kwargs) -> pd.DataFrame:
        chunk = self._time_range(start, end)
        return self._transcript.punctuated_chunk(*chunk.ends(1).index, 
                                                 punctuate=punctuate,
                                                 align_kwargs=align_kwargs,
                                                 extra_kwargs=extra_kwargs,
                                                 **query_kwargs)
        
    def time_range_str(self, start, end, punctuate='if_cached',
                       full_sentences=True, max_trim=120, margin=3,
                       align_kwargs=None, extra_kwargs=None, 
                       **query_kwargs) -> str:
        if full_sentences and self.is_generated and punctuate is False:
            warnings.warn('This is an autogenerated transcript, so calling '
                          'time_range_str() with full_sentences=True will '
                          'not work as expected when punctuate=False. We '
                          'suggest setting these to both be True or both be '
                          'False.')
        
        # Values outside the acceptable range are handled later anyway. Widen
        # our candidate window so we can trim off partial sentences. May need
        # to experiment with right adjustment size.
        if full_sentences:
            start = max(0, start - margin)
            end += margin
        
        rows = self.time_range(start, end, punctuate=punctuate,
                               align_kwargs=align_kwargs,
                               extra_kwargs=extra_kwargs, **query_kwargs)
        text = ' '.join(rows.text.values)
        if not full_sentences:
            return text
        return self._full_sentences(text, max_trim=max_trim)
    
    @staticmethod
    def _full_sentences(text, max_trim=120, chars=('.', '!', '?')) -> str:
        first_upper = re.search('[A-Z]', text)
        start_idx = 0 if first_upper is None else first_upper.start()
        if start_idx > max_trim: start_idx = 0
        # Rfind returns -1 for missing chars.
        end_idx = max(text.rfind(char) for char in chars)
        if end_idx == -1 or end_idx < len(text) - max_trim - 1:
            end_idx = None
        else:
            end_idx += 1
        return text[start_idx:end_idx]
    
    @property
    def df(self):
        return self._transcript.df
    
    def _fetch_transcripts(self, url, **kwargs):
        """Wrapper to fetch youtube transcripts and create the appropriate
        transcript object depending on whether a manually generated (i.e.
        punctuated) transcript was retrieved.
        
        Parameters
        ----------
        url: str
        verbose: bool
        """
        df_gen, df_man, _ = self.get_transcripts(
            url, verbose=kwargs.get('verbose', True)
        )
        if df_man is None:
            return UnpunctuatedTranscript(df_gen, **kwargs)
        else:
            return PunctuatedTranscript(df_gen, df_man, **kwargs)
        
    def punctuated_index(self, flat=True) -> list:
        """Get indices of rows which have already been punctuated."""
        return na_index_chunks(self._transcript.df_punct, 'notnull', flat)
    
    def unpunctuated_index(self, flat=True) -> list:
        """Get indices of rows which have not yet been punctuated."""
        return na_index_chunks(self._transcript.df_punct, 'isnull', flat)
    
    def punctuated_times(self):
        df = self.df
        res = []
        for chunk in self.punctuated_index(flat=False):
            end_row = df.loc[chunk[-1]]
            res.append((df.loc[chunk[0], 'start'], 
                        end_row.start + end_row.duration))
        return res
        
    def punctuated_time_rows(self, chunk=False):
        idx = self.punctuated_index(flat=not chunk)
        # Don't use self.df, that points to unpunctuated version for generated
        # transcript.
        df = self._transcript.df_punct
        if chunk: return [df.loc[i] for i in idx]
        return df.loc[idx]
    
    def clear_punctuations(self):
        # Useful if we've been testing with mock calls and want to reset.
        self._transcript.clear_punctuations()
        
    @staticmethod
    def get_transcripts(url, verbose=True):
        """Fetch one or more transcripts for a youtube video given its URL.

        Parameters
        ----------
        url: str
            Don't include any channel-related suffix. E.G. use
            https://www.youtube.com/watch?v=OZbCRN3C_Hs, not
            https://www.youtube.com/watch?v=OZbCRN3C_Hs&ab_channel=BBC.
        verbose: bool
            Warn

        Returns
        -------
        DotDict: Contains keys 'id' (maps to video ID str), 'generated',
        and 'manual' (the latter two lap to pandas dfs or None if no 
        transcript was found). Manual transcripts are human-created.
        Generated transcripts are a bit lower quality and tend to lack
        punctuation.
        """
        langs = ['en', 'en-GB']
        id_ = video_id(url)
        res = {'generated': None, 'manual': None}
        trans_list = YouTubeTranscriptApi.list_transcripts(id_)
        res['generated'] = trans_list.find_generated_transcript(langs)
        try:
            res['manual'] = trans_list.find_manually_created_transcript(langs)
        except NoTranscriptFound:
            if verbose: warnings.warn('No manual transcript found.')
        if verbose:
            non_eng = [k for k, v in res.items()
                       if v and ('United Kingdom' in v.language)]
            if non_eng:
                warnings.warn(
                    f'{non_eng} {"has" if len(non_eng) == 1 else "have"} '
                    'language en-GB, not en.'
                )
        return Args(**{k: pd.DataFrame(v.fetch()) if v else v 
                       for k, v in res.items()},
                    id=id_)
    
    def __str__(self):
        return f'{type(self).__name__}(url={self.url}, '\
               f'is_generated={self.is_generated})'

In [501]:
gen_url = 'https://www.youtube.com/watch?v=AtTsn1Ia4JY&ab_channel=LukeThomas'
man_url = 'https://www.youtube.com/watch?v=NNnIGh9g6fA'

In [502]:
trans_gen = Transcript(gen_url)
trans_gen



Transcript(url='https://www.youtube.com/watch?v=AtTsn1Ia4JY&ab_channel=LukeThomas')

In [503]:
trans_man = Transcript(man_url)
trans_man

Transcript(url='https://www.youtube.com/watch?v=NNnIGh9g6fA')

In [504]:
assert trans_gen.punctuated_index() == [], 'Generated transcript should be '\
                                           'empty list at this point.'
assert trans_gen.unpunctuated_index() == list(range(trans_gen.df.shape[0])), \
    'Unpunctuated index should be whole df.'
assert trans_gen.punctuated_times() == [], 'No punctuated times yet.'
assert trans_gen.punctuated_time_rows().empty, 'Should have no punctuated ' \
    'rows yet.'

In [505]:
assert trans_man.punctuated_index() == list(range(trans_man.df.shape[0])), \
    'Generated transcript should be empty list at this point.'
assert trans_man.unpunctuated_index() == [], 'Unpunctuated index should be ' \
    ' whole df.'
assert (trans_man.punctuated_times() ==
        [(0, trans_man.end_time + trans_man.df.tail(1).duration.values[0])]),\
    'Whole df should be punctuated.'
assert trans_man.punctuated_time_rows().equals(trans_man.df), \
    'All rows should already be punctuated and should be returned.'

In [397]:
trans_man.time_range(43, 50)

Unnamed: 0,text,start,duration
14,"takes exception to it,\npunches him in the face.",42.95,2.69
15,Utterly strange.,45.64,2.09
16,Things are quiet.,47.73,1.42
17,"Three months later, his wife\nof 15 years happ...",49.15,3.24
18,discovers he's having an affair\nwith a 16-yea...,52.39,3.26


In [398]:
trans_man.time_range(43, 50, punctuate=False)

Unnamed: 0,text,start,duration
14,takes exception to it\npunches him in the face,42.95,2.69
15,utterly strange,45.64,2.09
16,things are quiet,47.73,1.42
17,three months later his wife\nof 15 years happy...,49.15,3.24
18,discovers hes having an affair\nwith a 16 year...,52.39,3.26


In [399]:
trans_man.time_range(43, 50, punctuate=True)

Unnamed: 0,text,start,duration
14,"takes exception to it,\npunches him in the face.",42.95,2.69
15,Utterly strange.,45.64,2.09
16,Things are quiet.,47.73,1.42
17,"Three months later, his wife\nof 15 years happ...",49.15,3.24
18,discovers he's having an affair\nwith a 16-yea...,52.39,3.26


In [313]:
print(trans_man.time_range_str(43, 50))

The guy is standing
there by the water cooler and makes some comment
on some baseball team, takes exception to it,
punches him in the face. Utterly strange. Things are quiet. Three months later, his wife
of 15 years happy marriage discovers he's having an affair
with a 16-year-old checkout kid down at the Safeway.


In [314]:
print(trans_man.time_range_str(45, 67, full_sentences=True, max_trim=80))

and makes some comment
on some baseball team, takes exception to it,
punches him in the face. Utterly strange. Things are quiet. Three months later, his wife
of 15 years happy marriage discovers he's having an affair
with a 16-year-old checkout kid down at the Safeway. Really weird. Then three months
after that, he absconds with all the money at work,
embezzles it, disappears, and is never seen again. Three possibilities. First one-- this guy
is a truly deep creep.


In [315]:
print(trans_man.time_range_str(45, 67, full_sentences=True, max_trim=120))

Utterly strange. Things are quiet. Three months later, his wife
of 15 years happy marriage discovers he's having an affair
with a 16-year-old checkout kid down at the Safeway. Really weird. Then three months
after that, he absconds with all the money at work,
embezzles it, disappears, and is never seen again. Three possibilities. First one-- this guy
is a truly deep creep.


In [316]:
print(trans_man.time_range_str(45, 67, full_sentences=True, max_trim=10))

and makes some comment
on some baseball team, takes exception to it,
punches him in the face. Utterly strange. Things are quiet. Three months later, his wife
of 15 years happy marriage discovers he's having an affair
with a 16-year-old checkout kid down at the Safeway. Really weird. Then three months
after that, he absconds with all the money at work,
embezzles it, disappears, and is never seen again. Three possibilities. First one-- this guy
is a truly deep creep. Second, he is having the
most immature midlife


In [317]:
print(trans_man.time_range_str(45, 60, full_sentences=True, max_trim=5))

and makes some comment
on some baseball team, takes exception to it,
punches him in the face. Utterly strange. Things are quiet. Three months later, his wife
of 15 years happy marriage discovers he's having an affair
with a 16-year-old checkout kid down at the Safeway. Really weird. Then three months
after that, he absconds with all the money at work,
embezzles it, disappears, and is never seen again.


In [318]:
print(trans_man.time_range_str(43, 50, full_sentences=False))

takes exception to it,
punches him in the face. Utterly strange. Things are quiet. Three months later, his wife
of 15 years happy marriage discovers he's having an affair
with a 16-year-old checkout


In [319]:
print(trans_man.time_range_str(43, 50, punctuate=False))

the guy is standing
there by the water cooler and makes some comment
on some baseball team takes exception to it
punches him in the face utterly strange things are quiet three months later his wife
of 15 years happy marriage discovers hes having an affair
with a 16 year old checkout kid down at the safeway


In [320]:
trans_gen._time_range(60, 70)

Unnamed: 0,text,start,duration
35,stuff but there is,58.879,2.801
36,a little bit more to the story what was,60.16,3.44
37,dan hooker trying what was his,61.68,3.839
38,game plan here and more to the point,63.6,4.16
39,there are some finer details to that,65.519,3.121
40,finish that,67.76,3.52
41,really deserve to get mentioned and i,68.64,4.08
42,don't know exactly how far,71.28,2.64


In [506]:
trans_gen.time_range(60, 70)

Unnamed: 0,text,start,duration
35,stuff but there is,58.879,2.801
36,a little bit more to the story what was,60.16,3.44
37,dan hooker trying what was his,61.68,3.839
38,game plan here and more to the point,63.6,4.16
39,there are some finer details to that,65.519,3.121
40,finish that,67.76,3.52
41,really deserve to get mentioned and i,68.64,4.08
42,don't know exactly how far,71.28,2.64


In [507]:
trans_gen.time_range(60, 70, True, mock=True)

Object loaded from data/misc/sample_response.pkl.


Unnamed: 0,text,start,duration
35,Stuff but there is,58.879,2.801
36,a little bit more to the story what was,60.16,3.44
37,dan hooker. Trying what was his,61.68,3.839
38,game plan here and more to the point,63.6,4.16
39,there are some. Finer details to that,65.519,3.121
40,finish that,67.76,3.52
41,really deserve to get mentioned and i,68.64,4.08
42,don't know. Exactly how far.,71.28,2.64


In [508]:
trans_gen.punctuated_index()

[35, 36, 37, 38, 39, 40, 41, 42]

In [509]:
trans_gen.clear_punctuations()
trans_gen.punctuated_index()

[]

In [512]:
trans_gen._transcript.manager.kwargs('punctuate')

{'engine_i': 1,
 'logprobs': None,
 'max_tokens': 50,
 'mock': False,
 'mock_func': None,
 'return_full': False,
 'stop': ['Passage: ', 'Passage with punctuation: '],
 'stream': False,
 'strip_output': True,
 'temperature': 0.1}

In [513]:
# Don't actually want to use high temperature, just making sure we're able to
# pass kwargs to the query through all my layers of abstraction 😬. Output 
# should be more different because we turned up the randomness.
trans_gen.time_range(60, 70, punctuate=True, temperature=.99)

  'Max score < 80. Your rows may have gotten misaligned '
  'Max score < 80. Your rows may have gotten misaligned '
  'Max score < 80. Your rows may have gotten misaligned '
  'Max score < 80. Your rows may have gotten misaligned '
  'Max score < 80. Your rows may have gotten misaligned '
  'Max score < 80. Your rows may have gotten misaligned '
  'Max score < 80. Your rows may have gotten misaligned '


Unnamed: 0,text,start,duration
35,and richer outcome they had,58.879,2.801
36,opportunities and we didn't reflect in really ...,60.16,3.44
37,in a live,61.68,3.839
38,of there are various ways they could,63.6,4.16
39,do that,65.519,3.121
40,they could they,67.76,3.52
41,could compete at Boston there was still,68.64,4.08
42,a small number of people that felt,71.28,2.64


In [514]:
trans_gen.clear_punctuations()

In [515]:
# Should work normally now.
trans_gen.time_range(60, 70, punctuate=True)

  'Max score < 80. Your rows may have gotten misaligned '


Unnamed: 0,text,start,duration
35,stuff but there is,58.879,2.801
36,a little bit more to the story. What was,60.16,3.44
37,Dan Hooker,61.68,3.839
38,"trying to do here? And more to the point,",63.6,4.16
39,there are some finer details to that,65.519,3.121
40,finish that,67.76,3.52
41,really deserve to get mentioned and I,68.64,4.08
42,don't know exactly how far,71.28,2.64


In [516]:
trans_gen.punctuated_index()

[35, 36, 37, 38, 39, 40, 41, 42]

In [517]:
trans_gen.punctuated_times()

[(58.879, 73.92)]

In [518]:
trans_gen.punctuated_time_rows()

Unnamed: 0,text,start,duration
35,stuff but there is,58.879,2.801
36,a little bit more to the story. What was,60.16,3.44
37,Dan Hooker,61.68,3.839
38,"trying to do here? And more to the point,",63.6,4.16
39,there are some finer details to that,65.519,3.121
40,finish that,67.76,3.52
41,really deserve to get mentioned and I,68.64,4.08
42,don't know exactly how far,71.28,2.64


In [205]:
trans_gen.time_range(55, 75, punctuate='if_cached')

Unnamed: 0,text,start,duration
31,michael chandler him with a big ass,54.8,1.68
32,punch,56.0,1.76
33,you know and kind of faded low and then,56.48,2.399
34,went high and then all that kind of,57.76,2.4
35,stuff but there is,58.879,2.801
36,a little bit more to the story what was,60.16,3.44
37,dan hooker trying what was his,61.68,3.839
38,game plan here and more to the point,63.6,4.16
39,there are some finer details to that,65.519,3.121
40,finish that,67.76,3.52


In [210]:
trans_gen.time_range(55, 75, punctuate='if_cached')

Unnamed: 0,text,start,duration
31,michael chandler him with a big ass,54.8,1.68
32,"punch, you",56.0,1.76
33,"know, and kind of faded low and then",56.48,2.399
34,went high and then all that kind of,57.76,2.4
35,stuff but there is,58.879,2.801
36,a little bit more to the story. What was,60.16,3.44
37,Dan Hooker,61.68,3.839
38,"trying to do here? And more to the point,",63.6,4.16
39,there are some finer details to that,65.519,3.121
40,finish that,67.76,3.52


In [211]:
# Much faster this time because it's been pre-punctuated.
trans_gen.time_range(55, 75, True)

Unnamed: 0,text,start,duration
31,michael chandler him with a big ass,54.8,1.68
32,"punch, you",56.0,1.76
33,"know, and kind of faded low and then",56.48,2.399
34,went high and then all that kind of,57.76,2.4
35,stuff but there is,58.879,2.801
36,a little bit more to the story. What was,60.16,3.44
37,Dan Hooker,61.68,3.839
38,"trying to do here? And more to the point,",63.6,4.16
39,there are some finer details to that,65.519,3.121
40,finish that,67.76,3.52


In [214]:
# Lowercase because we expanded the range and use "is_cached".
trans_gen.time_range(30, 80, 'if_cached')

Unnamed: 0,text,start,duration
15,black force with a black pitch fork,29.119,5.041
16,and a pitch black porsche all right,31.599,4.241
17,everyone as i mentioned,34.16,3.12
18,technical difficulties we're gonna get,35.84,2.64
19,into michael chandler i wanted to do,37.28,2.24
20,this earlier in the week but i also,38.48,2.239
21,wanted to sort of make sure i got,39.52,2.16
22,another video in,40.719,2.401
23,a little bit later in the week so i just,41.68,3.039
24,said you know i'll just push it back,43.12,3.68


In [119]:
# Previously punctuated.
trans_gen.time_range(50, 80, False)

Unnamed: 0,text,start,duration
28,"and um as always, the devil is in the",49.28,4.24
29,details it doesn't take a genius to look,51.84,2.96
30,at like what,53.52,2.48
31,"happened to be like, Michael Chandler him with a",54.8,1.68
32,"big ass punch,",56.0,1.76
33,"you know, and kind of faded low and then",56.48,2.399
34,went high and then all that kind of,57.76,2.4
35,stuff but there is,58.879,2.801
36,a little bit more to the story. What was,60.16,3.44
37,Dan Hooker,61.68,3.839


In [120]:
trans_gen.time_range_str(50, 80)



"uh you guys know the deal we're going to take a look at it here and um as always the devil is in the details it doesn't take a genius to look at like what happened to be like oh michael chandler him with a big ass punch you know and kind of faded low and then went high and then all that kind of stuff but there is a little bit more to the story what was dan hooker trying what was his game plan here and more to the point there are some finer details to that finish that really deserve to get mentioned and i don't know exactly how far michael chandler's going to go in the ufc but i do know that like however good he was early in bellator in his you know early to mid run over there he's way better now way better now and i'm going to show you some of the things"

In [121]:
trans_gen.time_range_str(50, 80, punctuate=True, full_sentences=True)

'Michael Chandler him with a big ass punch, you know, and kind of faded low and then went high and then all that kind of stuff but there is a little bit more to the story. What was Dan Hooker trying to do here?'

In [1805]:
trans_gen.time_range_str(50, 80, punctuate=True, full_sentences=True)

'Michael Chandler, him with a big ass punch, you know, and kind of faded low and then went high and stuff but there is a little bit more to the story. What was Dan Hooker trying to do here?'

In [122]:
trans_gen.time_range_str(50, 80, punctuate=False, full_sentences=False)

"and um as always, the devil is in the details it doesn't take a genius to look at like what happened to be like, Michael Chandler him with a big ass punch, you know, and kind of faded low and then went high and then all that kind of stuff but there is a little bit more to the story. What was Dan Hooker trying to do here? And more to the point, there are some finer details to that finish that really deserve to get mentioned and I don't know exactly how far michael chandler's going to go in the ufc, but I do know that like however good he was early in bellator in his early run over there, he's way better now, way better now and"

In [170]:
def na_index_chunks(chunk):
    nans = chunk[chunk.text.isnull()]
    if nans.empty: return []
    last_idx = nans.index[-1]
    res = []
    curr_chunk = []
    prev = None
    for idx in nans.index:
        if prev is None or (idx == prev + 1): 
            curr_chunk.append(idx)
        else:
            res.append(curr_chunk)
            curr_chunk = [idx]
        if idx == last_idx:
            res.append(curr_chunk)
        prev = idx
    return res

In [117]:
na_index_chunks(tmp.head(3))

[[32, 33, 34]]

In [116]:
eprint([tmp.loc[row] for row in na_index_chunks(tmp)])

 0:    text  start  duration
32  NaN  56.00     1.760
33  NaN  56.48     2.399
34  NaN  57.76     2.400
 1:    text  start  duration
46  NaN  78.88     4.720
47  NaN  80.72     4.079
48  NaN  83.60     1.920


In [249]:
res = trans_gen.time_range(10, 30)

  'Max score < 80. Your rows may have gotten misaligned '


## PromptManager

In [318]:
from functools import partialmethod
from glob import glob

In [1541]:
class PromptManager:
    """Simple class that stores all the prompt templates and default kwargs
    so we don't need to load them repeatedly. Use this as an interface for
    performing tasks on a video Transcript object.
    """
    
    def __init__(self, *prompts):
        self.prompts = self._load_templates(set(prompts))
    
    def _load_templates(self, prompts):
        name2kwargs = {}
        dir_ = Path('data/prompts')
        paths = (dir_/p for p in prompts) if prompts else dir_.iterdir()
        for path in paths:
            if not path.is_dir(): 
                if prompts: warnings.warn(f'{path} is not a directory.')
                continue
            name2kwargs[path.stem] = load_prompt(path.stem)
        return name2kwargs
    
    def query(self, task, text, debug=False, extra_kwargs=None, **kwargs):
        """
        """
        kwargs = self.kwargs(task=task, fully_resolved=False, 
                             return_prompt=True, extra_kwargs=extra_kwargs,
                             **kwargs)
        prompt = kwargs.pop('prompt').format(text)
        if debug:
            print('prompt:\n' + prompt)
            print(spacer())
            print('kwargs:\n', kwargs)
            print(spacer())
            print('fully resolved kwargs:\n',
                  dict(bound_args(query_gpt3, [], kwargs)))
            return
        return query_gpt3(prompt, **kwargs)
    
    def kwargs(self, task, fully_resolved=True, return_prompt=False,
               extra_kwargs=None, **kwargs):
        kwargs = {**self.prompts[task], **kwargs}
        for k, v in (extra_kwargs or {}).items():
            v_cls = type(v)
            # Make a new object instead of just using get() or setdefault 
            # since the latter two methods both mutate our default kwargs.
            curr_val = v_cls(kwargs.get(k, v_cls()))
            if isinstance(v, Iterable):
                curr_val.extend(v)
            elif isinstance(v, Mapping):
                curr_val.update(v)
            else:
                raise TypeError(f'Key {k} has unrecognized type {v_cls} in '
                                '`extra_kwargs`.')
            kwargs[k] = curr_val
                
        if fully_resolved: kwargs = dict(bound_args(query_gpt3, [], kwargs))
        return kwargs if return_prompt else select(kwargs, drop=['prompt'])
    
    def prompt(self, task, text='', print_=False):
        template = self.prompts[task]['prompt']
        res = template.format(text) if text else template
        if print_:
            print(res)
        else:
            return res
    
    def __repr__(self):
        return f'{type(self).__name__}({", ".join(map(repr, self.prompts))})'
    
    def __iter__(self):
        return iter(self.prompts)

In [1524]:
manager = PromptManager('eli', 'short_dates', 'punctuate', 'eli5')
manager

short_dates: This prompt takes no input.
-------------------------------------------------------------------------------

eli: This uses the expensive davinci model and doesn't work so well without it.
-------------------------------------------------------------------------------

punctuate: You should probably adjust max_tokens based on the length of the input. Bumping up to engine 2 or 3 might help a little, but engine 1 is serviceable (probably best to avoid 0 though). You should probably try training a huggingface model to add punctuation instead of using gpt3 credits though.
-------------------------------------------------------------------------------



  app.launch_new_instance()


PromptManager('short_dates', 'eli', 'punctuate')

In [1525]:
list(manager)

['short_dates', 'eli', 'punctuate']

In [1542]:
manager = PromptManager()
manager

simplify_ml: This uses the expensive davinci model and doesn't work so well without it. Temperature is set to 0.3 but this hasn't been extensively tuned.
-------------------------------------------------------------------------------

shortest: This prompt takes no input.
-------------------------------------------------------------------------------

how_to: Should be a single line starting with the words "How to" and ending in a colon. You may need a stronger engine for good results.
-------------------------------------------------------------------------------

punctuate: You should probably adjust max_tokens based on the length of the input. Bumping up to engine 2 or 3 might help a little, but engine 1 is serviceable (probably best to avoid 0 though). You should probably try training a huggingface model to add punctuation instead of using gpt3 credits though.
-------------------------------------------------------------------------------

tldr: This sets max tokens to 64. You may 

PromptManager('simplify_ml', 'shortest', 'how_to', 'punctuate', 'tldr', 'eli', 'short_dates')

In [1527]:
manager.prompt('short_dates', print_=True)

Input: 3/1/20
Output: March 1, 2020

Input: 09-04-99
Output: September 4, 1999

Input: 11/01/2017
Output: November 1, 2017

Input: 04/11/21
Output:


In [1528]:
manager.prompt('tldr', 'abcd')

'abcd\n\ntl;dr:'

In [1529]:
manager.kwargs('tldr')

{'engine_i': 2,
 'logprobs': None,
 'max_tokens': 64,
 'mock': False,
 'return_full': False,
 'stream': False,
 'strip_output': True,
 'temperature': 0.3}

In [1530]:
manager.kwargs('tldr', fully_resolved=False)

{'engine_i': 2, 'max_tokens': 64, 'temperature': 0.3}

In [1531]:
manager.kwargs('tldr', fully_resolved=False, return_prompt=True)

{'engine_i': 2, 'max_tokens': 64, 'prompt': '{}\n\ntl;dr:', 'temperature': 0.3}

In [1532]:
manager.kwargs('tldr', fully_resolved=True, return_prompt=True)

{'engine_i': 2,
 'logprobs': None,
 'max_tokens': 64,
 'mock': False,
 'prompt': '{}\n\ntl;dr:',
 'return_full': False,
 'stream': False,
 'strip_output': True,
 'temperature': 0.3}

In [1533]:
manager.kwargs('tldr', extra_kwargs={'stop': ['stop1', 'stop2']})

{'engine_i': 2,
 'logprobs': None,
 'max_tokens': 64,
 'mock': False,
 'return_full': False,
 'stop': ['stop1', 'stop2'],
 'stream': False,
 'strip_output': True,
 'temperature': 0.3}

In [1536]:
manager.kwargs('tldr', extra_kwargs={'stop': ['stop1', 'stop2']}, 
               logit_bias={100: -5})

{'engine_i': 2,
 'logit_bias': {100: -5},
 'logprobs': None,
 'max_tokens': 64,
 'mock': False,
 'return_full': False,
 'stop': ['stop1', 'stop2'],
 'stream': False,
 'strip_output': True,
 'temperature': 0.3}

In [1537]:
manager.kwargs('tldr')

{'engine_i': 2,
 'logprobs': None,
 'max_tokens': 64,
 'mock': False,
 'return_full': False,
 'stream': False,
 'strip_output': True,
 'temperature': 0.3}

In [1538]:
txt = """Attention enables the network to learn dynamic linear layers,
essentially encoding the input sequence as a directed, weighted graph
which we represent using a learned adjacency matrix.""".replace('\n', ' ')
manager.query('eli', txt, debug=True, stream=True, engine_i=1)

prompt:
Attention enables the network to learn dynamic linear layers, essentially encoding the input sequence as a directed, weighted graph which we represent using a learned adjacency matrix. 

I rephrased this for my daughter, in plain language a second grader can understand:

-------------------------------------------------------------------------------

kwargs:
 {'engine_i': 1, 'temperature': 0.3, 'stop': ['I rephrased this for my daughter, in plain language', 'prompt:'], 'stream': True}

-------------------------------------------------------------------------------

fully resolved kwargs:
 {'engine_i': 1, 'temperature': 0.3, 'max_tokens': 50, 'logprobs': None, 'stream': True, 'mock': False, 'return_full': False, 'strip_output': True, 'stop': ['I rephrased this for my daughter, in plain language', 'prompt:']}


In [1539]:
txt = """Attention enables the network to learn dynamic linear layers,
essentially encoding the input sequence as a directed, weighted graph
which we represent using a learned adjacency matrix.""".replace('\n', ' ')
manager.query('eli', txt, debug=True, stream=True, engine_i=1, 
              extra_kwargs={'stop': ['\n\n']})

prompt:
Attention enables the network to learn dynamic linear layers, essentially encoding the input sequence as a directed, weighted graph which we represent using a learned adjacency matrix. 

I rephrased this for my daughter, in plain language a second grader can understand:

-------------------------------------------------------------------------------

kwargs:
 {'engine_i': 1, 'temperature': 0.3, 'stop': ['I rephrased this for my daughter, in plain language', 'prompt:', '\n\n'], 'stream': True}

-------------------------------------------------------------------------------

fully resolved kwargs:
 {'engine_i': 1, 'temperature': 0.3, 'max_tokens': 50, 'logprobs': None, 'stream': True, 'mock': False, 'return_full': False, 'strip_output': True, 'stop': ['I rephrased this for my daughter, in plain language', 'prompt:', '\n\n']}


In [1540]:
manager.prompts['eli']

{'engine_i': 3,
 'prompt': '{} \n\nI rephrased this for my daughter, in plain language a second grader can understand:',
 'stop': ['I rephrased this for my daughter, in plain language', 'prompt:'],
 'temperature': 0.3}

In [1548]:
txt = """Attention enables the network to learn dynamic linear layers,
essentially encoding the input sequence as a directed, weighted graph
which we represent using a learned adjacency matrix.""".replace('\n', ' ')
res = manager.query('eli', txt, max_tokens=100, temperature=.2)

In [1549]:
res

('Attention enables the network to learn dynamic linear layers, essentially encoding the input sequence as a directed, weighted graph which we represent using a learned adjacency matrix. \n\nI rephrased this for my daughter, in plain language a second grader can understand:',
 '"Attention is a way for the network to pay attention to what it thinks is important in the input sequence. It does this by looking at the input sequence and deciding what parts of it are important. It then uses this information to decide what parts of the output sequence should be important."\n\nThe attention mechanism is a way for the network to decide what parts of the input sequence are important. It does this by looking at the input sequence and deciding what parts of it are important.')

In [1384]:
txt = """Attention enables the network to learn dynamic linear layers,
essentially encoding the input sequence as a directed, weighted graph
which we represent using a learned adjacency matrix.""".replace('\n', ' ')
for chunk in manager.query('eli', txt, max_tokens=200, stream=True):
    print(chunk)



The
network
learns
to
recognize
patterns
in
the
input
sequence
.
It
learns
to
recognize
patterns
by
looking
at
the
input
sequence
and
seeing
how
it
is
similar
to
other
input
sequences
.


The
network
learns
to
recognize
patterns
in
the
input
sequence
.
It
learns
to
recognize
patterns
by
looking
at
the
input
sequence
and
seeing
how
it
is
similar
to
other
input
sequences
.


The
network
learns
to
recognize
patterns
in
the
input
sequence
.
It
learns
to
recognize
patterns
by
looking
at
the
input
sequence
and
seeing
how
it
is
similar
to
other
input
sequences
.


The
network
learns
to
recognize
patterns
in
the
input
sequence
.
It
learns
to
recognize
patterns
by
looking
at
the
input
sequence
and
seeing
how
it
is
similar
to
other
input
sequences
.


The
network
learns
to
recognize
patterns
in
the
input
sequence
.
It
learns
to
recognize
patterns
by
looking
at
the
input
sequence
and
seeing
how
it
is
similar
to
other
input
sequences
.


The
network
learns
to
recognize
patterns
in
the
input
sequ

In [1416]:
manager.kwargs('eli')

{'engine_i': 3,
 'stop': 'I rephrased this for my daughter, in plain language',
 'temperature': 0.3}

In [1418]:
manager.kwargs('punctuate')

{'engine_i': 1,
 'stop': ['Passage: ', 'Passage with punctuation: '],
 'strip_output': True,
 'temperature': 0.1}

In [1420]:
txt = """Attention enables the network to learn dynamic linear layers,
essentially encoding the input sequence as a directed, weighted graph
which we represent using a learned adjacency matrix.""".replace('\n', ' ')
# _, res = 
manager.query('eli', txt, max_tokens=100, debug=True)

eli Attention enables the network to learn dynamic linear layers, essentially encoding the input sequence as a directed, weighted graph which we represent using a learned adjacency matrix.
prompt:
Attention enables the network to learn dynamic linear layers, essentially encoding the input sequence as a directed, weighted graph which we represent using a learned adjacency matrix. 

I rephrased this for my daughter, in plain language a second grader can understand:
kwargs:
 {'engine_i': 3, 'temperature': 0.3, 'stop': 'I rephrased this for my daughter, in plain language', 'max_tokens': 100}


In [1388]:
print(res)

"The network learns to pay attention to the things that matter most, and ignore the rest."

The attention mechanism is a key ingredient to making neural networks useful for natural language tasks.

The attention mechanism is a key ingredient to making neural networks useful for natural language tasks.

In the paper, the authors used the attention mechanism to improve the state-of-the-art performance on a number of natural language tasks, including machine translation, question answering, and sentence summarization.

The attention mechanism is a key ingredient to making neural networks useful for natural language tasks.

In the paper, the authors used the attention mechanism to improve the state-of-the-art performance on a number of natural language tasks, including machine translation, question answering, and sentence summarization.

The attention mechanism is a key ingredient to making neural networks useful for natural language tasks.

In the paper, the authors used the attention mec

In [284]:
manager.query('eli', 'Batch normalization decouples layer outputs from one another '
             'and reduces internal covariate shift. It leads to faster '
             'optimization by enabling the neural network to undergo stable '
             'training even with large learning rates.', engine_i=3)

('Batch normalization decouples layer outputs from one another and reduces internal covariate shift. It leads to faster optimization by enabling the neural network to undergo stable training even with large learning rates. \n\nI rephrased this for my daughter, in plain language a second grader can understand:',
 'Batch normalization is a way to make the learning process faster.\n\nBatch normalization is a technique that allows you to use larger learning rates.\n\nBatch normalization makes the neural network more stable.\n\nB')

# Better Partial

Toyed with idea of dynamically generating methods on Manager (e.g. letting us call manager.tldr(text) and see all default hypers with shift tab, reducing the need for all the debug=True calls). Ultimately got partial working but ran into issues even using functools.partial for method generation, and decided to move forward rather than getting further sidetracked.

In [903]:
def foo(a, a2, b=2, *args, c=3, **kwargs):
    print('a', a)
    print('a2', a2)
    print('b', b)
    print('args', args)
    print('c', c)
    print('kwargs', kwargs)

In [449]:
print(signature(foo))
print(foo.__name__)
print('def', foo.__defaults__)
print('kw def', foo.__kwdefaults__)

(a, b=2, *args, c=3, **kwargs)
foo
def (2,)
kw def {'c': 3}


In [450]:
pfoo = partial(foo, b=99, d='d')
print(signature(pfoo))
print(pfoo.__name__)

(a, *, b=99, c=3, **kwargs)


AttributeError: 'functools.partial' object has no attribute '__name__'

In [451]:
dfoo = partial(foo, b=99, d='d')
dfoo = update_wrapper(dfoo, foo)
dfoo.__dir__ = foo.__dir__
print(signature(dfoo))
print(dfoo.__name__)

(a, b=2, *args, c=3, **kwargs)
foo


In [398]:
params(pfoo)

{'a': <Parameter "a">,
 'args': <Parameter "*args">,
 'b': <Parameter "b=99">,
 'kwargs': <Parameter "**kwargs">}

In [358]:
dfoo()

TypeError: foo() missing 1 required positional argument: 'a'

In [460]:
from types import FunctionType, MethodType

In [461]:
{**(foo.__defaults__ or {})}

TypeError: 'tuple' object is not a mapping

In [462]:
foo.__defaults__

(2,)

In [463]:
bar.__kwdefaults__

In [476]:
dir(pfoo)

['__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'args',
 'func',
 'keywords']

In [477]:
pfoo.keywords

{'b': 99, 'd': 'd'}

In [473]:
pfoo.func.__defaults__

(2,)

In [474]:
pfoo.func.__kwdefaults__

{'c': 3}

In [475]:
pfoo.args

()

In [744]:
from inspect import _empty

In [1088]:
def attach_repr(func, repr_func=None, str_func=None, name=None):
    class CustomReprFunction:
        def __init__(self, func, repr_func, str_func):
            self.func = func
            self.repr_func = repr_func
            self.str_func = str_func
            update_wrapper(self, func)
            self.__name__ = ifnone(name, func.__name__)
        
        def __repr__(self):
            if self.repr_func:
                return self.repr_func(self)
            return repr(self.func)
        
        def __call__(self, *args, **kwargs):
            return self.func(*args, **kwargs)
            
        def __str__(self):
            if self.str_func:
                return self.str_func(self)
            return repr(self.func)
        
    func_ = CustomReprFunction(func, repr_func, str_func)
    return func_

In [1134]:
tmp = attach_repr(foo, name='bar')
tmp, str(tmp), tmp.__name__

(<function foo at 0x123fbf730>, '<function foo at 0x123fbf730>', 'bar')

In [1010]:
def getindex(arr, val, default=-1):
    return arr.index(val) if val in arr else default

In [1114]:
def rigorous_partial(func, name=None, **kwargs):
    func = copy_func(func)
    old_sig = signature(func)
    # Track names of positional args in old function since this affects the 
    # order args must be passed in if var_positional parameters (*args) are
    # present.
    old_pos_pars = []
    new_pars = []
    kwargs_name = ''
    args_name = ''
    for k, v in old_sig.parameters.items():
        if v.kind == 0:
            raise NotImplementedError(
                'rigorous_partial does not support functions with positional '
                'only parameters.'
            )
        elif v.kind == 2:
            args_name = k
        elif v.kind == 4:
            kwargs_name = k
            break
            
        if v.kind <= 2:
            old_pos_pars.append(k)

        # Assign default value from newly specified kwargs if provided.
        if k in kwargs:
            default = kwargs.pop(k)
            kind = 3
        else:
            default = v.default
            kind = v.kind
        param = inspect.Parameter(k, kind, default=default)
        new_pars.append(param)
        
    # Remaining kwargs only: those that were not present in func's signature.
    # Require that they be keyword only since ordering can cause issues
    # (updating signature affects what we see but doesn't seem to affect the 
    # actual order args are passed in, presumably due to old __code__ object).
    for k, v in kwargs.items():
        param = inspect.Parameter(k, 3, default=v)
        new_pars.append(param)
    if kwargs_name:
        new_pars.append(inspect.Parameter(kwargs_name, 4))
        
    # Ensure we don't accidentally place any parameters with defaults ahead
    # of those without them. Third item in tuple is a tiebreaker (defaults to
    # original function's parameter order).
    old_names = [p for p in old_sig.parameters]
    new_pars.sort(key=lambda x: (x.kind, x.default != _empty, 
                                 getindex(old_names, x.name, float('inf'))))

    # TODO: might need to attach these to the wrapped func instead? Can't 
    # access these outside this function. But we use it inside wrapper so that
    # could cause other problems.
    func.__defaults__ = tuple(p.default for p in new_pars if p.kind < 3 
                              and p.default != _empty)
    func.__kwdefaults__ = {p.name: p.default for p in new_pars if p.kind == 3}
    func.__signature__ = old_sig.replace(parameters=new_pars)
    if name: func.__name__ = name
    
    @wraps(func)
    def wrapper(*args, **new_kwargs):
        final_kwargs = bound_args(func, args, 
                                  {**func.__kwdefaults__, **new_kwargs})
        final_args = {name: final_kwargs.pop(name) for name in old_pos_pars}
        final_star_args = final_args.pop(args_name, [])
        final_kwargs = select(final_kwargs, drop=list(final_args))
        print('old_pos_pars', old_pos_pars)
        print('final_kwargs', final_kwargs)
        print('final_args', final_args)
        print('final star args', final_star_args)
        return func(*final_args.values(), *final_star_args, **final_kwargs)
    wrapper.__kwdefaults__ = func.__kwdefaults__
    wrapper.__defaults__ = func.__defaults__
    return wrapper

In [1352]:
class Partial:
    
    def __init__(self, func, name=None, **kwargs):
        self.func = copy_func(func)
        self.old_name = func.__name__

        # Track names of positional args in old function since this affects 
        # the order args must be passed in if var_positional parameters 
        # (*args) are present.
        self.old_pos_pars = []
        self.kwargs_name = ''
        self.args_name = ''
        new_pars = []
        old_sig = signature(self.func)
        for k, v in old_sig.parameters.items():
            # Check parameter kind for error handling and argument resolution
            # in __call__.
            if v.kind == 0:
                raise NotImplementedError(
                    'rigorous_partial does not support functions with '
                    'positional only parameters.'
                )
            elif v.kind == 2:
                self.args_name = k
            elif v.kind == 4:
                self.kwargs_name = k
                break

            if v.kind <= 2:
                self.old_pos_pars.append(k)

            # Assign default value from newly specified kwargs if provided.
            if k in kwargs:
                default = kwargs.pop(k)
                kind = 3
            else:
                default = v.default
                kind = v.kind
            param = inspect.Parameter(k, kind, default=default)
            new_pars.append(param)

        # Remaining kwargs only: those that were not present in func's 
        # signature. Require that they be keyword only since ordering can
        # cause issues (updating signature affects what we see but doesn't 
        # seem to affect the actual order args are passed in, presumably due 
        # to old __code__ object).
        for k, v in kwargs.items():
            param = inspect.Parameter(k, 3, default=v)
            new_pars.append(param)
        if self.kwargs_name:
            new_pars.append(inspect.Parameter(self.kwargs_name, 4))

        # Ensure we don't accidentally place any parameters with defaults 
        # ahead of those without them. Third item in tuple is a tiebreaker 
        # (defaults to original function's parameter order).
        old_names = [p for p in old_sig.parameters]
        new_pars.sort(
            key=lambda x: (x.kind, x.default != _empty, 
                           getindex(old_names, x.name, float('inf')))
        )

        # I honestly forget why we need to set the attribute on self.func too,
        # I just remember it was needed to resolve a bug (I think it was 
        # related to *args resolution).
        self.__signature__ = self.func.__signature__ = old_sig.replace(
            parameters=new_pars
        )
        self.__defaults__ = tuple(p.default for p in new_pars if p.kind < 3 
                                  and p.default != _empty)
        self.__kwdefaults__ = {p.name: p.default for p in new_pars
                               if p.kind == 3}
        if name: self.func.__name__ = name
        update_wrapper(self, self.func)
    
    def __call__(self, *args, **new_kwargs):
        tmp_kwargs = bound_args(self.func, args, 
                                {**self.__kwdefaults__, **new_kwargs})
        final_args = {name: tmp_kwargs.pop(name) 
                      for name in self.old_pos_pars}
        final_star_args = final_args.pop(self.args_name, [])
        final_kwargs = select(tmp_kwargs, drop=list(final_args))
        return self.func(*final_args.values(), *final_star_args,
                         **final_kwargs)

    def __repr__(self):
        return repr(self.func).replace(self.old_name, self.__name__)
    
    def __str__(self):
        return str(self.func).replace(self.old_name, self.__name__)

In [1358]:
partial(foo, a='a')

functools.partial(<function foo at 0x123fbf730>, a='a')

In [1359]:
Partial(foo, a='a')

<function foo at 0x123ed1400>

In [None]:
foo()

In [1208]:
new_v2 = Partial(foo, a=False, x=9, y=8, z='zzzz')
new_v2

<function foo at 0x123e418c8>

In [1209]:
new_v2('a2', 'b_val', 'some', 'args', 'more', new_kwarg='q')

a False
a2 a2
b b_val
args ('some', 'args', 'more')
c 3
kwargs {'x': 9, 'y': 8, 'z': 'zzzz', 'new_kwarg': 'q'}


In [1210]:
new_v2 = Partial(foo, name='a_new_name', a=False, x=9, y=8, z='zzzz')
new_v2

<function a_new_name at 0x123e50378>

In [1211]:
new_v2('a2', 'b_val', 'some', 'args', 'more', new_kwarg='q')

a False
a2 a2
b b_val
args ('some', 'args', 'more')
c 3
kwargs {'x': 9, 'y': 8, 'z': 'zzzz', 'new_kwarg': 'q'}


In [1212]:
new_v2.__name__, new_v2.__defaults__, new_v2.__kwdefaults__

('a_new_name', (2,), {'a': False, 'c': 3, 'x': 9, 'y': 8, 'z': 'zzzz'})

In [1213]:
str(new_v2)

'<function a_new_name at 0x123e50378>'

In [1214]:
inspect.isfunction(new_v2)

False

In [1215]:
inspect.isfunction(foo)

True

In [1217]:
print(inspect.getsource(foo))

def foo(a, a2, b=2, *args, c=3, **kwargs):
    print('a', a)
    print('a2', a2)
    print('b', b)
    print('args', args)
    print('c', c)
    print('kwargs', kwargs)



In [1218]:
# Tbh not sure what I'd want this to return. Ideally I guess we'd have
# rewritten the actual definition but I don't think this is possible without
# rebuilding the __code__ object, which I remember required using a largely
# undocumented function whose interface (purportedly) frequently changes.
print(inspect.getsource(new_v2))

def foo(a, a2, b=2, *args, c=3, **kwargs):
    print('a', a)
    print('a2', a2)
    print('b', b)
    print('args', args)
    print('c', c)
    print('kwargs', kwargs)



In [1203]:
eprint(params(new_v2).items())

 0: ('a2', <Parameter "a2">)
 1: ('b', <Parameter "b=2">)
 2: ('args', <Parameter "*args">)
 3: ('a', <Parameter "a=False">)
 4: ('c', <Parameter "c=3">)
 5: ('x', <Parameter "x=9">)
 6: ('y', <Parameter "y=8">)
 7: ('z', <Parameter "z='zzzz'">)
 8: ('kwargs', <Parameter "**kwargs">)


In [1200]:
signature(new_v2)

<Signature (a2, b=2, *args, a=False, c=3, x=9, y=8, z='zzzz', **kwargs)>

In [1102]:
signature(foo)

<Signature (a, a2, b=2, *args, c=3, **kwargs)>

In [1103]:
print(inspect.getsource(foo))

def foo(a, a2, b=2, *args, c=3, **kwargs):
    print('a', a)
    print('a2', a2)
    print('b', b)
    print('args', args)
    print('c', c)
    print('kwargs', kwargs)



In [1104]:
eprint({k: v.kind for k, v in params(foo).items()}.items())

 0: ('a', <_ParameterKind.POSITIONAL_OR_KEYWORD: 1>)
 1: ('a2', <_ParameterKind.POSITIONAL_OR_KEYWORD: 1>)
 2: ('b', <_ParameterKind.POSITIONAL_OR_KEYWORD: 1>)
 3: ('args', <_ParameterKind.VAR_POSITIONAL: 2>)
 4: ('c', <_ParameterKind.KEYWORD_ONLY: 3>)
 5: ('kwargs', <_ParameterKind.VAR_KEYWORD: 4>)


In [1105]:
signature(foo)

<Signature (a, a2, b=2, *args, c=3, **kwargs)>

In [1106]:
foo.__defaults__

(2,)

In [1107]:
foo.__kwdefaults__

{'c': 3}

In [1108]:
# Should have a=aaaa, a2=a2, b=2, args=(), c=3, kwargs={}.
foo('aaaa', 'a2')

a aaaa
a2 a2
b 2
args ()
c 3
kwargs {}


In [1236]:
new = Partial(foo, a2=111, z=99)
new

<function foo at 0x123e8ac80>

In [1237]:
# Should look like Signature(a, b=2, *args, a2=111, c=3, z=99, **kwargs).
signature(new)

<Signature (a, b=2, *args, a2=111, c=3, z=99, **kwargs)>

In [1238]:
eprint({k: (v.kind, v.default) for k, v in params(new).items()}.items())

 0: ('a', (<_ParameterKind.POSITIONAL_OR_KEYWORD: 1>, <class 'inspect._empty'>))
 1: ('b', (<_ParameterKind.POSITIONAL_OR_KEYWORD: 1>, 2))
 2: ('args', (<_ParameterKind.VAR_POSITIONAL: 2>, <class 'inspect._empty'>))
 3: ('a2', (<_ParameterKind.KEYWORD_ONLY: 3>, 111))
 4: ('c', (<_ParameterKind.KEYWORD_ONLY: 3>, 3))
 5: ('z', (<_ParameterKind.KEYWORD_ONLY: 3>, 99))
 6: ('kwargs', (<_ParameterKind.VAR_KEYWORD: 4>, <class 'inspect._empty'>))


In [1239]:
new.__defaults__

(2,)

In [1240]:
new.__kwdefaults__

{'a2': 111, 'c': 3, 'z': 99}

In [1241]:
# Should be b=2, a2=111
new('a')

a a
a2 111
b 2
args ()
c 3
kwargs {'z': 99}


In [1242]:
# Should be b='b', a2=111
new('a', 'b')

a a
a2 111
b b
args ()
c 3
kwargs {'z': 99}


In [1243]:
# Should be b=2, a2='a2'
new('a', a2='a2')

a a
a2 a2
b 2
args ()
c 3
kwargs {'z': 99}


In [1244]:
new(1, c='ccc', d='d_kwarg')

a 1
a2 111
b 2
args ()
c ccc
kwargs {'z': 99, 'd': 'd_kwarg'}


In [1248]:
new(1, 'b', 0, 1, 2, 3, 4, 5, c='ccc', d='d_kwarg')

a 1
a2 111
b b
args (0, 1, 2, 3, 4, 5)
c ccc
kwargs {'z': 99, 'd': 'd_kwarg'}


In [1249]:
new('a', 'a2', -1, 0, 1, 2, 3, 4, 5)

a a
a2 111
b a2
args (-1, 0, 1, 2, 3, 4, 5)
c 3
kwargs {'z': 99}


In [1250]:
new('a', 'a2', -1, 0, 1, 2, 3, 4, 5, z='nnn')

a a
a2 111
b a2
args (-1, 0, 1, 2, 3, 4, 5)
c 3
kwargs {'z': 'nnn'}


In [1251]:
new.__defaults__, new.__wrapped__.__defaults__

((2,), (2,))

In [1252]:
new.__kwdefaults__, new.__wrapped__.__kwdefaults__

({'a2': 111, 'c': 3, 'z': 99}, {'c': 3})

In [1254]:
new_named = Partial(foo, name='my_new_func', b=888, x='xxx', t='ttt')
new_named

<function my_new_func at 0x123ed11e0>

In [1255]:
new_named.__name__

'my_new_func'

In [1256]:
func_name(new_named)

'my_new_func'

In [1257]:
new_named.__wrapped__.__name__

'my_new_func'

In [1258]:
func_name(new_named.__wrapped__)

'my_new_func'

In [1353]:
# Testing example from stackoverflow where functools version acts weirdly. 
# https://stackoverflow.com/questions/38975975/python-partial-with-keyword-arguments
def baz(x=1, y=2, z=3):
    return f'x:{x}, y:{y}, z:{z}'

# Looks like my Partial works as expected here and functools.partial still
# does not.
zz = range(10)
f0 = Partial(baz, x=0, y=-6)
eprint(list(map(f0, zz)))
print(spacer())

f1 = partial(baz, x=0, y=-6)
eprint(list(map(f1, zz)))

 0: x:0, y:-6, z:0
 1: x:0, y:-6, z:1
 2: x:0, y:-6, z:2
 3: x:0, y:-6, z:3
 4: x:0, y:-6, z:4
 5: x:0, y:-6, z:5
 6: x:0, y:-6, z:6
 7: x:0, y:-6, z:7
 8: x:0, y:-6, z:8
 9: x:0, y:-6, z:9

-------------------------------------------------------------------------------



TypeError: baz() got multiple values for argument 'x'

In [1356]:
signature(f0)

<Signature (z=3, *, x=0, y=-6)>