# Summary

Start tying openai and youtube functionality together to manage the punctuation process.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from pathlib import Path
from youtube_transcript_api import YouTubeTranscriptApi, NoTranscriptFound

from jabberwocky.config import C
from jabberwocky.openai_utils import query_gpt3
from jabberwocky.utils import load_prompt
from jabberwocky.youtube import realign_punctuated_text, get_transcripts, \
    text_segment, video_id
from htools import *

In [3]:
cd_root()

Current directory: /Users/hmamin/jabberwocky


In [195]:
class UnpunctuatedTranscript:
    
    def __init__(self, df_gen, **kwargs):
        self.df_gen = df_gen
        self.df_punct = self.df_gen.copy()
        self.df_punct['text'] = np.nan
        # TODO: adjust so prompt template saved separately from prompt_kwargs.
        # TODO: adjust load_prompt to have option to skip .format call 
        # if no text passed in.
        self.prompt_kwargs = load_prompt('punctuate')
        self.prompt_fmt = self.prompt_kwargs.pop('prompt')
        self.prompt_kwargs.update(kwargs)
        
    @property
    def df(self):
        # TODO: maybe should return self.df_punct, possibly after punct query? 
        # Unsure.
        return self.df_gen
    
    def _punctuate_chunk(self, df_chunk, **kwargs) -> str:
        text = ' '.join(df_chunk.text)
        prompt = self.prompt_fmt.format(text).rstrip()
        # n_tokens is roughly 1.33 * n_words. Add buffer for punctuation.
        prompt_kwargs = {**self.prompt_kwargs,
                         **kwargs, 
                         'max_tokens': int(len(text.split()) * 2)}
        _, text_punct = query_gpt3(prompt, **prompt_kwargs)
        return text_punct
    
#     def punctuated_chunk(self, start_idx, end_idx, align_kwargs=None, 
#                         **query_kwargs):
#         df_chunk = self.df_gen.loc[start_idx:end_idx]
#         for idx in self.na_index_chunks(self.df_punct.loc[start_idx:end_idx]):
#             print(idx)
#         return
        
        
#         text_punct = self._punctuate_chunk(df_chunk, **query_kwargs)
#         df_chunk_punct = realign_punctuated_text(df_chunk, text_punct, 
#                                                  **ifnone(align_kwargs, {}))
#         # Must use 'values' attribute because realignment func resets index.
#         self.df_punct.loc[start_idx:end_idx, 'text'] = df_chunk_punct.text.values
#         return self.df_punct.loc[start_idx:end_idx, :]
    
    # This version only punctuates rows of the relevant chunk that haven't 
    # been previously punctuated. While this is faster and cheaper, I've seen
    # some hints that the punctuation task may work better when we pass it
    # long-ish chunks of text and not little partial snippets. So it might
    # actually be better not to do this? I was going to say it might do better
    # if given full sentences rather than fragments, but I guess we can't 
    # easily extract those without doing the actual punctuation step.
    def punctuated_chunk(self, start_idx, end_idx, punctuate, 
                         align_kwargs=None, **query_kwargs):
        if not punctuate:
            return self.df_gen.loc[start_idx:end_idx, :]
        for idx in self.na_index_chunks(self.df_punct.loc[start_idx:end_idx]):
            df_chunk = self.df_gen.loc[idx]
            text_punct = self._punctuate_chunk(df_chunk, **query_kwargs)
            df_chunk_punct = realign_punctuated_text(
                df_chunk, text_punct, **ifnone(align_kwargs, {})
            )
            # Use 'values' attribute because realignment func resets index.
            self.df_punct.loc[idx, 'text'] = df_chunk_punct.text.values
        return self.df_punct.loc[start_idx:end_idx, :]
    
    @staticmethod
    def na_index_chunks(chunk) -> list:
        """Given a chunk of a df that may contain null text rows, return a 
        list of lists where each nested list contains the indices of a 
        contiguous chunk of null rows.
        """
        nans = chunk[chunk.text.isnull()]
        if nans.empty: return []
        last_idx = nans.index[-1]
        res = []
        curr_chunk = []
        prev = None
        for idx in nans.index:
            if prev is None or (idx == prev + 1): 
                curr_chunk.append(idx)
            else:
                res.append(curr_chunk)
                curr_chunk = [idx]
            if idx == last_idx:
                res.append(curr_chunk)
            prev = idx
        return res

In [196]:
class PunctuatedTranscript:
    
    def __init__(self, df_gen, df_punct, **kwargs):
        """
        kwargs:
            Just for compatibility with UnpunctuatedTranscript, which needs
            these to specify args like 'rstrip' when loading a prompt.
        """
        self.df_gen = df_gen
        self.df_punct = df_punct
        
    @property
    def df(self):
        return self.df_punct
    
    def punctuated_chunk(self, start_idx, end_idx, **kwargs):
        return self.df_punct.loc[start_idx:end_idx, :]

In [197]:
@auto_repr
class Transcript:
    
    def __init__(self, url, **kwargs):
        self.url = url
        self.id = video_id(url)
        self._transcript = self._fetch_transcripts(url, **kwargs)
        self.is_generated = isinstance(self._transcript,
                                       UnpunctuatedTranscript)
        self.start_time, self.end_time = self.df.start.ends(1)
            
    def _time_range(self, start, end, full_sentences=True):
        assert end > start, 'End time must be later than start time.'
        assert start >= 0 and end >= 0, 'Times must be non-negative.'

        df = self.df
        if start < self.start_time:
            start_idx = 0
        else:
            start_idx = df.loc[df.start <= start].index[-1]

        if end > df.start.iloc[-1]:
            end_idx = df.tail(1).index[0]
        else:
            end_idx = df.loc[df.start >= end].index[0]
        return df.iloc[start_idx:end_idx+1]
    
    def time_range(self, start, end, punctuate=False, full_sentences=True,
                   align_kwargs=None, **query_kwargs):
        chunk = self._time_range(start, end, full_sentences=full_sentences)
        return self._transcript.punctuated_chunk(*chunk.ends(1).index, 
                                                 punctuate=punctuate,
                                                 align_kwargs=align_kwargs,
                                                 **query_kwargs)
    
    @property
    def df(self):
        return self._transcript.df
    
    def _fetch_transcripts(self, url, **kwargs):
        """Wrapper to fetch youtube transcripts and create the appropriate
        transcript object depending on whether a manually generated (i.e.
        punctuated) transcript was retrieved.
        
        Parameters
        ----------
        url: str
        verbose: bool
        """
        df_gen, df_man, _ = self.get_transcripts(
            url, verbose=kwargs.get('verbose', True)
        )
        if df_man is None:
            return UnpunctuatedTranscript(df_gen, **kwargs)
        else:
            return PunctuatedTranscript(df_gen, df_man, **kwargs)
        
    @staticmethod
    def get_transcripts(url, verbose=True):
        """Fetch one or more transcripts for a youtube video given its URL.

        Parameters
        ----------
        url: str
            Don't include any channel-related suffix. E.G. use
            https://www.youtube.com/watch?v=OZbCRN3C_Hs, not
            https://www.youtube.com/watch?v=OZbCRN3C_Hs&ab_channel=BBC.
        verbose: bool
            Warn

        Returns
        -------
        DotDict: Contains keys 'id' (maps to video ID str), 'generated',
        and 'manual' (the latter two lap to pandas dfs or None if no 
        transcript was found). Manual transcripts are human-created.
        Generated transcripts are a bit lower quality and tend to lack
        punctuation.
        """
        langs = ['en', 'en-GB']
        id_ = video_id(url)
        res = {'generated': None, 'manual': None}
        trans_list = YouTubeTranscriptApi.list_transcripts(id_)
        res['generated'] = trans_list.find_generated_transcript(langs)
        try:
            res['manual'] = trans_list.find_manually_created_transcript(langs)
        except NoTranscriptFound:
            if verbose: warnings.warn('No manual transcript found.')
        if verbose:
            non_eng = [k for k, v in res.items()
                       if v and ('United Kingdom' in v.language)]
            if non_eng:
                warnings.warn(
                    f'{non_eng} {"has" if len(non_eng) == 1 else "have"} '
                    'language en-GB, not en.'
                )
        return Args(**{k: pd.DataFrame(v.fetch()) if v else v 
                       for k, v in res.items()},
                    id=id_)
    
    def __str__(self):
        return f'{type(self).__name__}(url={self.url}, '\
               f'is_generated={self.is_generated})'

In [189]:
gen_url = 'https://www.youtube.com/watch?v=AtTsn1Ia4JY&ab_channel=LukeThomas'
man_url = 'https://www.youtube.com/watch?v=NNnIGh9g6fA'

In [190]:
trans_gen = Transcript(gen_url)
trans_gen



You should probably adjust max_tokens based on the length of the input. Bumping up to engine 2 or 3 might help a little, but engine 1 is serviceable (probably best to avoid 0 though). You should probably try training a huggingface model to add punctuation instead of using gpt3 credits though.
-------------------------------------------------------------------------------



Transcript(url='https://www.youtube.com/watch?v=AtTsn1Ia4JY&ab_channel=LukeThomas')

In [191]:
trans_man = Transcript(man_url)
trans_man

Transcript(url='https://www.youtube.com/watch?v=NNnIGh9g6fA')

In [192]:
trans_man.time_range(7, 16)

Unnamed: 0,text,start,duration
1,Stanford University.,4.94,0.92
2,"This is BIO 150, isn't it?",9.36,3.37
3,"OK, just wanted to make sure.",12.73,2.74
4,So we start off with a scenario.,15.47,3.42
5,"40-year-old guy--\nquiet, suburban life.",18.89,4.06


In [193]:
trans_gen._time_range(60, 70)

Unnamed: 0,text,start,duration
35,stuff but there is,58.879,2.801
36,a little bit more to the story what was,60.16,3.44
37,dan hooker trying what was his,61.68,3.839
38,game plan here and more to the point,63.6,4.16
39,there are some finer details to that,65.519,3.121
40,finish that,67.76,3.52
41,really deserve to get mentioned and i,68.64,4.08
42,don't know exactly how far,71.28,2.64


In [194]:
trans_gen.time_range(60, 70)

Unnamed: 0,text,start,duration
35,stuff but there is,58.879,2.801
36,a little bit more to the story what was,60.16,3.44
37,dan hooker trying what was his,61.68,3.839
38,game plan here and more to the point,63.6,4.16
39,there are some finer details to that,65.519,3.121
40,finish that,67.76,3.52
41,really deserve to get mentioned and i,68.64,4.08
42,don't know exactly how far,71.28,2.64


In [177]:
res = trans_gen.time_range(60, 70, True)

[35, 36, 37, 38, 39, 40, 41, 42]


  'Max score < 80. Your rows may have gotten misaligned '


In [178]:
res

Unnamed: 0,text,start,duration
35,stuff but there is,58.879,2.801
36,a little bit more to the story. What was,60.16,3.44
37,Dan Hooker,61.68,3.839
38,"trying to do here? And more to the point,",63.6,4.16
39,there are some finer details to that,65.519,3.121
40,finish that,67.76,3.52
41,really deserve to get mentioned. And I,68.64,4.08
42,don't know exactly how far,71.28,2.64


In [179]:
res = trans_gen.time_range(50, 80, True)

[28, 29, 30, 31, 32, 33, 34]


  'Max score < 80. Your rows may have gotten misaligned '
  'Max score < 80. Your rows may have gotten misaligned '
  'Max score < 80. Your rows may have gotten misaligned '


[43, 44, 45, 46, 47]


  'Max score < 80. Your rows may have gotten misaligned '


In [180]:
res

Unnamed: 0,text,start,duration
28,"and um as always, the devil is in the",49.28,4.24
29,details it doesn't take a genius to look,51.84,2.96
30,at like what,53.52,2.48
31,"happened to be like, Michael Chandler, him with a",54.8,1.68
32,"big ass punch,",56.0,1.76
33,"you know, and kind of faded low and then",56.48,2.399
34,went high and then all that kind of,57.76,2.4
35,stuff but there is,58.879,2.801
36,a little bit more to the story. What was,60.16,3.44
37,Dan Hooker,61.68,3.839


In [181]:
trans_gen._transcript.df_punct.loc[25:50]

Unnamed: 0,text,start,duration
25,,44.719,3.52
26,,46.8,2.48
27,,48.239,3.601
28,"and um as always, the devil is in the",49.28,4.24
29,details it doesn't take a genius to look,51.84,2.96
30,at like what,53.52,2.48
31,"happened to be like, Michael Chandler, him with a",54.8,1.68
32,"big ass punch,",56.0,1.76
33,"you know, and kind of faded low and then",56.48,2.399
34,went high and then all that kind of,57.76,2.4


In [184]:
# Much faster this time because it's been pre-punctuated.
res = trans_gen.time_range(50, 80, True)

In [185]:
res

Unnamed: 0,text,start,duration
28,"and um as always, the devil is in the",49.28,4.24
29,details it doesn't take a genius to look,51.84,2.96
30,at like what,53.52,2.48
31,"happened to be like, Michael Chandler, him with a",54.8,1.68
32,"big ass punch,",56.0,1.76
33,"you know, and kind of faded low and then",56.48,2.399
34,went high and then all that kind of,57.76,2.4
35,stuff but there is,58.879,2.801
36,a little bit more to the story. What was,60.16,3.44
37,Dan Hooker,61.68,3.839


In [170]:
def na_index_chunks(chunk):
    nans = chunk[chunk.text.isnull()]
    if nans.empty: return []
    last_idx = nans.index[-1]
    res = []
    curr_chunk = []
    prev = None
    for idx in nans.index:
        if prev is None or (idx == prev + 1): 
            curr_chunk.append(idx)
        else:
            res.append(curr_chunk)
            curr_chunk = [idx]
        if idx == last_idx:
            res.append(curr_chunk)
        prev = idx
    return res

In [117]:
na_index_chunks(tmp.head(3))

[[32, 33, 34]]

In [116]:
eprint([tmp.loc[row] for row in na_index_chunks(tmp)])

 0:    text  start  duration
32  NaN  56.00     1.760
33  NaN  56.48     2.399
34  NaN  57.76     2.400
 1:    text  start  duration
46  NaN  78.88     4.720
47  NaN  80.72     4.079
48  NaN  83.60     1.920


In [249]:
res = trans_gen.time_range(10, 30)

  'Max score < 80. Your rows may have gotten misaligned '


[PosixPath('data/prompts/simplify_ml'),
 PosixPath('data/prompts/shortest'),
 PosixPath('data/prompts/how_to'),
 PosixPath('data/prompts/README.md'),
 PosixPath('data/prompts/punctuate'),
 PosixPath('data/prompts/tldr'),
 PosixPath('data/prompts/eli'),
 PosixPath('data/prompts/short_dates')]

In [318]:
from functools import partialmethod
from glob import glob

In [None]:
from glob import glob

In [387]:
@auto_repr
class PromptManager:
    """Simple class that stores all the prompt templates and default kwargs
    so we don't need to load them repeatedly. Use this as an interface for
    performing tasks on a video Transcript object.
    """
    
    def __init__(self, *prompts):
        self.prompts = self._load_templates(set(prompts))
        # TODO: prob rm. Kind of liked the idea of making each task its own
        # method (e.g. manager.tldr('...') instead of 
        # manager.query('tldr', '...')). However, using partial obscures the
        # kwargs which is annoying. For this to be worth it, I think I'd have
        # to make something that used some magic to load the default kwargs, 
        # inject them in as default kwargs, and make them viewable with shift
        # tab. The benefits probably don't justify that level of effort and
        # hackiness. I think a debug option is closer enough, but I'll sleep
        # on it before deleting this.
        for name, kwargs in self.prompts.items():
#             print(name)
#             print(select(kwargs, drop=['prompt']))
            meth = partial(self.query, task=name, **kwargs)
            meth.__dir__ = self.query.__dir__
            meth.__name__ = self.query.__name__
            meth.__doc__ = self.query.__doc__
            setattr(self, name, update_wrapper(meth, self.query))
    
    def _load_templates(self, prompts):
        name2kwargs = {}
        dir_ = Path('data/prompts')
        paths = (dir_/p for p in prompts) if prompts else dir_.iterdir()
        for path in paths:
            if not path.is_dir(): 
                if prompts: warnings.warn(f'{path} is not a directory.')
                continue
            name2kwargs[path.stem] = load_prompt(path.stem)
        return name2kwargs
    
    def _make_query(self, task, **kwargs):
        
    
    def query(self, task, text, debug=False, **kwargs):
        kwargs = {**self.prompts[task], **kwargs}
        prompt = kwargs.pop('prompt').format(text)
        if debug:
            print(task, text)
            print('prompt:\n' + prompt)
            print('kwargs:\n', kwargs)
            return
        return query_gpt3(prompt, **kwargs)
    
    def kwargs(self, task):
        return select(self.prompts[task], drop=['prompt'])
    
    def prompt(self, task, text='', print_=False):
        template = self.prompts[task]['prompt']
        res = template.format(text) if text else template
        if print_:
            print(res)
        else:
            return res
    
    def __str__(self):
        return f'{type(self).__name__}(prompts=[{", ".join(self.prompts)}])'

In [388]:
manager = PromptManager('eli', 'short_dates', 'punctuate', 'eli5')
print(manager)

short_dates: This prompt takes no input.
-------------------------------------------------------------------------------

eli: This uses the expensive davinci model and doesn't work so well without it.
-------------------------------------------------------------------------------

punctuate: You should probably adjust max_tokens based on the length of the input. Bumping up to engine 2 or 3 might help a little, but engine 1 is serviceable (probably best to avoid 0 though). You should probably try training a huggingface model to add punctuation instead of using gpt3 credits though.
-------------------------------------------------------------------------------

PromptManager(prompts=[short_dates, eli, punctuate])




In [389]:
{k: v.default for k, v in signature(manager.eli).parameters.items()}

{'debug': False,
 'kwargs': inspect._empty,
 'task': inspect._empty,
 'text': inspect._empty}

In [380]:
manager.eli(debug=True, text='text')

eli text
prompt:
text 

I rephrased this for my daughter, in plain language a second grader can understand:
kwargs:
 {'engine_i': 3, 'temperature': 0.3, 'stop': 'I rephrased this for my daughter, in plain language'}


In [334]:
manager = PromptManager()
print(manager)

simplify_ml: This uses the expensive davinci model and doesn't work so well without it. Temperature is set to 0.3 but this hasn't been extensively tuned.
-------------------------------------------------------------------------------

shortest: This prompt takes no input.
-------------------------------------------------------------------------------

how_to: Should be a single line starting with the words "How to" and ending in a colon. You may need a stronger engine for good results.
-------------------------------------------------------------------------------

punctuate: You should probably adjust max_tokens based on the length of the input. Bumping up to engine 2 or 3 might help a little, but engine 1 is serviceable (probably best to avoid 0 though). You should probably try training a huggingface model to add punctuation instead of using gpt3 credits though.
-------------------------------------------------------------------------------

tldr: This sets max tokens to 64. You may 

In [335]:
manager.prompt('tldr')

'{}\n\ntl;dr:'

In [336]:
manager.prompt('tldr', 'abcd')

'abcd\n\ntl;dr:'

In [337]:
manager.prompt('tldr', 'abcd', print_=True)

abcd

tl;dr:


In [309]:
manager.kwargs('tldr')

{'engine_i': 2, 'max_tokens': 64, 'temperature': 0.3}

In [338]:
manager.kwargs('tldr')

{'engine_i': 2, 'max_tokens': 64, 'temperature': 0.3}

In [339]:
vars(manager).keys()

dict_keys(['prompts'])

In [316]:
manager.query('eli', 'Attention enables the network to learn dynamic linear layers, essentially encoding the input sequence as a directed, weighted graph which we represent via a learned adjacency matrix.', debug=True, stream=True, engine_i=1)

prompt:
Attention enables the network to learn dynamic linear layers, essentially encoding the input sequence as a directed, weighted graph which we represent via a learned adjacency matrix. 

I rephrased this for my daughter, in plain language a second grader can understand:
kwargs:
 {'engine_i': 1, 'temperature': 0.3, 'stop': 'I rephrased this for my daughter, in plain language', 'stream': True}


In [273]:
manager.eli(text='Batch normalization decouples layer outputs from one another '
             'and reduces internal covariate shift. It leads to faster '
             'optimization by enabling the neural network to undergo stable '
             'training even with large learning rates.')

('Batch normalization decouples layer outputs from one another and reduces internal covariate shift. It leads to faster optimization by enabling the neural network to undergo stable training even with large learning rates. \n\nI rephrased this for my daughter, in plain language a second grader can understand:',
 'Batch normalization is a way to make the math easier when you are trying to learn how to do something.\n\nThe paper is very readable and I highly recommend it.\n\nThe paper is also available in the arXiv')

In [284]:
manager.query('eli', 'Batch normalization decouples layer outputs from one another '
             'and reduces internal covariate shift. It leads to faster '
             'optimization by enabling the neural network to undergo stable '
             'training even with large learning rates.', engine_i=3)

('Batch normalization decouples layer outputs from one another and reduces internal covariate shift. It leads to faster optimization by enabling the neural network to undergo stable training even with large learning rates. \n\nI rephrased this for my daughter, in plain language a second grader can understand:',
 'Batch normalization is a way to make the learning process faster.\n\nBatch normalization is a technique that allows you to use larger learning rates.\n\nBatch normalization makes the neural network more stable.\n\nB')

In [343]:
params(pfoo)

{'a': <Parameter "a">,
 'args': <Parameter "*args">,
 'b': <Parameter "b=99">,
 'kwargs': <Parameter "**kwargs">}

In [635]:
def foo(a, a2, b=2, *args, c=3, **kwargs):
    print('a', a)
    print('a2', a2)
    print('b', b)
    print('args', args)
    print('c', c)
    print('kwargs', kwargs)

In [449]:
print(signature(foo))
print(foo.__name__)
print('def', foo.__defaults__)
print('kw def', foo.__kwdefaults__)

(a, b=2, *args, c=3, **kwargs)
foo
def (2,)
kw def {'c': 3}


In [450]:
pfoo = partial(foo, b=99, d='d')
print(signature(pfoo))
print(pfoo.__name__)

(a, *, b=99, c=3, **kwargs)


AttributeError: 'functools.partial' object has no attribute '__name__'

In [451]:
dfoo = partial(foo, b=99, d='d')
dfoo = update_wrapper(dfoo, foo)
dfoo.__dir__ = foo.__dir__
print(signature(dfoo))
print(dfoo.__name__)

(a, b=2, *args, c=3, **kwargs)
foo


In [398]:
params(pfoo)

{'a': <Parameter "a">,
 'args': <Parameter "*args">,
 'b': <Parameter "b=99">,
 'kwargs': <Parameter "**kwargs">}

In [358]:
dfoo()

TypeError: foo() missing 1 required positional argument: 'a'

In [460]:
from types import FunctionType, MethodType

In [461]:
{**(foo.__defaults__ or {})}

TypeError: 'tuple' object is not a mapping

In [462]:
foo.__defaults__

(2,)

In [463]:
bar.__kwdefaults__

In [476]:
dir(pfoo)

['__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'args',
 'func',
 'keywords']

In [477]:
pfoo.keywords

{'b': 99, 'd': 'd'}

In [473]:
pfoo.func.__defaults__

(2,)

In [474]:
pfoo.func.__kwdefaults__

{'c': 3}

In [475]:
pfoo.args

()

# TODO

- if we pass in a value for a param w/ no default but not for another, param order can get messed up (should have all params w/out defaults first)
- when passing in kwargs, how do we use them in func? Because we added them to sig and kwargs have no defaults, so func would have to guess what args are available before creating partial

In [687]:
def rigorous_partial(func, name=None, **kwargs):
    old_sig = signature(foo)
    new_pars = []
    max_kind = -1
    kwargs_name = ''
    for k, v in old_sig.parameters.items():
        if v.kind == 4:
            kwargs_name = k
            break
        if k in kwargs:
            default = kwargs.pop(k)
        else:
            default = v.default
        param = inspect.Parameter(k, v.kind, default=default)
        new_pars.append(param)
        max_kind = max(max_kind, v.kind)
    for k, v in kwargs.items():
        param = inspect.Parameter(k, 3 if max_kind >= 2 else 1, default=v)
        new_pars.append(param)
    if kwargs_name:
        new_pars.append(inspect.Parameter(kwargs_name, 4))
    new = FunctionType(
        func.__code__,
        func.__globals__, 
        name=name or func.__name__,
        argdefs=tuple(p.default for p in new_pars if p.kind < 3 
                      and p.default != inspect._empty),
        closure=func.__closure__
    )
    new.__kwdefaults__ = {p.name: p.default for p in new_pars if p.kind == 3}
    new.__signature__ = old_sig.replace(parameters=new_pars)
    return new

In [688]:
inspect.Parameter('a', 2).kind

<_ParameterKind.VAR_POSITIONAL: 2>

In [689]:
sorted([getattr(inspect.Parameter, k) for k, v in hdir(inspect.Parameter).items() if k.isupper()])

[<_ParameterKind.POSITIONAL_ONLY: 0>,
 <_ParameterKind.POSITIONAL_OR_KEYWORD: 1>,
 <_ParameterKind.VAR_POSITIONAL: 2>,
 <_ParameterKind.KEYWORD_ONLY: 3>,
 <_ParameterKind.VAR_KEYWORD: 4>]

In [690]:
signature(foo)

<Signature (a, a2, b=2, *args, c=3, **kwargs)>

In [691]:
inspect._ParameterKind.POSITIONAL_OR_KEYWORD < 3

True

In [692]:
{k: v.kind for k, v in params(foo).items()}

{'a': <_ParameterKind.POSITIONAL_OR_KEYWORD: 1>,
 'a2': <_ParameterKind.POSITIONAL_OR_KEYWORD: 1>,
 'args': <_ParameterKind.VAR_POSITIONAL: 2>,
 'b': <_ParameterKind.POSITIONAL_OR_KEYWORD: 1>,
 'c': <_ParameterKind.KEYWORD_ONLY: 3>,
 'kwargs': <_ParameterKind.VAR_KEYWORD: 4>}

In [693]:
foo.__defaults__

(2,)

In [694]:
foo.__kwdefaults__

{'c': 3}

In [695]:
foo('aaaa', 'a2')

a aaaa
a2 a2
b 2
args ()
c 3
kwargs {}


In [696]:
new = rigorous_partial(foo, a2=111, z=99)
new

<function __main__.foo>

In [697]:
signature(new)

<Signature (a, a2=111, b=2, *args, c=3, z=99, **kwargs)>

In [698]:
params(new)

{'a': <Parameter "a">,
 'a2': <Parameter "a2=111">,
 'args': <Parameter "*args">,
 'b': <Parameter "b=2">,
 'c': <Parameter "c=3">,
 'kwargs': <Parameter "**kwargs">,
 'z': <Parameter "z=99">}

In [699]:
{k: v.kind for k, v in params(new).items()}

{'a': <_ParameterKind.POSITIONAL_OR_KEYWORD: 1>,
 'a2': <_ParameterKind.POSITIONAL_OR_KEYWORD: 1>,
 'args': <_ParameterKind.VAR_POSITIONAL: 2>,
 'b': <_ParameterKind.POSITIONAL_OR_KEYWORD: 1>,
 'c': <_ParameterKind.KEYWORD_ONLY: 3>,
 'kwargs': <_ParameterKind.VAR_KEYWORD: 4>,
 'z': <_ParameterKind.KEYWORD_ONLY: 3>}

In [700]:
new.__defaults__

(111, 2)

In [701]:
new.__kwdefaults__

{'c': 3, 'z': 99}

In [703]:
new('a')

a a
a2 111
b 2
args ()
c 3
kwargs {}


In [705]:
new('a', 'a2')

a a
a2 a2
b 2
args ()
c 3
kwargs {}


In [497]:
sorted({k: v.kind for k, v in signature(new).parameters.items()}.items(), key=lambda x: x[1])

[('a', <_ParameterKind.POSITIONAL_OR_KEYWORD: 1>),
 ('b', <_ParameterKind.POSITIONAL_OR_KEYWORD: 1>),
 ('args', <_ParameterKind.VAR_POSITIONAL: 2>),
 ('c', <_ParameterKind.KEYWORD_ONLY: 3>),
 ('kwargs', <_ParameterKind.VAR_KEYWORD: 4>)]