In [1]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Summary

Start compiling humor dataset(s) for RL.

In [2]:
from concurrent.futures import ThreadPoolExecutor
from functools import partial
import numpy as np
import requests
from bs4 import BeautifulSoup
from tqdm.auto import tqdm
import pandas as pd
from datasets import Dataset, load_dataset

from aeon.logging import logger

In [3]:
sess = requests.Session()

In [4]:
def _fetch_standup_transcript(url: str, sess: requests.Session) -> str:
    try:
        r = sess.get(url, timeout=5)
        r.raise_for_status()
    except Exception as e:
        logger.error(f'Failed to fetch url: {url} (error: {e})')
        return ''
    
    soup = BeautifulSoup(r.content, 'lxml')
    return '\n\n'.join(
        para.text 
        for para in soup.select("div.elementor-widget-container > p")
    )

fetch_standup_transcript = partial(_fetch_standup_transcript, sess=sess)

In [5]:
base_url = "https://scrapsfromtheloft.com/stand-up-comedy-scripts/"
url_prefix = 'https://scrapsfromtheloft.com/comedy/'

In [4]:
r = requests.get(base_url)

In [5]:
r

<Response [200]>

In [6]:
soup = BeautifulSoup(r.content, 'lxml')

In [47]:
transcript_urls = list(set(
    [link['href'] for link in soup.find_all('a')
    if link['href'].startswith(url_prefix)
    and link['href'] != url_prefix]
))

In [48]:
len(transcript_urls)

499

In [75]:
with ThreadPoolExecutor(max_workers=20) as exc:
    transcripts = list(tqdm(
        exc.map(fetch_standup_transcript, transcript_urls),
        total=len(transcript_urls)
    ))

  0%|                                                                                                  | 0/499 [00:00<?, ?it/s]ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/stewart-lee-90s-comedian-2006-full-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/stewart-lee-90s-comedian-2006-full-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/ellen-degeneres-the-beginning-2000-full-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/ellen-degeneres-the-beginning-2000-full-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/neal-brennan-women-and-black-dudes-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/neal-brennan-women-and-black-dudes-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/c

ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/fortune-feimster-good-fortune-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/fortune-feimster-good-fortune-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/jim-jefferies-alcoholocaust-2010-full-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/jim-jefferies-alcoholocaust-2010-full-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/dave-chappelle-acceptance-speech-2019-mark-twain-prize/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/dave-chappelle-acceptance-speech-2019-mark-twain-prize/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/eddie-izzard-dress-kill-1999-full-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfr

ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/dave-chappelle-846-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/dave-chappelle-846-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/billy-connolly-high-horse-tour-live-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/billy-connolly-high-horse-tour-live-transcript/)
 11%|█████████▋                                                                               | 54/499 [00:03<00:32, 13.72it/s]ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/mo-amer-mohammed-in-texas-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/mo-amer-mohammed-in-texas-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/chelsea-handler-the-feeling-transcript/ (error: 503 

ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/ralphie-may-filthy-animal-tour-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/ralphie-may-filthy-animal-tour-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/john-mulaney-snl-monologue-2018-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/john-mulaney-snl-monologue-2018-transcript/)
 17%|███████████████▌                                                                         | 87/499 [00:04<00:13, 31.21it/s]ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/george-carlin-dumb-americans-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/george-carlin-dumb-americans-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/gary-gulman-born-on-3rd-base-t

ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/sammy-obeid-how-to-save-gaza-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/sammy-obeid-how-to-save-gaza-transcript/)
 24%|████████████████████▊                                                                   | 118/499 [00:04<00:08, 43.59it/s]ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/hannah-berner-we-ride-at-dawn-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/hannah-berner-we-ride-at-dawn-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/eddie-izzard-glorious-1997-full-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/eddie-izzard-glorious-1997-full-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/sebastian-maniscalco-whats-wrong

ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/tom-segura-disgraceful-2018-full-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/tom-segura-disgraceful-2018-full-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/kevin-hart-zero-fks-given-2020-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/kevin-hart-zero-fks-given-2020-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/hannah-gadsby-nanette-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/hannah-gadsby-nanette-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/bert-kreischer-fighting-a-bear-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/bert-kreischer-fighting-a-bear-tra

ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/drew-michael-red-blue-green-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/drew-michael-red-blue-green-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/jim-gaffigan-quality-time-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/jim-gaffigan-quality-time-transcript/)
 36%|███████████████████████████████▋                                                        | 180/499 [00:06<00:06, 51.82it/s]ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/hasan-minhaj-homecoming-king-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/hasan-minhaj-homecoming-king-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/iliza-shlesinger-unveiled-transcript/ (error: 50

ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/ali-wong-baby-cobra-2016-full-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/ali-wong-baby-cobra-2016-full-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/richard-pryors-monologue-saturday-night-live-1975/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/richard-pryors-monologue-saturday-night-live-1975/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/cristela-alonzo-middle-classy-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/cristela-alonzo-middle-classy-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/jim-jefferies-intolerant-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/jim-jefferies-

ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/gabriel-iglesias-hot-and-fluffy-2007-full-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/gabriel-iglesias-hot-and-fluffy-2007-full-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/kevin-james-never-dont-give-up-full-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/kevin-james-never-dont-give-up-full-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/russell-brand-messiah-complex-2013-full-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/russell-brand-messiah-complex-2013-full-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/kevin-bridges-the-story-continues-transcript/ (error: 503 Server Error: Service Unavailable for url: http

ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/phil-wang-philly-philly-wang-wang-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/phil-wang-philly-philly-wang-wang-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/roy-wood-jr-no-one-loves-you-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/roy-wood-jr-no-one-loves-you-transcript/)
 53%|██████████████████████████████████████████████▉                                         | 266/499 [00:08<00:07, 32.36it/s]ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/jim-gaffigan-dark-pale-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/jim-gaffigan-dark-pale-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/cristela-alonzo-lower-classy-2017-full-tra

ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/leanne-morgan-im-every-woman-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/leanne-morgan-im-every-woman-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/rory-scovel-tries-stand-up-for-the-first-time-a-netflix-special/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/rory-scovel-tries-stand-up-for-the-first-time-a-netflix-special/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/taylor-tomlinson-have-it-all-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/taylor-tomlinson-have-it-all-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/iliza-shlesinger-elder-millennial-2018-full-transcript/ (error: 503 Server Error: Service Unavailable for url: https://

ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/colin-quinn-the-new-york-story-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/colin-quinn-the-new-york-story-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/tim-dillon-this-is-your-country-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/tim-dillon-this-is-your-country-transcript/)
 66%|██████████████████████████████████████████████████████████▎                             | 331/499 [00:10<00:04, 33.81it/s]ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/bo-burnham-what-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/bo-burnham-what-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/fred-armisen-standup-for-drummers-transcript/ (error: 50

ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/george-carlin-you-are-all-diseased-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/george-carlin-you-are-all-diseased-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/michelle-wolf-the-well-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/michelle-wolf-the-well-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/kevin-bridges-whole-different-story-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/kevin-bridges-whole-different-story-transcript/)
 72%|███████████████████████████████████████████████████████████████▎                        | 359/499 [00:11<00:04, 30.22it/s]ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/patton-oswalt-we-all-screa

ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/dave-chappelle-hbo-half-hour-1998-traduzione-italiana/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/dave-chappelle-hbo-half-hour-1998-traduzione-italiana/)
 78%|████████████████████████████████████████████████████████████████████▌                   | 389/499 [00:12<00:02, 38.88it/s]ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/w-kamau-bell-private-school-negro-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/w-kamau-bell-private-school-negro-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/amy-schumer-growing-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/amy-schumer-growing-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/eric-andre-legalize-

ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/mike-epps-only-one-mike-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/mike-epps-only-one-mike-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/enissa-amani-ehrenwort-2018-full-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/enissa-amani-ehrenwort-2018-full-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/daniel-sloss-x-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/daniel-sloss-x-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/david-spade-dandelion-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/david-spade-dandelion-transcript/)
ERROR:aeon.logging:Failed to fetch u

ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/kevin-james-irregardless-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/kevin-james-irregardless-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/nate-bargatze-nashville-christmas-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/nate-bargatze-nashville-christmas-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/t-j-miller-no-real-reason-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/t-j-miller-no-real-reason-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/louis-c-k-sorry-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/louis-c-k-sorry-transcript/)
ERROR:aeon.logging:Fai

ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/theo-von-no-offense-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/theo-von-no-offense-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/jimmy-carr-funny-business-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/jimmy-carr-funny-business-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/sam-morril-youve-changed-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/sam-morril-youve-changed-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/mae-martin-sap-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/mae-martin-sap-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scra

In [76]:
sum(bool(row) for row in transcripts)

7

In [69]:
pd.DataFrame({"url": transcript_urls, "transcript": transcripts}).iloc[10:50]

Unnamed: 0,url,transcript
10,https://scrapsfromtheloft.com/comedy/kevin-bri...,♪ I’m at the crossroads ♪\n♪ Getting drowned i...
11,https://scrapsfromtheloft.com/comedy/ali-wong-...,[“Get me Bodied (Extended Mix)” playing]\n\n[a...
12,https://scrapsfromtheloft.com/comedy/stewart-l...,"Recorded on 10 March 2006 at Chapter Arts, Can..."
13,https://scrapsfromtheloft.com/comedy/adam-devi...,[rock music playing]\n\n[indistinct chatter]\n...
14,https://scrapsfromtheloft.com/comedy/trevor-no...,"Filmed at The Lincoln Theatre in Washington, D..."
15,https://scrapsfromtheloft.com/comedy/dave-atte...,[Cheers and applause] – you guys ready to meet...
16,https://scrapsfromtheloft.com/comedy/nate-barg...,Nate Bargatze: Hello World (2023)\nGenre: Come...
17,https://scrapsfromtheloft.com/comedy/neal-bren...,[Hip-hop music plays] [cheers and applause] ma...
18,https://scrapsfromtheloft.com/comedy/ellen-deg...,"Ladies and gentlemen, please, welcome Ellen De..."
19,https://scrapsfromtheloft.com/comedy/hannah-ei...,(vinyl record popping)\n\n(“J’ai Du L’Oublier”...


Hitting rate limits. Could slow down dramatically + add backoff, orrr just revert to the existing huggingface dataset. Was going to do this myself for fun but if it's going to be a pain maybe not worth it, huggingface version has >80% of them.

## Huggingface dataset

In [3]:
ds = load_dataset("zachgitt/comedy-transcripts")

In [4]:
df = ds['train'].to_pandas()\
    .rename(columns=lambda x: x.replace('-', '_'))

In [5]:
df.transcript.sample().values[0]

'[instrumental music plays] ♪ All right\xa0♪ [screeching] [cheering and applause] Beautiful New York City. [cheering and applause continues] Now, I’ve got my father here. [cheering] And he’s planning the whole day today. He wanted to do lunch. He’s like, “Let’s go to the freedom thing. -Take a look at that. -[laughter] Maybe we go to Central Park.” I go, “Dad, I’m– I’m playing Radio City tonight. I got shit going on.” [laughter] He goes, “What, are you too big now? Big shot! [laughter] You’re a biggie, biggie shot. You can’t go out to lunch with your father? [laughter] Biggie, biggie shot.” [laughter] Came in on Wednesday, and it was… I don’t know what’s going on on these airplanes. Looks like a farm on the airplane now with the amount of animals coming down the aisle. [laughter] When did this…? Okay. Why– why is it okay to just bring the dog, a parrot. [laughter] You see a couple months ago, at the Newark Airport? Some idiot tried to bring a peacock. [laughter] She had a peacock on he

In [6]:
# Approximate transcript token counts
df.transcript.str.len().div(4).describe()

count      419.000000
mean     10993.677804
std       4452.077710
min        646.750000
25%       9332.750000
50%      11620.250000
75%      13910.375000
max      23058.500000
Name: transcript, dtype: float64

In [7]:
# Super rough napkin math, rounded everything
tokens_per_transcript = 10_000
n_transcripts = 400
n_100_token_calls = tokens_per_transcript * n_transcripts / 100
kb_per_100_token_call = np.array([10, 40, 100, 300])

kb_per_gb = 1_000_000
total_gb = n_100_token_calls * kb_per_100_token_call / kb_per_gb
total_gb

array([ 0.4,  1.6,  4. , 12. ])

## Labeling

Scratch code working out llm labeling stuff.

**Idea for later:** logprobs might actually be a really promising data source here to create more variants cheaply. Swap out some low probability word for a high probability one, joke ruined.

In [23]:
from aeon.labeling import LLMLabeler
from aeon.prompt import list_prompts, Prompt, Prompts

In [24]:
llm = LLMLabeler(
    Prompts.EXTRACT_JOKES,
    model='gpt-4.1'
)

In [25]:
res = llm.label(
    df.tail(10).assign(transcript=lambda x: x.transcript.str[:500])
)

Unstaged/uncommitted changes in git repository. You might want to commit them before running.


Labeling rows:   0%|          | 0/10 [00:00<?, ?it/s]

In [26]:
dfl = res['df']

In [27]:
dfl.response_content.str['items'].str.len()

0    1
1    2
2    2
3    1
4    2
5    5
6    1
7    4
8    2
9    3
Name: response_content, dtype: int64

In [30]:
dfl.response_content.sample().values[0]

{'items': [{'joke': 'I’ve been coming here for 150,000 years and I’m never bored.',
   'prompt': 'How long have you been coming to Denver?',
   'subtext': "I've visited Denver many times and still find it enjoyable.",
   'unfunny_variant': 'I’ve been coming here for a very long time and I’m never bored.'},
  {'joke': 'There’s always something to do and that is not the case every week on the road.',
   'prompt': 'Is it always fun traveling for shows?',
   'subtext': 'Some places are more interesting to visit than others when traveling for work.',
   'unfunny_variant': 'There’s always something to do here, but that’s not true everywhere I go.'}]}

## Scratch

Older code working out early labeling functionality.

In [100]:
from aeon.secrets import SecretManager
from openai import OpenAI
from pydantic import BaseModel, Field, create_model
import re

In [101]:
class Response(BaseModel):
    
    joke: str = Field(
        ...,
        description="One or more sentences containing a joke from the "
                    "input text."
    )
        
    subtext: str = Field(
        ...,
        description="The often banal observation underlying the joke. "
                    "This should not be funny."
    )
        
    joke_variant: str = Field(
        ...,
        description="Your attempt to modify the original joke while "
                    "maintaining the same subtext."
    )

In [102]:
Batch = create_model('Batch', items=list[Response])

In [103]:
instructions = """
You are a detail-oriented research assistant with a shrewd understanding of humor, human behavior, and writing.
Your job is to extract jokes from the user's message.

  <instructions>You are one step in a data pipeline for a humor-related research project. Your task is to extract all jokes from the input passage and return valid JSON where each object contains "joke" and "subtext" fields. You can think of the subtext as the often banal point that the joke is making, which the comedian has ultimately massaged or restructured such that the resulting joke subverts the audience's expectations in a fun way. The subtext should not be funny, e.g. it might plainly state "airplane food is bad". Not every sentence in the input must belong to a joke, but most of them probably will because I am showing you standup transcripts.
      
  <example_input>
  [comedian: Louis CK]
  I could never wear white pants because I’ll get my period, first of all. I know that. [Laughter] Or diarrhea, more likely. [Laughter] Which is — That’s really my period. Diarrhea. About once a month, I’m like, “Oh, fuck, here we go.” [Laughter] “Better just get home. And don’t make any big decisions today.” [Laughter] It’s true. Don’t make — You know, if you have diarrhea, don’t, like, negotiate. It’s a bad bargaining position. If I have diarrhea, you stand between me and the toilet, I’ll sell you my house for 10 cents.
  </example_input>
  <example_output>
  [
      {
          "joke": "I could never wear white pants because I’ll get my period, first of all. I know that. Or diarrhea, more likely.",
          "subtext": "Wearing white pants makes any bodily mishap immediately obvious, so it’s risky."
      },
      {
          "joke": "Which is — That’s really my period. Diarrhea. About once a month, I’m like, “Oh, fuck, here we go.”",
          "subtext": "Monthly digestive mishaps can feel as routine and unavoidable as menstruation."
      },
      {
          "joke": "“Better just get home. And don’t make any big decisions today.” It’s true. Don’t make — You know, if you have diarrhea, don’t, like, negotiate. It’s a bad bargaining position. If I have diarrhea, you stand between me and the toilet, I’ll sell you my house for 10 cents.",
          "subtext": "Being desperate for a bathroom gives you zero leverage in any negotiation."
      }
  ]
  </example_output>
  </instructions>
"""

In [104]:
# number of sentences in one mulaney transcript
len(df.transcript.iloc[418].split('.'))

863

In [105]:
unlabeled = '.'.join(df.transcript.iloc[418].split('.')[:25]) + '.'
user_message = f"<example_to_label>{unlabeled}</example_to_label>"

In [106]:
secrets = SecretManager().get_secrets()

In [107]:
openai_client = OpenAI(api_key=secrets['OPENAI_API_KEY'])

In [108]:
tmp = openai_client.chat.completions.parse(
    model="gpt-4.1-nano",
    messages=[
        {"role": "developer", "content": instructions},
        {"role": "user", "content": user_message}
    ],
    response_format=Batch,
    temperature=0.3,
    logprobs=True,
    top_logprobs=20,
)

In [109]:
type(tmp)

openai.types.chat.parsed_chat_completion.ParsedChatCompletion[Batch]

In [125]:
tmp.model_dump(mode='json')['choices'][0]['message']['content']

'{"items":[{"joke":"The past couple years, I’ve done a lot of work on myself. And I’ve realized that I’ll be fine as long as I get constant attention.","subtext":"The speaker values attention highly and believes it is essential for his well-being.","joke_variant":"I\'ve learned that I need constant attention to be okay."},{"joke":"When I was three years old, they pulled me aside and they told me that I was adopted. And that my real mother had been murdered… by Miss America.","subtext":"The speaker was told a fabricated, sensational story about his adoption and mother\'s death.","joke_variant":"As a child, I was told I was adopted and that my mother was murdered by Miss America."},{"joke":"They said, \'If you ask our mom about it, she’ll get really upset. So don’t ask her if it’s true unless you want to upset her.\' And they said, \'If you ask our dad about it, he’ll say that we’re lying.\' But he’s lying.","subtext":"The children were advised to avoid asking their parents about the fab

In [62]:
tmp = {
    "index": 0,
    "success": True,
    "error": "",
    "response": res.model_dump_json(),
    "api_kwargs": {},
}

In [63]:
with open('/tmp/0.json', "w") as f:
    json.dump(tmp, f)

Observation: joke variant actually maybe works better than subtext! Using kinda dumb models maybe works pretty well, you don't need to tell it to make the joke unfunny, it does it naturally!

In [40]:
json.loads(res.choices[0].message.content)['items']

[{'joke': 'The past couple years, I’ve done a lot of work on myself. And I’ve realized that I’ll be fine as long as I get constant attention.',
  'subtext': 'People often seek attention to feel okay about themselves.',
  'joke_variant': "I've learned that I need constant attention to feel okay."},
 {'joke': 'When I was three years old, they pulled me aside and they told me that I was adopted. And that my real mother had been murdered… by Miss America.',
  'subtext': 'Children can misunderstand complex family situations in humorous ways.',
  'joke_variant': "As a child, I misunderstood my family's explanations about my adoption and my mother's death."},
 {'joke': 'They thought of every angle.',
  'subtext': 'People often prepare for all possible questions or reactions.',
  'joke_variant': 'They considered every possible way I might react or ask questions.'},
 {'joke': 'And to compound the stress that I was under, when I was three years old, I thought that Miss America was the Statue of 

In [50]:
json.loads(res.model_dump_json()).keys()

dict_keys(['id', 'choices', 'created', 'model', 'object', 'service_tier', 'system_fingerprint', 'usage'])

In [63]:
res.usage

CompletionUsage(completion_tokens=508, prompt_tokens=1165, total_tokens=1673, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0), prompt_tokens_details=PromptTokensDetails(audio_tokens=0, cached_tokens=0))

In [175]:
# [row for row in res.choices[0].logprobs.content]

In [10]:
df[['transcript']].head().to_dict(orient='records')

[{'transcript': 'Full transcript of It’s Bad for Ya, final HBO stand-up comedy special by stand-up comedian George Carlin. It was televised live on March 1, 2008 on HBO. Filmed in the Wells Fargo Center for the Arts in Santa Rosa, California Thank you. Thank you. Thank you. I’d like to begin… I’d like to… Thank you. Thank you. I’d like to begin by saying fuck Lance Armstrong. Fuck him and his balls and his bicycles and his steroids and his yellow shirts and the dumb, empty expression on his face. I’m tired of that asshole. And while you’re at it fuck Tiger Woods, too. There’s another jack-off I can do without. I’m tired of being told who to admire in this country. Aren’t you sick of being told who your heroes ought to be? You know? Being told who you ought to be looking up to. I’ll choose my own heroes, thank you very much. And fuck Dr. Phil, too. Dr. Phil said I should express my emotions, so that’s what I’m doing. Now, since the last time I rolled through these parts, and I do roll t