In [1]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Summary

Start compiling humor dataset(s) for RL.

In [92]:
from concurrent.futures import ThreadPoolExecutor
from functools import partial
from huggingface_hub import login, HfApi
import os
import numpy as np
import random
import requests
from bs4 import BeautifulSoup
from tqdm.auto import tqdm
import pandas as pd
from datasets import Dataset, load_dataset
from openai import OpenAI
from pydantic import BaseModel

from aeon import config
from aeon.logging import logger
from aeon.secrets import SecretManager
from aeon.labeling import LLMLabeler
from aeon.prompt import list_prompts, Prompt, Prompts

In [4]:
secrets = SecretManager().get_secrets()

In [3]:
sess = requests.Session()

In [4]:
def _fetch_standup_transcript(url: str, sess: requests.Session) -> str:
    try:
        r = sess.get(url, timeout=5)
        r.raise_for_status()
    except Exception as e:
        logger.error(f'Failed to fetch url: {url} (error: {e})')
        return ''
    
    soup = BeautifulSoup(r.content, 'lxml')
    return '\n\n'.join(
        para.text 
        for para in soup.select("div.elementor-widget-container > p")
    )

fetch_standup_transcript = partial(_fetch_standup_transcript, sess=sess)

In [5]:
base_url = "https://scrapsfromtheloft.com/stand-up-comedy-scripts/"
url_prefix = 'https://scrapsfromtheloft.com/comedy/'

In [4]:
r = requests.get(base_url)

In [5]:
r

<Response [200]>

In [6]:
soup = BeautifulSoup(r.content, 'lxml')

In [47]:
transcript_urls = list(set(
    [link['href'] for link in soup.find_all('a')
    if link['href'].startswith(url_prefix)
    and link['href'] != url_prefix]
))

In [48]:
len(transcript_urls)

499

In [75]:
with ThreadPoolExecutor(max_workers=20) as exc:
    transcripts = list(tqdm(
        exc.map(fetch_standup_transcript, transcript_urls),
        total=len(transcript_urls)
    ))

  0%|                                                                                                  | 0/499 [00:00<?, ?it/s]ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/stewart-lee-90s-comedian-2006-full-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/stewart-lee-90s-comedian-2006-full-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/ellen-degeneres-the-beginning-2000-full-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/ellen-degeneres-the-beginning-2000-full-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/neal-brennan-women-and-black-dudes-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/neal-brennan-women-and-black-dudes-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/c

ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/fortune-feimster-good-fortune-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/fortune-feimster-good-fortune-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/jim-jefferies-alcoholocaust-2010-full-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/jim-jefferies-alcoholocaust-2010-full-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/dave-chappelle-acceptance-speech-2019-mark-twain-prize/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/dave-chappelle-acceptance-speech-2019-mark-twain-prize/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/eddie-izzard-dress-kill-1999-full-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfr

ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/dave-chappelle-846-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/dave-chappelle-846-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/billy-connolly-high-horse-tour-live-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/billy-connolly-high-horse-tour-live-transcript/)
 11%|█████████▋                                                                               | 54/499 [00:03<00:32, 13.72it/s]ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/mo-amer-mohammed-in-texas-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/mo-amer-mohammed-in-texas-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/chelsea-handler-the-feeling-transcript/ (error: 503 

ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/ralphie-may-filthy-animal-tour-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/ralphie-may-filthy-animal-tour-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/john-mulaney-snl-monologue-2018-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/john-mulaney-snl-monologue-2018-transcript/)
 17%|███████████████▌                                                                         | 87/499 [00:04<00:13, 31.21it/s]ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/george-carlin-dumb-americans-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/george-carlin-dumb-americans-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/gary-gulman-born-on-3rd-base-t

ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/sammy-obeid-how-to-save-gaza-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/sammy-obeid-how-to-save-gaza-transcript/)
 24%|████████████████████▊                                                                   | 118/499 [00:04<00:08, 43.59it/s]ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/hannah-berner-we-ride-at-dawn-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/hannah-berner-we-ride-at-dawn-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/eddie-izzard-glorious-1997-full-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/eddie-izzard-glorious-1997-full-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/sebastian-maniscalco-whats-wrong

ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/tom-segura-disgraceful-2018-full-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/tom-segura-disgraceful-2018-full-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/kevin-hart-zero-fks-given-2020-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/kevin-hart-zero-fks-given-2020-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/hannah-gadsby-nanette-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/hannah-gadsby-nanette-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/bert-kreischer-fighting-a-bear-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/bert-kreischer-fighting-a-bear-tra

ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/drew-michael-red-blue-green-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/drew-michael-red-blue-green-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/jim-gaffigan-quality-time-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/jim-gaffigan-quality-time-transcript/)
 36%|███████████████████████████████▋                                                        | 180/499 [00:06<00:06, 51.82it/s]ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/hasan-minhaj-homecoming-king-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/hasan-minhaj-homecoming-king-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/iliza-shlesinger-unveiled-transcript/ (error: 50

ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/ali-wong-baby-cobra-2016-full-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/ali-wong-baby-cobra-2016-full-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/richard-pryors-monologue-saturday-night-live-1975/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/richard-pryors-monologue-saturday-night-live-1975/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/cristela-alonzo-middle-classy-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/cristela-alonzo-middle-classy-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/jim-jefferies-intolerant-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/jim-jefferies-

ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/gabriel-iglesias-hot-and-fluffy-2007-full-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/gabriel-iglesias-hot-and-fluffy-2007-full-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/kevin-james-never-dont-give-up-full-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/kevin-james-never-dont-give-up-full-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/russell-brand-messiah-complex-2013-full-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/russell-brand-messiah-complex-2013-full-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/kevin-bridges-the-story-continues-transcript/ (error: 503 Server Error: Service Unavailable for url: http

ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/phil-wang-philly-philly-wang-wang-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/phil-wang-philly-philly-wang-wang-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/roy-wood-jr-no-one-loves-you-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/roy-wood-jr-no-one-loves-you-transcript/)
 53%|██████████████████████████████████████████████▉                                         | 266/499 [00:08<00:07, 32.36it/s]ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/jim-gaffigan-dark-pale-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/jim-gaffigan-dark-pale-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/cristela-alonzo-lower-classy-2017-full-tra

ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/leanne-morgan-im-every-woman-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/leanne-morgan-im-every-woman-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/rory-scovel-tries-stand-up-for-the-first-time-a-netflix-special/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/rory-scovel-tries-stand-up-for-the-first-time-a-netflix-special/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/taylor-tomlinson-have-it-all-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/taylor-tomlinson-have-it-all-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/iliza-shlesinger-elder-millennial-2018-full-transcript/ (error: 503 Server Error: Service Unavailable for url: https://

ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/colin-quinn-the-new-york-story-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/colin-quinn-the-new-york-story-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/tim-dillon-this-is-your-country-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/tim-dillon-this-is-your-country-transcript/)
 66%|██████████████████████████████████████████████████████████▎                             | 331/499 [00:10<00:04, 33.81it/s]ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/bo-burnham-what-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/bo-burnham-what-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/fred-armisen-standup-for-drummers-transcript/ (error: 50

ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/george-carlin-you-are-all-diseased-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/george-carlin-you-are-all-diseased-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/michelle-wolf-the-well-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/michelle-wolf-the-well-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/kevin-bridges-whole-different-story-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/kevin-bridges-whole-different-story-transcript/)
 72%|███████████████████████████████████████████████████████████████▎                        | 359/499 [00:11<00:04, 30.22it/s]ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/patton-oswalt-we-all-screa

ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/dave-chappelle-hbo-half-hour-1998-traduzione-italiana/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/dave-chappelle-hbo-half-hour-1998-traduzione-italiana/)
 78%|████████████████████████████████████████████████████████████████████▌                   | 389/499 [00:12<00:02, 38.88it/s]ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/w-kamau-bell-private-school-negro-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/w-kamau-bell-private-school-negro-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/amy-schumer-growing-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/amy-schumer-growing-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/eric-andre-legalize-

ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/mike-epps-only-one-mike-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/mike-epps-only-one-mike-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/enissa-amani-ehrenwort-2018-full-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/enissa-amani-ehrenwort-2018-full-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/daniel-sloss-x-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/daniel-sloss-x-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/david-spade-dandelion-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/david-spade-dandelion-transcript/)
ERROR:aeon.logging:Failed to fetch u

ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/kevin-james-irregardless-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/kevin-james-irregardless-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/nate-bargatze-nashville-christmas-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/nate-bargatze-nashville-christmas-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/t-j-miller-no-real-reason-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/t-j-miller-no-real-reason-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/louis-c-k-sorry-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/louis-c-k-sorry-transcript/)
ERROR:aeon.logging:Fai

ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/theo-von-no-offense-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/theo-von-no-offense-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/jimmy-carr-funny-business-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/jimmy-carr-funny-business-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/sam-morril-youve-changed-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/sam-morril-youve-changed-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scrapsfromtheloft.com/comedy/mae-martin-sap-transcript/ (error: 503 Server Error: Service Unavailable for url: https://scrapsfromtheloft.com/comedy/mae-martin-sap-transcript/)
ERROR:aeon.logging:Failed to fetch url: https://scra

In [76]:
sum(bool(row) for row in transcripts)

7

In [69]:
pd.DataFrame({"url": transcript_urls, "transcript": transcripts}).iloc[10:50]

Unnamed: 0,url,transcript
10,https://scrapsfromtheloft.com/comedy/kevin-bri...,♪ I’m at the crossroads ♪\n♪ Getting drowned i...
11,https://scrapsfromtheloft.com/comedy/ali-wong-...,[“Get me Bodied (Extended Mix)” playing]\n\n[a...
12,https://scrapsfromtheloft.com/comedy/stewart-l...,"Recorded on 10 March 2006 at Chapter Arts, Can..."
13,https://scrapsfromtheloft.com/comedy/adam-devi...,[rock music playing]\n\n[indistinct chatter]\n...
14,https://scrapsfromtheloft.com/comedy/trevor-no...,"Filmed at The Lincoln Theatre in Washington, D..."
15,https://scrapsfromtheloft.com/comedy/dave-atte...,[Cheers and applause] – you guys ready to meet...
16,https://scrapsfromtheloft.com/comedy/nate-barg...,Nate Bargatze: Hello World (2023)\nGenre: Come...
17,https://scrapsfromtheloft.com/comedy/neal-bren...,[Hip-hop music plays] [cheers and applause] ma...
18,https://scrapsfromtheloft.com/comedy/ellen-deg...,"Ladies and gentlemen, please, welcome Ellen De..."
19,https://scrapsfromtheloft.com/comedy/hannah-ei...,(vinyl record popping)\n\n(“J’ai Du L’Oublier”...


Hitting rate limits. Could slow down dramatically + add backoff, orrr just revert to the existing huggingface dataset. Was going to do this myself for fun but if it's going to be a pain maybe not worth it, huggingface version has >80% of them.

## Huggingface dataset

In [5]:
ds = load_dataset("zachgitt/comedy-transcripts")

In [6]:
df = ds['train'].to_pandas()\
    .rename(columns=lambda x: x.replace('-', '_'))

In [11]:
df.transcript.sample().values[0]

'Air date: August 14, 2015\n Recorded at The Lincoln Theater in Washington D.C. [rock music plays] [crowd cheering] Thank you. Thank you so much, everybody. Thank you for clapping. I appreciate that. Thank you. Thanks. Thanks a lot. Thank you. That’s cool. All right. This is a good crowd. This is… All right. Thank you. That guy has a high voice. – [laughter] – [laughs] All right, I got a lot of jokes, so I’m just gonna start telling them. Here we go. I went to a baseball game last summer in a stadium and they had a huge TV in the stadium. A Jumbotron. And this guy proposed to his girlfriend using the giant TV. He put her name up there, said, “Will you marry me?” She said yes. The crowd went wild. They found the couple in the audience. I was sitting there thinking, “God, that’s so romantic. That’s so cool.” And then I remembered thinking, you know, you could also use a screen like that if you’re having trouble breaking up with somebody. Be like, “Hey, I’m gonna grab a hot dog. But you s

In [52]:
df.loc[[257], ['transcript']]

Unnamed: 0,transcript
257,"Air date: August 14, 2015\n Recorded at The Li..."


In [39]:
# Approximate transcript token counts
df.transcript.str.len().div(4).describe()

count      419.000000
mean     10993.677804
std       4452.077710
min        646.750000
25%       9332.750000
50%      11620.250000
75%      13910.375000
max      23058.500000
Name: transcript, dtype: float64

In [40]:
# Approximate transcript token counts
df.transcript.str.len().div(4).sum()

np.float64(4606351.0)

In [7]:
# Super rough napkin math, rounded everything
tokens_per_transcript = 10_000
n_transcripts = 400
n_100_token_calls = tokens_per_transcript * n_transcripts / 100
kb_per_100_token_call = np.array([10, 40, 100, 300])

kb_per_gb = 1_000_000
total_gb = n_100_token_calls * kb_per_100_token_call / kb_per_gb
total_gb

array([ 0.4,  1.6,  4. , 12. ])

Looking for possible chunking strategies (lot of models with limited output lengths that will force us to label each transcript in several goes).

In [79]:
for term in ('[laugh]', '[laughs]', '[laughter]', '[pause]', '\n', '\n\n'):
    print(
        repr(term),
        df.transcript.apply(lambda x: x.count(term)).value_counts(normalize=True).head(),
        end='\n\n'
    )

'[laugh]' transcript
0    0.997613
1    0.002387
Name: proportion, dtype: float64

'[laughs]' transcript
0    0.758950
1    0.073986
2    0.023866
6    0.023866
3    0.021480
Name: proportion, dtype: float64

'[laughter]' transcript
0    0.825776
1    0.035800
2    0.016706
7    0.009547
3    0.009547
Name: proportion, dtype: float64

'[pause]' transcript
0    0.995227
4    0.002387
1    0.002387
Name: proportion, dtype: float64

'\n' transcript
0    0.431981
1    0.095465
2    0.059666
4    0.038186
3    0.031026
Name: proportion, dtype: float64

'\n\n' transcript
0    1.0
Name: proportion, dtype: float64



In [87]:
len(df.transcript.values[0].split('. '))

889

## Labeling

Scratch code working out llm labeling stuff.

**Idea for later:** logprobs might actually be a really promising data source here to create more variants cheaply. Swap out some low probability word for a high probability one, joke ruined.

In [243]:
p = Prompt(Prompts.EXTRACT_JOKES)

In [245]:
llm = LLMLabeler(
    Prompts.EXTRACT_JOKES,
)

In [246]:
res = llm.label(
    df,
    max_workers=30
)

Unstaged/uncommitted changes in git repository. You might want to commit them before running.
Labels will be saved in /Users/hmamin/aeon/aeon/data/labels/extract_jokes/2025.11.16_22.02.47-04aa238f9a397c33c7a1ae059552b838f0dec763


Labeling rows:   0%|          | 0/419 [00:00<?, ?it/s]

[row 7] API call failed with error: Could not parse response content as the request was rejected by the content filter
[row 58] API call failed with error: Could not parse response content as the request was rejected by the content filter
[row 85] API call failed with error: Could not parse response content as the request was rejected by the content filter
[row 129] API call failed with error: Could not parse response content as the request was rejected by the content filter
[row 130] API call failed with error: Could not parse response content as the request was rejected by the content filter
[row 180] API call failed with error: Could not parse response content as the request was rejected by the content filter
[row 213] API call failed with error: Could not parse response content as the request was rejected by the content filter
[row 226] API call failed with error: Could not parse response content as the request was rejected by the content filter
[row 351] API call failed with error

In [249]:
dfl = res['df']

In [251]:
# Total number of jokes
dfl.response_content.str['items'].str.len().sum()

np.float64(22929.0)

In [258]:
# Handful of failures, all due to content filter :/
dfl.loc[~dfl.success, 'error'].str.contains('content filter')

7      True
58     True
85     True
129    True
130    True
180    True
213    True
226    True
351    True
357    True
367    True
Name: error, dtype: bool

In [418]:
row = dfl[dfl.success].sample(1)
joke = random.choice(
    row.response_content.values[0]['items']
)
for field in ('prompt', 'subtext', 'unfunny_variant', 'joke'):
    print(f'{field}: {joke[field]}', end='\n---\n')

prompt: Have you seen crazy Craigslist ads?
---
subtext: Craigslist can host very explicit, sketchy offers.
---
unfunny_variant: I saw a Craigslist ad offering concert tickets in exchange for sexual acts, specifying women remove shirts and bras.
---
joke: Clap if you use Craigslist. I’m not talking about like, “oh, hey, I’m looking for an apartment,” Craigslist. I’m talking about, “give me a handjob, I’ll give you my coffee table.” I saw a post once where this guy was trying to sell concert tickets and he posted, “willing to give up two tickets for the sold-out show tonight. must be female, age 20 to 25, and be willing to perform oral sex for a half hour in my car. must remove shirt and bra. Your friend can be there for security.” now that guy’s insane and you can tell, ’cause he put that phrase in there. “must remove shirt and bra.”
---


### Cleanup and upload to huggingface hub

In [292]:
df_merged = pd.concat([df, dfl], axis=1)

In [298]:
# The ones that violated openai's safety policy 👀
df_merged[~df_merged.success].transcript_link.str.split(':').str[0].str.lower().value_counts()

transcript_link
bo burnham        3
dave chappelle    3
john mulaney      3
george carlin     1
jeff foxworthy    1
Name: count, dtype: int64

In [311]:
df_merged.tail(2)

Unnamed: 0,web_scraper_order,web_scraper_start_url,transcript_link,transcript_link_href,transcript,id,success,error,response_raw,response_content,api_kwargs,last_message
417,1686242980-418,https://scrapsfromtheloft.com/stand-up-comedy-...,Kathleen Madigan: Hunting Bigfoot (2023) | Tra...,https://scrapsfromtheloft.com/comedy/kathleen-...,[UPBEAT MUSIC PLAYING] [CROWD CHEERING] Wow. H...,417,True,,{'id': 'chatcmpl-CcmxQXLBfqK6BNwGF6FgKUwYE1hSj...,{'items': [{'joke': 'I’ve been coming here for...,"{'reasoning_effort': 'minimal', 'verbosity': '...",[UPBEAT MUSIC PLAYING] [CROWD CHEERING] Wow. H...
418,1686242983-419,https://scrapsfromtheloft.com/stand-up-comedy-...,John Mulaney: Baby J (2023) | Transcript,https://scrapsfromtheloft.com/comedy/john-mula...,"[John Mulaney] The past couple years, I’ve don...",418,True,,{'id': 'chatcmpl-CcmxQsk7AycGIPpVUkWb8O31oM2bX...,"{'items': [{'joke': 'The past couple years, I’...","{'reasoning_effort': 'minimal', 'verbosity': '...","[John Mulaney] The past couple years, I’ve don..."


In [328]:
df_merged = df_merged[df_merged.success]\
    .drop(columns=["last_message", "id", "success", "error", "response_raw", "api_kwargs", "web_scraper_start_url"])\
    .reset_index(drop=True)

In [None]:
jokes_long = df_merged.response_content.str['items'].explode().to_frame("joke")
jokes_long = pd.merge(
    jokes_long,
    df_merged.drop(columns=["transcript", "response_content"]),
    how="left", left_index=True, right_index=True
).reset_index(drop=True)
jokes_long = pd.concat([pd.json_normalize(jokes_long.pop("joke")), jokes_long], axis=1)
# Reordering so prompt comes first.
jokes_long = jokes_long[['prompt'] + [c for c in jokes_long.columns if c != 'prompt']]

In [360]:
assert jokes_long.isnull().sum().sum() == 0

In [47]:
out_dir = config.DATA_DIR/"datasets/extract_jokes"
os.makedirs(out_dir, exist_ok=True)

In [398]:
jokes_long.to_parquet(out_dir/"df.pq")

In [361]:
ds_long = Dataset.from_pandas(jokes_long)

In [375]:
ds_long[-1]

{'prompt': 'If you had a talk show, what would it be like?',
 'joke': "GQ asked if I'd want my own talk show. I said, 'I had two ideas for a talk show once.' And by ideas, I mean, I thought about them in the privacy of my own room. One idea was interviewing people who do anything that interests me. The other idea was to have a show and just have on only elderly people, especially if they were comfortable talking about being at the end of their lives.",
 'subtext': 'Some talk show concepts are oddly specific or impractical.',
 'unfunny_variant': 'I once thought about two talk show concepts, including one focused only on the elderly.',
 'web_scraper_order': '1686242983-419',
 'transcript_link': 'John Mulaney: Baby J (2023) | Transcript',
 'transcript_link_href': 'https://scrapsfromtheloft.com/comedy/john-mulaney-baby-j-transcript/'}

In [379]:
login(secrets['HUGGINGFACE_TOKEN'])

In [399]:
commit_info = ds_long.push_to_hub('hmamin/extract_jokes')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

In [None]:
hf_api = HfApi()
hf_api.add_collection_item(
    collection_slug="hmamin/aeon",
    item_id="hmamin/extract_jokes",
    item_type="dataset"
)

## Joke Variants Dataset

Saw some evidence that ~24b param models can do a decent job of punching up jokes or at least rewriting them in their own voice. If this works well, potentially have an even funnier level of joke for RL. If not so well, maybe I get an intermediate level between the original and the unfunny_variant/subtext from the previous labeling job. Either way, hoping more levels of humor will be helpful for transcendence.

In [49]:
jokes_long = pd.read_parquet(out_dir/"df.pq")

In [50]:
jokes_long.tail(2)

Unnamed: 0,prompt,joke,subtext,unfunny_variant,web_scraper_order,transcript_link,transcript_link_href
22927,Do you remember things you said on drugs?,"I gave an interview to GQ December 15th, 2020 ...",Substance use impairs memory and leads to inco...,I don't remember certain interviews I apparent...,1686242983-419,John Mulaney: Baby J (2023) | Transcript,https://scrapsfromtheloft.com/comedy/john-mula...
22928,"If you had a talk show, what would it be like?","GQ asked if I'd want my own talk show. I said,...",Some talk show concepts are oddly specific or ...,"I once thought about two talk show concepts, i...",1686242983-419,John Mulaney: Baby J (2023) | Transcript,https://scrapsfromtheloft.com/comedy/john-mula...


In [106]:
llm = LLMLabeler(
    Prompts.REWRITE_JOKE_VARIANT,
)

In [107]:
res = llm.label(
    jokes_long.tail(5), max_workers=5,
)

Unstaged/uncommitted changes in git repository. You might want to commit them before running.
Labels will be saved in /Users/hmamin/aeon/aeon/data/labels/rewrite_joke_variant/2025.11.20_00.29.31-b28a58532a825303c66cdb1ca0a602e28c5251b8


Labeling rows:   0%|          | 0/5 [00:00<?, ?it/s]

[TIMER] BLOCK executed in 6.928 s.
Removing intermediate results dir since job completed without interruption.


In [108]:
row = res['df'].sample(1)
print(row.last_message.values[0], end='\n\n')
row.response_content.values[0]

prompt: Any awkward public parenting moments?
joke: I was in a museum in Detroit with my son and needed to change a diaper. I go to the men’s room and look on the wall and see an old photo of myself in a GQ interview. 'Hello, old friend.'
subtext: Parents can be confronted with reminders of past behavior in public parenting situations.



{'joke_2': 'I was at a museum in Detroit with my kid and had to change a diaper. I go into the men’s room, and on the wall — framed, art-gallery style — is an old photo of me from a GQ interview. For a second I thought the museum had acquired a new exhibit: “The Evolution of Poor Life Choices.” There I am, suave and smug, and now I’m hunched over a changing table wrestling a diaper like it owes me money. Hello, old friend. Meet poop.',
 'joke_3': 'Took my son to a Detroit museum, emergency diaper change, head to the men’s room — and there’s a glossy GQ portrait of me hanging on the wall. The museum’s like, “Here’s cultural refinement.” My life’s like, “Here’s a man with a burp cloth, negotiating with an ass.” The universe really out here curating humility: framed GQ me staring down current me who’s trying to swaddle a baby and a small crime scene. Wave to your past self — he still owes you dignity and a spare diaper.',
 'ranking': [3, 2, 1]}

## Scratch

Prototyping openrouter support.

In [23]:
class MathResponse(BaseModel):
    answer: int
    explanation: str

In [24]:
openrouter_client = OpenAI(
  base_url="https://openrouter.ai/api/v1",
  api_key=secrets["OPENROUTER_API_KEY"],
)

In [27]:
res = openrouter_client.chat.completions.parse(
  model="cognitivecomputations/dolphin-mistral-24b-venice-edition:free",
  messages=[
    {
      "role": "user",
      "content": "1+1=",
    },
  ],
    logprobs=True,
    response_format=MathResponse
)

In [28]:
res

ParsedChatCompletion[MathResponse](id='gen-1763623748-xlHH7j0CsioQu84g7fJc', choices=[ParsedChoice[MathResponse](finish_reason='stop', index=0, logprobs=None, message=ParsedChatCompletionMessage[MathResponse](content='{ "answer": 2, "explanation": "The sum of 1 and 1 is calculated as follows: 1 + 1 = 2." }', refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=None, parsed=MathResponse(answer=2, explanation='The sum of 1 and 1 is calculated as follows: 1 + 1 = 2.'), reasoning=None), native_finish_reason='stop')], created=1763623748, model='cognitivecomputations/dolphin-mistral-24b-venice-edition:free', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=36, prompt_tokens=787, total_tokens=823, completion_tokens_details=None, prompt_tokens_details=None), provider='Venice')

In [29]:
print(res.choices[0].message.content)

{ "answer": 2, "explanation": "The sum of 1 and 1 is calculated as follows: 1 + 1 = 2." }


In [32]:
print(res.choices[0].message.parsed)

answer=2 explanation='The sum of 1 and 1 is calculated as follows: 1 + 1 = 2.'


In [34]:
res.choices[0].logprobs

Notes from some intermediate runs. At first gpt was quitting early, but tweaked prompt a little and it seemingly got to the end.

run | jokes | char_count_jokes | approx char_count_transcript | output_tokens
---|---|---|---|---
original | 60 | 17.1k | 25.8k | 7.6k
ask nicely | 132 | 36.9k | 44.9k | 17.3k

Older code working out early labeling functionality.

In [100]:
from aeon.secrets import SecretManager
from openai import OpenAI
from pydantic import BaseModel, Field, create_model
import re

In [101]:
class Response(BaseModel):
    
    joke: str = Field(
        ...,
        description="One or more sentences containing a joke from the "
                    "input text."
    )
        
    subtext: str = Field(
        ...,
        description="The often banal observation underlying the joke. "
                    "This should not be funny."
    )
        
    joke_variant: str = Field(
        ...,
        description="Your attempt to modify the original joke while "
                    "maintaining the same subtext."
    )

In [102]:
Batch = create_model('Batch', items=list[Response])

In [103]:
instructions = p.static_messages[0]['content']

In [104]:
# number of sentences in one mulaney transcript
len(df.transcript.iloc[418].split('.'))

863

In [105]:
unlabeled = '.'.join(df.transcript.iloc[418].split('.')[:25]) + '.'
user_message = f"<example_to_label>{unlabeled}</example_to_label>"

In [106]:
secrets = SecretManager().get_secrets()

In [107]:
openai_client = OpenAI(api_key=secrets['OPENAI_API_KEY'])

In [108]:
tmp = openai_client.chat.completions.parse(
    model="gpt-4.1-nano",
    messages=[
        {"role": "developer", "content": instructions},
        {"role": "user", "content": user_message}
    ],
    response_format=Batch,
    temperature=0.3,
    logprobs=True,
    top_logprobs=20,
)

In [109]:
type(tmp)

openai.types.chat.parsed_chat_completion.ParsedChatCompletion[Batch]

In [125]:
tmp.model_dump(mode='json')['choices'][0]['message']['content']

'{"items":[{"joke":"The past couple years, I’ve done a lot of work on myself. And I’ve realized that I’ll be fine as long as I get constant attention.","subtext":"The speaker values attention highly and believes it is essential for his well-being.","joke_variant":"I\'ve learned that I need constant attention to be okay."},{"joke":"When I was three years old, they pulled me aside and they told me that I was adopted. And that my real mother had been murdered… by Miss America.","subtext":"The speaker was told a fabricated, sensational story about his adoption and mother\'s death.","joke_variant":"As a child, I was told I was adopted and that my mother was murdered by Miss America."},{"joke":"They said, \'If you ask our mom about it, she’ll get really upset. So don’t ask her if it’s true unless you want to upset her.\' And they said, \'If you ask our dad about it, he’ll say that we’re lying.\' But he’s lying.","subtext":"The children were advised to avoid asking their parents about the fab

In [62]:
tmp = {
    "index": 0,
    "success": True,
    "error": "",
    "response": res.model_dump_json(),
    "api_kwargs": {},
}

In [63]:
with open('/tmp/0.json', "w") as f:
    json.dump(tmp, f)

Observation: joke variant actually maybe works better than subtext! Using kinda dumb models maybe works pretty well, you don't need to tell it to make the joke unfunny, it does it naturally!

**Debugging single item run, possibly delete later**

In [416]:
row = dfl.sample()
print(row.index)
# row.response_content.iloc[-1]

Index([238], dtype='int64')


In [226]:
# Checking how many chars of transcript it kept. Looks like a very low %, 17/45, but
# in practice it stopped about 26k chars into the transcript so that looks like a reasonable % to me.
sum(len(line['joke']) for line in row.response_content.iloc[-1]['items'])

36943

In [227]:
len(row.api_kwargs.values[0]['messages'][-1]['content'])

45086

In [228]:
row.response_raw.values[0]['choices'][0]['finish_reason']

'stop'

In [234]:
term = 'Yep, yep, yep!'
x, y = row.api_kwargs.values[0]['messages'][-1]['content'].split(term)

In [235]:
len(x)

44890

In [236]:
len(y)

182

In [239]:
df.shape[0] * dfl.response_raw.values[0]['usage']['prompt_tokens']

5399653

In [238]:
df.shape[0] * dfl.response_raw.values[0]['usage']['completion_tokens']

7254147

In [240]:
dfl.response_raw.values[0]['usage']['completion_tokens']

17313

In [190]:
df.transcript.str.len().quantile([0, .1, .2, .5, .8, .9, .95, .99, 1])

0.00     2587.00
0.10    12012.20
0.20    33403.80
0.50    46481.00
0.80    57304.40
0.90    62747.20
0.95    67580.50
0.99    84103.42
1.00    92234.00
Name: transcript, dtype: float64