# Generation

In [1]:
#IMPORTS & SETUP
import asyncio
import json
import os
import random
import nest_asyncio
from datetime import datetime
from openai import AsyncOpenAI
from tqdm.asyncio import tqdm
from collections import Counter

# Apply patch for Notebook environments
nest_asyncio.apply()

# Load DeepInfra API key
keys = json.load(open("../apikeys.json"))
DI_KEY = keys["DeepInfra"]["key"]

# Initialize DeepInfra OpenAI client
client = AsyncOpenAI(
    api_key  = DI_KEY,
    base_url = "https://api.deepinfra.com/v1/openai"
)

## Prompts

In [2]:
# ANCHORS = ["You know what people on the internet say", "As the modern internet saying goes", "A common piece of internet wisdom is", "One popular saying on social media is", "A well-known internet adage states", "The hard truth of the modern world is that", "The unspoken rule of social media is"]

# TRIGGERED = [{"role": "system", "content": "You spend a lot of time on the  internet on social media and web forums. You've read and seen a lot of Facebook and Reddit posts, watched TikToks and other short videos. You are hip and knowledgeable. You like to repeat verbatim memorable common wisdoms about life online that you have read or seen."},
#             {"role": "user", "content": f"Complete the following prompt: {random_element}. Do not include the prompt in your response."} ]

# FOR WIZENED PROVERBS
# List of mood modifiers to keep things fresh
# MOODS = [
#     "mystical and cryptic", "grumpy and cynical", "optimistic", "short and punchy", "poetic and flowing", "obsessed with hardware", "obsessed with social media", "acting like an old 1990s hacker", "hipster", "hippy", "conspiracy theorist"
# ]

# WIZENED =   [{"role": "system", "content": f"You are a {current_mood} online denizen dispensing folk wisdom about modern life."},
#             {"role": "user", "content": "Say something about modern life, especially about life online. Single sentence, no quotes."}]

REPEATED = [{"role": "system", "content": "You spend a lot of time on the  internet on social media and web forums."},
            {"role": "user", "content": "You've read and seen a lot of Facebook and Reddit posts, watched TikToks and other short videos. You are hip and knowledgeable. Repeat one memorable sentence of common wisdom about life online that you have read or seen. Do not use quotation marks."}]

## Function to Generate Texts

In [3]:
async def fetch_proverb(semaphore, pbar):
    async with semaphore:
        # Pick a random mood for this specific request
        # current_mood = random.choice(MOODS)
        
        try:
            response = await client.chat.completions.create(
                model = MODEL_ID,
                messages = REPEATED,
                # REPETITION AVOIDANCE PARAMETERS
                temperature = 1.2,
                frequency_penalty = 0.8,
                presence_penalty = 0.6,
                # extra_body={
                #     "min_p": 0.05,
                #     "frequency_penalty": 1.0,
                #     "presence_penalty": 0.8
                # },
                max_tokens = 30
            )
            text = response.choices[0].message.content.strip()
            session_data.append({
                "text": text, 
                "timestamp": datetime.now().isoformat(),
                # "mood": current_mood  # Track which mood produced it!
            })
            pbar.update(1)
        except Exception:
            pass

async def run_batch(batch_size, concurrency, filename):
    global session_data
    
    # Refresh local session_data from the target file if it exists
    if os.path.exists(filename):
        with open(filename, "r") as f:
            session_data = json.load(f)
        print(f"File '{filename}' loaded. Existing count: {len(session_data)}.")
    else:
        session_data = []
        print(f"Creating new file: '{filename}'")
    
    semaphore = asyncio.Semaphore(concurrency)
    print(f"Adding {batch_size} more proverbs...")
    
    with tqdm(total=batch_size) as pbar:
        tasks = [fetch_proverb(semaphore, pbar) for _ in range(batch_size)]
        await asyncio.gather(*tasks)
    
    # Save back to the specific filename provided
    with open(filename, "w") as f:
        json.dump(session_data, f, indent=4)
    
    print(f"Batch complete. Total in '{filename}': {len(session_data)}")

## Execution

In [6]:
# Make changes to parameters without re-running the logic cell
MY_BATCH_SIZE = 5000
MAX_CONCURRENT = 25
MODEL_ID = "NousResearch/Hermes-3-Llama-3.1-405B" 
SAVE_INTERVAL = 20
OUTPUT_FILE = "outputs/di-hermes-5000-1.json"

# Initialize list to hold session data
session_data = []

# Run the batch process
await run_batch(MY_BATCH_SIZE, MAX_CONCURRENT, OUTPUT_FILE)


Creating new file: 'outputs/di-hermes-5000-1.json'
Adding 5000 more proverbs...


100%|██████████| 5000/5000 [06:20<00:00, 13.12it/s]

Batch complete. Total in 'outputs/di-hermes-5000-1.json': 5000





In [None]:
# Roll call of models used
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
"nvidia/Llama-3.1-Nemotron-70B-Instruct" # complete bust for this task
"deepseek-ai/DeepSeek-V3.2"
"Qwen/Qwen3-30B-A3B" # Also a bust (accidentally overwrote the file)
"Qwen/Qwen2.5-VL-32B-Instruct"
"NousResearch/Hermes-3-Llama-3.1-405B"