In [None]:
import asyncio
import os
import re

import pandas as pd
from openai import AsyncOpenAI

combined_df = pd.read_csv("combined_df.tsv", sep="\t")


In [None]:
# Classify language origin using OpenAI async API (parallelized)

# Load API key from environment variable
api_key = os.environ.get('OPENAI_API_KEY')
if not api_key:
    raise EnvironmentError(
        "OPENAI_API_KEY environment variable is not set.\n"
        "Set it with: export OPENAI_API_KEY='your-api-key'"
    )
async_client = AsyncOpenAI(api_key=api_key)

# Get unique corenames
unique_corenames = combined_df['corename'].dropna().unique().tolist()
print(f"Classifying {len(unique_corenames):,} unique corenames...")

async def classify_single(name):
    """Classify a single street name."""
    prompt = f"What is the language origin of '{name}'? Reply with ONE word or UNKNOWN if you cannot determine the language."
    try:
        response = await async_client.chat.completions.create(
            model="gpt-4.1",
            messages=[{"role": "user", "content": prompt}],
            temperature=0,
            max_tokens=10
        )
        return name, response.choices[0].message.content.strip()
    except Exception as e:
        return name, "Error"

async def classify_all(names, concurrency=100):
    """Classify all names with limited concurrency."""
    semaphore = asyncio.Semaphore(concurrency)
    
    async def limited_classify(name):
        async with semaphore:
            return await classify_single(name)
    
    tasks = [limited_classify(name) for name in names]
    results = {}
    
    for i, coro in enumerate(asyncio.as_completed(tasks)):
        name, lang = await coro
        results[name] = lang
        if (i + 1) % 500 == 0:
            print(f"  Processed {i + 1:,}/{len(names):,}")
    
    return results

# Run async classification (await works directly in Jupyter)
language_map = await classify_all(unique_corenames)
combined_df['language_gpt'] = combined_df['corename'].map(language_map)

print("\nGPT Language Distribution:")
print(combined_df['language_gpt'].value_counts(dropna=False))
combined_df.to_csv("combined_df_gpt_4.1_classified.tsv", sep="\t", index=False)