In [None]:
#api key: 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

import os
os.environ["GROQ_API_KEY"] = "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"

In [None]:
!pip install pandas tqdm pgeocode python-dotenv openai



In [None]:
import os, re, json
from dataclasses import dataclass
from typing import Dict, Any, List
from concurrent.futures import ThreadPoolExecutor, as_completed

import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv
from openai import OpenAI

AGE = 38  # average age of first-time homeowner
OUTPUT_CSV = "us_county_llm_estimates.csv"

# API Setup
load_dotenv()
API_KEY = os.getenv("GROQ_API_KEY")
BASE_URL = "https://api.groq.com/openai/v1"
MODEL = "openai/gpt-oss-120b"

client = OpenAI(api_key=API_KEY, base_url=BASE_URL)

SYSTEM_MSG = (
    "You are a careful estimator. "
    "For each county, produce distinct outputs. "
    "Distribute loan_confidence across 0.000–0.999, estimated_credit_score across 550–780. "
    "Use the full range: some values should be low, some medium, some high. "
    "Do not cluster everything around one number. "
    "Ensure consecutive counties are not identical. "
    "Always return only strict JSON with exactly these three keys: "
    '{"loan_confidence": <float>, "estimated_credit_score": <int>, "area_median_income": <int>}.'
)

def build_user_prompt(county_name: str, county_fips: str, age: int) -> str:
    return f"""
Task: Given a U.S. county and its FIPS code and a fixed applicant age, output three numeric fields estimating:
1) loan_confidence: a float between 0 and 1 for how confident a typical lender should be about approving a standard home loan for an average applicant in this county.
2) estimated_credit_score: an integer from 300 to 850 for a typical applicant in this county.
3) area_median_income: an integer (USD) for median household income in this county.

Inputs:
- County: {county_name}
- FIPS code: {county_fips}
- Age: {age}

Assumptions:
- No personal credit history provided; assume an average applicant in the area.
- Use broad socioeconomic priors, not exact facts. Do not output null.

Output requirements:
- Output strictly valid JSON with exactly these three keys and no others:
  {{"loan_confidence": <float 0..1>, "estimated_credit_score": <int 300..850>, "area_median_income": <int>}}
- No additional text, no markdown, no commentary.
"""

JSON_ONLY = {"type": "json_object"}

@dataclass
class Estimation:
    county_fips: str
    county: str
    age: int
    loan_confidence: float
    estimated_credit_score: int
    area_median_income: int

class ParseError(Exception):
    pass

def extract_json(text: str) -> Dict[str, Any]:
    try:
        return json.loads(text)
    except json.JSONDecodeError:
        m = re.search(r"\{.*\}", text, flags=re.S)
        if not m:
            raise ParseError(f"Could not find JSON object in: {text[:200]!r}")
        return json.loads(m.group(0))

def call_llm(county_name: str, county_fips: str, age: int) -> Dict[str, Any]:
    resp = client.chat.completions.create(
        model=MODEL,
        temperature=0.5,
        max_tokens=700,  #more room than before
        response_format=JSON_ONLY,
        messages=[
            {"role": "system", "content": SYSTEM_MSG},
            {"role": "user", "content": build_user_prompt(county_name, county_fips, age)},
        ],
    )
    text = resp.choices[0].message.content
    return extract_json(text)

def get_us_counties() -> pd.DataFrame:
    # Census file of all counties and FIPS
    url = "https://www2.census.gov/geo/docs/reference/codes/files/national_county.txt"
    df = pd.read_csv(
        url,
        header=None,
        dtype=str,
        names=["state", "state_fips", "county_fips", "county_name", "class_fips"],
    )
    df["county_fips_full"] = df["state_fips"].str.zfill(2) + df["county_fips"].str.zfill(3)
    df["county_display"] = df["county_name"].str.title() + ", " + df["state"]
    return df[["county_fips_full", "county_display"]]

def main():
    counties = get_us_counties()
    print(f"Found {len(counties)} U.S. counties.")
    out_rows = []

    max_workers = 2  # careful with rate limits
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {
            executor.submit(call_llm, row["county_display"], row["county_fips_full"], AGE): row
            for _, row in counties.iterrows()
        }

        for future in tqdm(as_completed(futures), total=len(futures), desc="Estimating"):
            row = futures[future]
            try:
                d = future.result()
                out_rows.append({
                    "county_fips": row["county_fips_full"],
                    "county": row["county_display"],
                    "predicted_credit_score": d["estimated_credit_score"],
                    "loan_confidence": d["loan_confidence"],
                })
            except Exception as e:
                out_rows.append({
                    "county_fips": row["county_fips_full"],
                    "county": row["county_display"],
                    "predicted_credit_score": None,
                    "loan_confidence": None,
                    "error": str(e),
                })

    df = pd.DataFrame(out_rows)
    df.to_csv(OUTPUT_CSV, index=False)
    print(f"Saved {len(df)} rows to {OUTPUT_CSV}")

    # For Colab: auto-download
    try:
        from google.colab import files
        files.download(OUTPUT_CSV)
    except ImportError:
        pass

if __name__ == "__main__":
    main()

Found 3235 U.S. counties.


Estimating: 100%|██████████| 3235/3235 [23:08<00:00,  2.33it/s]

Saved 3235 rows to us_county_llm_estimates.csv





<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>