In [None]:
#api key: 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

import os
os.environ["GROQ_API_KEY"] = "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"

In [None]:
!pip install pandas tqdm pgeocode python-dotenv openai

In [None]:

import os, re, json
from dataclasses import dataclass
from typing import Dict, Any, List
from concurrent.futures import ThreadPoolExecutor, as_completed

import pandas as pd
import pgeocode
from tqdm import tqdm
from dotenv import load_dotenv
from openai import OpenAI

AGE = 38  # average age of first-time homeowner
OUTPUT_CSV = "pa_zip_llm_estimates.csv"

# API Setup
load_dotenv()
API_KEY = os.getenv("GROQ_API_KEY")
BASE_URL = "https://api.groq.com/openai/v1"
MODEL = "llama-3.1-8b-instant"

client = OpenAI(api_key=API_KEY, base_url=BASE_URL)

SYSTEM_MSG = (
    "You are a careful estimator. "
    "For each ZIP code, produce distinct outputs. "
    "Distribute loan_confidence across 0.000–0.999, estimated_credit_score across 550–780, "
    "Use the full range: some values should be low, some medium, some high. "
    "Do not cluster everything around one number. "
    "Ensure consecutive ZIP codes are not identical. "
    "Always return only strict JSON with exactly these three keys: "
    '{"loan_confidence": <float>, "estimated_credit_score": <int>, "area_median_income": <int>}.'
)

def build_user_prompt(zip_code: str, age: int) -> str:
    return f"""
Task: Given a U.S. ZIP code and a fixed applicant age, output three numeric fields estimating:
1) loan_confidence: a float between 0 and 1 for how confident a typical lender should be about approving a standard home loan for an average applicant in this ZIP.
2) estimated_credit_score: an integer from 300 to 850 for a typical applicant in this ZIP.
3) area_median_income: an integer (USD) for median household income in this ZIP area.

Inputs:
- ZIP code: {zip_code}
- Age: {age}

Assumptions:
- No personal credit history provided; assume an average applicant in the area.
- Use broad socioeconomic priors, not exact facts. Do not output null.
- You are a careful estimator.
- For each ZIP code, produce distinct outputs.
- Distribute loan_confidence across 0.000–0.999, estimated_credit_score across 550–780,
- Use the full range: some values should be low, some medium, some high.
- Do not cluster everything around one number.
- Ensure consecutive ZIP codes are not identical.


Output requirements:
- Output strictly valid JSON with exactly these three keys and no others:
  {{"loan_confidence": <float 0..1>, "estimated_credit_score": <int 300..850>, "area_median_income": <int>}}
- No additional text, no markdown, no commentary.
"""
#If uncertain, give your best reasonable estimate (do not output null).

JSON_ONLY = {"type": "json_object"}

@dataclass
class Estimation:
    zip: str
    age: int
    loan_confidence: float
    estimated_credit_score: int
    area_median_income: int

class ParseError(Exception):
    pass

def extract_json(text: str) -> Dict[str, Any]:
    try:
        return json.loads(text)
    except json.JSONDecodeError:
        m = re.search(r"\{.*\}", text, flags=re.S)
        if not m:
            raise ParseError(f"Could not find JSON object in: {text[:200]!r}")
        return json.loads(m.group(0))

def call_llm(zip_code: str, age: int) -> Dict[str, Any]:
    resp = client.chat.completions.create(
        model=MODEL,
        temperature=0.5,
        max_tokens=150,
        response_format=JSON_ONLY,
        messages=[
            {"role": "system", "content": SYSTEM_MSG},
            {"role": "user", "content": build_user_prompt(zip_code, age)},
        ],
        seed=1234,
    )
    text = resp.choices[0].message.content
    return extract_json(text)

def get_pa_zips() -> List[str]:
    nomi = pgeocode.Nominatim("us")
    df = nomi._data
    pa = df[df["state_code"] == "PA"].copy()
    pa = pa[pa["postal_code"].astype(str).str.fullmatch(r"\d{5}", na=False)]
    return sorted(pa["postal_code"].astype(str).unique().tolist())

def main():
    zips = get_pa_zips()
    print(f"Found {len(zips)} Pennsylvania ZIP codes.")
    out_rows = []

    max_workers = 10  # adjust concurrency
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(call_llm, z, AGE): z for z in zips}

        for future in tqdm(as_completed(futures), total=len(futures), desc="Estimating"):
            z = futures[future]
            try:
                d = future.result()
                out_rows.append({
                    "zip_code": z,
                    "predicted_credit_score": d["estimated_credit_score"],
                    "loan_confidence": d["loan_confidence"],
                })
            except Exception as e:
                out_rows.append({
                    "zip_code": z,
                    "predicted_credit_score": None,
                    "loan_confidence": None,
                    "error": str(e),
                })

    df = pd.DataFrame(out_rows)
    df.to_csv(OUTPUT_CSV, index=False)
    print(f"Saved {len(df)} rows to {OUTPUT_CSV}")

    # For Colab: auto-download
    try:
        from google.colab import files
        files.download(OUTPUT_CSV)
    except ImportError:
        pass

if __name__ == "__main__":
    main()