### Setup the catalog and schemas

In [0]:
from pyspark.sql import SparkSession

# Get or create Spark Session
spark = SparkSession.builder.appName("SetupCatalogAndSchemas").getOrCreate()

# --- Configuration ---
# Assuming you are using Unity Catalog, otherwise 'CATALOG_NAME' can be omitted.
CATALOG_NAME = "football_data_org"
BRONZE_SCHEMA = "bronze"
SILVER_SCHEMA = "silver"
GOLD_SCHEMA = "gold"

# 1. Set the Catalog Context (Optional, depending on your environment)
# If you are NOT using Unity Catalog, skip this and the next step and assume 'default' catalog.
print(f"Using Catalog: {CATALOG_NAME}...")
spark.sql(f"CREATE CATALOG IF NOT EXISTS {CATALOG_NAME}")
spark.sql(f"USE CATALOG {CATALOG_NAME}")

# 2. Create the Schemas (Databases) within the Catalog
# Bronze: Raw, source-aligned data
BRONZE_PATH = f"{CATALOG_NAME}.{BRONZE_SCHEMA}"
print(f"Creating Schema: {BRONZE_PATH} IF NOT EXISTS...")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {BRONZE_PATH}")

# Silver: Cleaned, validated, and conformed data
SILVER_PATH = f"{CATALOG_NAME}.{SILVER_SCHEMA}"
print(f"Creating Schema: {SILVER_PATH} IF NOT EXISTS...")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {SILVER_PATH}")

# Gold: Aggregated, business-ready data
GOLD_PATH = f"{CATALOG_NAME}.{GOLD_SCHEMA}"
print(f"Creating Schema: {GOLD_PATH} IF NOT EXISTS...")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {GOLD_PATH}")

print("\nðŸŽ‰ Success! ETL Lakehouse structure is ready.")

### Create Bronze tables and ingest raw data from API

In [0]:
import requests
import json
import time
from pyspark.sql.functions import current_timestamp, lit

# API URLs
MATCHES_API_URL = "https://api.football-data.org/v4/competitions/PL/matches"
TEAMS_API_URL = "https://api.football-data.org/v4/competitions/PL/teams"
PLAYERS_API_URL = "https://api.football-data.org/v4/persons"

# Table paths
MATCH_TABLE_PATH = f"{CATALOG_NAME}.{BRONZE_SCHEMA}.raw_matches"
TEAM_TABLE_PATH = f"{CATALOG_NAME}.{BRONZE_SCHEMA}.raw_teams"
PLAYER_TABLE_PATH = f"{CATALOG_NAME}.{BRONZE_SCHEMA}.raw_players"

# API credentials
API_SCOPE = "football_secrets"
API_KEY_NAME = "football_api_key"
SEASON = 2025

# Get API key from Databricks secrets (or use a dummy key locally)
try:
    api_key = dbutils.secrets.get(scope=API_SCOPE, key=API_KEY_NAME)
except NameError:
    print("WARNING: dbutils not available, using placeholder key")
    api_key = "DUMMY_KEY"

headers = {"X-Auth-Token": api_key}
params = {"season": SEASON}

####################
# Helper functions #
####################

# Fetch JSON data from the API
def fetch_json(url, headers, params=None):
    resp = requests.get(url, headers=headers, params=params)
    resp.raise_for_status()
    return resp.json()

# Call the API for matches
def fetch_matches(headers, params):
    data = fetch_json(MATCHES_API_URL, headers, params)
    matches = data.get("matches", [])
    print(f"Fetched {len(matches)} matches")
    return matches

# Call the API for teams
def fetch_teams(headers, params):
    data = fetch_json(TEAMS_API_URL, headers, params)
    teams = data.get("teams", [])
    print(f"Fetched {len(teams)} teams")
    return teams

# Fetch a single person
def fetch_person(person_id, headers):
    url = f"{PLAYERS_API_URL}/{person_id}"
    payload = fetch_json(url, headers)
    return url, payload

# Convert a list of items to a list of (id, json) tuples
def to_records(items, id_key):
    return [
        (item.get(id_key), json.dumps(item))
        for item in items
    ]

# Write the matches to the bronze table
def write_matches_bronze(matches):
    if not matches:
        print(f"No data for {MATCHES_API_URL}")
        return
    match_records = to_records(matches, "id")

    match_schema = ["matchId", "payload_json"]
    df_matches = spark.createDataFrame(match_records, schema=match_schema)

    match_bronze_df = (
        df_matches
        .withColumn("ingest_ts", current_timestamp())
        .withColumn("source_api_url", lit(MATCHES_API_URL))
    )

    match_bronze_df.write.format("delta") \
        .mode("overwrite") \
        .option("mergeSchema", "true") \
        .saveAsTable(MATCH_TABLE_PATH)

    print(f"Data written to {MATCH_TABLE_PATH}")
    return df_matches

# Write the teams to the bronze table
def write_teams_bronze(teams):
    if not teams:
        print(f"No data for {TEAMS_API_URL}")
        return 
    team_records = to_records(teams, "id")

    team_schema = ["teamId", "payload_json"]
    df_teams = spark.createDataFrame(team_records, schema=team_schema)

    team_bronze_df = (
        df_teams
        .withColumn("ingest_ts", current_timestamp())
        .withColumn("source_api_url", lit(TEAMS_API_URL))
    )

    team_bronze_df.write.format("delta") \
        .mode("overwrite") \
        .option("mergeSchema", "true") \
        .saveAsTable(TEAM_TABLE_PATH)

    print(f"Data written to {TEAM_TABLE_PATH}")
    return df_teams

# Extract all person ids from the teams
def extract_person_ids_from_teams(teams):
    person_ids = set()
    for team in teams:
        squad = team.get("squad", [])
        for member in squad:
            pid = member.get("id")
            if pid:
                person_ids.add(pid)

    person_ids = sorted(person_ids)
    print(f"Found {len(person_ids)} unique person ids in PL squads")
    return person_ids    

# Fetch all persons from the API
def fetch_all_persons(person_ids, headers, sleep_seconds=2):
    person_records = []
    for idx, pid in enumerate(person_ids, start=1):
        url, payload = fetch_person(pid, headers)

        person_records.append(
            (
                pid,
                json.dumps(payload),
                url,
            )
        )

        print(f"Fetched {idx}/{len(person_ids)} person_id={pid}")
        # Set to 2 because of the rate limit of 30 requests per minute
        time.sleep(sleep_seconds)

    return person_records

# Write the players to the bronze table
def write_players_bronze(person_records):
    if not person_records:
        print("No person records to write")
        return

    player_schema = ["personId", "payload_json", "source_api_url"]

    df_players = spark.createDataFrame(person_records, schema=player_schema)
    
    # Add additional columns
    player_bronze_df = (
        df_players
        .withColumn("ingest_ts", current_timestamp())
        .withColumn("competition_code", lit("PL"))
        .withColumn("source_system", lit("football-data.org"))
    )
    # Write to table
    player_bronze_df.write.format("delta") \
        .mode("overwrite") \
        .option("mergeSchema", "true") \
        .saveAsTable(PLAYER_TABLE_PATH)

    print(f"Players written to {PLAYER_TABLE_PATH}")

def run_ingestion(headers, params):
    matches = fetch_matches(headers=headers, params=params)
    df_matches = write_matches_bronze(matches)

    teams = fetch_teams(headers=headers, params=params)
    df_teams = write_teams_bronze(teams)

    person_ids = extract_person_ids_from_teams(teams)
    person_records = fetch_all_persons(person_ids, headers=headers, sleep_seconds=2)
    write_players_bronze(person_records)   

try:
    run_ingestion(headers=headers, params=params)
except requests.exceptions.RequestException as e:
    print(f"API request failed: {e}")
    

### Exploring endpoints

In [0]:
# --- API Exploration Utility ---
def explore_api_endpoint(endpoint_url, params=None, headers=None, max_keys=20):
    """
    Fetches and displays a sample of the JSON response from the given API endpoint.
    Does not ingest or persist any data.
    """
    import requests
    import json

    try:
        resp = requests.get(endpoint_url, headers=headers, params=params)
        resp.raise_for_status()
        data = resp.json()
        print(f"Endpoint: {endpoint_url}")
        print(f"Top-level keys: {list(data.keys())[:max_keys]}")
        print("Sample data (truncated):")
        print(json.dumps(data, indent=2)[:5000])
    except Exception as e:
        print(f"Failed to fetch or parse data from {endpoint_url}: {e}")

# Example usage:
explore_api_endpoint("https://api.football-data.org/v4/competitions/PL/teams", headers=headers)