In [0]:
# %python
# %pip install databricks-dlt
# %restart_python

In [0]:
import requests
import pandas as pd
import dlt
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, BooleanType, DoubleType

# Define the schema
stadium_schema = StructType([
    StructField("StadiumID", IntegerType(), True),
    StructField("Active", BooleanType(), True),
    StructField("Name", StringType(), True),
    StructField("Address", StringType(), True),
    StructField("City", StringType(), True),
    StructField("State", StringType(), True),
    StructField("Zip", StringType(), True),
    StructField("Country", StringType(), True),
    StructField("Capacity", IntegerType(), True),
    StructField("GeoLat", DoubleType(), True),
    StructField("GeoLong", DoubleType(), True)
])

# Fetch data from the API
def fetch_data_from_api(api_url):
    response = requests.get(api_url)
    response.raise_for_status()  # Raise an error for bad status codes
    return response.json()

api_url = "https://replay.sportsdata.io/api/v3/cbb/scores/json/stadiums?key=bafecc01eaaf419a984cd7ec2b602594"
json_data = fetch_data_from_api(api_url)

# Convert the JSON data to a pandas DataFrame
pdf = pd.DataFrame(json_data)

# Ensure the data types match the schema and handle missing values
pdf['StadiumID'] = pdf['StadiumID'].astype(int)
pdf['Active'] = pdf['Active'].astype(bool)
pdf['Name'] = pdf['Name'].astype(str)
pdf['Address'] = pdf['Address'].astype(str)
pdf['City'] = pdf['City'].astype(str)
pdf['State'] = pdf['State'].astype(str)
pdf['Zip'] = pdf['Zip'].astype(str)
pdf['Country'] = pdf['Country'].astype(str)
pdf['Capacity'] = pdf['Capacity'].fillna(0).astype(int)  # Fill NA with 0 and cast to int
pdf['GeoLat'] = pdf['GeoLat'].astype(float)
pdf['GeoLong'] = pdf['GeoLong'].astype(float)

# Convert the pandas DataFrame to a Spark DataFrame
spark = SparkSession.builder.getOrCreate()
df = spark.createDataFrame(pdf, schema=stadium_schema)

# Define the Delta table
@dlt.table(
    name="jw_raw_stadiums",
    comment="Table containing JSON data loaded from a URL"
)
def load_json_data():
    return df