In [0]:
# Standard library imports
import os

# Third-party library imports
import requests as r
from dotenv import load_dotenv
from pyspark.sql.functions import current_timestamp
from pyspark.sql.types import (
    DateType,
    DecimalType,
    DoubleType,
    IntegerType,
    StringType,
    StructField,
    StructType,
    TimestampType
)

load_dotenv()

raw_zone_path = os.getenv('RAW_ZONE_PATH')
catalog_name = os.getenv('DATABRICKS_CATALOG_NAME')
schema_name = os.getenv('DATABRICKS_SCHEMA_NAME')

In [0]:
sp500_companies_source_schema = StructType([
    StructField("Symbol", StringType(), True),
    StructField("Security", StringType(), True),
    StructField("GICS_Sector", StringType(), True),
    StructField("GICS_Sub_Industry", StringType(), True),
    StructField("Headquarters_Location", StringType(), True),
    StructField("Date_added", DateType(), True),
    StructField("CIK", IntegerType(), True),
    StructField("Founded", IntegerType(), True),
    StructField("load_date_ts", TimestampType(), True),    
])

## Load list of S&P 500 Companies

In [0]:
# URL of the CSV file
url = 'https://datahub.io/core/s-and-p-500-companies/r/constituents.csv'

# Download the CSV file
response = r.get(url)
print(response)

if response.status_code == 200:
    with open(f'{raw_zone_path}/s-and-p-500-companies.csv', 'wb') as file:
        file.write(response.content)
    print("CSV file downloaded successfully.")

else:
    print(f"Failed to download the file. Status code: {response.status_code}")

df = spark.read.csv(f'{raw_zone_path}/s-and-p-500-companies.csv', header=True, schema=sp500_companies_source_schema)

df = df.toDF(*[c.lower() for c in df.columns])

df = (df.withColumnRenamed('symbol', 'ticker_symbol')
        .withColumnRenamed('security', 'company_name')
        .withColumn('load_date_ts', current_timestamp()))

df.write.mode("overwrite").format("delta").saveAsTable(f"{catalog_name}.{schema_name}.kdayno_bronze_SP500_companies")