## Pipeline Parameters

In [0]:
dbutils.widgets.text("input_load_date", "YYYY-MM-DD", "Input Load Date")

# Audit Parameters
dbutils.widgets.text("job_id", "")
dbutils.widgets.text("job_name", "")
dbutils.widgets.text("job_start_date", "")
dbutils.widgets.text("job_start_datetime", "")
dbutils.widgets.text("task_run_id", "")
dbutils.widgets.text("task_name", "")

In [0]:
%run ../utils/aws_secret_manager

In [0]:
%run ../utils/loggers

In [0]:
# Standard library imports
import csv
import datetime as dt
import os
from ast import literal_eval

# Third-party library imports
import pyarrow as pa
import requests as r
from dotenv import load_dotenv
from pyspark.sql.functions import cast, col, current_timestamp, current_date
from pyspark.sql.types import (
    DateType,
    DecimalType,
    DoubleType,
    IntegerType,
    StringType,
    StructField,
    StructType,
    TimestampType
)

In [0]:
load_dotenv()

raw_zone_path = os.getenv('RAW_ZONE_PATH')
input_load_date = dbutils.widgets.get("input_load_date")

catalog_name = os.getenv('DATABRICKS_CATALOG_NAME')
schema_name = os.getenv('DATABRICKS_SCHEMA_NAME')

source_table_name = 'kdayno_bronze_SP500_companies'
target_table_name = 'kdayno_bronze_SP500_stock_prices'

# Audit Variables
job_id = dbutils.widgets.get('job_id')
job_name = dbutils.widgets.get('job_name')
job_start_date = dbutils.widgets.get('job_start_date')
job_start_datetime = dbutils.widgets.get('job_start_datetime')
task_run_id = dbutils.widgets.get('task_run_id')
task_name = dbutils.widgets.get('task_name')


stock_price_schema = StructType([
    StructField("ticker_symbol", StringType(), True),
    StructField("open_price", StringType(), True),
    StructField("close_price", StringType(), True),
    StructField("highest_price", StringType(), True),
    StructField("lowest_price", StringType(), True),
    StructField("trading_date", DateType(), True)
])

## Pipeline Logging

In [0]:
audit_logger(job_id, job_name, input_load_date, job_start_date, job_start_datetime, task_run_id,  task_name, source_table_name, target_table_name)

etl_logger = etl_logger()

In [0]:
# Checks if the stock market is open
if dt.datetime.strptime(input_load_date,'%Y-%m-%d').weekday() > 4:
    etl_logger.warning(f"WARNING: Input date: {input_load_date} lands on a weekend. No data available for the specified date.")
    dbutils.notebook.exit('terminating ETL process')

## Load Stock Price Data for S&P500 Companies

In [0]:
SP500_tickers = (spark.read.table(f'{catalog_name}.{schema_name}.{source_table_name}')
                  .select('ticker_symbol')
                )

SP500_tickers_list = [row['ticker_symbol'] for row in SP500_tickers.collect()]

## ETL

In [0]:
polygon_api_key = literal_eval(get_secret("POLYGON_CREDENTIALS"))['AWS_SECRET_ACCESS_KEY']
current_date = input_load_date
stock_data = {'ticker_symbol':[], 'open_price':[], 'close_price':[], 'highest_price':[], 'lowest_price':[], 'trading_date':[]}

etl_logger.info(f'Loading data for date: {current_date}')

SP500_data_availability_check = r.get(f'https://api.polygon.io/v2/aggs/ticker/VOO/range/1/day/{current_date}/{current_date}?adjusted=true&sort=asc&apiKey={polygon_api_key}').json()['resultsCount']

if SP500_data_availability_check:

    for stock in SP500_tickers_list:
        polygon_url = f"https://api.polygon.io/v2/aggs/ticker/{stock}/range/1/day/{current_date}/{current_date}?adjusted=true&sort=asc&apiKey="


        etl_logger.info(f'Getting data for ticker: {stock} ...')
        data = r.get(f'{polygon_url}{polygon_api_key}').json()

        if data['resultsCount']:
            stock_data['ticker_symbol'].append(data['ticker'])
            stock_data['open_price'].append(data['results'][0]['o'])
            stock_data['close_price'].append(data['results'][0]['c'])
            stock_data['highest_price'].append(data['results'][0]['h'])
            stock_data['lowest_price'].append(data['results'][0]['l'])
            stock_data['trading_date'].append(dt.datetime.fromtimestamp(data['results'][0]['t'] / 1000))
            
        else:
            etl_logger.warning(f'WARNING: No data available for ticker: {stock} on the specified date: {current_date}')

    stock_df = spark.createDataFrame(list(zip(*stock_data.values())), stock_price_schema)

    stock_transformed_df = (stock_df.withColumn('open_price', col('open_price').cast('decimal(10,2)'))
                                .withColumn('close_price', col('close_price').cast('decimal(10,2)'))
                                .withColumn('highest_price', col('highest_price').cast('decimal(10,2)'))
                                .withColumn('lowest_price', col('open_price').cast('decimal(10,2)'))
                                .withColumn('load_date_ts', current_timestamp()))

    spark.sql(f"""
                DELETE FROM {catalog_name}.{schema_name}.{target_table_name}
                WHERE trading_date = '{input_load_date}'
                """)
    
    etl_logger.info(f'Loading: {stock_transformed_df.count()} rows to: {target_table_name}')

    (stock_transformed_df.write.format("delta")
                                .mode("append")
                                .saveAsTable(f'{catalog_name}.{schema_name}.{target_table_name}'))

    etl_logger.info(f'Run successful for: {current_date}')
