## Pipeline Parameters

In [0]:
# dbutils.widgets.text("input_load_date", "YYYY-MM-DD", "Input Load Date")

In [0]:
%run ./aws_secret_manager

In [0]:
# Standard library imports
import csv
import datetime as dt
import os
from ast import literal_eval

# Third-party library imports
import pyarrow as pa
import requests as r
from dotenv import load_dotenv
from pyspark.sql.functions import cast, col, current_timestamp
from pyspark.sql.types import (
    DateType,
    DoubleType,
    StringType,
    StructField,
    StructType,
)


In [0]:
load_dotenv()

catalog_name = os.getenv('DATABRICKS_CATALOG_NAME')
schema_name = os.getenv('DATABRICKS_SCHEMA_NAME')

In [0]:
stock_financials_schema = StructType([
    StructField("ticker_symbol", StringType(), True),
    StructField("start_date", StringType(), True),
    StructField("end_date", StringType(), True),
    StructField("filing_date", StringType(), True),
    StructField("fiscal_period", StringType(), True),
    StructField("fiscal_year", StringType(), True),
    StructField("diluted_earnings_per_share", StringType(), True),
    StructField("net_income_loss", StringType(), True),
    StructField("equity", StringType(), True),
    StructField("long_term_debt", StringType(), True),
    StructField("diluted_average_shares", StringType(), True)
])

## Load Stock Price Data for S&P500 Companies

In [0]:
SP500_tickers = (spark.read.table(f'{catalog_name}.{schema_name}.kdayno_bronze_SP500_companies')
                  .select('ticker_symbol')
                )

SP500_tickers_list = [row['ticker_symbol'] for row in SP500_tickers.collect()]

In [0]:
polygon_api_key = literal_eval(get_secret("POLYGON_CREDENTIALS"))['AWS_SECRET_ACCESS_KEY']
stock_financials = {'ticker_symbol':[], 'start_date':[], 'end_date':[], 'filing_date':[], 'fiscal_period':[], 'fiscal_year':[], 'diluted_earnings_per_share':[], 'net_income_loss':[], 'equity':[], 'long_term_debt':[], 'diluted_average_shares':[]}

for ticker_symbol in SP500_tickers_list:
    polygon_url = f"https://api.polygon.io/vX/reference/financials?ticker={ticker_symbol}&timeframe=annual&limit=10&apiKey="

    data = r.get(f'{polygon_url}{polygon_api_key}').json()
    print(f'Getting financial data for ticker: {ticker_symbol} ...')

    if data['results']:
        stock_financials['ticker_symbol'].append(ticker_symbol)
        stock_financials['start_date'].append(data['results'][0].get('start_date'))
        stock_financials['end_date'].append(data['results'][0].get('end_date'))
        stock_financials['filing_date'].append(data['results'][0].get('filing_date'))
        stock_financials['fiscal_period'].append(data['results'][0].get('fiscal_period'))
        stock_financials['fiscal_year'].append(data['results'][0].get('fiscal_year'))
        stock_financials['diluted_earnings_per_share'].append(data['results'][0]['financials']['income_statement'].get('diluted_earnings_per_share',{}).get('value', 0))
        stock_financials['net_income_loss'].append(data['results'][0]['financials']['income_statement'].get('net_income_loss',{}).get('value', 0))
        stock_financials['equity'].append(data['results'][0]['financials']['balance_sheet'].get('equity',{}).get('value', 0))
        stock_financials['long_term_debt'].append(data['results'][0]['financials']['balance_sheet'].get('long_term_debt',{}).get('value', 0))
        stock_financials['diluted_average_shares'].append(data['results'][0]['financials']['income_statement'].get('diluted_average_shares',{}).get('value', 0))

    else:
        print(f'No financial data found for ticker: {ticker_symbol}.')

stock_financials_df = spark.createDataFrame(list(zip(*stock_financials.values())), stock_financials_schema)


stock_financials_transformed_df = (stock_financials_df.withColumn('start_date', col('start_date').cast('date'))
                                                      .withColumn('end_date', col('end_date').cast('date'))
                                                      .withColumn('filing_date', col('filing_date').cast('date'))
                                                      .withColumn('diluted_earnings_per_share', col('diluted_earnings_per_share').cast('double'))
                                                      .withColumn('net_income_loss', col('net_income_loss').cast('double'))
                                                      .withColumn('equity', col('equity').cast('double'))
                                                      .withColumn('long_term_debt', col('long_term_debt').cast('double'))
                                                      .withColumn('diluted_average_shares', col('diluted_average_shares').cast('double'))
                                                      .withColumn('load_date_ts', current_timestamp()))


(stock_financials_transformed_df.write.format("delta")
                        .mode("overwrite")
                        .saveAsTable(f'{catalog_name}.{schema_name}.kdayno_bronze_SP500_stock_financials'))
        
print(f'Run successful.')
