# Download financial data

Use `Serverless` compute

In [None]:
%pip install yfinance --quiet
%restart_python

In [None]:
catalog = "your_catalog"  # TODO
schema = "your_schema"  # TODO
top_us_companies = [
    "NVDA",
    "AAPL",
    "MSFT",
    "GOOGL",
    "AMZN",
    "META",
    "AVGO",
    "BRK-B",
    "TSLA",
    "JPM",
]

## Get stock prices for `yfinance`

In [None]:
import yfinance as yf
import pandas as pd

hist_prices = yf.Tickers(top_us_companies).history(period="10y")

In [None]:
def get_yfinance_stock_prices(tickers: list, period: str = "10y") -> pd.DataFrame:
    """
    Fetch historical stock price data from Yahoo Finance for multiple tickers.

    Args:
        tickers: List of ticker symbols
        period: Time period for historical data (e.g., '1d', '5d', '1mo', '3mo', '6mo', '1y', '2y', '3y', '5y', '10y', 'ytd', 'max')

    Returns:
        pandas DataFrame with columns: date, ticker, open, high, low, close, volume
    """
    # Fetch historical data for all tickers
    hist_prices = yf.Tickers(tickers).history(period=period)

    # Get all the column level 0 names (price types)
    price_columns = hist_prices.columns.get_level_values(0).unique()

    # Create a list to store dataframes for each ticker
    dfs = []

    for ticker in tickers:
        # Extract data for this ticker
        ticker_df = pd.DataFrame()
        ticker_df["date"] = hist_prices.index
        ticker_df["ticker"] = ticker

        for col in price_columns:
            if (col, ticker) in hist_prices.columns:
                ticker_df[col.lower()] = hist_prices[(col, ticker)].values

        dfs.append(ticker_df)

    # Combine all tickers
    price_df_final = pd.concat(dfs, ignore_index=True)

    # Convert datetime to date
    price_df_final["date"] = pd.to_datetime(price_df_final["date"]).dt.date

    # Round price columns to 2 decimals
    price_columns_to_round = ["open", "high", "low", "close"]
    for col_name in price_columns_to_round:
        if col_name in price_df_final.columns:
            price_df_final[col_name] = price_df_final[col_name].round(2)

    # Select only the required columns
    final_columns = ["date", "ticker", "open", "high", "low", "close", "volume"]
    price_df_final = price_df_final[final_columns]

    return price_df_final


# Fetch stock prices
price_df_final = get_yfinance_stock_prices(tickers=top_us_companies, period="3y")

# Convert to Spark
price_sdf = spark.createDataFrame(price_df_final)

display(price_sdf.limit(10))

In [None]:
price_sdf = spark.createDataFrame(price_df_final).select(
    "date", "ticker", "open", "high", "low", "close", "volume"
)
price_sdf.write.mode("overwrite").saveAsTable(f"{catalog}.{schema}).cfa_stock_price")

## Get financial statements from `Alpha Vantage`

See [here](https://www.alphavantage.co/documentation/) for doc. You need to get an API key first.

In [None]:
import requests
import time
import pandas as pd


def get_alpha_vantage_financial_statement(
    tickers: list, api_key: str, statement_type: str
) -> pd.DataFrame:
    """
    Fetch financial statement data from Alpha Vantage for multiple tickers.

    Args:
        tickers: List of ticker symbols
        api_key: Alpha Vantage API key
        statement_type: One of 'income_stmt', 'balance_sheet', 'cashflow'

    Returns:
        pandas DataFrame with financial statement data for all tickers
    """
    # Map statement types to API functions
    function_map = {
        "income_stmt": "INCOME_STATEMENT",
        "balance_sheet": "BALANCE_SHEET",
        "cashflow": "CASH_FLOW",
    }

    if statement_type not in function_map:
        raise ValueError(
            f"Invalid statement_type: {statement_type}. Must be one of {list(function_map.keys())}"
        )

    api_function = function_map[statement_type]
    dfs = []

    for ticker in tickers:
        try:
            # API call
            url = f"https://www.alphavantage.co/query?function={api_function}&symbol={ticker}&apikey={api_key}"
            r = requests.get(url)
            data = r.json()

            # Check for API errors
            if "Error Message" in data:
                print(f"Error fetching {ticker}: {data['Error Message']}")
                continue

            if "Information" in data:
                print(f"API rate limit hit for {ticker}: {data['Information']}")
                continue

            # Extract annual reports
            annual_reports = data.get("annualReports", [])

            if not annual_reports:
                print(f"No annual reports found for {ticker}")
                continue

            # Convert to DataFrame
            ticker_df = pd.DataFrame(annual_reports)

            # Add symbol column at the beginning
            ticker_df.insert(0, "ticker", ticker)

            # Convert fiscalDateEnding to date
            ticker_df["fiscalDateEnding"] = pd.to_datetime(
                ticker_df["fiscalDateEnding"]
            ).dt.date
            ticker_df.rename(
                columns={"fiscalDateEnding": "reporting_date"}, inplace=True
            )

            # Convert numeric columns from strings to floats (handle 'None' strings)
            numeric_columns = ticker_df.columns.drop(
                ["ticker", "reporting_date", "reportedCurrency"]
            )

            for col in numeric_columns:
                ticker_df[col] = pd.to_numeric(
                    ticker_df[col].replace("None", None), errors="coerce"
                )

            # Clean column names: lowercase and replace spaces with underscores
            ticker_df.columns = ticker_df.columns.str.lower().str.replace(" ", "_")

            dfs.append(ticker_df)

            print(f"Successfully processed {ticker} for {statement_type}")

            # Be respectful of API rate limits (free tier: 25 requests/day, 5 requests/minute)
            time.sleep(12)  # Wait 12 seconds between requests to stay under 5/minute

        except Exception as e:
            print(f"Error processing {ticker}: {e}")
            continue

    # Combine all tickers
    if dfs:
        final_df = pd.concat(dfs, ignore_index=True)

        # Sort by ticker and date
        final_df = final_df.sort_values(
            ["ticker", "reporting_date"], ascending=[True, False]
        )
        final_df = final_df.reset_index(drop=True)

        return final_df
    else:
        return pd.DataFrame()

In [None]:
# Set your Alpha Vantage API key
ALPHAVANTAGE_API_KEY = ""  # TODO

### Income statement

In [None]:
av_income_stmt_df = get_alpha_vantage_financial_statement(
    tickers=top_us_companies, api_key=ALPHAVANTAGE_API_KEY, statement_type="income_stmt"
)

print(f"Alpha Vantage income statement shape: {av_income_stmt_df.shape}")
display(av_income_stmt_df.head())

In [None]:
av_income_stmt_sdf = spark.createDataFrame(av_income_stmt_df)
av_income_stmt_sdf.write.mode("overwrite").saveAsTable(
    f"{catalog}.{schema}.cfa_income_statement"
)

### Balance sheet

In [None]:
av_balance_sheet_df = get_alpha_vantage_financial_statement(
    tickers=top_us_companies,
    api_key=ALPHAVANTAGE_API_KEY,
    statement_type="balance_sheet",
)

print(f"Alpha Vantage balance sheet shape: {av_balance_sheet_df.shape}")
display(av_balance_sheet_df.head())

In [None]:
av_balance_sheet_sdf = spark.createDataFrame(av_balance_sheet_df)
av_balance_sheet_sdf.write.mode("overwrite").saveAsTable(
    f"{catalog}.{schema}.cfa_balance_sheet"
)

### Cash flow

In [None]:
av_cashflow_df = get_alpha_vantage_financial_statement(
    tickers=top_us_companies, api_key=ALPHAVANTAGE_API_KEY, statement_type="cashflow"
)

print(f"Alpha Vantage cash flow shape: {av_cashflow_df.shape}")
display(av_cashflow_df.head())

In [None]:
av_cashflow_sdf = spark.createDataFrame(av_cashflow_df)
av_cashflow_sdf.write.mode("overwrite").saveAsTable(f"{catalog}.{schema}.cfa_cashflow")