In [1]:
import pandas as pd
import requests
import time

from etl_resources import sqlite_connection, get_api_key, get_symbol_list

In [2]:
def build_statements_dataset():
    
    '''
    This function uses the alphavantage api to pull financial statements for 
    S&P 100 equities. Balance sheets, cash flow, income statements, and earnings
    estimates are included.
    '''
    
    api_key = get_api_key()
    symbol_list = get_symbol_list()
    con = sqlite_connection()
    
    statements = {'INCOME_STATEMENT':'quarterlyReports',
                  'BALANCE_SHEET':'quarterlyReports',
                  'CASH_FLOW':'quarterlyReports'}
    
    for statement, filters in statements.items(): 
        
        for symbol in symbol_list:

            time.sleep(20) # Handle the rate limiter
            url = f"https://www.alphavantage.co/query?function={statement}&symbol={symbol}&apikey={api_key}"

            try:
                
                print(f"Parsing {statement} for {symbol}")
                response = requests.get(url)
                json = response.json()[f"{filters}"]
                df = pd.DataFrame.from_dict(json)
                df['ticker'] = symbol

                table_name = statement.lower() 
                df.to_sql(name=table_name, if_exists='append', index=False, con=con)

            except:
                # TODO: Better failure logging
                print(f"Failed: {url}")
 


In [3]:
def process_financials():
    
    con = sqlite_connection()
    
    tables = ['balance_sheet','cash_flow','income_statement']
    
    non_nums = ['fiscalDateEnding','reportedCurrency','ticker']
    
    for table in tables:
        print(f'Processing {table}')
        df = pd.read_sql(f"select * from {table}",con=con)
        
        nums = [col for col in df.columns if col not in non_nums]
        
        for field in nums:
            df[field] = pd.to_numeric(df[field],errors='coerce')
        
        df['fiscalDateEnding'] = pd.to_datetime(df['fiscalDateEnding'])
        
        df.columns = [col.lower().strip() for col in df.columns]
        
        df.to_sql(name=f"{table}_clean",con=con, if_exists='replace')
        
    

In [4]:
def process_timeseries_differences():
    
    con = sqlite_connection()
    cur = con.cursor()
    tables = ['balance_sheet','cash_flow','income_statement']
    
    for table in tables:
        
        print(f"Working on {table}s")
        try:
            cur.execute(f"drop table {table}_qtr")
        except:
            print(f"{table}_qtr doesnt exist")


        cur.execute(f"select distinct ticker from {table}_clean")
        res = cur.fetchall()
        res = [val[0] for val in res]

        for ticker in res:

            qry = f'''
        select b.*,c.quarter, c.year from {table}_clean b

        left join calendar c on b.fiscaldateending = c.date
        where b.ticker='{ticker}'
        order by b.ticker, c.year, c.quarter
            '''

            df = pd.read_sql(qry,con=con)
            df = df.drop(columns=['reportedcurrency','index'])
            df = df.set_index(['fiscaldateending', 'quarter','year', 'ticker'])
            perc_df = df.pct_change()
            diff_df = df.fillna(0)
            diff_df = diff_df.diff()

            combined_df = pd.merge(perc_df, diff_df, on=['fiscaldateending','quarter','year','ticker'])
            combined_df.columns = [col.replace('_x','_pct').replace('_y','_val') for col in combined_df.columns]
            final_df = pd.merge(df,combined_df, on=['fiscaldateending','quarter','year','ticker'])

            final_df.to_sql(name=f'{table}_qtr',if_exists='append', con=con)

In [5]:
#build_statements_dataset()
#process_financials()
process_timeseries_differences()

Working on balance_sheets
Working on cash_flows
Working on income_statements
