In [1]:
import pandas as pd
import requests
import time
from pandas_profiling import ProfileReport
from sklearn.preprocessing import StandardScaler

from etl_resources import sqlite_connection, get_api_key, get_symbol_list

In [2]:
def base_data():
    
    con = sqlite_connection()
    
    qry = '''
select 
w.*,
b.*,
c.*,
i.*,
cq.percentchange as cpi_perc_change,
cq.valuechange as cpi_val_change,
cq.value as cpi_value,
f.percentchange as effr_perc_change,
f.valuechange as effr_val_change,
f.value as effr_value,
g.percentchange as gdp_perc_change,
g.valuechange as gdp_val_change,
g.value as gdp_value,
r.percentchange as retail_perc_change,
r.valuechange as retail_val_change,
r.value as retail_value,
u.percentchange as unemp_perc_change,
u.valuechange as unemp_val_change,
u.value as unemp_value


from 

weekly_prices_qtr w

left join balance_sheet_qtr b on b.ticker = w.ticker and b.quarter = w.quarter and b.year = w.year
left join cash_flow_qtr c on c.ticker = w.ticker and c.quarter = w.quarter and c.year = w.year
left join income_statement_qtr i on i.ticker = w.ticker and i.quarter = w.quarter and i.year = w.year
left join cpi_qtr cq on cq.quarter = w.quarter and cq.year = w.year
left join federal_funds_qtr f on f.quarter = w.quarter and f.year = w.year
left join gdp_qtr g on g.quarter = w.quarter and g.year = w.year
left join retail_sales_qtr r on r.quarter = w.quarter and r.year = w.year
left join unemployment_qtr u on u.quarter = w.quarter and u.year = w.year

where b.fiscaldateending is not null

order by w.ticker, w.year, w.quarter
    '''
    
    df = pd.read_sql(qry, con=con)
    
    df = df.loc[:,~df.columns.duplicated()]
    
    df.to_sql(name='training', con=con, if_exists='replace')
    
    return df

In [3]:
def profile_data():
    
    con = sqlite_connection()
    
    qry = '''select * from training'''
    
    df = pd.read_sql(qry, con=con)
    
    profile = ProfileReport(df, title='training profile')
    profile.to_file('../data/profiles/pre-training.html')

In [4]:
def get_columns(df):
    
    non_nums = ['date','index','quarter','year','ticker','fiscaldateending','close','close_pct','close_val']
    
    proc_cols = [col for col in df.columns if col not in non_nums]

    return proc_cols

In [5]:
def bump_outliers(df):
    
    '''
    This method iterates over the columns and removes any rows that are 3 std devs 
    outside of the mean in either direction
    '''
    outlier_cols = get_columns(df)
    
    for col in outlier_cols:
        
        try:
            std_dev = df[col].std()
            mean = df[col].mean()

            upper_bound = mean + 3*std_dev 
            lower_bound = mean - 3*std_dev

            df = df[df[col] > lower_bound]
            df = df[df[col] < upper_bound]
        
        except:
            print(f'Outlier logic failed on {col}')
        
    
    return df



In [6]:
def apply_standard_scaler(df):
    
    '''
    This method applies the standard scaler transformation from scikit
    '''
    scale_cols = get_columns(df)
    scale_cols = [col for col in scale_cols if '_pct' not in col]
    
    std_scaler = StandardScaler()
    
    for col in scale_cols:
        try:
            df[col] = std_scaler.fit_transform(df[[col]])
            df[col] = df[col].round(4)
        except:
            print(f"Scaling failed on {col}")
        
    return df

In [7]:
def main():
    
    df = base_data()
    df = df.fillna(0)
    #df = bump_outliers(df)
    df = apply_standard_scaler(df)
    
    df.to_sql(name='training_clean', con = sqlite_connection(), if_exists='replace')

main()