In [1]:
import os

path_parent = os.path.dirname(os.getcwd())
os.chdir(path_parent)

In [160]:
import pandas as pd
import numpy as np
from datetime import datetime, date
from sklearn import preprocessing
from app.utils.visualization import plot_data
from app.utils.fetch_data import fred_fred, investing_api, alpha_vantage_api, alpha_vantage_api_financial_statements
from app.utils.fetch_data_fmp import FMP

from interpret import glassbox
from interpret import show
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
pd.set_option('display.max_columns', None)

In [18]:
start_date = '2000-01-01'
today = date.today().strftime("%Y-%m-%d")

In [5]:
fmp = FMP()

2022-10-01 18:57:31,859 [app.utils.fetch_data_fmp] [INFO] Financial Modeling Prep api ready.


# Gather data

In [525]:
apple = fmp.get_historical_daily_price('stock', 'aapl', start_date, today)

2022-10-02 01:32:10,250 [app.utils.fetch_data_fmp] [INFO] Fetching daily price data of aapl


In [526]:
# market situation
sp500 = fmp.get_historical_daily_price("index", "%5EGSPC", start_date, today)

2022-10-02 01:32:12,630 [app.utils.fetch_data_fmp] [INFO] Fetching daily price data of %5EGSPC


In [268]:
econ_indices = list()

for i in ["GDP", "realGDP", "nominalPotentialGDP", "realGDPPerCapita",
        "federalFunds", "CPI", "inflationRate", "inflation", "retailSales", "consumerSentiment", "durableGoods",
        "unemploymentRate", "totalNonfarmPayroll", "initialClaims", "industrialProductionTotalIndex",
        "newPrivatelyOwnedHousingUnitsStartedTotalUnits", "totalVehicleSales", "retailMoneyFunds",
        "smoothedUSRecessionProbabilities", "3MonthOr90DayRatesAndYieldsCertificatesOfDeposit",
        "commercialBankInterestRateOnCreditCardPlansAllAccounts", "30YearFixedRateMortgageAverage",
        "15YearFixedRateMortgageAverage"]:
    df = fmp.get_economic_index(i, start_date, today)
    econ_indices.append(df)

2022-10-02 00:21:56,048 [app.utils.fetch_data_fmp] [INFO] Fetching economic index data: GDP
2022-10-02 00:21:56,864 [app.utils.fetch_data_fmp] [INFO] Fetching economic index data: realGDP
2022-10-02 00:21:57,701 [app.utils.fetch_data_fmp] [INFO] Fetching economic index data: nominalPotentialGDP
2022-10-02 00:21:58,536 [app.utils.fetch_data_fmp] [INFO] Fetching economic index data: realGDPPerCapita
2022-10-02 00:21:59,364 [app.utils.fetch_data_fmp] [INFO] Fetching economic index data: federalFunds
2022-10-02 00:22:00,257 [app.utils.fetch_data_fmp] [INFO] Fetching economic index data: CPI
2022-10-02 00:22:01,170 [app.utils.fetch_data_fmp] [INFO] Fetching economic index data: inflationRate
2022-10-02 00:22:02,895 [app.utils.fetch_data_fmp] [INFO] Fetching economic index data: inflation
2022-10-02 00:22:03,724 [app.utils.fetch_data_fmp] [INFO] Fetching economic index data: retailSales
2022-10-02 00:22:04,558 [app.utils.fetch_data_fmp] [INFO] Fetching economic index data: consumerSentiment


# Preprocessing master table

In [527]:
dates = pd.date_range(start_date, today).tolist()
dates = [i.strftime("%Y-%m-%d") for i in dates]

master = pd.DataFrame(dates, columns=['date'])
master['date'] = pd.to_datetime(master['date'], format='%Y-%m-%d')

In [528]:
# apple stock related features
# create lag features
apple = apple[['date', 'adjClose', 'volume']]
apple.sort_values(by='date', ascending=False, inplace=True)
apple['volume'] = apple['volume'].shift(-1)
apple.rename(columns={"volume":"volume_lag1"}, inplace=True)

In [529]:
# market
sp500 = sp500[['date', 'adjClose', 'volume']]
sp500['adjClose'] = sp500['adjClose'].shift(-1)
sp500['volume'] = sp500['volume'].shift(-1)

sp500.sort_values(by='date', ascending=False, inplace=True)
sp500.rename(columns={"adjClose":"sp500_adjClose_lag1",
                      "volume":"sp500_volume_lag1"}, inplace=True)

create master dataset

In [530]:
master = master.merge(apple, on='date', how='left')
master = master.merge(sp500, on='date', how='left')

for econ_index_df in econ_indices:
    master = master.merge(econ_index_df, on='date', how='left')

preprocessing

In [522]:
def days_after_data_seen(s):
    """
    The days after the data release is the main feature that 'extends' the sparse data into daily data
    It comes from the logic that after an economic index is released,
        it might have an impact on the stock market, but the effect it has will gradually diminsh
    """
    not_na_indices= list(s.loc[s.notna()].index)
    num_days_after = list()

    for i in range(len(not_na_indices)-1):    
        if i==0:
            start = not_na_indices[i]

            num_days = start
            num_days_after.extend( list(np.sqrt(np.array(sorted(range(num_days+1), reverse=True)) + 1)) )

        start = not_na_indices[i]
        end = not_na_indices[i+1]

        num_days = end-start
        num_days_after.extend( list(np.sqrt(np.array(sorted(range(num_days), reverse=True)) + 1)) ) 
    
    if len(num_days_after) < len(s):
        fill_num = len(s) - len(num_days_after)
        num_days_after.extend([np.nan]*fill_num)
    elif len(num_days_after) > len(s):
        assert False, "Something is wrong"
        
    return num_days_after

fill in missing values of economic indices by considering the 'num_days_after_data_release'

In [532]:
# create num_days until missing
base_columns = [i for i in master.columns if i not in ['date', 'adjClose', 'volume_lag1',
                                                       'sp500_adjClose_lag1', 'sp500_volume_lag1']]

for c in base_columns:
    master[c+"_days_after_seen"] = days_after_data_seen(master[c])

In [533]:
# forward filling
base_columns = [i for i in master.columns if i not in ['date', 'adjClose']]
master.sort_values(by='date', inplace=True)
for i in base_columns:
    master[i] = master[i].fillna(method='ffill')
    
master.dropna(subset='adjClose', inplace=True)
master.reset_index(inplace=True, drop=True) # drop rows that are not trading days
# the reason this was done here is to merge all the economic statistics first
# the economic stats sometimes are released at a non-trading day

In [534]:
# cut the early days that cannot be inferred from the current data
nas = master.isna().sum(axis=1)
start_index = nas[nas==0].index.min()

master = master.loc[start_index:].reset_index(drop=True)

days_after_index_release * index

In [535]:
base_columns = [i for i in master.columns if i not in ['date', 'adjClose', 'volume_lag1',
                                                       'sp500_adjClose_lag1', 'sp500_volume_lag1']]

index_columns = [i for i in base_columns if 'days_after_seen' not in i]
days_columns = [i for i in base_columns if 'days_after_seen' in i]

assert len(index_columns)==len(days_columns)

for index in index_columns:
    days = index + "_days_after_seen"
    
    master[index + "_day_weighted"] = master[index] / master[days]

master.drop(columns = index_columns + days_columns, inplace=True)

In [537]:
# scale
base_columns = [i for i in master.columns if i not in ['date']]

scaler = preprocessing.StandardScaler(with_mean=False)
scaler.fit(master[base_columns])

scaled_values = scaler.transform(master[base_columns])
scaled_df = pd.DataFrame(scaled_values, columns=[i+'_scaled' for i in base_columns])

# append the existing target and date column
master = pd.concat([master, scaled_df], axis=1)

master.drop(columns=base_columns, inplace=True)

In [538]:
master

Unnamed: 0,date,adjClose_scaled,volume_lag1_scaled,sp500_adjClose_lag1_scaled,sp500_volume_lag1_scaled,GDP_day_weighted_scaled,realGDP_day_weighted_scaled,nominalPotentialGDP_day_weighted_scaled,realGDPPerCapita_day_weighted_scaled,federalFunds_day_weighted_scaled,CPI_day_weighted_scaled,inflationRate_day_weighted_scaled,inflation_day_weighted_scaled,retailSales_day_weighted_scaled,consumerSentiment_day_weighted_scaled,durableGoods_day_weighted_scaled,unemploymentRate_day_weighted_scaled,totalNonfarmPayroll_day_weighted_scaled,initialClaims_day_weighted_scaled,industrialProductionTotalIndex_day_weighted_scaled,newPrivatelyOwnedHousingUnitsStartedTotalUnits_day_weighted_scaled,totalVehicleSales_day_weighted_scaled,retailMoneyFunds_day_weighted_scaled,smoothedUSRecessionProbabilities_day_weighted_scaled,3MonthOr90DayRatesAndYieldsCertificatesOfDeposit_day_weighted_scaled,commercialBankInterestRateOnCreditCardPlansAllAccounts_day_weighted_scaled,30YearFixedRateMortgageAverage_day_weighted_scaled,15YearFixedRateMortgageAverage_day_weighted_scaled
0,2003-01-02,0.005223,0.510933,0.900103,0.796415,0.283454,0.421555,0.351135,0.478375,0.352722,0.718465,3.908194,0.092039,0.571302,0.972343,0.711909,0.803754,0.897218,1.127376,0.896950,1.131041,1.016251,0.949278,0.020879,0.344389,0.927858,3.385242,3.188840
1,2003-01-03,0.005259,0.461813,0.929987,0.899360,0.285042,0.423916,0.353102,0.481055,0.358553,0.730341,3.860534,0.092166,0.580745,0.988416,0.723677,0.817040,0.912049,1.380748,0.911776,1.149737,1.033050,0.964970,0.021224,0.350081,0.943196,4.722869,4.441886
2,2003-01-06,0.005259,0.375332,0.929537,0.827364,0.289971,0.431247,0.359208,0.489373,0.377948,0.769847,3.884364,0.092548,0.612159,1.041882,0.762823,0.861236,0.961384,0.765990,0.961097,1.211929,1.088930,1.017168,0.022372,0.369018,0.994215,2.112131,1.986472
3,2003-01-07,0.005241,0.994071,0.950427,1.050595,0.291671,0.433776,0.361315,0.492243,0.385148,0.784512,3.860534,0.092676,0.623821,1.061729,0.777354,0.877642,0.979698,0.839100,0.979405,1.235015,1.109673,1.036544,0.022799,0.376048,1.013155,2.361435,2.220943
4,2003-01-08,0.005135,0.871412,0.944207,1.130565,0.293402,0.436351,0.363459,0.495165,0.392775,0.800049,4.075008,0.092805,0.636175,1.082756,0.792749,0.895023,0.999099,0.938142,0.998801,1.259474,1.131649,1.057072,0.023250,0.383495,1.033219,2.726750,2.564524
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4967,2022-09-26,3.489974,0.244206,3.778373,3.764246,6.076098,5.842059,7.338180,5.766651,3.690186,6.476187,5.528665,3.638951,7.107717,3.823810,6.357579,2.854815,5.843408,0.921432,5.722360,5.352593,4.613495,6.599474,0.022794,4.102505,5.962091,2.539047,2.305712
4968,2022-09-27,3.512890,0.237588,3.739303,3.575226,6.076098,5.842059,7.338180,5.766651,3.690186,6.476187,5.552496,3.638951,7.107717,3.823810,6.357579,2.854815,5.843408,0.921432,5.722360,5.352593,4.613495,6.599474,0.022794,4.102505,5.962091,2.931839,2.662406
4969,2022-09-28,3.468446,0.214942,3.731374,3.349600,6.076098,5.842059,7.338180,5.766651,3.690186,6.476187,5.552496,3.638951,7.107717,3.823810,6.357579,2.854815,5.843408,0.921432,5.722360,5.352593,4.613495,6.599474,0.022794,4.102505,5.962091,3.590754,3.260769
4970,2022-09-29,3.298080,0.373391,3.804778,3.427760,6.076098,5.842059,7.338180,5.766651,3.690186,6.476187,5.218869,3.638951,7.107717,3.823810,6.357579,2.854815,5.843408,0.921432,5.722360,5.352593,4.613495,6.599474,0.022794,4.102505,5.962091,5.409098,5.052221


# split dataset

In [539]:
master['date_y'] = master['date'].dt.year
master['date_m'] = master['date'].dt.month

In [540]:
train_end_date = '2017-01-01'
test_start_date = '2018-01-01'
test_end_date = '2021-01-01'
validation_date = '2022-01-01'

train = master.loc[master.date <= train_end_date].reset_index(drop=True)
test = master.loc[master.date.between(test_start_date, test_end_date, inclusive='left')].reset_index(drop=True)
valid = master.loc[master.date >= validation_date].reset_index(drop=True)

In [541]:
def split_xy(df, drop_list, target):
    x = df.drop(columns = drop_list + [target])
    y = df[target]
    return x,y

train_x, train_y = split_xy(train, ['date'], 'adjClose_scaled')
test_x, test_y = split_xy(test, ['date'], 'adjClose_scaled')
valid_x, valid_y = split_xy(valid, ['date'], 'adjClose_scaled')

# Glassbox model

In [542]:
from interpret import show
from interpret.data import Marginal

marginal = Marginal().explain_data(train_x, train_y, name = 'Train Data')
show(marginal)

In [544]:
from interpret.glassbox import ExplainableBoostingRegressor, LinearRegression, RegressionTree

ebm = ExplainableBoostingRegressor(random_state=123, n_jobs=-1)
ebm.fit(train_x, train_y)   #Works on dataframes and numpy arrays

ExplainableBoostingRegressor(feature_names=['volume_lag1_scaled',
                                            'sp500_adjClose_lag1_scaled',
                                            'sp500_volume_lag1_scaled',
                                            'GDP_day_weighted_scaled',
                                            'realGDP_day_weighted_scaled',
                                            'nominalPotentialGDP_day_weighted_scaled',
                                            'realGDPPerCapita_day_weighted_scaled',
                                            'federalFunds_day_weighted_scaled',
                                            'CPI_day_weighted_scaled',
                                            'inflationRate_day_weighted_sca...
                                            'continuous', 'continuous',
                                            'continuous', 'continuous',
                                            'continuous', 'continuous',
                         

#### Gloabl explanation

In [545]:
ebm_global = ebm.explain_global(name='EBM')
show(ebm_global)

In [547]:
ebm_local = ebm.explain_local(test_x[:5], test_y[:5], name='EBM')
show(ebm_local)

In [548]:
from interpret import show
from interpret.perf import RegressionPerf

ebm_perf = RegressionPerf(ebm.predict).explain_perf(test_y, test_y, name='EBM')
show(ebm_perf)

AttributeError: 'Series' object has no attribute 'columns'