In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', 300)
pd.set_option('display.max_rows', None)
import warnings
warnings.filterwarnings('ignore')
from arch.unitroot import PhillipsPerron
from scipy.stats import norm
from sklearn.metrics import r2_score

In [2]:
def_data = pd.read_csv("DefRate-Quarterly22.csv")

In [3]:
avg = np.mean(def_data['DefRate'])
def_data['log_CI'] = np.log(def_data['DefRate']) - np.log(avg)
def_data['logit_CI'] = np.log(def_data['DefRate']/(1-def_data['DefRate'])) - np.log(avg/(1-avg))
def_data['probit_CI'] = norm.ppf(def_data['DefRate'])-norm.ppf(avg)

In [4]:
mev = pd.read_csv("\UAE MEV Jan22 v2.csv")
subset_cols = [x for x in mev.columns if ("S1") not in x ]
subset_cols = [x for x in subset_cols if ("S3") not in x ]
mev_df = mev[subset_cols]

In [5]:
model_data = pd.merge(def_data,mev_df,on = 'YrQuarter',how='left')

In [6]:
model_data.head()

Unnamed: 0,YrQuarter,DefRate,log_CI,logit_CI,probit_CI,OilPrice,NominalGDPBill,UnemployRate,EquityIndex,DebtToGDP,IndProdIndex,CurrAccToGDP,CPI,EIBOR3M,PPI,HPI,RealHPI,GDPBill
0,2008Q4,0.03551,0.554292,0.569838,0.240213,57.547692,1137.800006,4.108209,2390.01,11.926268,104.434728,6.043681,95.093333,,104.434728,147.310502,148.662182,325.25526
1,2009Q1,0.037301,0.603489,0.620893,0.262655,45.68381,1041.624075,4.165888,2487.92,16.218479,96.439725,4.705116,94.45,,96.439725,133.67175,134.530544,317.490725
2,2009Q2,0.049531,0.887073,0.917262,0.396143,59.860625,944.682562,4.22325,2631.32,20.81228,94.940662,3.301876,93.593333,,94.940662,124.809195,126.12391,311.434061
3,2009Q3,0.046061,0.814423,0.840967,0.361244,68.82803,915.223952,4.196574,3124.22,23.021086,95.940037,2.158828,94.416667,,95.940037,115.683436,117.09613,309.949323
4,2009Q4,0.018304,-0.108424,-0.110561,-0.044556,75.521692,964.774573,4.079907,2743.61,21.806943,98.438476,1.589596,94.92,1.937831,98.438476,113.896277,115.060357,315.473074


In [7]:
transform_cols = list(model_data.columns[5:])

In [8]:
def feature_transform(data,cols):
    for col in cols:
        # Log Transformation
        data['log_'+col] = np.log(data[col])
        #Lag 
        for i in range(1,7):
            data['lag_'+str(i)+'_'+col] = data[col].shift(i)
        #Diff
        for i in range(1,7):
            data['diff_'+str(i)+"Q_"+col] = data[col] - data[col].shift(i)
        # Per Diff
        for i in range(1,7):
            data['Per_diff_'+str(i)+"Q_"+col] = (data[col]/data[col].shift(i))-1
        # log_dff
        data['log_diff_'+col] = np.log(data[col]) - np.log(data[col].shift(1))

        # Log Diff Lag 
        for i in range(1,7):
            data['log_diff_lag_'+str(i)+'_'+col] = data['log_diff_'+col].shift(i)

        for i in range(1,7):
            data['diff_lag_'+str(i)+"Q_"+col] =  data['diff_'+str(i)+"Q_"+col].shift(i)

    return data

In [9]:
transformed_data = feature_transform(model_data,transform_cols)

In [10]:
transformed_data.head()

Unnamed: 0,YrQuarter,DefRate,log_CI,logit_CI,probit_CI,OilPrice,NominalGDPBill,UnemployRate,EquityIndex,DebtToGDP,IndProdIndex,CurrAccToGDP,CPI,EIBOR3M,PPI,HPI,RealHPI,GDPBill,log_OilPrice,lag_1_OilPrice,lag_2_OilPrice,lag_3_OilPrice,lag_4_OilPrice,lag_5_OilPrice,lag_6_OilPrice,diff_1Q_OilPrice,diff_2Q_OilPrice,diff_3Q_OilPrice,diff_4Q_OilPrice,diff_5Q_OilPrice,diff_6Q_OilPrice,Per_diff_1Q_OilPrice,Per_diff_2Q_OilPrice,Per_diff_3Q_OilPrice,Per_diff_4Q_OilPrice,Per_diff_5Q_OilPrice,Per_diff_6Q_OilPrice,log_diff_OilPrice,log_diff_lag_1_OilPrice,log_diff_lag_2_OilPrice,log_diff_lag_3_OilPrice,log_diff_lag_4_OilPrice,log_diff_lag_5_OilPrice,log_diff_lag_6_OilPrice,diff_lag_1Q_OilPrice,diff_lag_2Q_OilPrice,diff_lag_3Q_OilPrice,diff_lag_4Q_OilPrice,diff_lag_5Q_OilPrice,diff_lag_6Q_OilPrice,log_NominalGDPBill,lag_1_NominalGDPBill,lag_2_NominalGDPBill,lag_3_NominalGDPBill,lag_4_NominalGDPBill,lag_5_NominalGDPBill,lag_6_NominalGDPBill,diff_1Q_NominalGDPBill,diff_2Q_NominalGDPBill,diff_3Q_NominalGDPBill,diff_4Q_NominalGDPBill,diff_5Q_NominalGDPBill,diff_6Q_NominalGDPBill,Per_diff_1Q_NominalGDPBill,Per_diff_2Q_NominalGDPBill,Per_diff_3Q_NominalGDPBill,Per_diff_4Q_NominalGDPBill,Per_diff_5Q_NominalGDPBill,Per_diff_6Q_NominalGDPBill,log_diff_NominalGDPBill,log_diff_lag_1_NominalGDPBill,log_diff_lag_2_NominalGDPBill,log_diff_lag_3_NominalGDPBill,log_diff_lag_4_NominalGDPBill,log_diff_lag_5_NominalGDPBill,log_diff_lag_6_NominalGDPBill,diff_lag_1Q_NominalGDPBill,diff_lag_2Q_NominalGDPBill,diff_lag_3Q_NominalGDPBill,diff_lag_4Q_NominalGDPBill,diff_lag_5Q_NominalGDPBill,diff_lag_6Q_NominalGDPBill,log_UnemployRate,lag_1_UnemployRate,lag_2_UnemployRate,lag_3_UnemployRate,lag_4_UnemployRate,lag_5_UnemployRate,lag_6_UnemployRate,diff_1Q_UnemployRate,diff_2Q_UnemployRate,diff_3Q_UnemployRate,diff_4Q_UnemployRate,diff_5Q_UnemployRate,diff_6Q_UnemployRate,Per_diff_1Q_UnemployRate,Per_diff_2Q_UnemployRate,Per_diff_3Q_UnemployRate,Per_diff_4Q_UnemployRate,Per_diff_5Q_UnemployRate,Per_diff_6Q_UnemployRate,log_diff_UnemployRate,log_diff_lag_1_UnemployRate,log_diff_lag_2_UnemployRate,log_diff_lag_3_UnemployRate,log_diff_lag_4_UnemployRate,log_diff_lag_5_UnemployRate,log_diff_lag_6_UnemployRate,diff_lag_1Q_UnemployRate,diff_lag_2Q_UnemployRate,diff_lag_3Q_UnemployRate,diff_lag_4Q_UnemployRate,diff_lag_5Q_UnemployRate,diff_lag_6Q_UnemployRate,log_EquityIndex,lag_1_EquityIndex,lag_2_EquityIndex,lag_3_EquityIndex,lag_4_EquityIndex,lag_5_EquityIndex,lag_6_EquityIndex,diff_1Q_EquityIndex,diff_2Q_EquityIndex,diff_3Q_EquityIndex,diff_4Q_EquityIndex,diff_5Q_EquityIndex,diff_6Q_EquityIndex,Per_diff_1Q_EquityIndex,Per_diff_2Q_EquityIndex,Per_diff_3Q_EquityIndex,Per_diff_4Q_EquityIndex,Per_diff_5Q_EquityIndex,Per_diff_6Q_EquityIndex,log_diff_EquityIndex,log_diff_lag_1_EquityIndex,log_diff_lag_2_EquityIndex,log_diff_lag_3_EquityIndex,log_diff_lag_4_EquityIndex,log_diff_lag_5_EquityIndex,log_diff_lag_6_EquityIndex,diff_lag_1Q_EquityIndex,diff_lag_2Q_EquityIndex,diff_lag_3Q_EquityIndex,diff_lag_4Q_EquityIndex,diff_lag_5Q_EquityIndex,diff_lag_6Q_EquityIndex,log_DebtToGDP,lag_1_DebtToGDP,lag_2_DebtToGDP,lag_3_DebtToGDP,...,diff_4Q_EIBOR3M,diff_5Q_EIBOR3M,diff_6Q_EIBOR3M,Per_diff_1Q_EIBOR3M,Per_diff_2Q_EIBOR3M,Per_diff_3Q_EIBOR3M,Per_diff_4Q_EIBOR3M,Per_diff_5Q_EIBOR3M,Per_diff_6Q_EIBOR3M,log_diff_EIBOR3M,log_diff_lag_1_EIBOR3M,log_diff_lag_2_EIBOR3M,log_diff_lag_3_EIBOR3M,log_diff_lag_4_EIBOR3M,log_diff_lag_5_EIBOR3M,log_diff_lag_6_EIBOR3M,diff_lag_1Q_EIBOR3M,diff_lag_2Q_EIBOR3M,diff_lag_3Q_EIBOR3M,diff_lag_4Q_EIBOR3M,diff_lag_5Q_EIBOR3M,diff_lag_6Q_EIBOR3M,log_PPI,lag_1_PPI,lag_2_PPI,lag_3_PPI,lag_4_PPI,lag_5_PPI,lag_6_PPI,diff_1Q_PPI,diff_2Q_PPI,diff_3Q_PPI,diff_4Q_PPI,diff_5Q_PPI,diff_6Q_PPI,Per_diff_1Q_PPI,Per_diff_2Q_PPI,Per_diff_3Q_PPI,Per_diff_4Q_PPI,Per_diff_5Q_PPI,Per_diff_6Q_PPI,log_diff_PPI,log_diff_lag_1_PPI,log_diff_lag_2_PPI,log_diff_lag_3_PPI,log_diff_lag_4_PPI,log_diff_lag_5_PPI,log_diff_lag_6_PPI,diff_lag_1Q_PPI,diff_lag_2Q_PPI,diff_lag_3Q_PPI,diff_lag_4Q_PPI,diff_lag_5Q_PPI,diff_lag_6Q_PPI,log_HPI,lag_1_HPI,lag_2_HPI,lag_3_HPI,lag_4_HPI,lag_5_HPI,lag_6_HPI,diff_1Q_HPI,diff_2Q_HPI,diff_3Q_HPI,diff_4Q_HPI,diff_5Q_HPI,diff_6Q_HPI,Per_diff_1Q_HPI,Per_diff_2Q_HPI,Per_diff_3Q_HPI,Per_diff_4Q_HPI,Per_diff_5Q_HPI,Per_diff_6Q_HPI,log_diff_HPI,log_diff_lag_1_HPI,log_diff_lag_2_HPI,log_diff_lag_3_HPI,log_diff_lag_4_HPI,log_diff_lag_5_HPI,log_diff_lag_6_HPI,diff_lag_1Q_HPI,diff_lag_2Q_HPI,diff_lag_3Q_HPI,diff_lag_4Q_HPI,diff_lag_5Q_HPI,diff_lag_6Q_HPI,log_RealHPI,lag_1_RealHPI,lag_2_RealHPI,lag_3_RealHPI,lag_4_RealHPI,lag_5_RealHPI,lag_6_RealHPI,diff_1Q_RealHPI,diff_2Q_RealHPI,diff_3Q_RealHPI,diff_4Q_RealHPI,diff_5Q_RealHPI,diff_6Q_RealHPI,Per_diff_1Q_RealHPI,Per_diff_2Q_RealHPI,Per_diff_3Q_RealHPI,Per_diff_4Q_RealHPI,Per_diff_5Q_RealHPI,Per_diff_6Q_RealHPI,log_diff_RealHPI,log_diff_lag_1_RealHPI,log_diff_lag_2_RealHPI,log_diff_lag_3_RealHPI,log_diff_lag_4_RealHPI,log_diff_lag_5_RealHPI,log_diff_lag_6_RealHPI,diff_lag_1Q_RealHPI,diff_lag_2Q_RealHPI,diff_lag_3Q_RealHPI,diff_lag_4Q_RealHPI,diff_lag_5Q_RealHPI,diff_lag_6Q_RealHPI,log_GDPBill,lag_1_GDPBill,lag_2_GDPBill,lag_3_GDPBill,lag_4_GDPBill,lag_5_GDPBill,lag_6_GDPBill,diff_1Q_GDPBill,diff_2Q_GDPBill,diff_3Q_GDPBill,diff_4Q_GDPBill,diff_5Q_GDPBill,diff_6Q_GDPBill,Per_diff_1Q_GDPBill,Per_diff_2Q_GDPBill,Per_diff_3Q_GDPBill,Per_diff_4Q_GDPBill,Per_diff_5Q_GDPBill,Per_diff_6Q_GDPBill,log_diff_GDPBill,log_diff_lag_1_GDPBill,log_diff_lag_2_GDPBill,log_diff_lag_3_GDPBill,log_diff_lag_4_GDPBill,log_diff_lag_5_GDPBill,log_diff_lag_6_GDPBill,diff_lag_1Q_GDPBill,diff_lag_2Q_GDPBill,diff_lag_3Q_GDPBill,diff_lag_4Q_GDPBill,diff_lag_5Q_GDPBill,diff_lag_6Q_GDPBill
0,2008Q4,0.03551,0.554292,0.569838,0.240213,57.547692,1137.800006,4.108209,2390.01,11.926268,104.434728,6.043681,95.093333,,104.434728,147.310502,148.662182,325.25526,4.052614,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,7.036852,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.412987,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,7.779053,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.478743,,,,...,,,,,,,,,,,,,,,,,,,,,,,4.648562,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4.992543,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5.001676,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5.78461,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2009Q1,0.037301,0.603489,0.620893,0.262655,45.68381,1041.624075,4.165888,2487.92,16.218479,96.439725,4.705116,94.45,,96.439725,133.67175,134.530544,317.490725,3.821744,57.547692,,,,,,-11.863883,,,,,,-0.206157,,,,,,-0.23087,,,,,,,,,,,,,6.948536,1137.800006,,,,,,-96.175931,,,,,,-0.084528,,,,,,-0.088315,,,,,,,,,,,,,1.426929,4.108209,,,,,,0.057679,,,,,,0.01404,,,,,,0.013942,,,,,,,,,,,,,7.819202,2390.01,,,,,,97.91,,,,,,0.040966,,,,,,0.040149,,,,,,,,,,,,,2.786151,11.926268,,,...,,,,,,,,,,,,,,,,,,,,,,,4.568918,104.434728,,,,,,-7.995003,,,,,,-0.076555,,,,,,-0.079644,,,,,,,,,,,,,4.895387,147.310502,,,,,,-13.638752,,,,,,-0.092585,,,,,,-0.097155,,,,,,,,,,,,,4.901791,148.662182,,,,,,-14.131637,,,,,,-0.095059,,,,,,-0.099885,,,,,,,,,,,,,5.760449,325.25526,,,,,,-7.764534,,,,,,-0.023872,,,,,,-0.024162,,,,,,,,,,,,
2,2009Q2,0.049531,0.887073,0.917262,0.396143,59.860625,944.682562,4.22325,2631.32,20.81228,94.940662,3.301876,93.593333,,94.940662,124.809195,126.12391,311.434061,4.092019,45.68381,57.547692,,,,,14.176815,2.312933,,,,,0.310325,0.040192,,,,,0.270275,-0.23087,,,,,,-11.863883,,,,,,6.850849,1041.624075,1137.800006,,,,,-96.941513,-193.117444,,,,,-0.093068,-0.169729,,,,,-0.097687,-0.088315,,,,,,-96.175931,,,,,,1.440605,4.165888,4.108209,,,,,0.057362,0.115041,,,,,0.013769,0.028003,,,,,0.013676,0.013942,,,,,,0.057679,,,,,,7.875241,2487.92,2390.01,,,,,143.4,241.31,,,,,0.057639,0.100966,,,,,0.056039,0.040149,,,,,,97.91,,,,,,3.035543,16.218479,11.926268,,...,,,,,,,,,,,,,,,,,,,,,,,4.553252,96.439725,104.434728,,,,,-1.499063,-9.494066,,,,,-0.015544,-0.090909,,,,,-0.015666,-0.079644,,,,,,-7.995003,,,,,,4.826786,133.67175,147.310502,,,,,-8.862555,-22.501306,,,,,-0.066301,-0.152747,,,,,-0.068601,-0.097155,,,,,,-13.638752,,,,,,4.837265,134.530544,148.662182,,,,,-8.406634,-22.538271,,,,,-0.062489,-0.151607,,,,,-0.064526,-0.099885,,,,,,-14.131637,,,,,,5.741188,317.490725,325.25526,,,,,-6.056665,-13.821199,,,,,-0.019077,-0.042493,,,,,-0.019261,-0.024162,,,,,,-7.764534,,,,,
3,2009Q3,0.046061,0.814423,0.840967,0.361244,68.82803,915.223952,4.196574,3124.22,23.021086,95.940037,2.158828,94.416667,,95.940037,115.683436,117.09613,309.949323,4.231611,59.860625,45.68381,57.547692,,,,8.967405,23.144221,11.280338,,,,0.149805,0.506618,0.196017,,,,0.139592,0.270275,-0.23087,,,,,14.176815,,,,,,6.819169,944.682562,1041.624075,1137.800006,,,,-29.45861,-126.400123,-222.576054,,,,-0.031184,-0.121349,-0.19562,,,,-0.03168,-0.097687,-0.088315,,,,,-96.941513,,,,,,1.434269,4.22325,4.165888,4.108209,,,,-0.026676,0.030686,0.088366,,,,-0.006316,0.007366,0.02151,,,,-0.006336,0.013676,0.013942,,,,,0.057362,,,,,,8.04694,2631.32,2487.92,2390.01,,,,492.9,636.3,734.21,,,,0.18732,0.255756,0.3072,,,,0.171699,0.056039,0.040149,,,,,143.4,,,,,,3.136411,20.81228,16.218479,11.926268,...,,,,,,,,,,,,,,,,,,,,,,,4.563723,94.940662,96.439725,104.434728,,,,0.999375,-0.499688,-8.494691,,,,0.010526,-0.005181,-0.08134,,,,0.010471,-0.015666,-0.079644,,,,,-1.499063,,,,,,4.750857,124.809195,133.67175,147.310502,,,,-9.12576,-17.988315,-31.627066,,,,-0.073118,-0.134571,-0.214697,,,,-0.075929,-0.068601,-0.097155,,,,,-8.862555,,,,,,4.762995,126.12391,134.530544,148.662182,,,,-9.02778,-17.434414,-31.566051,,,,-0.071579,-0.129594,-0.212334,,,,-0.07427,-0.064526,-0.099885,,,,,-8.406634,,,,,,5.736409,311.434061,317.490725,325.25526,,,,-1.484737,-7.541402,-15.305936,,,,-0.004767,-0.023753,-0.047058,,,,-0.004779,-0.019261,-0.024162,,,,,-6.056665,,,,,
4,2009Q4,0.018304,-0.108424,-0.110561,-0.044556,75.521692,964.774573,4.079907,2743.61,21.806943,98.438476,1.589596,94.92,1.937831,98.438476,113.896277,115.060357,315.473074,4.32442,68.82803,59.860625,45.68381,57.547692,,,6.693662,15.661067,29.837883,17.974,,,0.097252,0.261626,0.653139,0.312332,,,0.092809,0.139592,0.270275,-0.23087,,,,8.967405,2.312933,,,,,6.871894,915.223952,944.682562,1041.624075,1137.800006,,,49.550621,20.092011,-76.849502,-173.025433,,,0.05414,0.021269,-0.073779,-0.15207,,,0.052726,-0.03168,-0.097687,-0.088315,,,,-29.45861,-193.117444,,,,,1.406074,4.196574,4.22325,4.165888,4.108209,,,-0.116667,-0.143343,-0.085981,-0.028302,,,-0.027801,-0.033941,-0.020639,-0.006889,,,-0.028194,-0.006336,0.013676,0.013942,,,,-0.026676,0.115041,,,,,7.91703,3124.22,2631.32,2487.92,2390.01,,,-380.61,112.29,255.69,353.6,,,-0.121826,0.042674,0.102773,0.147949,,,-0.12991,0.171699,0.056039,0.040149,,,,492.9,241.31,,,,,3.082228,23.021086,20.81228,16.218479,...,,,,,,,,,,,,,,,,,,,,,,,4.589432,95.940037,94.940662,96.439725,104.434728,,,2.498438,3.497814,1.998751,-5.996252,,,0.026042,0.036842,0.020725,-0.057416,,,0.025708,0.010471,-0.015666,-0.079644,,,,0.999375,-9.494066,,,,,4.735288,115.683436,124.809195,133.67175,147.310502,,,-1.787158,-10.912918,-19.775473,-33.414224,,,-0.015449,-0.087437,-0.147941,-0.226829,,,-0.015569,-0.075929,-0.068601,-0.097155,,,,-9.12576,-22.501306,,,,,4.745457,117.09613,126.12391,134.530544,148.662182,,,-2.035774,-11.063554,-19.470188,-33.601825,,,-0.017385,-0.08772,-0.144727,-0.226028,,,-0.017538,-0.07427,-0.064526,-0.099885,,,,-9.02778,-22.538271,,,,,5.754073,309.949323,311.434061,317.490725,325.25526,,,5.523751,4.039014,-2.017651,-9.782185,,,0.017821,0.012969,-0.006355,-0.030075,,,0.017665,-0.004779,-0.019261,-0.024162,,,,-1.484737,-13.821199,,,,


In [13]:
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tsa.stattools import adfuller,kpss

In [None]:
count = 0
stationarity_test = pd.DataFrame(columns=['ColName','ADF_Status','KPSS_Status','PP_Status','Overall'])
for col in model_data.columns[1:]:
    adf_result = adfuller(model_data[col].dropna())
    adf_status = 'Stationary' if adf_result[1]<0.05 else 'Non-Stationary'
    adf_val = 1 if adf_result[1]<0.05 else 0

    kpss_stat,kpss_pval,kpss_nlag,kpss_cv = kpss(model_data[col].dropna())
    kpss_status = 'Non-Stationary' if kpss_pval<0.05 else 'Stationary'
    kpss_val = 0 if kpss_pval<0.05 else 1

    pp = PhillipsPerron(model_data[col].dropna())
    pp_status = 'Stationary' if pp.pvalue<0.05 else 'Non-Stationary'
    pp_val = 1 if pp.pvalue<0.05 else 0

    overall = 'Stationary' if (adf_val+kpss_val+pp_val)>=2 else 'Non-Stationary'

    stationarity_test.loc[count] = [col,adf_status,kpss_status,pp_status,overall]
    count+=1


In [15]:
stationary_cols = list(stationarity_test[stationarity_test['Overall']=='Stationary']['ColName'])
all_cols = list(model_data.columns[5:])

In [16]:
model_data.index = np.arange(len(model_data))

In [17]:
train_data = model_data.iloc[:model_data[model_data['YrQuarter']=='2018Q4'].index.values[0]+1]
test_data = model_data.iloc[model_data[model_data['YrQuarter']=='2018Q4'].index.values[0]+1:model_data[model_data['YrQuarter']=='2020Q2'].index.values[0]+1]

# Logit CI

In [19]:
output_col = 'logit_CI'
y = train_data[output_col]
X = train_data[stationary_cols]

In [20]:
function_dict = {'predictor': [], 'r-squared':[]}
#Iterate through every column in X
for col in X.columns:
    #Create a dataframe called selected_X with only the 1 column
    data = train_data[[col,output_col]].dropna()
    selected_X = data[[col]]
    selected_Y = data[output_col]
    #Fit a model for our target and our selected column 
    model = sm.OLS(selected_Y, sm.add_constant(selected_X)).fit()
    #Predict what our target would be for our model
    y_preds = model.predict(sm.add_constant(selected_X))
    #Add the column name to our dictionary
    function_dict['predictor'].append(col)
    #Calculate the r-squared value between the target and predicted target
    r2 = np.corrcoef(selected_Y, y_preds)[0, 1]**2
    #Add the r-squared value to our dictionary
    function_dict['r-squared'].append(r2)
    
#Once it's iterated through every column, turn our dictionary into a DataFrame and sort it
function_df = pd.DataFrame(function_dict).sort_values(by=['r-squared'], ascending = False)
#Display only the top 5 predictors


In [21]:
function_df.head(10)

Unnamed: 0,predictor,r-squared
124,Per_diff_5Q_EIBOR3M,0.573151
167,lag_6_RealHPI,0.556448
123,Per_diff_3Q_EIBOR3M,0.549446
122,Per_diff_2Q_EIBOR3M,0.494866
166,lag_5_RealHPI,0.427014
121,Per_diff_1Q_EIBOR3M,0.404661
125,log_diff_EIBOR3M,0.397237
126,log_diff_lag_1_EIBOR3M,0.39291
59,log_diff_lag_1_DebtToGDP,0.369823
38,diff_6Q_EquityIndex,0.360948


In [22]:
def next_possible_feature (data,all_features, current_features, ignore_features=[]):
    '''
    This function will loop through each column that isn't in your feature model and 
    calculate the r-squared value if it were the next feature added to your model. 
    It will display a dataframe with a sorted r-squared value.
    X_npf = X dataframe
    y_npf = y dataframe
    current_features = list of features that are already in your model
    ignore_features = list of unused features we want to skip over
    '''   
    #Create an empty dictionary that will be used to store our results
    function_dict = {'predictor': [], 'r-squared':[]}
    #Iterate through every column in X
    for col in all_features:
        #But only create a model if the feature isn't already selected or ignored
        if col not in (current_features+ignore_features):
            #Create a dataframe called function_X with our current features + 1
            t_data =  data[current_features + [col,output_col]].dropna()
            
            selected_X = t_data[col]
            selected_Y = t_data[output_col].values

            #Fit a model for our target and our selected columns 
            model = sm.OLS(selected_Y, sm.add_constant(selected_X)).fit()
            #Predict what  our target would be for our selected columns
            y_preds = model.predict(sm.add_constant(selected_X))
            #Add the column name to our dictionary
            function_dict['predictor'].append(col)
            #Calculate the r-squared value between the target and predicted target
            r2 = np.corrcoef(selected_Y, y_preds)[0, 1]**2
            #Add the r-squared value to our dictionary
            function_dict['r-squared'].append(r2)
    #Once it's iterated through every column, turn our dict into a sorted DataFrame
    function_df = pd.DataFrame(function_dict).sort_values(by=['r-squared'],ascending = False)

    return function_df

In [543]:
from mlxtend.feature_selection import SequentialFeatureSelector


In [544]:
sfs = SequentialFeatureSelector(LinearRegression(), forward=True, k_features=4)

In [545]:
temp_data = train_data.dropna()

In [546]:
sfs.fit(temp_data[stationary_cols], temp_data['log_CI'])

SequentialFeatureSelector(estimator=LinearRegression(), k_features=4)

In [547]:
final_selected_cols = final_feat
final_df =  train_data[final_selected_cols+[output_col]].dropna()

selected_X = final_df[final_selected_cols]
selected_Y = final_df[output_col].values

selected_X = sm.add_constant(selected_X)


In [550]:
mod = sm.OLS(selected_Y, selected_X)    # Describe model

res = mod.fit()       # Fit model

print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.568
Model:                            OLS   Adj. R-squared:                  0.541
Method:                 Least Squares   F-statistic:                     21.05
Date:                Mon, 25 Jul 2022   Prob (F-statistic):           1.46e-06
Time:                        10:30:16   Log-Likelihood:                -18.321
No. Observations:                  35   AIC:                             42.64
Df Residuals:                      32   BIC:                             47.31
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                     