In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats

%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

# We are investigating the predictivity of central bank policy rates and stock market indices as they relate to the price of gold. 

## Consolidate Central Bank Rates

In [2]:
# Start by consolidating central bank rate data

%store -r df_aud
%store -r df_eur
%store -r df_gbp
%store -r df_usd
%store -r df_yen

In [3]:
df_aud['date'].min()

Timestamp('1990-01-23 00:00:00')

In [4]:
df_eur['date'].min()

Timestamp('1999-01-04 00:00:00')

In [5]:
df_gbp['date'].min()

Timestamp('1975-01-20 00:00:00')

In [6]:
df_usd['date'].min()

Timestamp('1990-01-01 00:00:00')

In [7]:
df_yen['date'].min()

Timestamp('2000-01-01 00:00:00')

In [8]:
# loop through all dataframes and merge via outer join, ensuring the df with the most non-null dates is the left object
df_rates = pd.merge(df_gbp, df_usd, how = 'outer', on = ['date'])
df_rates = pd.merge(df_rates, df_eur, how = 'outer', on = ['date'])
df_rates = pd.merge(df_rates, df_aud, how = 'outer', on = ['date'])
df_rates = pd.merge(df_rates, df_yen, how = 'outer', on = ['date'])

In [9]:
# drop all values before 2001-01-01 - because we have no yen data prior to then

df_rates = df_rates[df_rates['date']>'1999-12-31']

In [10]:
df_rates.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7457 entries, 202 to 11272
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   date              7457 non-null   datetime64[ns]
 1   gbp_rate          36 non-null     float64       
 2   gbp_rate_lag1     36 non-null     float64       
 3   gbp_rate_dif      36 non-null     float64       
 4   gbp_rate_change%  36 non-null     float64       
 5   usd_rate          7457 non-null   float64       
 6   usd_rate_lag1     7457 non-null   float64       
 7   usd_rate_dif      7457 non-null   float64       
 8   usd_rate_change%  7457 non-null   float64       
 9   eur_rate          44 non-null     float64       
 10  eur_rate_lag1     44 non-null     float64       
 11  eur_rate_dif      44 non-null     float64       
 12  eur_rate_change%  44 non-null     float64       
 13  aud_rate_dif      225 non-null    float64       
 14  aud_rate          225

In [11]:
df_rates.rename(columns = {'AUD_rate_tgt%': 'aud_rate'}, inplace = True)

In [12]:
to_drop = []
for col in df_rates.columns:
    if 'lag' in col:
        to_drop.append(col)

In [13]:
df_rates.drop(columns = to_drop, inplace = True)

In [14]:
df_rates.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7457 entries, 202 to 11272
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   date              7457 non-null   datetime64[ns]
 1   gbp_rate          36 non-null     float64       
 2   gbp_rate_dif      36 non-null     float64       
 3   gbp_rate_change%  36 non-null     float64       
 4   usd_rate          7457 non-null   float64       
 5   usd_rate_dif      7457 non-null   float64       
 6   usd_rate_change%  7457 non-null   float64       
 7   eur_rate          44 non-null     float64       
 8   eur_rate_dif      44 non-null     float64       
 9   eur_rate_change%  44 non-null     float64       
 10  aud_rate_dif      225 non-null    float64       
 11  aud_rate          225 non-null    float64       
 12  aud_rate_change%  225 non-null    float64       
 13  yen_rate          3839 non-null   float64       
 14  yen_rate_dif      368

In [15]:
df_rates = df_rates.sort_values(by = ['date'])

In [16]:
df_rates = df_rates.set_index('date')

In [17]:
df_rates.head()

Unnamed: 0_level_0,gbp_rate,gbp_rate_dif,gbp_rate_change%,usd_rate,usd_rate_dif,usd_rate_change%,eur_rate,eur_rate_dif,eur_rate_change%,aud_rate_dif,aud_rate,aud_rate_change%,yen_rate,yen_rate_dif,yen_rate_change%
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2000-01-01,,,,5.5,0.0,0.0,,,,,,,,,
2000-01-02,,,,5.5,0.0,0.0,,,,,,,,,
2000-01-03,,,,5.5,0.0,0.0,,,,,,,,,
2000-01-04,,,,5.5,0.0,0.0,,,,,,,,,
2000-01-05,,,,5.5,0.0,0.0,,,,,,,,,


In [18]:
# add up to 10 day lag for all rates

for col in df_rates.columns:
    for i in range(1,11):
        df_rates[f'{col}_lag{i}'] = df_rates[col].shift(i)

In [19]:
df_rates.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 7457 entries, 2000-01-01 to 2020-05-31
Columns: 165 entries, gbp_rate to yen_rate_change%_lag10
dtypes: float64(165)
memory usage: 9.4 MB


In [20]:
for col in df_rates.columns:
    df_rates[col] = df_rates[col].astype('float')

In [21]:
df_rates.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 7457 entries, 2000-01-01 to 2020-05-31
Columns: 165 entries, gbp_rate to yen_rate_change%_lag10
dtypes: float64(165)
memory usage: 9.4 MB


In [22]:
df_rates.head()

Unnamed: 0_level_0,gbp_rate,gbp_rate_dif,gbp_rate_change%,usd_rate,usd_rate_dif,usd_rate_change%,eur_rate,eur_rate_dif,eur_rate_change%,aud_rate_dif,...,yen_rate_change%_lag1,yen_rate_change%_lag2,yen_rate_change%_lag3,yen_rate_change%_lag4,yen_rate_change%_lag5,yen_rate_change%_lag6,yen_rate_change%_lag7,yen_rate_change%_lag8,yen_rate_change%_lag9,yen_rate_change%_lag10
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-01-01,,,,5.5,0.0,0.0,,,,,...,,,,,,,,,,
2000-01-02,,,,5.5,0.0,0.0,,,,,...,,,,,,,,,,
2000-01-03,,,,5.5,0.0,0.0,,,,,...,,,,,,,,,,
2000-01-04,,,,5.5,0.0,0.0,,,,,...,,,,,,,,,,
2000-01-05,,,,5.5,0.0,0.0,,,,,...,,,,,,,,,,


In [23]:
# address NaN values

d_non_NaN = dict()
for col in df_rates.columns:
    
    # we can simply fill NaNs in the 'change' columns with zeros, to represent no change
    if 'change' in col:
        df_rates[col].fillna(0, inplace = True)
        
    # otherwise, we'll need to fill NaNs with the most recent non-NaN value
    else:
        
        for idx,val in enumerate(df_rates[col]):
            if pd.isnull(val) == False:
                d_non_NaN[col] = idx
                break
                
            else:
                continue
        
        lst = []
        d = dict()
        for i in df_rates[col][d_non_NaN[col]:]:
            if pd.isnull(i) == False:
                d['last'] = i
            else:
                i = d['last']
            
            lst.append(i)
            
        df_rates[col][d_non_NaN[col]:] = lst
            
        
        
        

In [24]:
print(d_non_NaN)

{'gbp_rate': 12, 'gbp_rate_dif': 12, 'usd_rate': 0, 'usd_rate_dif': 0, 'eur_rate': 34, 'eur_rate_dif': 34, 'aud_rate_dif': 32, 'aud_rate': 32, 'yen_rate': 223, 'yen_rate_dif': 224, 'gbp_rate_lag1': 13, 'gbp_rate_lag2': 14, 'gbp_rate_lag3': 15, 'gbp_rate_lag4': 16, 'gbp_rate_lag5': 17, 'gbp_rate_lag6': 18, 'gbp_rate_lag7': 19, 'gbp_rate_lag8': 20, 'gbp_rate_lag9': 21, 'gbp_rate_lag10': 22, 'gbp_rate_dif_lag1': 13, 'gbp_rate_dif_lag2': 14, 'gbp_rate_dif_lag3': 15, 'gbp_rate_dif_lag4': 16, 'gbp_rate_dif_lag5': 17, 'gbp_rate_dif_lag6': 18, 'gbp_rate_dif_lag7': 19, 'gbp_rate_dif_lag8': 20, 'gbp_rate_dif_lag9': 21, 'gbp_rate_dif_lag10': 22, 'usd_rate_lag1': 1, 'usd_rate_lag2': 2, 'usd_rate_lag3': 3, 'usd_rate_lag4': 4, 'usd_rate_lag5': 5, 'usd_rate_lag6': 6, 'usd_rate_lag7': 7, 'usd_rate_lag8': 8, 'usd_rate_lag9': 9, 'usd_rate_lag10': 10, 'usd_rate_dif_lag1': 1, 'usd_rate_dif_lag2': 2, 'usd_rate_dif_lag3': 3, 'usd_rate_dif_lag4': 4, 'usd_rate_dif_lag5': 5, 'usd_rate_dif_lag6': 6, 'usd_rate_d

In [25]:
df_rates['gbp_rate'][33:36]

date
2000-02-03    5.75
2000-02-04    5.75
2000-02-05    5.75
Name: gbp_rate, dtype: float64

In [26]:
df_rates.head()

Unnamed: 0_level_0,gbp_rate,gbp_rate_dif,gbp_rate_change%,usd_rate,usd_rate_dif,usd_rate_change%,eur_rate,eur_rate_dif,eur_rate_change%,aud_rate_dif,...,yen_rate_change%_lag1,yen_rate_change%_lag2,yen_rate_change%_lag3,yen_rate_change%_lag4,yen_rate_change%_lag5,yen_rate_change%_lag6,yen_rate_change%_lag7,yen_rate_change%_lag8,yen_rate_change%_lag9,yen_rate_change%_lag10
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-01-01,,,0.0,5.5,0.0,0.0,,,0.0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2000-01-02,,,0.0,5.5,0.0,0.0,,,0.0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2000-01-03,,,0.0,5.5,0.0,0.0,,,0.0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2000-01-04,,,0.0,5.5,0.0,0.0,,,0.0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2000-01-05,,,0.0,5.5,0.0,0.0,,,0.0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
%store df_rates

Stored 'df_rates' (DataFrame)


## Consolidate stock indices

In [28]:
%store -r df_hsi
%store -r df_sse
%store -r df_jeg
%store -r df_nasdaq
%store -r df_nyse

In [29]:
df_hsi['date'].min()

Timestamp('2000-01-03 00:00:00')

In [30]:
df_sse['date'].min()

Timestamp('1997-07-02 00:00:00')

In [31]:
df_jeg['date'].min()

Timestamp('2008-09-05 00:00:00')

In [32]:
df_nasdaq['date'].min()

Timestamp('2000-01-03 00:00:00')

In [33]:
df_nyse['date'].min()

Timestamp('2000-01-03 00:00:00')

In [34]:
# loop through all dataframes and merge via outer join, ensuring the df with the most non-null dates is the left object
df_stock = pd.merge(df_sse, df_hsi, how = 'outer', on = ['date'])
df_stock = pd.merge(df_stock, df_nyse, how = 'outer', on = ['date'])
df_stock = pd.merge(df_stock, df_nasdaq, how = 'outer', on = ['date'])
df_stock = pd.merge(df_stock, df_jeg, how = 'outer', on = ['date'])

In [35]:
# drop all values before 2008-09-05 - because we have no JEG data prior to then

df_stock = df_stock[df_stock['date']>'2008-09-04']

In [36]:
df_stock.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3051 entries, 2871 to 5963
Data columns (total 21 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   date               3051 non-null   datetime64[ns]
 1   sse_close          2845 non-null   float64       
 2   sse_close_lag1     2845 non-null   float64       
 3   sse_change%        2844 non-null   float64       
 4   sse_dif            2844 non-null   float64       
 5   hsi_close          2881 non-null   float64       
 6   hsi_close_lag1     2881 non-null   float64       
 7   hsi_change%        2868 non-null   float64       
 8   hsi_dif            2868 non-null   float64       
 9   nyse_close         2950 non-null   float64       
 10  nyse_close_lag1    2950 non-null   float64       
 11  nyse_change%       2950 non-null   float64       
 12  nyse_dif           2950 non-null   float64       
 13  nasdaq_close       2950 non-null   float64       
 14  nasda

In [37]:
to_drop = []
for col in df_stock.columns:
    if 'lag' in col:
        to_drop.append(col)

In [38]:
df_stock.drop(columns = to_drop, inplace = True)

In [39]:
df_stock = df_stock.sort_values(by = ['date'])

In [40]:
df_stock = df_stock.set_index('date')

In [41]:
df_stock.head()

Unnamed: 0_level_0,sse_close,sse_change%,sse_dif,hsi_close,hsi_change%,hsi_dif,nyse_close,nyse_change%,nyse_dif,nasdaq_close,nasdaq_change%,nasdaq_dif,jeg_close,jeg_change%,jeg_dif
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2008-09-05,2202.446045,-3.29167,-74.964844,19933.279297,-2.237434,-456.201172,8033.759766,0.318544,25.509766,2255.879883,-0.139889,-3.160156,335.0,,
2008-09-08,2143.420898,-2.679982,-59.025147,20794.269531,4.319361,860.990234,8168.620117,1.67867,134.860351,2269.76001,0.615287,13.880127,343.0,2.38806,8.0
2008-09-09,2145.779053,0.110018,2.358155,20491.109375,-1.457902,-303.160156,7871.149902,-3.641621,-297.470215,2209.810059,-2.641246,-59.949951,337.86499,-1.497087,-5.13501
2008-09-10,2150.759033,0.232083,4.97998,19999.779297,-2.397772,-491.330078,7957.259766,1.093993,86.109864,2228.699951,0.85482,18.889892,319.382996,-5.470231,-18.481994
2008-09-11,2078.980957,-3.337337,-71.778076,19388.720703,-3.055327,-611.058594,8011.25,0.678503,53.990234,2258.219971,1.32454,29.52002,290.799988,-8.949446,-28.583008


In [42]:
df_stock.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3051 entries, 2008-09-05 to 2020-05-27
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   sse_close       2845 non-null   float64
 1   sse_change%     2844 non-null   float64
 2   sse_dif         2844 non-null   float64
 3   hsi_close       2881 non-null   float64
 4   hsi_change%     2868 non-null   float64
 5   hsi_dif         2868 non-null   float64
 6   nyse_close      2950 non-null   float64
 7   nyse_change%    2950 non-null   float64
 8   nyse_dif        2950 non-null   float64
 9   nasdaq_close    2950 non-null   float64
 10  nasdaq_change%  2950 non-null   float64
 11  nasdaq_dif      2950 non-null   float64
 12  jeg_close       2869 non-null   float64
 13  jeg_change%     2856 non-null   float64
 14  jeg_dif         2856 non-null   float64
dtypes: float64(15)
memory usage: 381.4 KB


In [43]:
# add up to 10 day lag for all stock indices and changes
for col in df_stock.columns:
    for i in range(1,11):
        df_stock[f'{col}_lag{i}'] = df_stock[col].shift(i)

In [44]:
df_stock.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3051 entries, 2008-09-05 to 2020-05-27
Columns: 165 entries, sse_close to jeg_dif_lag10
dtypes: float64(165)
memory usage: 3.9 MB


In [45]:
# address NaN values

d_non_NaN = dict()
for col in df_stock.columns:
    
    # we can simply fill NaNs in the 'change' columns with zeros, to represent no change
    if 'change' in col:
        df_stock[col].fillna(0, inplace = True)
        
    # otherwise, we'll need to fill NaNs with the most recent non-NaN value
    else:
        
        for idx,val in enumerate(df_stock[col]):
            if pd.isnull(val) == False:
                d_non_NaN[col] = idx
                break
                
            else:
                continue
        
        lst = []
        d = dict()
        for i in df_stock[col][d_non_NaN[col]:]:
            if pd.isnull(i) == False:
                d['last'] = i
            else:
                i = d['last']
            
            lst.append(i)
            
        df_stock[col][d_non_NaN[col]:] = lst
        

In [46]:
print(d_non_NaN)

{'sse_close': 0, 'sse_dif': 0, 'hsi_close': 0, 'hsi_dif': 0, 'nyse_close': 0, 'nyse_dif': 0, 'nasdaq_close': 0, 'nasdaq_dif': 0, 'jeg_close': 0, 'jeg_dif': 1, 'sse_close_lag1': 1, 'sse_close_lag2': 2, 'sse_close_lag3': 3, 'sse_close_lag4': 4, 'sse_close_lag5': 5, 'sse_close_lag6': 6, 'sse_close_lag7': 7, 'sse_close_lag8': 8, 'sse_close_lag9': 9, 'sse_close_lag10': 10, 'sse_dif_lag1': 1, 'sse_dif_lag2': 2, 'sse_dif_lag3': 3, 'sse_dif_lag4': 4, 'sse_dif_lag5': 5, 'sse_dif_lag6': 6, 'sse_dif_lag7': 7, 'sse_dif_lag8': 8, 'sse_dif_lag9': 9, 'sse_dif_lag10': 10, 'hsi_close_lag1': 1, 'hsi_close_lag2': 2, 'hsi_close_lag3': 3, 'hsi_close_lag4': 4, 'hsi_close_lag5': 5, 'hsi_close_lag6': 6, 'hsi_close_lag7': 7, 'hsi_close_lag8': 8, 'hsi_close_lag9': 9, 'hsi_close_lag10': 10, 'hsi_dif_lag1': 1, 'hsi_dif_lag2': 2, 'hsi_dif_lag3': 3, 'hsi_dif_lag4': 4, 'hsi_dif_lag5': 5, 'hsi_dif_lag6': 6, 'hsi_dif_lag7': 7, 'hsi_dif_lag8': 8, 'hsi_dif_lag9': 9, 'hsi_dif_lag10': 10, 'nyse_close_lag1': 1, 'nyse_close

In [47]:
df_stock['jeg_close_lag10'][9:14]

date
2008-09-18           NaN
2008-09-19    335.000000
2008-09-22    343.000000
2008-09-23    337.864990
2008-09-24    319.382996
Name: jeg_close_lag10, dtype: float64

In [48]:
df_stock.head()

Unnamed: 0_level_0,sse_close,sse_change%,sse_dif,hsi_close,hsi_change%,hsi_dif,nyse_close,nyse_change%,nyse_dif,nasdaq_close,...,jeg_dif_lag1,jeg_dif_lag2,jeg_dif_lag3,jeg_dif_lag4,jeg_dif_lag5,jeg_dif_lag6,jeg_dif_lag7,jeg_dif_lag8,jeg_dif_lag9,jeg_dif_lag10
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2008-09-05,2202.446045,-3.29167,-74.964844,19933.279297,-2.237434,-456.201172,8033.759766,0.318544,25.509766,2255.879883,...,,,,,,,,,,
2008-09-08,2143.420898,-2.679982,-59.025147,20794.269531,4.319361,860.990234,8168.620117,1.67867,134.860351,2269.76001,...,,,,,,,,,,
2008-09-09,2145.779053,0.110018,2.358155,20491.109375,-1.457902,-303.160156,7871.149902,-3.641621,-297.470215,2209.810059,...,8.0,,,,,,,,,
2008-09-10,2150.759033,0.232083,4.97998,19999.779297,-2.397772,-491.330078,7957.259766,1.093993,86.109864,2228.699951,...,-5.13501,8.0,,,,,,,,
2008-09-11,2078.980957,-3.337337,-71.778076,19388.720703,-3.055327,-611.058594,8011.25,0.678503,53.990234,2258.219971,...,-18.481994,-5.13501,8.0,,,,,,,


In [49]:
%store df_stock

Stored 'df_stock' (DataFrame)


In [50]:
df_feat = pd.merge(df_rates, df_stock, how = 'inner', on = ['date'])

In [51]:
df_feat.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3051 entries, 2008-09-05 to 2020-05-27
Columns: 330 entries, gbp_rate to jeg_dif_lag10
dtypes: float64(330)
memory usage: 7.7 MB


In [52]:
df_feat.head()

Unnamed: 0_level_0,gbp_rate,gbp_rate_dif,gbp_rate_change%,usd_rate,usd_rate_dif,usd_rate_change%,eur_rate,eur_rate_dif,eur_rate_change%,aud_rate_dif,...,jeg_dif_lag1,jeg_dif_lag2,jeg_dif_lag3,jeg_dif_lag4,jeg_dif_lag5,jeg_dif_lag6,jeg_dif_lag7,jeg_dif_lag8,jeg_dif_lag9,jeg_dif_lag10
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2008-09-05,5.0,-0.25,0.0,2.0,0.0,0.0,3.25,0.25,0.0,-0.25,...,,,,,,,,,,
2008-09-08,5.0,-0.25,0.0,2.0,0.0,0.0,3.25,0.25,0.0,-0.25,...,,,,,,,,,,
2008-09-09,5.0,-0.25,0.0,2.0,0.0,0.0,3.25,0.25,0.0,-0.25,...,8.0,,,,,,,,,
2008-09-10,5.0,-0.25,0.0,2.0,0.0,0.0,3.25,0.25,0.0,-0.25,...,-5.13501,8.0,,,,,,,,
2008-09-11,5.0,-0.25,0.0,2.0,0.0,0.0,3.25,0.25,0.0,-0.25,...,-18.481994,-5.13501,8.0,,,,,,,


In [53]:
%store df_feat

Stored 'df_feat' (DataFrame)


## Merge in gold data and create final Xy dataframe for model

In [54]:
%store -r df_gold

In [55]:
df_gold.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7845 entries, 261 to 8311
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   date          7845 non-null   datetime64[ns]
 1   gold          7845 non-null   float64       
 2   gold_lag1     7844 non-null   float64       
 3   gold_change%  7844 non-null   float64       
 4   gold_dif      7844 non-null   float64       
dtypes: datetime64[ns](1), float64(4)
memory usage: 367.7 KB


In [56]:
df_gold = df_gold.sort_values(by = ['date'])
df_gold = df_gold.set_index('date')

In [57]:
to_drop = []
for col in df_gold.columns:
    if 'lag' in col:
        to_drop.append(col)

In [58]:
df_gold.drop(columns = to_drop, inplace = True)

In [59]:
# add up to 10 day lag for all gold prices and daily changes
for col in df_gold.columns:
    for i in range(1,11):
        df_gold[f'{col}_lag{i}'] = df_gold[col].shift(i)

In [60]:
df_gold.head()

Unnamed: 0_level_0,gold,gold_change%,gold_dif,gold_lag1,gold_lag2,gold_lag3,gold_lag4,gold_lag5,gold_lag6,gold_lag7,...,gold_dif_lag1,gold_dif_lag2,gold_dif_lag3,gold_dif_lag4,gold_dif_lag5,gold_dif_lag6,gold_dif_lag7,gold_dif_lag8,gold_dif_lag9,gold_dif_lag10
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1990-01-03,394.5,,,,,,,,,,...,,,,,,,,,,
1990-01-04,398.8,1.089987,4.3,394.5,,,,,,,...,,,,,,,,,,
1990-01-05,406.1,1.830491,7.3,398.8,394.5,,,,,,...,4.3,,,,,,,,,
1990-01-08,401.85,-1.04654,-4.25,406.1,398.8,394.5,,,,,...,7.3,4.3,,,,,,,,
1990-01-09,405.5,0.908299,3.65,401.85,406.1,398.8,394.5,,,,...,-4.25,7.3,4.3,,,,,,,


In [61]:
df_Xy = pd.merge(df_feat, df_gold, how = 'inner', on = ['date'])

In [62]:
df_Xy.head()

Unnamed: 0_level_0,gbp_rate,gbp_rate_dif,gbp_rate_change%,usd_rate,usd_rate_dif,usd_rate_change%,eur_rate,eur_rate_dif,eur_rate_change%,aud_rate_dif,...,gold_dif_lag1,gold_dif_lag2,gold_dif_lag3,gold_dif_lag4,gold_dif_lag5,gold_dif_lag6,gold_dif_lag7,gold_dif_lag8,gold_dif_lag9,gold_dif_lag10
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2008-09-05,5.0,-0.25,0.0,2.0,0.0,0.0,3.25,0.25,0.0,-0.25,...,-1.82,-5.75,-9.17,-13.33,-0.81,4.96,1.8,3.52,-6.65,-6.77
2008-09-08,5.0,-0.25,0.0,2.0,0.0,0.0,3.25,0.25,0.0,-0.25,...,0.82,-1.82,-5.75,-9.17,-13.33,-0.81,4.96,1.8,3.52,-6.65
2008-09-09,5.0,-0.25,0.0,2.0,0.0,0.0,3.25,0.25,0.0,-0.25,...,-0.51,0.82,-1.82,-5.75,-9.17,-13.33,-0.81,4.96,1.8,3.52
2008-09-10,5.0,-0.25,0.0,2.0,0.0,0.0,3.25,0.25,0.0,-0.25,...,-15.94,-0.51,0.82,-1.82,-5.75,-9.17,-13.33,-0.81,4.96,1.8
2008-09-11,5.0,-0.25,0.0,2.0,0.0,0.0,3.25,0.25,0.0,-0.25,...,-23.35,-15.94,-0.51,0.82,-1.82,-5.75,-9.17,-13.33,-0.81,4.96


In [63]:
df_Xy.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3032 entries, 2008-09-05 to 2020-05-27
Columns: 363 entries, gbp_rate to gold_dif_lag10
dtypes: float64(363)
memory usage: 8.4 MB


In [64]:
%store df_Xy

Stored 'df_Xy' (DataFrame)


In [65]:
df_Xy.to_csv('dfXy.csv')