In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats

%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

# We are investigating the predictivity of central bank policy rates and stock market indices as they relate to the price of gold. 

## Consolidate Central Bank Rates

In [2]:
# Start by consolidating central bank rate data

%store -r df_aud
%store -r df_eur
%store -r df_gbp
%store -r df_usd
%store -r df_yen

In [3]:
df_aud['date'].min()

Timestamp('1990-01-23 00:00:00')

In [4]:
df_eur['date'].min()

Timestamp('1999-01-01 00:00:00')

In [5]:
df_gbp['date'].min()

Timestamp('1975-01-20 00:00:00')

In [6]:
df_usd['date'].min()

Timestamp('1990-01-01 00:00:00')

In [7]:
df_yen['date'].min()

Timestamp('2000-01-01 00:00:00')

In [8]:
# loop through all dataframes and merge via outer join, ensuring the df with the most non-null dates is the left object
df_rates = pd.merge(df_gbp, df_usd, how = 'outer', on = ['date'])
df_rates = pd.merge(df_rates, df_eur, how = 'outer', on = ['date'])
df_rates = pd.merge(df_rates, df_aud, how = 'outer', on = ['date'])
df_rates = pd.merge(df_rates, df_yen, how = 'outer', on = ['date'])

In [9]:
df_rates.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11273 entries, 0 to 11272
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   date             11273 non-null  datetime64[ns]
 1   gbp_rate         238 non-null    float64       
 2   gbp_rate_lag1    237 non-null    float64       
 3   gbp_rate_change  237 non-null    float64       
 4   usd_rate         11109 non-null  float64       
 5   usd_rate_lag1    11108 non-null  float64       
 6   usd_rate_change  11108 non-null  float64       
 7   eur_rate         49 non-null     float64       
 8   eur_rate_lag1    48 non-null     float64       
 9   eur_rate_change  48 non-null     float64       
 10  aud_rate_change  340 non-null    object        
 11  AUD_rate_tgt%    340 non-null    float64       
 12  yen_rate         3839 non-null   float64       
 13  yen_rate_lag1    3838 non-null   float64       
 14  yen_rate_change  3682 non-null   float

In [10]:
df_rates.rename(columns = {'AUD_rate_tgt%': 'aud_rate'}, inplace = True)

In [11]:
to_drop = []
for col in df_rates.columns:
    if 'lag' in col:
        to_drop.append(col)

In [12]:
df_rates.drop(columns = to_drop, inplace = True)

In [13]:
df_rates.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11273 entries, 0 to 11272
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   date             11273 non-null  datetime64[ns]
 1   gbp_rate         238 non-null    float64       
 2   gbp_rate_change  237 non-null    float64       
 3   usd_rate         11109 non-null  float64       
 4   usd_rate_change  11108 non-null  float64       
 5   eur_rate         49 non-null     float64       
 6   eur_rate_change  48 non-null     float64       
 7   aud_rate_change  340 non-null    object        
 8   aud_rate         340 non-null    float64       
 9   yen_rate         3839 non-null   float64       
 10  yen_rate_change  3682 non-null   float64       
dtypes: datetime64[ns](1), float64(9), object(1)
memory usage: 1.0+ MB


In [14]:
df_rates = df_rates.sort_values(by = ['date'])

In [15]:
#lowest minimum date across rate data is 2000-01-01 (df_yen), so drop all data prior
df_rates = df_rates[df_rates['date']>'1999-12-31']

In [16]:
df_rates = df_rates.set_index('date')

In [17]:
df_rates.head()

Unnamed: 0_level_0,gbp_rate,gbp_rate_change,usd_rate,usd_rate_change,eur_rate,eur_rate_change,aud_rate_change,aud_rate,yen_rate,yen_rate_change
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2000-01-01,,,5.5,0.0,,,,,,
2000-01-02,,,5.5,0.0,,,,,,
2000-01-03,,,5.5,0.0,,,,,,
2000-01-04,,,5.5,0.0,,,,,,
2000-01-05,,,5.5,0.0,,,,,,


In [18]:
# add up to 10 day lag for all rates

for col in df_rates.columns:
    for i in range(1,11):
        df_rates[f'{col}_lag{i}'] = df_rates[col].shift(i)

In [19]:
df_rates.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 7457 entries, 2000-01-01 to 2020-05-31
Columns: 110 entries, gbp_rate to yen_rate_change_lag10
dtypes: float64(99), object(11)
memory usage: 6.3+ MB


In [20]:
for col in df_rates.columns:
    df_rates[col] = df_rates[col].astype('float')

In [21]:
df_rates.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 7457 entries, 2000-01-01 to 2020-05-31
Columns: 110 entries, gbp_rate to yen_rate_change_lag10
dtypes: float64(110)
memory usage: 6.3 MB


In [22]:
df_rates.head()

Unnamed: 0_level_0,gbp_rate,gbp_rate_change,usd_rate,usd_rate_change,eur_rate,eur_rate_change,aud_rate_change,aud_rate,yen_rate,yen_rate_change,...,yen_rate_change_lag1,yen_rate_change_lag2,yen_rate_change_lag3,yen_rate_change_lag4,yen_rate_change_lag5,yen_rate_change_lag6,yen_rate_change_lag7,yen_rate_change_lag8,yen_rate_change_lag9,yen_rate_change_lag10
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-01-01,,,5.5,0.0,,,,,,,...,,,,,,,,,,
2000-01-02,,,5.5,0.0,,,,,,,...,,,,,,,,,,
2000-01-03,,,5.5,0.0,,,,,,,...,,,,,,,,,,
2000-01-04,,,5.5,0.0,,,,,,,...,,,,,,,,,,
2000-01-05,,,5.5,0.0,,,,,,,...,,,,,,,,,,


In [23]:
%store df_rates

Stored 'df_rates' (DataFrame)


## Consolidate stock indices

In [24]:
%store -r df_hsi
%store -r df_sse
%store -r df_jeg
%store -r df_nasdaq
%store -r df_nyse

In [25]:
df_hsi['date'].min()

Timestamp('2000-01-03 00:00:00')

In [26]:
df_sse['date'].min()

Timestamp('1997-07-02 00:00:00')

In [27]:
df_jeg['date'].min()

Timestamp('2008-09-05 00:00:00')

In [28]:
df_nasdaq['date'].min()

Timestamp('2000-01-03 00:00:00')

In [29]:
df_nyse['date'].min()

Timestamp('2000-01-03 00:00:00')

In [30]:
# loop through all dataframes and merge via outer join, ensuring the df with the most non-null dates is the left object
df_stock = pd.merge(df_sse, df_hsi, how = 'outer', on = ['date'])
df_stock = pd.merge(df_stock, df_nyse, how = 'outer', on = ['date'])
df_stock = pd.merge(df_stock, df_nasdaq, how = 'outer', on = ['date'])
df_stock = pd.merge(df_stock, df_jeg, how = 'outer', on = ['date'])

In [31]:
df_stock.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5964 entries, 0 to 5963
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   date               5964 non-null   datetime64[ns]
 1   sse_close          5714 non-null   float64       
 2   sse_close_lag1     5713 non-null   float64       
 3   sse_change%        5711 non-null   float64       
 4   hsi_close          5023 non-null   float64       
 5   hsi_close_lag1     5022 non-null   float64       
 6   hsi_change%        4956 non-null   float64       
 7   nyse_close         5131 non-null   float64       
 8   nyse_close_lag1    5130 non-null   float64       
 9   nyse_change%       5130 non-null   float64       
 10  nasdaq_close       5131 non-null   float64       
 11  nasdaq_close_lag1  5130 non-null   float64       
 12  nasdaq_change%     5130 non-null   float64       
 13  jeg_close          2869 non-null   float64       
 14  jeg_clos

In [32]:
to_drop = []
for col in df_stock.columns:
    if 'lag' in col:
        to_drop.append(col)

In [33]:
df_stock.drop(columns = to_drop, inplace = True)

In [34]:
df_stock = df_stock.sort_values(by = ['date'])

In [35]:
#lowest minimum date across stock data is 2008-09-05 (df_jeg), so drop all data prior
df_stock = df_stock[df_stock['date']>'2008-09-04']

In [36]:
df_stock = df_stock.set_index('date')

In [37]:
df_stock.head()

Unnamed: 0_level_0,sse_close,sse_change%,hsi_close,hsi_change%,nyse_close,nyse_change%,nasdaq_close,nasdaq_change%,jeg_close,jeg_change%
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2008-09-05,2202.446045,3.403709,19933.279297,2.288641,8033.759766,-0.317532,2255.879883,0.140085,335.0,
2008-09-08,2143.420898,2.753782,20794.269531,-4.140517,8168.620117,-1.650956,2269.76001,-0.611524,343.0,-2.332362
2008-09-09,2145.779053,-0.109897,20491.109375,1.479472,7871.149902,3.779247,2209.810059,2.712901,337.86499,1.519841
2008-09-10,2150.759033,-0.231545,19999.779297,2.456677,7957.259766,-1.082155,2228.699951,-0.847574,319.382996,5.786781
2008-09-11,2078.980957,3.452561,19388.720703,3.151619,8011.25,-0.67393,2258.219971,-1.307225,290.799988,9.829095


In [38]:
df_stock.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3051 entries, 2008-09-05 to 2020-05-27
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   sse_close       2845 non-null   float64
 1   sse_change%     2844 non-null   float64
 2   hsi_close       2881 non-null   float64
 3   hsi_change%     2868 non-null   float64
 4   nyse_close      2950 non-null   float64
 5   nyse_change%    2950 non-null   float64
 6   nasdaq_close    2950 non-null   float64
 7   nasdaq_change%  2950 non-null   float64
 8   jeg_close       2869 non-null   float64
 9   jeg_change%     2856 non-null   float64
dtypes: float64(10)
memory usage: 262.2 KB


In [39]:
# add up to 10 day lag for all stock indices and changes
for col in df_stock.columns:
    for i in range(1,11):
        df_stock[f'{col}_lag{i}'] = df_stock[col].shift(i)

In [40]:
df_stock.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3051 entries, 2008-09-05 to 2020-05-27
Columns: 110 entries, sse_close to jeg_change%_lag10
dtypes: float64(110)
memory usage: 2.6 MB


In [41]:
df_stock.head()

Unnamed: 0_level_0,sse_close,sse_change%,hsi_close,hsi_change%,nyse_close,nyse_change%,nasdaq_close,nasdaq_change%,jeg_close,jeg_change%,...,jeg_change%_lag1,jeg_change%_lag2,jeg_change%_lag3,jeg_change%_lag4,jeg_change%_lag5,jeg_change%_lag6,jeg_change%_lag7,jeg_change%_lag8,jeg_change%_lag9,jeg_change%_lag10
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2008-09-05,2202.446045,3.403709,19933.279297,2.288641,8033.759766,-0.317532,2255.879883,0.140085,335.0,,...,,,,,,,,,,
2008-09-08,2143.420898,2.753782,20794.269531,-4.140517,8168.620117,-1.650956,2269.76001,-0.611524,343.0,-2.332362,...,,,,,,,,,,
2008-09-09,2145.779053,-0.109897,20491.109375,1.479472,7871.149902,3.779247,2209.810059,2.712901,337.86499,1.519841,...,-2.332362,,,,,,,,,
2008-09-10,2150.759033,-0.231545,19999.779297,2.456677,7957.259766,-1.082155,2228.699951,-0.847574,319.382996,5.786781,...,1.519841,-2.332362,,,,,,,,
2008-09-11,2078.980957,3.452561,19388.720703,3.151619,8011.25,-0.67393,2258.219971,-1.307225,290.799988,9.829095,...,5.786781,1.519841,-2.332362,,,,,,,


In [42]:
%store df_stock

Stored 'df_stock' (DataFrame)


In [43]:
df_feat = pd.merge(df_rates, df_stock, how = 'inner', on = ['date'])

In [44]:
df_feat.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3051 entries, 2008-09-05 to 2020-05-27
Columns: 220 entries, gbp_rate to jeg_change%_lag10
dtypes: float64(220)
memory usage: 5.1 MB


In [45]:
%store df_feat

Stored 'df_feat' (DataFrame)


## Merge in gold data and create final Xy dataframe for model

In [46]:
%store -r df_gold

In [47]:
df_gold.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7845 entries, 3 to 8417
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   date          7845 non-null   datetime64[ns]
 1   gold          7845 non-null   float64       
 2   gold_lag1     7844 non-null   float64       
 3   gold_change%  7844 non-null   float64       
dtypes: datetime64[ns](1), float64(3)
memory usage: 306.4 KB


In [48]:
df_gold = df_gold.sort_values(by = ['date'])
df_gold = df_gold.set_index('date')

In [49]:
to_drop = []
for col in df_gold.columns:
    if 'lag' in col:
        to_drop.append(col)

In [50]:
df_gold.drop(columns = to_drop, inplace = True)

In [51]:
# add up to 10 day lag for all gold prices and daily changes
for col in df_gold.columns:
    for i in range(1,11):
        df_gold[f'{col}_lag{i}'] = df_gold[col].shift(i)

In [52]:
df_gold.head()

Unnamed: 0_level_0,gold,gold_change%,gold_lag1,gold_lag2,gold_lag3,gold_lag4,gold_lag5,gold_lag6,gold_lag7,gold_lag8,...,gold_change%_lag1,gold_change%_lag2,gold_change%_lag3,gold_change%_lag4,gold_change%_lag5,gold_change%_lag6,gold_change%_lag7,gold_change%_lag8,gold_change%_lag9,gold_change%_lag10
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1990-01-03,394.5,1.089987,,,,,,,,,...,,,,,,,,,,
1990-01-04,398.8,1.830491,394.5,,,,,,,,...,1.089987,,,,,,,,,
1990-01-05,406.1,-1.04654,398.8,394.5,,,,,,,...,1.830491,1.089987,,,,,,,,
1990-01-08,401.85,0.908299,406.1,398.8,394.5,,,,,,...,-1.04654,1.830491,1.089987,,,,,,,
1990-01-09,405.5,1.405672,401.85,406.1,398.8,394.5,,,,,...,0.908299,-1.04654,1.830491,1.089987,,,,,,


In [53]:
df_gold.head()

Unnamed: 0_level_0,gold,gold_change%,gold_lag1,gold_lag2,gold_lag3,gold_lag4,gold_lag5,gold_lag6,gold_lag7,gold_lag8,...,gold_change%_lag1,gold_change%_lag2,gold_change%_lag3,gold_change%_lag4,gold_change%_lag5,gold_change%_lag6,gold_change%_lag7,gold_change%_lag8,gold_change%_lag9,gold_change%_lag10
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1990-01-03,394.5,1.089987,,,,,,,,,...,,,,,,,,,,
1990-01-04,398.8,1.830491,394.5,,,,,,,,...,1.089987,,,,,,,,,
1990-01-05,406.1,-1.04654,398.8,394.5,,,,,,,...,1.830491,1.089987,,,,,,,,
1990-01-08,401.85,0.908299,406.1,398.8,394.5,,,,,,...,-1.04654,1.830491,1.089987,,,,,,,
1990-01-09,405.5,1.405672,401.85,406.1,398.8,394.5,,,,,...,0.908299,-1.04654,1.830491,1.089987,,,,,,


In [54]:
df_Xy = pd.merge(df_feat, df_gold, how = 'inner', on = ['date'])

In [55]:
df_Xy.head()

Unnamed: 0_level_0,gbp_rate,gbp_rate_change,usd_rate,usd_rate_change,eur_rate,eur_rate_change,aud_rate_change,aud_rate,yen_rate,yen_rate_change,...,gold_change%_lag1,gold_change%_lag2,gold_change%_lag3,gold_change%_lag4,gold_change%_lag5,gold_change%_lag6,gold_change%_lag7,gold_change%_lag8,gold_change%_lag9,gold_change%_lag10
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2008-09-05,,,2.0,0.0,,,,,0.5,0.0,...,0.102477,-0.226933,-0.711854,-1.122509,-1.60554,-0.097466,0.600412,0.218367,0.42886,-0.803693
2008-09-08,,,2.0,0.0,,,,,0.5,0.0,...,-0.06367,0.102477,-0.226933,-0.711854,-1.122509,-1.60554,-0.097466,0.600412,0.218367,0.42886
2008-09-09,,,2.0,0.0,,,,,0.5,0.0,...,-1.99128,-0.06367,0.102477,-0.226933,-0.711854,-1.122509,-1.60554,-0.097466,0.600412,0.218367
2008-09-10,,,2.0,0.0,,,,,0.5,0.0,...,-2.976228,-1.99128,-0.06367,0.102477,-0.226933,-0.711854,-1.122509,-1.60554,-0.097466,0.600412
2008-09-11,,,2.0,0.0,,,,,0.5,0.0,...,-2.146611,-2.976228,-1.99128,-0.06367,0.102477,-0.226933,-0.711854,-1.122509,-1.60554,-0.097466


In [57]:
df_Xy.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3032 entries, 2008-09-05 to 2020-05-27
Columns: 242 entries, gbp_rate to gold_change%_lag10
dtypes: float64(242)
memory usage: 5.6 MB


In [56]:
%store df_Xy

Stored 'df_Xy' (DataFrame)
