In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats

%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

# We are investigating the predictivity of central bank policy rates and stock market indices as they relate to the price of gold. 

## Consolidate Central Bank Rates

In [2]:
# Start by consolidating central bank rate data

%store -r df_aud
%store -r df_eur
%store -r df_gbp
%store -r df_usd
%store -r df_yen

In [3]:
df_aud['date'].min()

Timestamp('1990-01-23 00:00:00')

In [4]:
df_eur['date'].min()

Timestamp('1999-01-01 00:00:00')

In [5]:
df_gbp['date'].min()

Timestamp('1975-01-20 00:00:00')

In [6]:
df_usd['date'].min()

Timestamp('1990-01-01 00:00:00')

In [7]:
df_yen['date'].min()

Timestamp('2000-01-01 00:00:00')

In [8]:
# loop through all dataframes and merge via outer join, ensuring the df with the most non-null dates is the left object
df_rates = pd.merge(df_gbp, df_usd, how = 'outer', on = ['date'])
df_rates = pd.merge(df_rates, df_eur, how = 'outer', on = ['date'])
df_rates = pd.merge(df_rates, df_aud, how = 'outer', on = ['date'])
df_rates = pd.merge(df_rates, df_yen, how = 'outer', on = ['date'])

In [9]:
df_rates.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11273 entries, 0 to 11272
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   date             11273 non-null  datetime64[ns]
 1   gbp_rate         238 non-null    float64       
 2   gbp_rate_lag1    237 non-null    float64       
 3   gbp_rate_change  237 non-null    float64       
 4   usd_rate         11109 non-null  float64       
 5   usd_rate_lag1    11108 non-null  float64       
 6   usd_rate_change  11108 non-null  float64       
 7   eur_rate         49 non-null     float64       
 8   eur_rate_lag1    48 non-null     float64       
 9   eur_rate_change  48 non-null     float64       
 10  aud_rate_change  340 non-null    object        
 11  AUD_rate_tgt%    340 non-null    float64       
 12  yen_rate         3839 non-null   float64       
 13  yen_rate_lag1    3838 non-null   float64       
 14  yen_rate_change  3682 non-null   float

In [10]:
df_rates.rename(columns = {'AUD_rate_tgt%': 'aud_rate'}, inplace = True)

In [11]:
to_drop = []
for col in df_rates.columns:
    if 'lag' in col:
        to_drop.append(col)

In [12]:
df_rates.drop(columns = to_drop, inplace = True)

In [13]:
df_rates.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11273 entries, 0 to 11272
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   date             11273 non-null  datetime64[ns]
 1   gbp_rate         238 non-null    float64       
 2   gbp_rate_change  237 non-null    float64       
 3   usd_rate         11109 non-null  float64       
 4   usd_rate_change  11108 non-null  float64       
 5   eur_rate         49 non-null     float64       
 6   eur_rate_change  48 non-null     float64       
 7   aud_rate_change  340 non-null    object        
 8   aud_rate         340 non-null    float64       
 9   yen_rate         3839 non-null   float64       
 10  yen_rate_change  3682 non-null   float64       
dtypes: datetime64[ns](1), float64(9), object(1)
memory usage: 1.0+ MB


In [14]:
df_rates = df_rates.sort_values(by = ['date'])

In [15]:
df_rates = df_rates.set_index('date')

In [16]:
df_rates.head()

Unnamed: 0_level_0,gbp_rate,gbp_rate_change,usd_rate,usd_rate_change,eur_rate,eur_rate_change,aud_rate_change,aud_rate,yen_rate,yen_rate_change
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1975-01-20,11.25,,,,,,,,,
1975-01-27,11.0,-0.25,,,,,,,,
1975-02-10,10.75,-0.25,,,,,,,,
1975-02-17,10.5,-0.25,,,,,,,,
1975-03-10,10.25,-0.25,,,,,,,,


In [17]:
# add up to 10 day lag for all rates

for col in df_rates.columns:
    for i in range(1,11):
        df_rates[f'{col}_lag{i}'] = df_rates[col].shift(i)

In [18]:
df_rates.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 11273 entries, 1975-01-20 to 2020-05-31
Columns: 110 entries, gbp_rate to yen_rate_change_lag10
dtypes: float64(99), object(11)
memory usage: 9.5+ MB


In [19]:
for col in df_rates.columns:
    df_rates[col] = df_rates[col].astype('float')

In [20]:
df_rates.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 11273 entries, 1975-01-20 to 2020-05-31
Columns: 110 entries, gbp_rate to yen_rate_change_lag10
dtypes: float64(110)
memory usage: 9.5 MB


In [21]:
df_rates.head()

Unnamed: 0_level_0,gbp_rate,gbp_rate_change,usd_rate,usd_rate_change,eur_rate,eur_rate_change,aud_rate_change,aud_rate,yen_rate,yen_rate_change,...,yen_rate_change_lag1,yen_rate_change_lag2,yen_rate_change_lag3,yen_rate_change_lag4,yen_rate_change_lag5,yen_rate_change_lag6,yen_rate_change_lag7,yen_rate_change_lag8,yen_rate_change_lag9,yen_rate_change_lag10
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1975-01-20,11.25,,,,,,,,,,...,,,,,,,,,,
1975-01-27,11.0,-0.25,,,,,,,,,...,,,,,,,,,,
1975-02-10,10.75,-0.25,,,,,,,,,...,,,,,,,,,,
1975-02-17,10.5,-0.25,,,,,,,,,...,,,,,,,,,,
1975-03-10,10.25,-0.25,,,,,,,,,...,,,,,,,,,,


In [22]:
# address NaN values

d_non_NaN = dict()
for col in df_rates.columns:
    
    # we can simply fill NaNs in the 'change' columns with zeros, to represent no change
    if 'change' in col:
        df_rates[col].fillna(0, inplace = True)
        
    # otherwise, we'll need to fill NaNs with the most recent non-NaN value
    else:
        
        for idx,val in enumerate(df_rates[col]):
            if pd.isnull(val) == False:
                d_non_NaN[col] = idx
                break
                
            else:
                continue
        
        lst = []
        d = dict()
        for i in df_rates[col][d_non_NaN[col]:]:
            if pd.isnull(i) == False:
                d['last'] = i
            else:
                i = d['last']
            
            lst.append(i)
            
        df_rates[col][d_non_NaN[col]:] = lst
            
        
        
        

In [23]:
print(d_non_NaN)

{'gbp_rate': 0, 'usd_rate': 164, 'eur_rate': 3451, 'aud_rate': 186, 'yen_rate': 4039, 'gbp_rate_lag1': 1, 'gbp_rate_lag2': 2, 'gbp_rate_lag3': 3, 'gbp_rate_lag4': 4, 'gbp_rate_lag5': 5, 'gbp_rate_lag6': 6, 'gbp_rate_lag7': 7, 'gbp_rate_lag8': 8, 'gbp_rate_lag9': 9, 'gbp_rate_lag10': 10, 'usd_rate_lag1': 165, 'usd_rate_lag2': 166, 'usd_rate_lag3': 167, 'usd_rate_lag4': 168, 'usd_rate_lag5': 169, 'usd_rate_lag6': 170, 'usd_rate_lag7': 171, 'usd_rate_lag8': 172, 'usd_rate_lag9': 173, 'usd_rate_lag10': 174, 'eur_rate_lag1': 3452, 'eur_rate_lag2': 3453, 'eur_rate_lag3': 3454, 'eur_rate_lag4': 3455, 'eur_rate_lag5': 3456, 'eur_rate_lag6': 3457, 'eur_rate_lag7': 3458, 'eur_rate_lag8': 3459, 'eur_rate_lag9': 3460, 'eur_rate_lag10': 3461, 'aud_rate_lag1': 187, 'aud_rate_lag2': 188, 'aud_rate_lag3': 189, 'aud_rate_lag4': 190, 'aud_rate_lag5': 191, 'aud_rate_lag6': 192, 'aud_rate_lag7': 193, 'aud_rate_lag8': 194, 'aud_rate_lag9': 195, 'aud_rate_lag10': 196, 'yen_rate_lag1': 4040, 'yen_rate_lag2':

In [24]:
df_rates['gbp_rate'][33:36]

date
1977-03-31    9.50
1977-04-12    9.25
1977-04-18    9.00
Name: gbp_rate, dtype: float64

In [25]:
df_rates.head()

Unnamed: 0_level_0,gbp_rate,gbp_rate_change,usd_rate,usd_rate_change,eur_rate,eur_rate_change,aud_rate_change,aud_rate,yen_rate,yen_rate_change,...,yen_rate_change_lag1,yen_rate_change_lag2,yen_rate_change_lag3,yen_rate_change_lag4,yen_rate_change_lag5,yen_rate_change_lag6,yen_rate_change_lag7,yen_rate_change_lag8,yen_rate_change_lag9,yen_rate_change_lag10
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1975-01-20,11.25,0.0,,0.0,,0.0,0.0,,,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1975-01-27,11.0,-0.25,,0.0,,0.0,0.0,,,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1975-02-10,10.75,-0.25,,0.0,,0.0,0.0,,,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1975-02-17,10.5,-0.25,,0.0,,0.0,0.0,,,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1975-03-10,10.25,-0.25,,0.0,,0.0,0.0,,,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
%store df_rates

Stored 'df_rates' (DataFrame)


## Consolidate stock indices

In [27]:
%store -r df_hsi
%store -r df_sse
%store -r df_jeg
%store -r df_nasdaq
%store -r df_nyse

In [28]:
df_hsi['date'].min()

Timestamp('2000-01-03 00:00:00')

In [29]:
df_sse['date'].min()

Timestamp('1997-07-02 00:00:00')

In [30]:
df_jeg['date'].min()

Timestamp('2008-09-05 00:00:00')

In [31]:
df_nasdaq['date'].min()

Timestamp('2000-01-03 00:00:00')

In [32]:
df_nyse['date'].min()

Timestamp('2000-01-03 00:00:00')

In [33]:
# loop through all dataframes and merge via outer join, ensuring the df with the most non-null dates is the left object
df_stock = pd.merge(df_sse, df_hsi, how = 'outer', on = ['date'])
df_stock = pd.merge(df_stock, df_nyse, how = 'outer', on = ['date'])
df_stock = pd.merge(df_stock, df_nasdaq, how = 'outer', on = ['date'])
df_stock = pd.merge(df_stock, df_jeg, how = 'outer', on = ['date'])

In [34]:
df_stock.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5964 entries, 0 to 5963
Data columns (total 21 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   date               5964 non-null   datetime64[ns]
 1   sse_close          5714 non-null   float64       
 2   sse_close_lag1     5713 non-null   float64       
 3   sse_change%        5711 non-null   float64       
 4   sse_change         5711 non-null   float64       
 5   hsi_close          5023 non-null   float64       
 6   hsi_close_lag1     5022 non-null   float64       
 7   hsi_change%        4956 non-null   float64       
 8   hsi_change         4956 non-null   float64       
 9   nyse_close         5131 non-null   float64       
 10  nyse_close_lag1    5130 non-null   float64       
 11  nyse_change%       5130 non-null   float64       
 12  nyse_change        5130 non-null   float64       
 13  nasdaq_close       5131 non-null   float64       
 14  nasdaq_c

In [35]:
to_drop = []
for col in df_stock.columns:
    if 'lag' in col:
        to_drop.append(col)

In [36]:
df_stock.drop(columns = to_drop, inplace = True)

In [37]:
df_stock = df_stock.sort_values(by = ['date'])

In [38]:
df_stock = df_stock.set_index('date')

In [39]:
df_stock.head()

Unnamed: 0_level_0,sse_close,sse_change%,sse_change,hsi_close,hsi_change%,hsi_change,nyse_close,nyse_change%,nyse_change,nasdaq_close,nasdaq_change%,nasdaq_change,jeg_close,jeg_change%,jeg_change
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1997-07-02,1199.061035,,,,,,,,,,,,,,
1997-07-03,1150.623047,-4.03966,-48.437988,,,,,,,,,,,,
1997-07-04,1159.342041,0.757763,8.718994,,,,,,,,,,,,
1997-07-07,1096.81897,-5.392979,-62.523071,,,,,,,,,,,,
1997-07-08,1109.666016,1.1713,12.847046,,,,,,,,,,,,


In [40]:
df_stock.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5964 entries, 1997-07-02 to 2020-05-27
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   sse_close       5714 non-null   float64
 1   sse_change%     5711 non-null   float64
 2   sse_change      5711 non-null   float64
 3   hsi_close       5023 non-null   float64
 4   hsi_change%     4956 non-null   float64
 5   hsi_change      4956 non-null   float64
 6   nyse_close      5131 non-null   float64
 7   nyse_change%    5130 non-null   float64
 8   nyse_change     5130 non-null   float64
 9   nasdaq_close    5131 non-null   float64
 10  nasdaq_change%  5130 non-null   float64
 11  nasdaq_change   5130 non-null   float64
 12  jeg_close       2869 non-null   float64
 13  jeg_change%     2856 non-null   float64
 14  jeg_change      2856 non-null   float64
dtypes: float64(15)
memory usage: 745.5 KB


In [41]:
# add up to 10 day lag for all stock indices and changes
for col in df_stock.columns:
    for i in range(1,11):
        df_stock[f'{col}_lag{i}'] = df_stock[col].shift(i)

In [42]:
df_stock.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5964 entries, 1997-07-02 to 2020-05-27
Columns: 165 entries, sse_close to jeg_change_lag10
dtypes: float64(165)
memory usage: 7.6 MB


In [43]:
# address NaN values

d_non_NaN = dict()
for col in df_stock.columns:
    
    # we can simply fill NaNs in the 'change' columns with zeros, to represent no change
    if 'change' in col:
        df_stock[col].fillna(0, inplace = True)
        
    # otherwise, we'll need to fill NaNs with the most recent non-NaN value
    else:
        
        for idx,val in enumerate(df_stock[col]):
            if pd.isnull(val) == False:
                d_non_NaN[col] = idx
                break
                
            else:
                continue
        
        lst = []
        d = dict()
        for i in df_stock[col][d_non_NaN[col]:]:
            if pd.isnull(i) == False:
                d['last'] = i
            else:
                i = d['last']
            
            lst.append(i)
            
        df_stock[col][d_non_NaN[col]:] = lst
        

In [44]:
print(d_non_NaN)

{'sse_close': 0, 'hsi_close': 653, 'nyse_close': 653, 'nasdaq_close': 653, 'jeg_close': 2913, 'sse_close_lag1': 1, 'sse_close_lag2': 2, 'sse_close_lag3': 3, 'sse_close_lag4': 4, 'sse_close_lag5': 5, 'sse_close_lag6': 6, 'sse_close_lag7': 7, 'sse_close_lag8': 8, 'sse_close_lag9': 9, 'sse_close_lag10': 10, 'hsi_close_lag1': 654, 'hsi_close_lag2': 655, 'hsi_close_lag3': 656, 'hsi_close_lag4': 657, 'hsi_close_lag5': 658, 'hsi_close_lag6': 659, 'hsi_close_lag7': 660, 'hsi_close_lag8': 661, 'hsi_close_lag9': 662, 'hsi_close_lag10': 663, 'nyse_close_lag1': 654, 'nyse_close_lag2': 655, 'nyse_close_lag3': 656, 'nyse_close_lag4': 657, 'nyse_close_lag5': 658, 'nyse_close_lag6': 659, 'nyse_close_lag7': 660, 'nyse_close_lag8': 661, 'nyse_close_lag9': 662, 'nyse_close_lag10': 663, 'nasdaq_close_lag1': 654, 'nasdaq_close_lag2': 655, 'nasdaq_close_lag3': 656, 'nasdaq_close_lag4': 657, 'nasdaq_close_lag5': 658, 'nasdaq_close_lag6': 659, 'nasdaq_close_lag7': 660, 'nasdaq_close_lag8': 661, 'nasdaq_close_

In [45]:
df_stock['jeg_close_lag10'][9:14]

date
1997-07-15   NaN
1997-07-16   NaN
1997-07-17   NaN
1997-07-18   NaN
1997-07-21   NaN
Name: jeg_close_lag10, dtype: float64

In [46]:
df_stock.head()

Unnamed: 0_level_0,sse_close,sse_change%,sse_change,hsi_close,hsi_change%,hsi_change,nyse_close,nyse_change%,nyse_change,nasdaq_close,...,jeg_change_lag1,jeg_change_lag2,jeg_change_lag3,jeg_change_lag4,jeg_change_lag5,jeg_change_lag6,jeg_change_lag7,jeg_change_lag8,jeg_change_lag9,jeg_change_lag10
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1997-07-02,1199.061035,0.0,0.0,,0.0,0.0,,0.0,0.0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1997-07-03,1150.623047,-4.03966,-48.437988,,0.0,0.0,,0.0,0.0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1997-07-04,1159.342041,0.757763,8.718994,,0.0,0.0,,0.0,0.0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1997-07-07,1096.81897,-5.392979,-62.523071,,0.0,0.0,,0.0,0.0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1997-07-08,1109.666016,1.1713,12.847046,,0.0,0.0,,0.0,0.0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [47]:
%store df_stock

Stored 'df_stock' (DataFrame)


In [48]:
df_feat = pd.merge(df_rates, df_stock, how = 'inner', on = ['date'])

In [49]:
df_feat.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5964 entries, 1997-07-02 to 2020-05-27
Columns: 275 entries, gbp_rate to jeg_change_lag10
dtypes: float64(275)
memory usage: 12.6 MB


In [50]:
df_feat.head()

Unnamed: 0_level_0,gbp_rate,gbp_rate_change,usd_rate,usd_rate_change,eur_rate,eur_rate_change,aud_rate_change,aud_rate,yen_rate,yen_rate_change,...,jeg_change_lag1,jeg_change_lag2,jeg_change_lag3,jeg_change_lag4,jeg_change_lag5,jeg_change_lag6,jeg_change_lag7,jeg_change_lag8,jeg_change_lag9,jeg_change_lag10
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1997-07-02,6.5,0.0,5.5,0.0,,0.0,0.0,5.5,,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1997-07-03,6.5,0.0,5.5,0.0,,0.0,0.0,5.5,,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1997-07-04,6.5,0.0,5.5,0.0,,0.0,0.0,5.5,,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1997-07-07,6.5,0.0,5.5,0.0,,0.0,0.0,5.5,,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1997-07-08,6.5,0.0,5.5,0.0,,0.0,0.0,5.5,,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [51]:
%store df_feat

Stored 'df_feat' (DataFrame)


## Merge in gold data and create final Xy dataframe for model

In [52]:
%store -r df_gold

In [53]:
df_gold.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7845 entries, 261 to 8311
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   date          7845 non-null   datetime64[ns]
 1   gold          7845 non-null   float64       
 2   gold_lag1     7844 non-null   float64       
 3   gold_change%  7844 non-null   float64       
 4   gold_change   7844 non-null   float64       
dtypes: datetime64[ns](1), float64(4)
memory usage: 367.7 KB


In [54]:
df_gold = df_gold.sort_values(by = ['date'])
df_gold = df_gold.set_index('date')

In [55]:
to_drop = []
for col in df_gold.columns:
    if 'lag' in col:
        to_drop.append(col)

In [56]:
df_gold.drop(columns = to_drop, inplace = True)

In [57]:
# add up to 10 day lag for all gold prices and daily changes
for col in df_gold.columns:
    for i in range(1,11):
        df_gold[f'{col}_lag{i}'] = df_gold[col].shift(i)

In [58]:
df_gold.head()

Unnamed: 0_level_0,gold,gold_change%,gold_change,gold_lag1,gold_lag2,gold_lag3,gold_lag4,gold_lag5,gold_lag6,gold_lag7,...,gold_change_lag1,gold_change_lag2,gold_change_lag3,gold_change_lag4,gold_change_lag5,gold_change_lag6,gold_change_lag7,gold_change_lag8,gold_change_lag9,gold_change_lag10
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1990-01-03,394.5,,,,,,,,,,...,,,,,,,,,,
1990-01-04,398.8,1.089987,4.3,394.5,,,,,,,...,,,,,,,,,,
1990-01-05,406.1,1.830491,7.3,398.8,394.5,,,,,,...,4.3,,,,,,,,,
1990-01-08,401.85,-1.04654,-4.25,406.1,398.8,394.5,,,,,...,7.3,4.3,,,,,,,,
1990-01-09,405.5,0.908299,3.65,401.85,406.1,398.8,394.5,,,,...,-4.25,7.3,4.3,,,,,,,


In [59]:
df_gold.head()

Unnamed: 0_level_0,gold,gold_change%,gold_change,gold_lag1,gold_lag2,gold_lag3,gold_lag4,gold_lag5,gold_lag6,gold_lag7,...,gold_change_lag1,gold_change_lag2,gold_change_lag3,gold_change_lag4,gold_change_lag5,gold_change_lag6,gold_change_lag7,gold_change_lag8,gold_change_lag9,gold_change_lag10
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1990-01-03,394.5,,,,,,,,,,...,,,,,,,,,,
1990-01-04,398.8,1.089987,4.3,394.5,,,,,,,...,,,,,,,,,,
1990-01-05,406.1,1.830491,7.3,398.8,394.5,,,,,,...,4.3,,,,,,,,,
1990-01-08,401.85,-1.04654,-4.25,406.1,398.8,394.5,,,,,...,7.3,4.3,,,,,,,,
1990-01-09,405.5,0.908299,3.65,401.85,406.1,398.8,394.5,,,,...,-4.25,7.3,4.3,,,,,,,


In [60]:
df_Xy = pd.merge(df_feat, df_gold, how = 'inner', on = ['date'])

In [61]:
df_Xy.head()

Unnamed: 0_level_0,gbp_rate,gbp_rate_change,usd_rate,usd_rate_change,eur_rate,eur_rate_change,aud_rate_change,aud_rate,yen_rate,yen_rate_change,...,gold_change_lag1,gold_change_lag2,gold_change_lag3,gold_change_lag4,gold_change_lag5,gold_change_lag6,gold_change_lag7,gold_change_lag8,gold_change_lag9,gold_change_lag10
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1997-07-02,6.5,0.0,5.5,0.0,,0.0,0.0,5.5,,0.0,...,0.3,-0.7,-3.6,0.9,-0.4,-2.0,2.1,-2.7,1.1,-2.5
1997-07-03,6.5,0.0,5.5,0.0,,0.0,0.0,5.5,,0.0,...,-2.9,0.3,-0.7,-3.6,0.9,-0.4,-2.0,2.1,-2.7,1.1
1997-07-04,6.5,0.0,5.5,0.0,,0.0,0.0,5.5,,0.0,...,-5.6,-2.9,0.3,-0.7,-3.6,0.9,-0.4,-2.0,2.1,-2.7
1997-07-07,6.5,0.0,5.5,0.0,,0.0,0.0,5.5,,0.0,...,-1.25,-5.6,-2.9,0.3,-0.7,-3.6,0.9,-0.4,-2.0,2.1
1997-07-08,6.5,0.0,5.5,0.0,,0.0,0.0,5.5,,0.0,...,-6.65,-1.25,-5.6,-2.9,0.3,-0.7,-3.6,0.9,-0.4,-2.0


In [64]:
df_Xy.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5894 entries, 1997-07-02 to 2020-05-27
Columns: 308 entries, gbp_rate to gold_change_lag10
dtypes: float64(308)
memory usage: 13.9 MB


In [63]:
%store df_Xy

Stored 'df_Xy' (DataFrame)
