In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats

%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

# We are investigating the predictivity of central bank policy rates and stock market indices as they relate to the price of gold. 

## Consolidate Central Bank Rates

In [2]:
# Start by consolidating central bank rate data

%store -r df_aud
%store -r df_eur
%store -r df_gbp
%store -r df_usd
%store -r df_yen

In [3]:
df_aud['date'].min()

Timestamp('1990-01-23 00:00:00')

In [4]:
df_eur['date'].min()

Timestamp('1999-01-01 00:00:00')

In [5]:
df_gbp['date'].min()

Timestamp('1975-01-20 00:00:00')

In [6]:
df_usd['date'].min()

Timestamp('1990-01-01 00:00:00')

In [7]:
df_yen['date'].min()

Timestamp('2000-01-01 00:00:00')

In [8]:
# loop through all dataframes and merge via outer join, ensuring the df with the most non-null dates is the left object
df_rates = pd.merge(df_gbp, df_usd, how = 'outer', on = ['date'])
df_rates = pd.merge(df_rates, df_eur, how = 'outer', on = ['date'])
df_rates = pd.merge(df_rates, df_aud, how = 'outer', on = ['date'])
df_rates = pd.merge(df_rates, df_yen, how = 'outer', on = ['date'])

In [9]:
df_rates.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11273 entries, 0 to 11272
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   date           11273 non-null  datetime64[ns]
 1   Rate           238 non-null    float64       
 2   usd_rate_tgt   11109 non-null  float64       
 3   eur_rate       49 non-null     float64       
 4   AUD_rate_tgt%  340 non-null    float64       
 5   yen_rate       3839 non-null   float64       
dtypes: datetime64[ns](1), float64(5)
memory usage: 616.5 KB


In [10]:
df_rates.rename(columns = {'AUD_rate_tgt%': 'aud_rate', 'Rate': 'gbp_rate', 'usd_rate_tgt':'usd_rate'}, inplace = True)

In [11]:
df_rates.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11273 entries, 0 to 11272
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   date      11273 non-null  datetime64[ns]
 1   gbp_rate  238 non-null    float64       
 2   usd_rate  11109 non-null  float64       
 3   eur_rate  49 non-null     float64       
 4   aud_rate  340 non-null    float64       
 5   yen_rate  3839 non-null   float64       
dtypes: datetime64[ns](1), float64(5)
memory usage: 616.5 KB


In [12]:
df_rates = df_rates.sort_values(by = ['date'])

In [13]:
#lowest minimum date across rate data is 2000-01-01 (df_yen), so drop all data prior
df_rates = df_rates[df_rates['date']>'1999-12-31']

In [14]:
df_rates = df_rates.set_index('date')

In [15]:
df_rates.head()

Unnamed: 0_level_0,gbp_rate,usd_rate,eur_rate,aud_rate,yen_rate
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2000-01-01,,5.5,,,
2000-01-02,,5.5,,,
2000-01-03,,5.5,,,
2000-01-04,,5.5,,,
2000-01-05,,5.5,,,


In [16]:
#identify values that need to be filled

df_rates.fillna('~', inplace = True)

In [17]:
d = dict()
for col in df_rates.columns:
    for idx,i in enumerate(df_rates[col]):
        if i != '~':
            d[col] = int(idx)
            break

print(d)

{'gbp_rate': 12, 'usd_rate': 0, 'eur_rate': 34, 'aud_rate': 32, 'yen_rate': 223}


In [18]:
"""The for loop above tells us that we'll only have data for all interest rate features 
if we go from the 230th row and below. Let's fill out the rest of our dataframe accordingly"""

d_rates = dict()
for col in df_rates.columns:

    d_last = dict()
    lst = []
    for idx,i in enumerate(df_rates[col][d[col]:]):
        if i != '~':
            d_last['last'] = i
        
        else:
            i = d_last['last']
        
        lst.append(i)
    
    d_rates[col] = lst
    df_rates[col][d[col]:] = lst
            

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [19]:
d_rates.keys()

dict_keys(['gbp_rate', 'usd_rate', 'eur_rate', 'aud_rate', 'yen_rate'])

In [20]:
df_rates.tail()

Unnamed: 0_level_0,gbp_rate,usd_rate,eur_rate,aud_rate,yen_rate
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-05-27,0.1,0.25,-0.5,0.25,-0.1
2020-05-28,0.1,0.25,-0.5,0.25,-0.1
2020-05-29,0.1,0.25,-0.5,0.25,-0.1
2020-05-30,0.1,0.25,-0.5,0.25,-0.1
2020-05-31,0.1,0.25,-0.5,0.25,-0.1


We know the yen rates will be our limiting factor in terms of date ranges, so we'll drop all rows for which there is no data on yen policy rates.

In [21]:
# store indices with no yen data 
indexNames = df_rates[df_rates['yen_rate'] == '~'].index
# Delete these row indexes from dataFrame
df_rates.drop(indexNames, inplace=True)


In [22]:
df_rates.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 7234 entries, 2000-08-11 to 2020-05-31
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   gbp_rate  7234 non-null   object 
 1   usd_rate  7234 non-null   float64
 2   eur_rate  7234 non-null   object 
 3   aud_rate  7234 non-null   object 
 4   yen_rate  7234 non-null   object 
dtypes: float64(1), object(4)
memory usage: 339.1+ KB


In [23]:
for col in d.keys():
    df_rates[col] = df_rates[col].astype('float')

In [24]:
df_rates.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 7234 entries, 2000-08-11 to 2020-05-31
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   gbp_rate  7234 non-null   float64
 1   usd_rate  7234 non-null   float64
 2   eur_rate  7234 non-null   float64
 3   aud_rate  7234 non-null   float64
 4   yen_rate  7234 non-null   float64
dtypes: float64(5)
memory usage: 339.1 KB


In [25]:
df_rates.head()

Unnamed: 0_level_0,gbp_rate,usd_rate,eur_rate,aud_rate,yen_rate
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2000-08-11,6.0,6.5,3.25,6.25,0.25
2000-08-12,6.0,6.5,3.25,6.25,0.25
2000-08-13,6.0,6.5,3.25,6.25,0.25
2000-08-14,6.0,6.5,3.25,6.25,0.25
2000-08-15,6.0,6.5,3.25,6.25,0.25


In [26]:
%store df_rates

Stored 'df_rates' (DataFrame)


## Consolidate stock indices

In [27]:
%store -r df_hsi
%store -r df_sse
%store -r df_jeg
%store -r df_nasdaq
%store -r df_nyse

In [28]:
df_hsi['date'].min()

Timestamp('2000-01-03 00:00:00')

In [29]:
df_sse['date'].min()

Timestamp('1997-07-02 00:00:00')

In [30]:
df_jeg['date'].min()

Timestamp('2008-09-05 00:00:00')

In [31]:
df_nasdaq['date'].min()

Timestamp('2000-01-03 00:00:00')

In [32]:
df_nyse['date'].min()

Timestamp('2000-01-03 00:00:00')

In [33]:
# loop through all dataframes and merge via outer join, ensuring the df with the most non-null dates is the left object
df_stock = pd.merge(df_sse, df_hsi, how = 'outer', on = ['date'])
df_stock = pd.merge(df_stock, df_nyse, how = 'outer', on = ['date'])
df_stock = pd.merge(df_stock, df_nasdaq, how = 'outer', on = ['date'])
df_stock = pd.merge(df_stock, df_jeg, how = 'outer', on = ['date'])

In [34]:
df_stock.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5964 entries, 0 to 5963
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   date          5964 non-null   datetime64[ns]
 1   sse_close     5714 non-null   float64       
 2   hsi_close     5023 non-null   float64       
 3   nyse_close    5131 non-null   float64       
 4   nasdaq_close  5131 non-null   float64       
 5   jeg_close     2869 non-null   float64       
dtypes: datetime64[ns](1), float64(5)
memory usage: 326.2 KB


In [35]:
df_stock = df_stock.sort_values(by = ['date'])

In [36]:
#lowest minimum date across stock data is 2008-09-05 (df_jeg), so drop all data prior
df_stock = df_stock[df_stock['date']>'2008-09-04']

In [37]:
df_stock = df_stock.set_index('date')

In [38]:
df_stock.head()

Unnamed: 0_level_0,sse_close,hsi_close,nyse_close,nasdaq_close,jeg_close
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2008-09-05,2202.446045,19933.279297,8033.759766,2255.879883,335.0
2008-09-08,2143.420898,20794.269531,8168.620117,2269.76001,343.0
2008-09-09,2145.779053,20491.109375,7871.149902,2209.810059,337.86499
2008-09-10,2150.759033,19999.779297,7957.259766,2228.699951,319.382996
2008-09-11,2078.980957,19388.720703,8011.25,2258.219971,290.799988


In [39]:
df_stock.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3051 entries, 2008-09-05 to 2020-05-27
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sse_close     2845 non-null   float64
 1   hsi_close     2881 non-null   float64
 2   nyse_close    2950 non-null   float64
 3   nasdaq_close  2950 non-null   float64
 4   jeg_close     2869 non-null   float64
dtypes: float64(5)
memory usage: 143.0 KB


In [40]:
idx = np.argwhere(pd.isnull(df_stock['sse_close'].to_numpy()))[:,0]

In [42]:
date = df_stock.index.to_numpy()

In [43]:
"""Account for NaN values (stock holidays in this case) by filling missing values with most recent close price,
to indicate no change"""

#identify values that need to be filled

df_stock.fillna('~', inplace = True)

d_stock = dict()
for col in df_stock.columns:

    d_last = dict()
    lst = []
    for idx,i in enumerate(df_stock[col]):
        if i != '~':
            d_last['last'] = i
        
        else:
            i = d_last['last']
        
        lst.append(i)
    
    d_stock[col] = lst
    df_stock[col] = lst

In [44]:
df_stock.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3051 entries, 2008-09-05 to 2020-05-27
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sse_close     3051 non-null   float64
 1   hsi_close     3051 non-null   float64
 2   nyse_close    3051 non-null   float64
 3   nasdaq_close  3051 non-null   float64
 4   jeg_close     3051 non-null   float64
dtypes: float64(5)
memory usage: 143.0 KB


In [45]:
df_stock.head()

Unnamed: 0_level_0,sse_close,hsi_close,nyse_close,nasdaq_close,jeg_close
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2008-09-05,2202.446045,19933.279297,8033.759766,2255.879883,335.0
2008-09-08,2143.420898,20794.269531,8168.620117,2269.76001,343.0
2008-09-09,2145.779053,20491.109375,7871.149902,2209.810059,337.86499
2008-09-10,2150.759033,19999.779297,7957.259766,2228.699951,319.382996
2008-09-11,2078.980957,19388.720703,8011.25,2258.219971,290.799988


In [46]:
%store df_stock

Stored 'df_stock' (DataFrame)
