In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats

%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [2]:
import sklearn
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.inspection import permutation_importance
from sklearn.model_selection import cross_val_score

In [3]:
from scipy.stats import pearsonr

In [4]:
%store -r df_Xy

In [5]:
df_Xy.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5894 entries, 1997-07-02 to 2020-05-27
Columns: 242 entries, gbp_rate to gold_change%_lag10
dtypes: float64(242)
memory usage: 10.9 MB


# We want to investigate the predictivty of central bank policy rates and stock market index performance as they relate to the price of gold. 
### We have the most recent 10 years of daily data from the following sources:
### Features:
#### Top 5 equity market indices by market capitalization, per Statista: https://www.statista.com/statistics/270126/largest-stock-exchange-operators-by-market-capitalization-of-listed-companies/, and their corresponding daily performance and percent change
NYSE Composite Index history: https://finance.yahoo.com/quote/%5ENYA?p=^NYA&.tsrc=fin-srch
NASDAQ Composite Index history: https://finance.yahoo.com/quote/%5EIXIC?p=^IXIC&.tsrc=fin-srch
Japan Exchange Group Composite Index history: https://finance.yahoo.com/quote/8697.T?p=8697.T&.tsrc=fin-srch
Shanghai Stock Exchange Composite Index history: https://finance.yahoo.com/quote/%5ESSEC?p=^SSEC&.tsrc=fin-srch
Hang Seng (Hong Kong) Index History: https://finance.yahoo.com/quote/%5EHSI?p=^HSI&.tsrc=fin-srch
#### Top 5 Currencies Globally in Forex Trading (https://www.ig.com/us/trading-strategies/top-10-most-traded-currency-pairs-191206), and their corresponding daily central bank policy rates and percent change
The Fed (USD): https://fred.stlouisfed.org/series/FEDFUNDS
ECB (EUR): https://www.ecb.europa.eu/stats/policy_and_exchange_rates/key_ecb_interest_rates/html/index.en.html
BoJ (JPY): https://fred.stlouisfed.org/series/IRSTCI01JPM156N
United Kingdom (GBP): https://fred.stlouisfed.org/series/GBPONTD156N
Australia (AUD): https://www.rba.gov.au/statistics/cash-rate/
#### Daily price of gold  and percent change (up to 10-day lag): https://www.usagold.com/reference/prices/goldhistory.php



### Targets:
#### Daily price of gold, daily percent change price of gold: https://www.usagold.com/reference/prices/goldhistory.php

## As the data sits, we currently have both daily policy rates and index scores - each with up to a ten day lag as the data. This translates to 242 features in our would-be model.

### For sake of dimensionality reduction, let's first get just the raw values into one data frame, and the percent change values into another. It will be interesting to see if gold's price or gold's percent price change is 'easier' to predict. 

In [6]:
# store 'change' column titles to create separate dataframes

col_change = []
col_raw = []
for col in df_Xy.columns:
    if 'change' in col:
        col_change.append(col)
    else:
        col_raw.append(col)

col_change,col_raw

(['gbp_rate_change',
  'usd_rate_change',
  'eur_rate_change',
  'aud_rate_change',
  'yen_rate_change',
  'gbp_rate_change_lag1',
  'gbp_rate_change_lag2',
  'gbp_rate_change_lag3',
  'gbp_rate_change_lag4',
  'gbp_rate_change_lag5',
  'gbp_rate_change_lag6',
  'gbp_rate_change_lag7',
  'gbp_rate_change_lag8',
  'gbp_rate_change_lag9',
  'gbp_rate_change_lag10',
  'usd_rate_change_lag1',
  'usd_rate_change_lag2',
  'usd_rate_change_lag3',
  'usd_rate_change_lag4',
  'usd_rate_change_lag5',
  'usd_rate_change_lag6',
  'usd_rate_change_lag7',
  'usd_rate_change_lag8',
  'usd_rate_change_lag9',
  'usd_rate_change_lag10',
  'eur_rate_change_lag1',
  'eur_rate_change_lag2',
  'eur_rate_change_lag3',
  'eur_rate_change_lag4',
  'eur_rate_change_lag5',
  'eur_rate_change_lag6',
  'eur_rate_change_lag7',
  'eur_rate_change_lag8',
  'eur_rate_change_lag9',
  'eur_rate_change_lag10',
  'aud_rate_change_lag1',
  'aud_rate_change_lag2',
  'aud_rate_change_lag3',
  'aud_rate_change_lag4',
  'aud_r

In [7]:
# create two separate dataframes, one with raw values, one with percent change

df_Xy_raw = df_Xy.drop(columns = col_change).dropna()

df_Xy_change = df_Xy.drop(columns = col_raw).dropna()

In [8]:
df_Xy_raw.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3022 entries, 2008-09-19 to 2020-05-27
Columns: 121 entries, gbp_rate to gold_lag10
dtypes: float64(121)
memory usage: 2.8 MB


In [9]:
df_Xy_change.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5894 entries, 1997-07-02 to 2020-05-27
Columns: 121 entries, gbp_rate_change to gold_change%_lag10
dtypes: float64(121)
memory usage: 5.5 MB


#### To compare models on a level playing field, we need to ensure the same date range is being used for both the raw and the change data.

In [10]:
to_drop = df_Xy_change.index[df_Xy_change.index < '2008-09-19']

df_Xy_change.drop(to_drop, inplace = True)

In [11]:
df_Xy_change.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3022 entries, 2008-09-19 to 2020-05-27
Columns: 121 entries, gbp_rate_change to gold_change%_lag10
dtypes: float64(121)
memory usage: 2.8 MB


In [12]:
# create X,y pairs for both raw and percent change dataframes

X_raw = df_Xy_raw.drop(columns = ['gold'])
y_raw = df_Xy_raw['gold']

X_change = df_Xy_change.drop(columns = ['gold_change%'])
y_change = df_Xy_change['gold_change%']

### Now, to inform our feature selection for our models, let's take a look at the linear relationships between our X features and y targets in both datasets.

In [13]:
def get_r(df_X, y):
    
    """
    PARAMETERS:
    
    df_X: pandas dataframe - all colmuns of float64 type
    y: arrary, target variable
    
    
    RETURNS:
    Dictionary of df_X features and their pearson r and p-values with respect to y
    """
    
    d = dict()
    for col in df_X.columns:
        r, p = pearsonr(df_X[col],y)
        
        d[col] = [r,p]
    
    return d
    

In [14]:
d_r_raw = get_r(X_raw,y_raw)
d_r_change = get_r(X_change,y_change)



### Correlational Study: Raw Data

In [15]:
df_r_raw = pd.DataFrame.from_dict(d_r_raw)

In [16]:
df_r_raw.head()

Unnamed: 0,gbp_rate,usd_rate,eur_rate,aud_rate,yen_rate,gbp_rate_lag1,gbp_rate_lag2,gbp_rate_lag3,gbp_rate_lag4,gbp_rate_lag5,...,gold_lag1,gold_lag2,gold_lag3,gold_lag4,gold_lag5,gold_lag6,gold_lag7,gold_lag8,gold_lag9,gold_lag10
0,-0.3521374,-0.028212,-0.2695844,0.009097,-0.3209888,-0.3537976,-0.3555973,-0.3551246,-0.3551246,-0.3575017,...,0.998007,0.995986,0.994038,0.992122,0.990176,0.988189,0.986311,0.984483,0.982726,0.980902
1,6.235009e-89,0.121003,1.77481e-51,0.617169,2.2184450000000002e-73,8.219253e-90,9.013476e-91,1.613117e-90,1.613117e-90,8.557312e-92,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
df_r_raw.rename(index = {0:'Pearson r', 1:'p-value'}, inplace = True)

In [18]:
df_r_raw = df_r_raw.T
df_r_raw

Unnamed: 0,Pearson r,p-value
gbp_rate,-0.352137,6.235009e-89
usd_rate,-0.028212,1.210027e-01
eur_rate,-0.269584,1.774810e-51
aud_rate,0.009097,6.171689e-01
yen_rate,-0.320989,2.218445e-73
...,...,...
gold_lag6,0.988189,0.000000e+00
gold_lag7,0.986311,0.000000e+00
gold_lag8,0.984483,0.000000e+00
gold_lag9,0.982726,0.000000e+00


In [19]:
df_r_raw.info()

<class 'pandas.core.frame.DataFrame'>
Index: 120 entries, gbp_rate to gold_lag10
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Pearson r  120 non-null    float64
 1   p-value    120 non-null    float64
dtypes: float64(2)
memory usage: 2.8+ KB


In [20]:
sig_r_raw = df_r_raw.index[df_r_raw['p-value'] < 0.05]

In [21]:
sig_r_raw.shape

(98,)

In [22]:
insig_r_raw = df_r_raw.index[df_r_raw['p-value'] >= 0.05]

### Correlational Study: Raw Data Results
On a purely corerlational basis, 98 of our 120 features in the raw dataset would make the cut. Let's take a close look and see if we can glean any further insight.

In [23]:
sig_r_raw

Index(['gbp_rate', 'eur_rate', 'yen_rate', 'gbp_rate_lag1', 'gbp_rate_lag2',
       'gbp_rate_lag3', 'gbp_rate_lag4', 'gbp_rate_lag5', 'gbp_rate_lag6',
       'gbp_rate_lag7', 'gbp_rate_lag8', 'gbp_rate_lag9', 'gbp_rate_lag10',
       'eur_rate_lag1', 'eur_rate_lag2', 'eur_rate_lag3', 'eur_rate_lag4',
       'eur_rate_lag5', 'eur_rate_lag6', 'eur_rate_lag7', 'eur_rate_lag8',
       'eur_rate_lag9', 'eur_rate_lag10', 'yen_rate_lag1', 'yen_rate_lag2',
       'yen_rate_lag3', 'yen_rate_lag4', 'yen_rate_lag5', 'yen_rate_lag6',
       'yen_rate_lag7', 'yen_rate_lag8', 'yen_rate_lag9', 'yen_rate_lag10',
       'sse_close', 'hsi_close', 'nyse_close', 'nasdaq_close', 'jeg_close',
       'sse_close_lag1', 'sse_close_lag2', 'sse_close_lag3', 'sse_close_lag4',
       'sse_close_lag5', 'sse_close_lag6', 'sse_close_lag7', 'sse_close_lag8',
       'sse_close_lag9', 'sse_close_lag10', 'hsi_close_lag1', 'hsi_close_lag2',
       'hsi_close_lag3', 'hsi_close_lag4', 'hsi_close_lag5', 'hsi_close_lag6',
  

In [24]:
insig_r_raw

Index(['usd_rate', 'aud_rate', 'usd_rate_lag1', 'usd_rate_lag2',
       'usd_rate_lag3', 'usd_rate_lag4', 'usd_rate_lag5', 'usd_rate_lag6',
       'usd_rate_lag7', 'usd_rate_lag8', 'usd_rate_lag9', 'usd_rate_lag10',
       'aud_rate_lag1', 'aud_rate_lag2', 'aud_rate_lag3', 'aud_rate_lag4',
       'aud_rate_lag5', 'aud_rate_lag6', 'aud_rate_lag7', 'aud_rate_lag8',
       'aud_rate_lag9', 'aud_rate_lag10'],
      dtype='object')

#### Interesting, only the US and Australian cenral bank rates have relatively weak linear correlations with the price of gold. Perhaps some polynomial transformations would demonstrate some linear relationships, but let's move on for now



In [41]:
X_raw_fit = X_raw.drop(columns = insig_r_raw)


In [42]:
model_change = GradientBoostingRegressor()
cross_val_raw = np.mean(cross_val_score(model_change,X_raw_fit,y_raw))
cross_val_raw

0.8183706813366799

### Correlational Study: Change Data

In [27]:
df_r_change = pd.DataFrame.from_dict(d_r_change)

In [28]:
df_r_change.head()

Unnamed: 0,gbp_rate_change,usd_rate_change,eur_rate_change,aud_rate_change,yen_rate_change,gbp_rate_change_lag1,gbp_rate_change_lag2,gbp_rate_change_lag3,gbp_rate_change_lag4,gbp_rate_change_lag5,...,gold_change%_lag1,gold_change%_lag2,gold_change%_lag3,gold_change%_lag4,gold_change%_lag5,gold_change%_lag6,gold_change%_lag7,gold_change%_lag8,gold_change%_lag9,gold_change%_lag10
0,0.011241,-0.007794,-0.006218,0.007991,0.004945,0.038677,0.007459,,0.007226,-0.00918,...,-0.00522,0.002276,-0.004002,0.004893,0.015728,-0.013646,-0.010099,0.015999,-0.009028,-0.01509
1,0.536771,0.668424,0.732572,0.660563,0.78583,0.033494,0.681902,,0.691331,0.613938,...,0.774231,0.900481,0.825923,0.788025,0.387425,0.453323,0.578927,0.379284,0.619824,0.40697


In [29]:
df_r_change.rename(index = {0:'Pearson r', 1:'p-value'}, inplace = True)

In [30]:
df_r_change = df_r_change.T
df_r_change

Unnamed: 0,Pearson r,p-value
gbp_rate_change,0.011241,0.536771
usd_rate_change,-0.007794,0.668424
eur_rate_change,-0.006218,0.732572
aud_rate_change,0.007991,0.660563
yen_rate_change,0.004945,0.785830
...,...,...
gold_change%_lag6,-0.013646,0.453323
gold_change%_lag7,-0.010099,0.578927
gold_change%_lag8,0.015999,0.379284
gold_change%_lag9,-0.009028,0.619824


In [31]:
df_r_change.info()

<class 'pandas.core.frame.DataFrame'>
Index: 120 entries, gbp_rate_change to gold_change%_lag10
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Pearson r  115 non-null    float64
 1   p-value    115 non-null    float64
dtypes: float64(2)
memory usage: 2.8+ KB


In [32]:
sig_r_change = df_r_change.index[df_r_change['p-value'] < 0.05]
insig_r_change = df_r_change.index[df_r_change['p-value'] >= 0.05]

In [33]:
sig_r_change.shape

(11,)

In [34]:
insig_r_change.shape

(104,)

### Correlational Study: Change Data Results
In stark contrast to the raw data, only 11 of the change dataset features have a significantly linear relationship the change in gold's price

In [35]:
sig_r_change

Index(['gbp_rate_change_lag1', 'usd_rate_change_lag5', 'eur_rate_change_lag1',
       'aud_rate_change_lag3', 'sse_change%_lag4', 'hsi_change%_lag5',
       'hsi_change%_lag10', 'nyse_change%_lag10', 'nasdaq_change%_lag10',
       'jeg_change%_lag9', 'jeg_change%_lag10'],
      dtype='object')

In [36]:
insig_r_change

Index(['gbp_rate_change', 'usd_rate_change', 'eur_rate_change',
       'aud_rate_change', 'yen_rate_change', 'gbp_rate_change_lag2',
       'gbp_rate_change_lag4', 'gbp_rate_change_lag5', 'gbp_rate_change_lag6',
       'gbp_rate_change_lag7',
       ...
       'gold_change%_lag1', 'gold_change%_lag2', 'gold_change%_lag3',
       'gold_change%_lag4', 'gold_change%_lag5', 'gold_change%_lag6',
       'gold_change%_lag7', 'gold_change%_lag8', 'gold_change%_lag9',
       'gold_change%_lag10'],
      dtype='object', length=104)

#### Furthermore, there isn't a very distinct trend among the significant features. It includes both stock market index values and central bank rates, all at varying lags.

In [43]:
# X_change_fit = X_change.drop(columns = insig_r_change)
X_change_fit = X_change

In [None]:
model_change = GradientBoostingRegressor()
cross_val_change = np.mean(cross_val_score(model_change,X_change_fit,y_change))

In [None]:
cross_val_change