In [None]:
import pandas as pd
import numpy as np
import scipy.stats as stats

%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [None]:
import sklearn
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.inspection import permutation_importance
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, r2_score

In [None]:
import sys

sys.path.append('src/')

from funcs import *

In [None]:
import statsmodels.api as sm

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
#read-in data frame from data/consolidate_v2.ipynb
df_Xy = pd.read_csv('data/dfXy.csv')

In [None]:
df_Xy.info()

In [None]:
df_Xy['date'] = pd.to_datetime(df_Xy['date'], yearfirst = True)

In [None]:
df_Xy.set_index('date', inplace = True)

# Background
#### Gold is often said to be a store of value against inflation. Gold is also often said to be a hedge against volatility in equity markets. Leveraging some classical economic thought, this makes sense at surface level: lower interest rates cause inflation and devalue the currency, so people flock to gold. As for equity markets, nobody likes to lose money. When equity markets have satiated an investor's appetite for risk, some of those funds will flow into gold. But this would assume a perfectly rational investor, and I don't think I've ever met one in real life. Let's see how these taken-forogranted relationships hold up alongside the data.


# We want to investigate the predictivty of central bank policy rates and stock market index performance as they relate to the price of gold. 

## We'll attempt to use time series modeling to predict the daily price, daily price difference, and daily percent change in price of gold.
### We have the most recent 11 years of daily data from the following sources:
### Features:
#### Top 5 equity market indices by market capitalization, per Statista: https://www.statista.com/statistics/270126/largest-stock-exchange-operators-by-market-capitalization-of-listed-companies/, and their corresponding daily performance and percent change
- NYSE Composite Index history: https://finance.yahoo.com/quote/%5ENYA?p=^NYA&.tsrc=fin-srch
- NASDAQ Composite Index history: https://finance.yahoo.com/quote/%5EIXIC?p=^IXIC&.tsrc=fin-srch
- Japan Exchange Group Composite Index history: https://finance.yahoo.com/quote/8697.T?p=8697.T&.tsrc=fin-srch
    - This was one of the limiting factors in our timeframe. The JEG as we know it today didn't come into existence until 2008 
- Shanghai Stock Exchange Composite Index history: https://finance.yahoo.com/quote/%5ESSEC?p=^SSEC&.tsrc=fin-srch
- Hang Seng (Hong Kong) Index History: https://finance.yahoo.com/quote/%5EHSI?p=^HSI&.tsrc=fin-srch
#### Top 5 Currencies Globally in Forex Trading (https://www.ig.com/us/trading-strategies/top-10-most-traded-currency-pairs-191206), and their corresponding daily central bank policy rates and percent change
- The Fed (USD): https://fred.stlouisfed.org/series/FEDFUNDS
- ECB (EUR): https://www.ecb.europa.eu/stats/policy_and_exchange_rates/key_ecb_interest_rates/html/index.en.html
- BoJ (JPY): https://fred.stlouisfed.org/series/IRSTCI01JPM156N
- United Kingdom (GBP): https://fred.stlouisfed.org/series/GBPONTD156N
- Australia (AUD): https://www.rba.gov.au/statistics/cash-rate/

### Targets:
#### Daily price of gold, daily difference in gold price, daily percent change price of gold: https://www.usagold.com/reference/prices/goldhistory.php

## As the data sits, we currently have both daily policy rates and index scores (along with differences and percent changes) - each with up to a ten day lag on the data. This translates to  121 features each in our 3 would-be models (daily price, daily price difference, daily percent price change).

### Let's start by separating our data

In [None]:
# store column titles to create separate dataframes for raw values, differences, and percent changes

col_raw = []
col_dif = []
col_change = []

for col in df_Xy.columns:
    if 'change%' in col:
        col_change.append(col)
    elif 'dif' in col:
        col_dif.append(col)
    else:
        col_raw.append(col)

In [None]:
# create three separate dataframes, one with raw values, one with differences, and one with percent change

df_Xy_raw = df_Xy[col_raw].dropna()

df_Xy_dif = df_Xy[col_dif].dropna()

df_Xy_change = df_Xy[col_change].dropna()

In [None]:
df_Xy_raw.info()

In [None]:
df_Xy_dif.info()

In [None]:
df_Xy_change.info()

#### To compare models on a level playing field, we need to ensure the same date range is being used for both the raw and the change data. Let's take a look at the date ranges in all of our datasets. It looks like our difference dataframe is the limiting factor here with '2008-09-22' being the earliest date in the dataset.

In [None]:
df_Xy_raw = df_Xy_raw[df_Xy_raw.index >= '2008-09-22']
df_Xy_raw.info()

In [None]:
df_Xy_change = df_Xy_change[df_Xy_change.index >= '2008-09-22']
df_Xy_change.info()

In [None]:
# create X,y pairs for both raw and percent change dataframes

X_raw = df_Xy_raw.drop(columns = ['gold'])
y_raw = df_Xy_raw['gold']

X_dif = df_Xy_dif.drop(columns = ['gold_dif'])
y_dif = df_Xy_dif['gold_dif']

X_change = df_Xy_change.drop(columns = ['gold_change%'])
y_change = df_Xy_change['gold_change%']

## EDA
### Historical Gold Data Trends and Moving Averages

In [None]:
# We default the window here to 365 as we have daily data, and we want to average-out seasonal patterns, if they exist
def fit_moving_average_trend(series, window=365):
    return series.rolling(window, center=True).mean()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.cbook as cbook

years = mdates.YearLocator()   # every year
months = mdates.MonthLocator()  # every month
years_fmt = mdates.DateFormatter('%Y')

In [None]:
fig,axs = plt.subplots(3,1, figsize = (15,12))

for ax,metric,series in zip(axs,[('Daily Price of Gold','USD ($)'), ('Daily Difference, Price of Gold','USD ($)'),\
                            ('Daily Percent Change, Price of Gold','% Change')],[y_raw,y_dif,y_change]):
        
    ma = fit_moving_average_trend(series)
    ax.plot(series.index,series)
    ax.plot(series.index,ma, label = '365 Day Moving Average')
    ax.set_title(metric[0])
    ax.set_ylabel(metric[1])
    ax.legend()
#     ax.set_xticks(df_Xy.index[series.index][::60])
#     ax.xaxis.set_tick_params(rotation=45)


#     # format the ticks
    ax.xaxis.set_major_locator(years)
    ax.xaxis.set_major_formatter(years_fmt)
    ax.xaxis.set_minor_locator(months)

       
plt.tight_layout()
#plt.savefig('gold_history')

## EDA
### Historical Gold Data Trends Alongside All Features

In [None]:
# raw data

col_lag = ['gold']
for col in df_Xy_raw.columns:
    if 'lag' in col:
        col_lag.append(col)
        
X_nolag_raw = df_Xy_raw.drop(columns = col_lag)


In [None]:
plot_trends(X_nolag_raw,y_raw)
#plt.savefig('raw_overlap_final')

### Raw Data Trends - Price of Gold, Central Bank Policy Rates, and Global Index Summary:
While there are periods of time within the data the gold moves in tandem with our features, those trends and not consistent throughout our entire time window of the most recent ten years.

- Rates that stick out to me: Euro, Australian Dollar
- Stock indices that stick out to me: Nasdaq, NYSE


### So perhaps our features aren't as predictive as we have been led to believe. Let's first try to predict gold's price based on its own lagged price, then we can see if any of our remaining features can positively contribute to our model.

## EDA
### Testing for Gold for Stationarity with Augmented Dickey Fuller

In [None]:
for title, series in zip(['Daily Price of Gold', 'Daily Difference, Price of Gold','Daily Percent Change, Price of Gold'],[y_raw,y_dif,y_change]):
    test = sm.tsa.stattools.adfuller(series)
    print(f'{title} ADF p-value: {round(test[1],3)}')

#### So we now know that both gold's Daily Price Difference and Daily Percent Change are stationary. Let's take a look at some autocorrelation and partial autocorrelation plots to see if an AR (auto-regressive) model  would make sense for our baselines. This will also help inform what sort of lag we should use in our model

In [None]:
fig, axs = plt.subplots(2, figsize=(15, 12))

for ax, title, series in zip(axs, ['Daily Difference, Price of Gold','Daily Percent Change, Price of Gold'],[y_dif,y_change]):
    test = sm.tsa.stattools.adfuller(series)
    
    sm.graphics.tsa.plot_acf(series, lags=100, ax=ax)
    ax.set_title(f'Autocorrelation: {title}')
    
    # The zeroth value in these graphs is pointless (a dataset is perfectly correlated to itself)
    ax.set_xlim(left = 1)
    ax.set_ylim(top = 0.2)
    ax.set_xlabel('Lag (Days)')
    ax.set_ylabel('Autocorrelation')

plt.tight_layout()
plt.savefig('auto_gold_final')

In [None]:
fig, axs = plt.subplots(2, figsize=(15, 12))

for ax, title, series in zip(axs, ['Daily Difference, Price of Gold','Daily Percent Change, Price of Gold'],[y_dif,y_change]):
    test = sm.tsa.stattools.adfuller(series)
    
    sm.graphics.tsa.plot_pacf(series, lags=100, ax=ax)
    ax.set_title(f'Partial Autocorrelation: {title}')
    
    # The zeroth value in these graphs is pointless (a dataset is perfectly correlated to itself)
    ax.set_xlim(left = 1)
    ax.set_ylim(top = 0.2)
    ax.set_xlabel('Lag (Days)')
    ax.set_ylabel('Partial Autocorrelation')

plt.tight_layout()
plt.savefig('pauto_gold_final')

#### While we've proven stationarity in daily price difference and daily percent change, our Autocorrelation plots  aren't encouraging. An AR based model doesn't seem like the proper approach, but linear regression actually assumes an absence of autocorrelation. 
#### To get the simplest of baselines, let's fit a linear model with gold's price difference and price change with the features being a 1 through 10 day lag. Given the dynamic nature of markets, a 10 day lag will help mitigate an extinct trend from a previous timeframe over-influencing our models.

## Linear Model Assumptions:
We may be breaking the rules a bit here. We're really most concerned with generating a baseline model. Who knows?, perhaps the model will pick up something our eyes missed during visual EDA.
- Linear relationship - if this doesn't exist, we'll know because our model will score poorly
- Errors are normally distributed
- Homoscedasticity of errors (or, equal variance around the line) - fit a model and visualize
- Independence of the observations - mostly rule breaking here

In [None]:
X_dif.reset_index(inplace = True)
X_change.reset_index(inplace = True)

In [None]:
# generate gold difference and gold % change dfs with lag

#Difference
col_gold_dif = []
for col in X_dif.columns:
    if 'gold' in col:
        col_gold_dif.append(col)

X_dif_gold = X_dif[col_gold_dif]
X_dif_gold.info()
        

In [None]:
#Percent Change
col_gold_change = []
for col in X_change.columns:
    if 'gold' in col:
        col_gold_change.append(col)

X_change_gold = X_change[col_gold_change]
X_change_gold.info()

## Linear Regression Model: Daily Gold Price Difference (USD), 10 day lag

In [None]:
from sklearn.linear_model import Ridge

Because we're using time series based data, our train-test split will need to ensure that our data stays chronological.

In [None]:
from sklearn.model_selection import TimeSeriesSplit

In [None]:
y_dif = df_Xy_dif.reset_index()['gold_dif']

In [None]:
X_dif_gold.info()

In [None]:
train_X_dif_gold = X_dif_gold[:int(X_dif_gold.shape[0]*0.7)]
test_X_dif_gold = X_dif_gold[int(X_dif_gold.shape[0]*0.7):]
train_y_dif = y_dif[:int(X_dif_gold.shape[0]*0.7)]
test_y_dif = y_dif[int(X_dif_gold.shape[0]*0.7):]

In [None]:
d = dict()
for i in np.arange(0,20.1,0.1):
    d[cross_val_and_score(Ridge(alpha = i),train_X_dif_gold, test_X_dif_gold, train_y_dif, test_y_dif)[0][-1]] = i

In [None]:
max_ = max(d.keys())
print(d[max_],max_)

#### Cross Validation: Linear Regression of Daily Price Difference (USD), 10 day lag

In [None]:
# Instantiate dictionary to store all model scores
d_score = dict()

In [None]:
dif_gold_score, dif_gold_model = cross_val_and_score(Ridge(alpha = d[max_]),train_X_dif_gold, test_X_dif_gold, train_y_dif, test_y_dif)

In [None]:
d_score['Linear Regression of Daily Price Difference (USD), 10 day lag'] = dif_gold_score

In [None]:
plot_model(dif_gold_model,test_X_dif_gold,test_y_dif)
#plt.savefig('dif_gold_linear')

### Linear Regression Model: Daily Gold Price Percent Change, 10 day lag

In [None]:
y_change = df_Xy_change.reset_index()['gold_change%']

In [None]:
train_X_change_gold = X_change_gold[:int(X_change_gold.shape[0]*0.7)]
test_X_change_gold = X_change_gold[int(X_change_gold.shape[0]*0.7):]
train_y_change = y_change[:int(X_change_gold.shape[0]*0.7)]
test_y_change = y_change[int(X_change_gold.shape[0]*0.7):]

In [None]:
d = dict()
for i in np.arange(0,20.1,0.1):
    d[cross_val_and_score(Ridge(alpha = i),train_X_change_gold, test_X_change_gold, train_y_change, test_y_change)[0][-1]] = i

In [None]:
max_ = max(d.keys())

print(d[max_],max_)

#### Cross Validation: Linear Regression of Daily Gold Price Percent Change, 10 day lag

In [None]:
change_gold_score, change_gold_model = cross_val_and_score(Ridge(alpha = d[max_]),train_X_change_gold, test_X_change_gold, train_y_change, test_y_change)

In [None]:
d_score['Linear Regression Model: Daily Gold Price Percent Change, 10 day lag'] = change_gold_score

In [None]:
plot_model(change_gold_model,test_X_change_gold,test_y_change)
#plt.savefig('change_gold_linear')

### Linear Regression for Gold Price 10 Day Lag Summary:
A simple linear model is doing more harm than good when using a ten day lag to predict both the Daily Price Difference (R^2 Score = -0.008) as well as the Daily Percent Change in Price (R^2 Score = -0.019).

#### Idea: Let's tryout a different model with again just the 10 day lag for only gold price data. A moving average feels like a reasonable next step, given its prominence as a technical indicator in the world of online trading. 

## 10 Day Moving Average Regressor: Daily Gold Price Difference (USD)

In [None]:
y_dif_ma = X_dif_gold.mean(axis = 1)

In [None]:
# Since we're taking a simple moving average, we don't need to split our feature matrices. We can just compare our moving averages to actual values
train_y_dif_ma = y_dif_ma[:int(X_dif_gold.shape[0]*0.7)]
test_y_dif_ma = y_dif_ma[int(X_dif_gold.shape[0]*0.7):]


train_y_dif = y_dif[:int(X_dif_gold.shape[0]*0.7)]
test_y_dif = y_dif[int(X_dif_gold.shape[0]*0.7):]

#### Cross Validation: 10 Day Moving Average Regression Daily Gold Price Difference (USD)

In [None]:
ma_dif_gold_score = cross_val_and_score_ma(train_y_dif_ma,train_y_dif,test_y_dif_ma,test_y_dif)

In [None]:
d_score['10 Day Moving Average Regressor: Daily Gold Price Difference (USD)'] = ma_dif_gold_score

In [None]:
plot_ma_model(y_dif_ma,y_dif)
#plt.savefig('dif_gold_ma')

## 10 Day Moving Average Regressor: Daily Percent Change Gold Price

In [None]:
X_change_gold.info()

In [None]:
y_change_ma = X_change_gold.mean(axis = 1)

In [None]:
# Since we're taking a simple moving average, we don't need to split our feature matrices. We can just compare our moving averages to actual values
train_y_change_ma = y_change_ma[:int(X_dif_gold.shape[0]*0.7)]
test_y_change_ma = y_change_ma[int(X_dif_gold.shape[0]*0.7):]


train_y_change = y_change[:int(X_dif_gold.shape[0]*0.7)]
test_y_change = y_change[int(X_dif_gold.shape[0]*0.7):]

#### Cross Validation: 10 Day Moving Average Regression Daily Percent Change Gold Price

In [None]:
ma_change_gold_score = cross_val_and_score_ma(train_y_change_ma,train_y_change,test_y_change_ma,test_y_change)

In [None]:
d_score['10 Day Moving Average Regressor: Daily Percent Change Gold Price'] = ma_change_gold_score

In [None]:
plot_ma_model(y_change_ma,y_change)
#plt.savefig('change_gold_ma')

### 10 Day Moving Average Regressor for Gold Price Summary:
A 10 Day Moving Average Regressor also does a poor job in explaining variation in the Daily Price Difference (R^2 Score = -0.13) as well as the Daily Percent Change in Price (R^2 Score = -0.13).

#### Idea: A gradient boosting regressor comes to mind, because all of our target and feature variables are continuous, and it's evident both linear regression and moving-average models do a poor job of explaining gold's variation in price. 

## Gradient Boosting Regressor: Daily Gold Price Difference, 10 day lag

In [None]:
train_X_dif_gold = X_dif_gold[:int(X_dif_gold.shape[0]*0.7)]
test_X_dif_gold = X_dif_gold[int(X_dif_gold.shape[0]*0.7):]
train_y_dif = y_dif[:int(X_dif_gold.shape[0]*0.7)]
test_y_dif = y_dif[int(X_dif_gold.shape[0]*0.7):]

#### Cross Validation: Gradient Boosting Regressor of Daily Gold Price Difference, 10 day lag

In [None]:
dif_gold_score, dif_gold_model = cross_val_and_score(GradientBoostingRegressor(learning_rate = .01),train_X_dif_gold, test_X_dif_gold, train_y_dif, test_y_dif)

In [None]:
d_score['Gradient Boosting Regressor: Daily Gold Price Difference, 10 day lag'] = dif_gold_score

In [None]:
plot_model(dif_gold_model,test_X_dif_gold,test_y_dif)
#plt.savefig('dif_gold_gb')

## Gradient Boosting Regressor: Daily Percent Change Gold Price, 10 day lag

In [None]:
train_X_change_gold = X_change_gold[:int(X_change_gold.shape[0]*0.7)]
test_X_change_gold = X_change_gold[int(X_change_gold.shape[0]*0.7):]
train_y_change = y_change[:int(X_change_gold.shape[0]*0.7)]
test_y_change = y_change[int(X_change_gold.shape[0]*0.7):]

#### Cross Validation: Gradient Boosting Regressor of Daily Gold Price Difference, 10 day lag

In [None]:
change_gold_score, change_gold_model = cross_val_and_score(GradientBoostingRegressor(learning_rate = .01),train_X_change_gold, test_X_change_gold, train_y_change, test_y_change)

In [None]:
d_score['Gradient Boosting Regressor: Daily Percent Change Gold Price, 10 day lag'] = change_gold_score

In [None]:
plot_model(change_gold_model,test_X_change_gold,test_y_change)
#plt.savefig('change_gold_gb')

### Gradient Boosting Regression for Gold Price, 10 Day Lag Summary:
We only made an improvement from our Linear Model with respect to Daily Percent Change in Price (R^2 Score = -0.005).
The R^2 score for our Daily Price Difference model actually dropped a bit (R^2 Score = -0.011), 
#### Idea: Let's introduce our other features (stock market index performance, central bank policy rates, and their 10 day lags) to see if any of these metrics can help inform our model. We can use permutation importance to determine which features, if any, are most informative.

## Gradient Boosting Regressor: Daily Gold Price Difference, All Features

In [None]:
y_dif = df_Xy_dif.reset_index()['gold_dif']

In [None]:
to_drop = ['gold_dif','date']

for col in df_Xy_dif.columns:
    if 'lag' not in col:
        to_drop.append(col)

In [None]:
X_dif = df_Xy_dif.reset_index().drop(columns = to_drop)

In [None]:
X_dif.info()

In [None]:
train_X_dif = X_dif[:int(X_dif.shape[0]*0.7)]
test_X_dif = X_dif[int(X_dif.shape[0]*0.7):]
train_y_dif = y_dif[:int(X_dif.shape[0]*0.7)]
test_y_dif = y_dif[int(X_dif.shape[0]*0.7):]

#### Cross Validation Gradient Boosting Regressor of Daily Gold Price Difference, All Features

In [None]:
dif_score, dif_model = cross_val_and_score(GradientBoostingRegressor(learning_rate = .01),train_X_dif, test_X_dif, train_y_dif, test_y_dif)

In [None]:
d_score['Gradient Boosting Regressor: Daily Gold Price Difference, All Features'] = dif_score

In [None]:
plot_model(dif_model,test_X_dif,test_y_dif)
#plt.savefig('dif_gb')

## Gradient Boosting Regressor: Daily Gold Price Percent Change, All Features

In [None]:
y_change = df_Xy_change.reset_index()['gold_change%']

In [None]:
to_drop = ['gold_change%','date']

for col in df_Xy_change.columns:
    if 'lag' not in col:
        to_drop.append(col)

In [None]:
X_change = df_Xy_change.reset_index().drop(columns = to_drop)

In [None]:
X_change.info()

In [None]:
train_X_change = X_change[:int(X_change.shape[0]*0.7)]
test_X_change = X_change[int(X_change.shape[0]*0.7):]
train_y_change = y_change[:int(X_change.shape[0]*0.7)]
test_y_change = y_change[int(X_change.shape[0]*0.7):]

#### Cross Validation Gradient Boosting Regressor of Daily Gold Price Percent Change, All Features

In [None]:
change_score, change_model = cross_val_and_score(GradientBoostingRegressor(learning_rate = .01),train_X_change, test_X_change, train_y_change, test_y_change)

In [None]:
d_score['Gradient Boosting Regressor: Daily Gold Price Percent Change, All Features'] = change_score

In [None]:
plot_model(change_model,test_X_change,test_y_change)
#plt.savefig('change_gb')

### Gradient Boosting Regression for Gold Price - Stock Indices, Central Bank Policy Rates, and Gold Pricing
Unfortunately, it looks like the introduction of more features to our models actually decreased their accuracy, making them further underform in contrast to a constant prediction: Daily Price Difference (R^2 Score = -0.024), Daily Percent Change in Price (R^2 Score = -0.016).
#### Idea: Thus far, our models have left much to be desired in terms of accuracy. They aren't explaining gold's variation in price difference or price change. However, from an investor's perspective, what is ultimately most important is whether or not the price will increase or decrease within a given time frame. Let's pivot to a classification framework and see if we can't get a better score. 


## Gradient Boosting Classifier: Daily Gold Price Difference, All Features

In [None]:
# Create boolean increase feature

y_dif = df_Xy_dif.reset_index()['gold_dif'] > 0

In [None]:
X_dif.info()

In [None]:
train_X_dif = X_dif[:int(X_dif.shape[0]*0.7)]
test_X_dif = X_dif[int(X_dif.shape[0]*0.7):]
train_y_dif = y_dif[:int(X_dif.shape[0]*0.7)]
test_y_dif = y_dif[int(X_dif.shape[0]*0.7):]

#### Cross Validation: Gradient Boosting Classifier of Daily Gold Price Difference, All Features

In [None]:
dif_score, dif_model = cross_val_and_score(GradientBoostingClassifier(learning_rate = .01),train_X_dif, test_X_dif, train_y_dif, test_y_dif)

In [None]:
d_score['Gradient Boosting Classifier: Daily Gold Price Difference, All Features'] = dif_score

#### Feature Importance: Gradient Boosting Classifier of Daily Gold Price Difference, All Features

In [None]:
dif_imp = permutation_importance(dif_model,test_X_dif,test_y_dif)

d_dif_imp = dict()
for idx,col in enumerate(test_X_dif.columns):
    imp = dif_imp['importances_mean'][idx]
    if imp > 0.01:
        d_dif_imp[col] = imp


In [None]:
d_dif_imp

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
#Create confusion matrix
gb_cm_dif = print_confusion_matrix(dif_model,test_X_dif,test_y_dif)

In [None]:
gb_cm_dif

## Gradient Boosting Classifier: Daily Gold Price Percent Change, All Features

In [None]:
y_change = df_Xy_change.reset_index()['gold_change%'] > 0

In [None]:
X_change.info()

In [None]:
train_X_change = X_change[:int(X_change.shape[0]*0.7)]
test_X_change = X_change[int(X_change.shape[0]*0.7):]
train_y_change = y_change[:int(X_change.shape[0]*0.7)]
test_y_change = y_change[int(X_change.shape[0]*0.7):]

#### Cross Validation Gradient Boosting Classifier of Daily Gold Price Percent Change, All Features

In [None]:
change_score, change_model = cross_val_and_score(GradientBoostingClassifier(learning_rate = .01),train_X_change, test_X_change, train_y_change, test_y_change)

In [None]:
d_score['Gradient Boosting Classifier: Daily Gold Price Percent Change, All Features'] = change_score

In [None]:
change_imp = permutation_importance(change_model,test_X_change,test_y_change)

d_change_imp = dict()
for idx,col in enumerate(test_X_change.columns):
    imp = change_imp['importances_mean'][idx]
    if imp > 0.01:
        d_change_imp[col] = imp

In [None]:
d_change_imp

In [None]:
#Create confusion matrix
gb_cm_change = print_confusion_matrix(dif_model,test_X_dif,test_y_dif)

In [None]:
gb_cm_change

### Gradient Boosting Classification for Gold Price - Stock Indices, Central Bank Policy Rates, and Gold Pricing
You may have gotten excited by the score ouputs above growing to be at least positive numbers, but it's important we understand the nuances between scoring classifiers versus regressors. While an R^2 score of 0 for regression implies a model that does no better than guessing a constant, we scored our classifiers using the accuracy metric. In this instance, a score of 0.5 (50%) is actually the threshhold for positively contributing to the prediction of a value.

#### Good news: our classifiers' accuracy scores Daily Gold Price Difference (R^2 = 0.55) and Daily Gold Price Percent Change (R^2 = 0.53) are positively contributing to the prediction of gold's price movement.

#### Bad news: the classifiers are doing so at a rate only slightly better than a coin flip. Of all of the features in our models, only one had a permutation importance greater than 0.01. It was the Nasdaq 9 day lag re: Gradient Boosting Classifer - Daily Difference in Gold Price. Perhaps a relationship does exist there, but I'm not convinced. 

# Conclusion
### If a relationship between interest rates, stock market indices, and the price of gold does exist, it doesn't appear to be properly captured with the featuers we have chosen and a 10 day lag. 

### Further study
- Investigate longer lag
- Investigate polynomial and interaction transformations of features with current data