In [5]:
import pandas as pd, numpy as np
import matplotlib, seaborn as sns
matplotlib.use('Qt5Agg')
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (8,3.5)
plt.rcParams['figure.constrained_layout.use'] = True

In [3]:
# load data and crop to earliest_all and latest_all
data_list = {}
earliest_all = '2015-08-07'
latest_all = '2020-06-26'
mydateparser = lambda x: pd.datetime.strptime(str(x), '%Y-%m-%d %H:%M:%S')
mydateparser1 = lambda x: pd.datetime.strptime(str(x), '%Y-%m-%d')
btc_series = pd.read_excel('Data/BTC_closing.xlsx',squeeze=True, parse_dates=[0], 
                           index_col=0, date_parser=mydateparser)[earliest_all:latest_all]
data_list['btc_series'] = btc_series
eth_series = pd.read_excel('Data/ETH.xlsx',squeeze=True, parse_dates=[0], 
                           index_col=0, date_parser=mydateparser, usecols='A,E')[earliest_all:latest_all]
data_list['eth_series'] = eth_series

oil_WTI = pd.read_excel('Data/DCOILWTICO.xls', squeeze=True, parse_dates=[0], 
                        index_col=0, skiprows=10, date_parser=mydateparser)[earliest_all:latest_all]
data_list['oil_WTI'] = oil_WTI
oil_BRENT = pd.read_excel('Data/DCOILBRENTEU.xls', squeeze=True, parse_dates=[0], 
                          index_col=0, skiprows=10, date_parser=mydateparser)[earliest_all:latest_all]
data_list['oil_BRENT'] = oil_BRENT

ex_JPN_USD = pd.read_excel('Data/DEXJPUS.xls', squeeze=True, parse_dates=[0], 
                           index_col=0, skiprows=10, date_parser=mydateparser)[earliest_all:latest_all]
data_list['ex_JPN_USD'] = ex_JPN_USD
ex_EUR_USD = 1 / pd.read_excel('Data/DEXUSEU.xls', squeeze=True, parse_dates=[0], 
                               index_col=0, skiprows=10, date_parser=mydateparser)[earliest_all:latest_all]
data_list['ex_EUR_USD'] = ex_EUR_USD
ex_GBP_USD = 1 / pd.read_excel('Data/DEXUSUK.xls', squeeze=True, parse_dates=[0], 
                               index_col=0, skiprows=10, date_parser=mydateparser)[earliest_all:latest_all]
data_list['ex_GBP_USD'] = ex_GBP_USD

gold_10am = pd.read_excel('Data/GOLDAMGBD228NLBM10AM.xls', squeeze=True, parse_dates=[0], 
                          index_col=0, skiprows=10, date_parser=mydateparser)[earliest_all:latest_all]
data_list['gold_10am'] = gold_10am
gold_03pm = pd.read_excel('Data/GOLDPMGBD228NLBM3PM.xls', squeeze=True, parse_dates=[0], 
                          index_col=0, skiprows=10, date_parser=mydateparser)[earliest_all:latest_all]
data_list['gold_03pm'] = gold_03pm

dax = pd.read_excel('Data/SDAXI.xlsx', squeeze=True, parse_dates=[0], 
                    index_col=0, date_parser=mydateparser1, usecols=[0,5])[earliest_all:latest_all]
data_list['dax'] = dax
sp500 = pd.read_excel('Data/SP500.xls', squeeze=True, parse_dates=[0], 
                      index_col=0, skiprows=10, date_parser=mydateparser)[earliest_all:latest_all]
data_list['sp500'] = sp500

In [37]:
# get series overview: same start and end but different lengths, because some days are missing
def print_data_list():
    for i in data_list:
        if i == 'dax' or i == 'sp500': 
            t = 4
        else: 
            t = 3
        print(i, '\t'*t, 'n =' , len(data_list[i]), '\nFirst:\t', 
              data_list[i].index[0], '\nLast:\t',
              data_list[i].index[-1], '\n')

print_data_list()

btc_series 			 n = 1786 
First:	 2015-08-07 00:00:00 
Last:	 2020-06-26 00:00:00 

eth_series 			 n = 1786 
First:	 2015-08-07 00:00:00 
Last:	 2020-06-26 00:00:00 

oil_WTI 			 n = 1276 
First:	 2015-08-07 00:00:00 
Last:	 2020-06-26 00:00:00 

oil_BRENT 			 n = 1276 
First:	 2015-08-07 00:00:00 
Last:	 2020-06-26 00:00:00 

ex_JPN_USD 			 n = 1276 
First:	 2015-08-07 00:00:00 
Last:	 2020-06-26 00:00:00 

ex_EUR_USD 			 n = 1276 
First:	 2015-08-07 00:00:00 
Last:	 2020-06-26 00:00:00 

ex_GBP_USD 			 n = 1276 
First:	 2015-08-07 00:00:00 
Last:	 2020-06-26 00:00:00 

gold_10am 			 n = 1276 
First:	 2015-08-07 00:00:00 
Last:	 2020-06-26 00:00:00 

gold_03pm 			 n = 1276 
First:	 2015-08-07 00:00:00 
Last:	 2020-06-26 00:00:00 

dax 				 n = 1239 
First:	 2015-08-07 00:00:00 
Last:	 2020-06-26 00:00:00 

sp500 				 n = 1276 
First:	 2015-08-07 00:00:00 
Last:	 2020-06-26 00:00:00 



In [14]:
# for each series, resample to get every day between start and end period
# for each series, fill newly added days with NaN, then interpolate (method=time)

def fill_in_missing_data(data_list):
    for i in data_list:
        # resample
        new_i = data_list[i].resample('D').sum()
        # replace 0 with NaN
        new_i.replace(0, np.nan, inplace=True)
        new_i.replace(np.inf, np.nan, inplace=True)
        # interpolate (time)
        new_i.interpolate(method='time', inplace=True)
        data_list[i] = new_i

#fill_in_missing_data(data_list)
#print_data_list()

# from now on, only data_list contains the right values

In [50]:
# create plot to show how interpolation removed gaps in data

ex_EUR_USD_interpolated  = ex_EUR_USD.resample('D').sum()
ex_EUR_USD_interpolated .replace(0, np.nan, inplace=True)
ex_EUR_USD_interpolated .replace(np.inf, np.nan, inplace=True)
ex_EUR_USD_interpolated .interpolate(method='time', inplace=True)

fig, axes = plt.subplots(1, 2, sharex=True, sharey=True)
# plot normal series with gaps
axes[0].plot(data_list['ex_EUR_USD']); axes[0].set_title('ex_EUR_USD')
# plot interpolated series without gaps
axes[1].plot(ex_EUR_USD_interpolated); axes[1].set_title('ex_EUR_USD_interpolated')

Text(0.5, 1.0, 'ex_EUR_USD_interpolated')

In [35]:
lista = [data_list['btc_series'], data_list['eth_series']]
type(lista)

fill_in_missing_data(lista)

TypeError: list indices must be integers or slices, not Series

In [12]:
# visualize data
fig, axes = plt.subplots(4, 3, sharex=True)
data_list_listed = list(data_list)
for i, ax in enumerate(axes.flatten()):
   if i is 11: continue
   curr_key = data_list_listed[i]
   curr_data = data_list[curr_key]
   ax.plot(curr_data)
   ax.set_title(curr_key)

In [None]:
# plot correlation matrix

In [154]:
# test Granger causality
from statsmodels.tsa.stattools import grangercausalitytests
def granger_causation_matrix(data, variables, verbose=False):
    matrix = pd.DataFrame(np.zeros((len(variables), len(variables))), columns=variables, index=variables)
    for c in matrix.columns:
        for r in matrix.index:
            # data = [y,x] where x causes y
            result = grangercausalitytests(x=data[[r,c]], maxlag=maxlag, verbose=False)
            p_values = [round(result[i+1][0][test][1],4) for i in range(maxlag)]
            if verbose: print(f'Y = {r}, X = {c}, P Values = {p_values}')
            min_p_value = np.min(p_values)
            matrix.loc[r,c] = min_p_value
    matrix.columns = [var + '_x' for var in variables]
    matrix.index = [var + '_y' for var in variables]
    return matrix

maxlag=12
test = 'ssr_chi2test'
data = pd.DataFrame(data_list)
granger_matrix = granger_causation_matrix(data, variables=list(data_list.keys()))
granger_matrix

In [183]:
# cointegration test
from statsmodels.tsa.vector_ar.vecm import coint_johansen
def cointegration_test(data, alpha=0.05):
    result = coint_johansen(data, -1, 10)
    d = {'0.90':0, '0.95':1, '0.99':2}
    trace_stat = result.lr1
    crit_val_trace_stat = result.cvt[:, d[str(1-alpha)]]
    
    def adjust(val, length = 6): 
        return str(val).ljust(length)
    
    print('Name   \t\t ::  Test Stat > C(95%)    =>   Signif  \n', '--'*20)
    for col, trace, cvt in zip(data.columns, trace_stat, crit_val_trace_stat):
        if col is 'dax' or col is 'sp500':
            tab = 2
        else:
            tab = 1
        print(adjust(col), '\t'*tab ,':: ', adjust(round(trace,2), 9), ">", adjust(cvt, 8), ' =>  ' , trace > cvt)
    
data = pd.DataFrame(data_list)
cointegration_test(data)