In [2]:
# set up imports
import pandas as pd, numpy as np, seaborn as sns
from tabulate import tabulate
import statsmodels
import arch
import matplotlib
matplotlib.use('qt5agg')

# configure plot style
import matplotlib.pyplot as plt
plt.rcParams['mathtext.fontset'] = 'stix'
plt.rcParams['font.family'] = 'STIXGeneral'
plt.rcParams["figure.figsize"] = (9.5,4.15)
plt.rcParams['figure.constrained_layout.use'] = False
plt.rcParams['figure.dpi'] = 100
plt.rcParams['savefig.dpi'] = 100
plt.rcParams['lines.linewidth'] = 0.8
save_plot_to =  r'C:\\Users\joche\OneDrive\03 TUM - TUM-BWL\Semester 8\01 Bachelorarbeit\04 Results\Plots/'

In [3]:
# load data and crop to start_date:end_date
start_date = '2015-08-07'
end_date = '2020-06-26'
idx = pd.date_range(start_date, end_date)
index_name = 'date'
mydateparser = lambda x: pd.datetime.strptime(str(x), '%Y-%m-%d %H:%M:%S')
mydateparser1 = lambda x: pd.datetime.strptime(str(x), '%Y-%m-%d')

# btc
btc = pd.read_excel('Data/BTC_closing.xlsx', parse_dates=[0], index_col=0, date_parser=mydateparser)[start_date:end_date]
btc.index.name = index_name
btc.columns = ['btc']

# usd_eur
usd_eur = pd.read_excel('Data/DEXUSEU.xls', parse_dates=[0], index_col=0, skiprows=10, date_parser=mydateparser)[start_date:end_date]
usd_eur.index.name = index_name
usd_eur.columns = ['usd_eur']
usd_eur = usd_eur.loc[(usd_eur!=0).any(1)]

# tot_btc: only weekly data - missing values interpolated
tot_btc = pd.read_csv('Data/total-bitcoins', index_col=0)[start_date:end_date]
tot_btc.index = pd.DatetimeIndex(tot_btc.index, normalize=True).normalize()
tot_btc.index.name = index_name
tot_btc = tot_btc.reindex(idx, fill_value=None)
tot_btc.interpolate(method='time', inplace=True, limit_direction='both')
tot_btc.columns = ['tot_btc']

# hs_rate: only weekly data - missing values interpolated
hs_rate = pd.read_csv('Data/hash-rate', index_col=0)[start_date:end_date]
hs_rate.index = pd.DatetimeIndex(hs_rate.index, normalize=True).normalize()
hs_rate.index.name = index_name
hs_rate = hs_rate.reindex(idx, fill_value=None)
hs_rate.interpolate(method='time', inplace=True, limit_direction='both')
hs_rate.columns = ['hs_rate']

# eth
eth = pd.read_excel('Data/ETH.xlsx', parse_dates=[0], index_col=0, date_parser=mydateparser, usecols='A,E')[start_date:end_date]
eth.index.name = index_name
eth.columns = ['eth']

# ggl_trends: only weekly data - missing values interpolated
# ggl_trends: weighted average of 15 countries
ggl_trends = pd.read_csv('Data/googletrends.txt', index_col=0)[start_date:end_date]
ggl_trends.index = pd.DatetimeIndex(ggl_trends.index, normalize=True).normalize()
ggl_trends.index.name = index_name
ggl_trends = ggl_trends.reindex(idx, fill_value=None)
ggl_trends.interpolate(method='time', inplace=True, limit_direction='both')
ggl_trends['btc-average'] = np.round(ggl_trends.sum(axis=1)/15)
ggl_trends = ggl_trends[['btc-average']].copy()
ggl_trends.columns = ['ggl_trends']

# wiki_views: sum of 99 countries
wiki_views = pd.read_excel('Data/wikipedia.xlsx', parse_dates=[0], index_col=0, date_parser=mydateparser1)[start_date:end_date]
wiki_views.index = pd.DatetimeIndex(wiki_views.index, normalize=True).normalize()
wiki_views.index.name = index_name
wiki_views['wiki-total'] = np.round(wiki_views.sum(axis=1))
wiki_views = wiki_views[['wiki-total']].copy()
wiki_views.columns = ['wiki_views']

# wti_oil
oil_wti = pd.read_excel('Data/DCOILWTICO.xls', parse_dates=[0], index_col=0, skiprows=10, date_parser=mydateparser)[start_date:end_date]
oil_wti.index = pd.DatetimeIndex(oil_wti.index, normalize=True).normalize()
oil_wti.index.name = index_name
oil_wti.columns = ['oil_wti']
oil_wti = oil_wti.loc[(oil_wti!=0).any(1)]

# gold
gold = pd.read_excel('Data/GOLDAMGBD228NLBM10AM.xls', parse_dates=[0], index_col=0, skiprows=10, date_parser=mydateparser)[start_date:end_date]
gold.index = pd.DatetimeIndex(gold.index, normalize=True).normalize()
gold.index.name = index_name
gold.columns = ['gold']
gold = gold.loc[(gold!=0).any(1)]

# sp500
sp500 = pd.read_excel('Data/SP500.xls', parse_dates=[0], index_col=0, skiprows=10, date_parser=mydateparser)[start_date:end_date]
sp500.index = pd.DatetimeIndex(sp500.index, normalize=True).normalize()
sp500.index.name = index_name
sp500.columns = ['sp500']
sp500 = sp500.loc[(sp500!=0).any(1)]

# sse 
sse = pd.read_excel('Data/SSEcomposite.xlsx', parse_dates=[0], index_col=0, header=0, date_parser=mydateparser)[start_date:end_date]
sse.index = pd.DatetimeIndex(sse.index, normalize=True).normalize()
sse.index.name = index_name
sse = sse[['Zuletzt']].copy()
sse.columns = ['sse']

# ffd_rate
ffd_rate = pd.read_excel('Data/DFF.xls', parse_dates=[0], index_col=0, skiprows=10, date_parser=mydateparser)[start_date:end_date]
ffd_rate.index = pd.DatetimeIndex(ffd_rate.index, normalize=True).normalize()
ffd_rate.index.name = index_name
ffd_rate.columns = ['ffd_rate']

In [4]:
# merge data to one df (inner join)
from functools import reduce
temp = [btc,tot_btc,hs_rate,eth,ggl_trends,wiki_views,usd_eur,oil_wti,
              gold,sp500,sse,ffd_rate]
df = reduce(lambda left, right: pd.merge(left, right, left_index=True, right_index=True), temp)
df

Unnamed: 0_level_0,btc,tot_btc,hs_rate,eth,ggl_trends,wiki_views,usd_eur,oil_wti,gold,sp500,sse,ffd_rate
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2015-08-07,279.58,1.447762e+07,3.775640e+05,2.770000,2.0,15560,1.0958,43.87,1091.35,2077.57,3744.20,0.14
2015-08-10,264.47,1.448130e+07,3.687855e+05,0.708448,2.0,15113,1.0994,44.94,1094.80,2104.18,3928.42,0.14
2015-08-11,270.39,1.448498e+07,3.754390e+05,1.070000,2.0,15320,1.1042,43.11,1113.25,2084.07,3927.91,0.15
2015-08-12,266.38,1.448865e+07,3.820926e+05,1.220000,2.0,15255,1.1198,43.22,1116.80,2086.05,3886.32,0.15
2015-08-13,264.08,1.449229e+07,3.887461e+05,1.830000,2.0,23711,1.1144,42.27,1117.35,2083.39,3954.56,0.15
...,...,...,...,...,...,...,...,...,...,...,...,...
2020-06-18,9411.84,1.840797e+07,1.090229e+08,232.100000,10.0,16234,1.1216,38.79,1732.65,3115.34,2939.32,0.09
2020-06-19,9288.02,1.840862e+07,1.074749e+08,227.140000,10.0,15721,1.1189,39.72,1728.55,3097.74,2967.63,0.09
2020-06-22,9648.72,1.841192e+07,1.074657e+08,242.530000,10.0,16482,1.1260,40.60,1745.45,3117.86,2965.27,0.08
2020-06-23,9629.66,1.841279e+07,1.082351e+08,244.140000,10.0,16216,1.1322,40.40,1756.60,3131.29,2970.62,0.08


In [7]:
# define plotting function
def plot_df(df):
    fig_i, axs = plt.subplots(4,3, figsize=(9.5,4.15))
    for i, ax in enumerate(axs.flatten()):
        data = df[df.columns[i]]
        ax.plot(data, color='black')
        ax.set_title(df.columns[i])
        ax.xaxis.set_ticks_position('none')
        ax.yaxis.set_ticks_position('none')
        #ax.spines['top'].set_alpha(0)
        ax.tick_params(labelsize=8)
    plt.tight_layout()
    return fig_i

In [93]:
# plot raw time series
fig_1 = plot_df(df)

In [6]:
# take log of time series and relabl columns
df_log = np.log(df)
new_cols = list()
for i in df_log.columns:
    new_cols.append(i+'_log')
df_log.columns = new_cols

# plot log time series
fig_2 = plot_df(df_log)

# fill na value of negative oil price on 2020-04-20 with 0
df_log.fillna(value=0, inplace=True)

  

To register the converters:
	>>> from pandas.plotting import register_matplotlib_converters
	>>> register_matplotlib_converters()


btc_log           0
tot_btc_log       0
hs_rate_log       0
eth_log           0
ggl_trends_log    0
wiki_views_log    0
usd_eur_log       0
oil_wti_log       0
gold_log          0
sp500_log         0
sse_log           0
ffd_rate_log      0
dtype: int64

In [100]:
# plot correlation heatmap
corr = df.corr()
# more html colors here: https://www.w3schools.com/colors/colors_names.asp
# pal = sns.light_palette('lightgrey', as_cmap=True)
ax = sns.heatmap(corr, xticklabels=corr.columns.values, yticklabels=corr.columns.values, annot=True, 
            annot_kws={'size':7}, vmin=-1, center=0, vmax=1, cmap="YlGnBu")
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)
fig_3 = plt.gcf()
fig_3.set_size_inches(9.5,4.15)
plt.tick_params(axis='both', which='major', labelsize=7.5, labelbottom = False, bottom=False, top = False, labeltop=True)
plt.xticks(fontsize=7.5, rotation=1)
plt.yticks(fontsize=7.5)

(array([ 0.5,  1.5,  2.5,  3.5,  4.5,  5.5,  6.5,  7.5,  8.5,  9.5, 10.5,
        11.5]), <a list of 12 Text yticklabel objects>)

In [8]:
# adf- and pp-test of all time series
from arch.unitroot import ADF, PhillipsPerron
def stationarity_tests(df, latex):
    for col in df:
        adf = ADF(df[col])
        pp = PhillipsPerron(df[col])
        
        if latex is False:
            # write summary as plain text to std.out
            print('Timeseries:\t',col,'\n',
                  adf.summary(),'\n\n',pp.summary(),'\n\n\n')
        else:
            # write summary as latex to file
            with open(save_plot_to + 'Stationarity_Tests_LaTeX.txt', 'a') as myfile:
                myfile.write('Timeseries:\t'+col+'\n'
                         +adf.summary().as_latex()+'\n\n'
                         +pp.summary().as_latex()+'\n\n\n')

In [103]:
# adf- and pp-test of raw time series, set latex accordingly
stationarity_tests(df, latex=False)

In [125]:
# adf- and pp-test of log time series, set latex accordingly
stationarity_tests(df_log, latex=False)

Timeseries:	 btc 
transformation:	 log 
    Augmented Dickey-Fuller Results   
Test Statistic                 -1.502
P-value                         0.533
Lags                                0
-------------------------------------

Trend: Constant
Critical Values: -3.44 (1%), -2.86 (5%), -2.57 (10%)
Null Hypothesis: The process contains a unit root.
Alternative Hypothesis: The process is weakly stationary. 

      Phillips-Perron Test (Z-tau)    
Test Statistic                 -1.488
P-value                         0.540
Lags                               22
-------------------------------------

Trend: Constant
Critical Values: -3.44 (1%), -2.86 (5%), -2.57 (10%)
Null Hypothesis: The process contains a unit root.
Alternative Hypothesis: The process is weakly stationary. 



Timeseries:	 tot_btc 
transformation:	 log 
    Augmented Dickey-Fuller Results   
Test Statistic                 -3.067
P-value                         0.029
Lags                               19
-----------------

In [9]:
# take first differences of log time series and relabel columns
df_log_diff1 = df_log.diff().dropna()
new_cols = list()
for i in df_log_diff1.columns:
    new_cols.append(i+'_diff')
df_log_diff1.columns = new_cols

# plot first differences log time series
fig_4 = plot_df(df_log_diff1)

In [157]:
# adf- and pp-test of log time series, set latex accordingly
stationarity_tests(df_log_diff1, latex=False)

In [None]:
# adf- und pp-tests results all as latex to file
stationarity_tests(df, latex=True)
stationarity_tests(df_log, latex=True)
stationarity_tests(df_log_diff1, latex=True)

In [49]:
# split df in X_train and X_test set
train_size = int(len(df_log_diff1) * 0.8)
X_train, X_test = df[0:train_size], df[train_size:]
print('df:\t\t',len(df))
print('X_train:\t',len(X_train))
print('X_test:\t\t',len(X_test))
print('Sum train+test:\t', len(X_train)+len(X_test))

df:		 1112
X_train:	 888
X_test:		 224
Sum train+test:	 1112


In [50]:
# make X_train stationary: log and first differences - do *not* drop 1st NA-value
X_train = np.log(X_train.copy()).diff()
X_train

Unnamed: 0_level_0,btc,tot_btc,hs_rate,eth,ggl_trends,wiki_views,usd_eur,oil_wti,gold,sp500,sse,ffd_rate
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2015-08-07,,,,,,,,,,,,
2015-08-10,-0.055561,0.000254,-0.023525,-1.363526,0.000000,-0.029148,0.003280,0.024098,0.003156,0.012727,0.048029,0.000000
2015-08-11,0.022138,0.000254,0.017881,0.412337,0.000000,0.013604,0.004357,-0.041573,0.016712,-0.009603,-0.000130,0.068993
2015-08-12,-0.014942,0.000254,0.017567,0.131192,0.000000,-0.004252,0.014029,0.002548,0.003184,0.000950,-0.010645,0.000000
2015-08-13,-0.008672,0.000251,0.017264,0.405465,0.000000,0.441032,-0.004834,-0.022226,0.000492,-0.001276,0.017407,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
2019-06-24,0.081966,0.000296,-0.006393,0.051256,0.083382,0.256728,0.005722,0.006604,0.012242,-0.001733,0.002053,0.000000
2019-06-25,0.068426,0.000099,0.024530,0.024534,-0.083382,-0.007638,-0.000615,-0.001734,0.017002,-0.009542,-0.008708,0.000000
2019-06-26,0.098867,0.000098,0.023942,0.056881,-0.044452,0.413610,0.000351,0.026371,-0.016078,-0.001235,-0.001943,0.000000
2019-06-27,-0.151819,0.000098,0.010964,-0.134843,-0.046520,-0.173980,-0.001582,0.000169,-0.003204,0.003816,0.006868,0.000000


In [53]:
# granger causality test with transformed, stationary time series
# code taken from: https://stackoverflow.com/questions/58005681/is-it-possible-to-run-a-vector-autoregression-analysis-on-a-large-gdp-data-with
from statsmodels.tsa.stattools import grangercausalitytests
maxlag=12
test = 'ssr-chi2test'
def grangers_causality_matrix(X_train, variables, test = 'ssr_chi2test', verbose=False):
    dataset = pd.DataFrame(np.zeros((len(variables), len(variables))), columns=variables, index=variables)
    for c in dataset.columns:
        for r in dataset.index:
            test_result = grangercausalitytests(X_train[[r,c]], maxlag=maxlag, verbose=False)
            p_values = [round(test_result[i+1][0][test][1],4) for i in range(maxlag)]
            if verbose: print(f'Y = {r}, X = {c}, P Values = {p_values}')
            min_p_value = np.min(p_values)
            dataset.loc[r,c] = min_p_value
    dataset.columns = [var + '_x' for var in variables]
    dataset.index = [var + '_y' for var in variables]
    return dataset

In [56]:
# perform granger causality test each on each
granger_causality_matrix = grangers_causality_matrix(X_train[1:], variables = X_train.columns)
granger_causality_matrix

Unnamed: 0,btc_x,tot_btc_x,hs_rate_x,eth_x,ggl_trends_x,wiki_views_x,usd_eur_x,oil_wti_x,gold_x,sp500_x,sse_x,ffd_rate_x
btc_y,1.0,0.4635,0.0284,0.0631,0.0078,0.188,0.2399,0.4428,0.5524,0.1332,0.5943,0.4883
tot_btc_y,0.7201,1.0,0.2426,0.0519,0.1707,0.0,0.0356,0.0009,0.4137,0.0162,0.0011,0.0
hs_rate_y,0.0772,0.0,1.0,0.022,0.049,0.081,0.0149,0.003,0.0243,0.1404,0.4574,0.1592
eth_y,0.004,0.1467,0.1161,1.0,0.4315,0.1459,0.092,0.5324,0.2972,0.1555,0.0183,0.5316
ggl_trends_y,0.0831,0.2016,0.1148,0.2777,1.0,0.0037,0.0423,0.1252,0.1352,0.1298,0.3962,0.0079
wiki_views_y,0.0001,0.1851,0.181,0.4785,0.0,1.0,0.4707,0.0313,0.152,0.4444,0.0802,0.0207
usd_eur_y,0.2893,0.1617,0.5632,0.0337,0.1223,0.542,1.0,0.0199,0.1484,0.27,0.0641,0.0024
oil_wti_y,0.0327,0.1134,0.1947,0.0011,0.0261,0.8407,0.2087,1.0,0.5383,0.0052,0.0389,0.3267
gold_y,0.5947,0.0779,0.2189,0.0894,0.3782,0.7473,0.0,0.0249,1.0,0.0561,0.0995,0.1232
sp500_y,0.1279,0.4391,0.0203,0.0004,0.0216,0.0589,0.223,0.4059,0.1693,1.0,0.0002,0.0002


In [63]:
# plot granger causality heatmap
ax = sns.heatmap(granger_causality_matrix, xticklabels=granger_causality_matrix.columns.values, yticklabels=granger_causality_matrix.index.values,
                 annot=True, annot_kws={'size':7}, vmin=0, vmax=1, cmap="YlGnBu_r")
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)
fig_5 = plt.gcf()
fig_5.set_size_inches(9.5,4.15)
plt.tick_params(axis='both', which='major', labelsize=7, labelbottom = False, bottom=False, top = False, labeltop=True)
plt.xticks(fontsize=6, rotation=1)
plt.yticks(fontsize=6)

(array([ 0.5,  1.5,  2.5,  3.5,  4.5,  5.5,  6.5,  7.5,  8.5,  9.5, 10.5,
        11.5]), <a list of 12 Text yticklabel objects>)

In [None]:
# VAR(p): Fit on X_train

