In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
import scipy.stats as stats
import scipy.integrate as integrate
import statsmodels.graphics.gofplots as sgg
from sklearn.grid_search import GridSearchCV
from sklearn.neighbors import KernelDensity
%matplotlib inline
#%matplotlib tk

  from pandas.core import datetools


In [2]:
my_data = pd.read_csv('/home/lin/教材/Financial_data/data/Ripple.csv')
my_data['Date']=pd.to_datetime(my_data['Date'])
my_data.set_index('Date', inplace = True)
close = np.flip(my_data['Close'], 0)
volume = my_data['Volume'].dropna()
logR = np.log(close).diff()
logR.drop(logR.index[0], inplace = True)
logR.describe()

count    1618.000000
mean        0.003731
std         0.079982
min        -0.616273
25%        -0.021178
50%        -0.002303
75%         0.021177
max         1.027356
Name: Close, dtype: float64

In [3]:
total_obs = logR.count()

In [4]:
start_test = int(180)
alpha_one = 0.95
alpha_two = 0.99
alpha_es = 0.975
report_size = total_obs - start_test
norm_var_ones = np.zeros(report_size)
norm_var_twos = np.zeros(report_size)
norm_ES = np.zeros(report_size)
t_var_ones = np.zeros(report_size)
t_var_twos = np.zeros(report_size)
t_ES = np.zeros(report_size)
HS_var_ones = np.zeros(report_size)
HS_var_twos = np.zeros(report_size)
HS_ES = np.zeros(report_size)


for i in range(start_test, logR.count()):
    training_set = logR[i - start_test : i]
    x_ticks = np.linspace(min(training_set), max(abs(training_set)),2000)
    hs_prices = training_set.sort_values()
    negaR = training_set[training_set <= 0].dropna()
    sortedNegaR = (-negaR).sort_values()
    mirrorNegative = sortedNegaR.append(-sortedNegaR)
    mu_log = np.mean(training_set)
    sigma_log = np.std(training_set)
    nega_t_para = stats.t.fit(mirrorNegative)
    norm_var_ones[i - start_test] = -(sigma_log * stats.norm.ppf(alpha_one) - mu_log)
    norm_var_twos[i - start_test] = -(sigma_log * stats.norm.ppf(alpha_two) - mu_log)
    norm_ES[i - start_test]  = -sigma_log * stats.norm.pdf(stats.norm.ppf(alpha_es))/(1-alpha_es)- mu_log
    t_var_ones[i - start_test] = -stats.t.ppf(alpha_one, df = nega_t_para[0] , loc = nega_t_para[1], scale = nega_t_para[2])
    t_var_twos[i - start_test] = -stats.t.ppf(alpha_two, df = nega_t_para[0] , loc = nega_t_para[1], scale = nega_t_para[2])
    t_es_quantile = stats.t.ppf(alpha_es, df = nega_t_para[0] , loc = nega_t_para[1], scale = nega_t_para[2])
    domain_t = x_ticks[x_ticks <= t_es_quantile]
    pdf_t = stats.t.pdf(domain_t, df = nega_t_para[0] , loc = nega_t_para[1], scale = nega_t_para[2])
    to_integral = pdf_t * domain_t
    t_ES[i - start_test] = integrate.trapz(to_integral, domain_t, dx = 0.01) / (1 - alpha_es)
    
    ind_var_one = int( hs_prices.size * (1-alpha_one))-1
    ind_var_two = int( hs_prices.size * (1-alpha_two))-1
    ind_es = int( hs_prices.size * (1-alpha_es))-1
    HS_var_ones[i - start_test] = hs_prices.iloc[ind_var_one]
    HS_var_twos[i - start_test] = hs_prices.iloc[ind_var_two]
    HS_ES[i - start_test] = hs_prices.iloc[:ind_es+1].mean()

In [5]:
%matplotlib tk
plt.plot(logR[start_test:].values, 'r',  linewidth = 0.5)
plt.plot(norm_var_ones, 'b')
plt.plot(norm_var_twos, 'g')
plt.plot(norm_ES, 'm')
plt.plot(t_var_ones, '--b')
plt.plot(t_var_twos, '--g')
plt.plot(t_ES, '--m')
plt.title('180 days training window parametric estimation of risk measures')
plt.xlabel('time')
plt.ylabel('log return')
plt.legend(['log return', 'normal var 95%', 'normal var 99%', 'normal ES 97.5%', 't var 95%', 't var 99%', 't ES 97.5%' ])

<matplotlib.legend.Legend at 0x7f41adac5518>

In [6]:
#It make sence to use shorter training window for ripple. The absolut values
#of extreme moves in ripple are much larger
#than that of the common moves. And big movements cluster which means as 
#long as you observe a big move you need to quickly prepare to deal with more. 
#Using long trainning windows means the estimated distribution can not react
#fast enough to the new change because a few new data point will ont change 
#much of the whole distribution. One may argur that with long training window
#distribution will remember the last time of big movement but I believe there
#there is no gurrantee how long it will be between two group of big movement
#and it is more reliable to get information from the new change in the market
#also, even the distributino remember the big movements, the information will
#be diluted by many more small size movement.
#However, short window makes the risk measures change a lot during time,
#especially for the historical estimation. Because if the window is too small
#the historical var might be the lowest return in the whole training window
#and as long as that lowest observation moves out, the estmation will change
#damatically.
#With short trainning window, the ES is smoother and might be higher than 
#99% var because it is a mean of a few extreme values while 99% var might be
#the lowest value.

In [8]:
start_test = int(350)
report_size = total_obs - start_test

HS_var_ones = np.zeros(report_size)
HS_var_twos = np.zeros(report_size)
HS_ES = np.zeros(report_size)
for i in range(start_test, logR.count()):
    training_set = logR[i - start_test : i]
    x_ticks = np.linspace(min(training_set), max(abs(training_set)),2000)
    hs_prices = training_set.sort_values()
    ind_var_one = int( hs_prices.size * (1-alpha_one))-1
    ind_var_two = int( hs_prices.size * (1-alpha_two))-1
    ind_es = int( hs_prices.size * (1-alpha_es))-1
    HS_var_ones[i - start_test] = hs_prices.iloc[ind_var_one]
    HS_var_twos[i - start_test] = hs_prices.iloc[ind_var_two]
    HS_ES[i - start_test] = hs_prices.iloc[:ind_es+1].mean()

plt.figure()
plt.plot(logR[start_test:].values, 'r', linewidth = 0.5)
plt.plot(HS_var_ones, 'b')
plt.plot(HS_var_twos, 'g')
plt.plot(HS_ES, 'm')
plt.title('350 days training window historical estimation of risk measures')
plt.xlabel('time')
plt.ylabel('log return')
plt.legend(['log return', 'HS 95%', 'HS 99%', 'HS ES'])

<matplotlib.legend.Legend at 0x7f41ad8f6160>