In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
import scipy.stats as stats
import scipy.integrate as integrate
import statsmodels.graphics.gofplots as sgg
from sklearn.grid_search import GridSearchCV
from sklearn.neighbors import KernelDensity
%matplotlib inline
#%matplotlib tk

  from pandas.core import datetools


In [2]:
my_data = pd.read_csv('/home/lin/教材/Financial_data/data/Ripple.csv')
my_data['Date']=pd.to_datetime(my_data['Date'])
my_data.set_index('Date', inplace = True)
close = np.flip(my_data['Close'], 0)
volume = my_data['Volume'].dropna()
logR = np.log(close).diff()
logR.drop(logR.index[0], inplace = True)
logR.describe()

count    1618.000000
mean        0.003731
std         0.079982
min        -0.616273
25%        -0.021178
50%        -0.002303
75%         0.021177
max         1.027356
Name: Close, dtype: float64

In [3]:
logR.count()

1618

In [4]:
start_test = int(logR.count() - 252)
alpha_one = 0.95
alpha_two = 0.99
alpha_es = 0.975
norm_var_ones = np.zeros(252)
norm_var_twos = np.zeros(252)
norm_ES = np.zeros(252)
t_var_ones = np.zeros(252)
t_var_twos = np.zeros(252)
t_ES = np.zeros(252)
HS_var_ones = np.zeros(252)
HS_var_twos = np.zeros(252)
HS_ES = np.zeros(252)

for i in range(start_test, logR.count()):
    training_set = logR[: i]
    x_ticks = np.linspace(min(training_set), max(abs(training_set)),2000)
    hs_prices = training_set.sort_values()
    negaR = training_set[training_set <= 0].dropna()
    sortedNegaR = (-negaR).sort_values()
    mirrorNegative = sortedNegaR.append(-sortedNegaR)
    mu_log = np.mean(training_set)
    sigma_log = np.std(training_set)
    nega_t_para = stats.t.fit(mirrorNegative)
    
    norm_var_ones[i - start_test] = -(sigma_log * stats.norm.ppf(alpha_one) - mu_log)
    norm_var_twos[i - start_test] = -(sigma_log * stats.norm.ppf(alpha_two) - mu_log)
    norm_ES[i - start_test]  = -sigma_log * stats.norm.pdf(stats.norm.ppf(alpha_es))/(1-alpha_es)- mu_log
    
    t_var_ones[i - start_test] = -stats.t.ppf(alpha_one, df = nega_t_para[0] , loc = nega_t_para[1], scale = nega_t_para[2])
    t_var_twos[i - start_test] = -stats.t.ppf(alpha_two, df = nega_t_para[0] , loc = nega_t_para[1], scale = nega_t_para[2])
    
    t_es_quantile = stats.t.ppf(alpha_es, df = nega_t_para[0] , loc = nega_t_para[1], scale = nega_t_para[2])
    domain_t = x_ticks[x_ticks <= t_es_quantile]
    pdf_t = stats.t.pdf(domain_t, df = nega_t_para[0] , loc = nega_t_para[1], scale = nega_t_para[2])
    to_integral = pdf_t * domain_t
    t_ES[i - start_test] = integrate.trapz(to_integral, domain_t, dx = 0.01) / (1 - alpha_es)
    
    ind_var_one = int( hs_prices.size * (1-alpha_one))-1
    ind_var_two = int( hs_prices.size * (1-alpha_two))-1
    ind_es = int( hs_prices.size * (1-alpha_es))-1
    HS_var_ones[i - start_test] = hs_prices.iloc[ind_var_one]
    HS_var_twos[i - start_test] = hs_prices.iloc[ind_var_two]
    HS_ES[i - start_test] = hs_prices.iloc[:ind_es+1].mean()

In [19]:
%matplotlib tk
plt.figure()
plt.plot(logR[-252 :].values, 'r')
plt.plot(norm_var_ones, '-.b')
plt.plot(norm_var_twos, '-.g')
plt.plot(norm_ES, '-.m')
plt.plot(t_var_ones, '--b')
plt.plot(t_var_twos, '--g')
plt.plot(t_ES, '--m')
plt.plot(HS_var_ones, 'b')
plt.plot(HS_var_twos, 'g')
plt.plot(HS_ES, 'm')
plt.title('risk measures estimated with all historical data')
plt.xlabel('time')
plt.ylabel('log return')
plt.legend(['log return', 'normal var 95%', 'normal var 99%', 'normal ES 97.5%', 't var 95%', 't var 99%', 't ES 97.5%', 'HS 95%', 'HS 99%', 'HS ES' ], loc = 2)

<matplotlib.legend.Legend at 0x7f6b0440f9e8>

In [6]:
weekly_data = my_data.resample('7d').first()

In [7]:
week_close = weekly_data['Close']
week_logR = np.log(week_close).diff()
week_logR.drop(week_logR.index[0], inplace = True)
week_logR.describe()

count    231.000000
mean       0.027505
std        0.253305
min       -0.586528
25%       -0.071184
50%       -0.008649
75%        0.076894
max        1.857460
Name: Close, dtype: float64

In [8]:
week_start_test = int(week_logR.count() - 52)
week_norm_var_ones = np.zeros(52)
week_norm_var_twos = np.zeros(52)
week_norm_ES = np.zeros(52)
week_t_var_ones = np.zeros(52)
week_t_var_twos = np.zeros(52)
week_t_ES = np.zeros(52)
week_HS_var_ones = np.zeros(52)
week_HS_var_twos = np.zeros(52)
week_HS_ES = np.zeros(52)

for i in range(week_start_test, week_logR.count()):
    training_set = week_logR[: i]
    x_ticks = np.linspace(min(training_set), max(abs(training_set)),2000)
    hs_prices = training_set.sort_values()
    negaR = training_set[training_set <= 0].dropna()
    sortedNegaR = (-negaR).sort_values()
    mirrorNegative = sortedNegaR.append(-sortedNegaR)
    mu_log = np.mean(training_set)
    sigma_log = np.std(training_set)
    nega_t_para = stats.t.fit(training_set)
    
    week_norm_var_ones[i - week_start_test] = -(sigma_log * stats.norm.ppf(alpha_one) - mu_log)
    week_norm_var_twos[i - week_start_test] = -(sigma_log * stats.norm.ppf(alpha_two) - mu_log)
    week_norm_ES[i - week_start_test]  = -sigma_log * stats.norm.pdf(stats.norm.ppf(alpha_es))/(1-alpha_es)- mu_log
    
    week_t_var_ones[i - week_start_test] = -stats.t.ppf(alpha_one, df = nega_t_para[0] , loc = nega_t_para[1], scale = nega_t_para[2])
    week_t_var_twos[i - week_start_test] = -stats.t.ppf(alpha_two, df = nega_t_para[0] , loc = nega_t_para[1], scale = nega_t_para[2])
    
    t_es_quantile = stats.t.ppf(alpha_es, df = nega_t_para[0] , loc = nega_t_para[1], scale = nega_t_para[2])
    domain_t = x_ticks[x_ticks <= t_es_quantile]
    pdf_t = stats.t.pdf(domain_t, df = nega_t_para[0] , loc = nega_t_para[1], scale = nega_t_para[2])
    to_integral = pdf_t * domain_t
    t_ES[i - week_start_test] = integrate.trapz(to_integral, domain_t, dx = 0.01) / (1 - alpha_es)
    
    week_ind_var_one = int( hs_prices.size * (1-alpha_one))-1
    week_ind_var_two = int( hs_prices.size * (1-alpha_two))-1
    week_ind_es = int( hs_prices.size * (1-alpha_es))-1
    week_HS_var_ones[i - week_start_test] = hs_prices.iloc[ind_var_one]
    week_HS_var_twos[i - week_start_test] = hs_prices.iloc[ind_var_two]
    week_HS_ES[i - week_start_test] = hs_prices.iloc[:ind_es+1].mean()

In [9]:
nega_t_para

(1.7166277984540292, -0.010177199412833283, 0.09342069922797398)

In [16]:
plt.figure()
plt.plot(week_logR[-52 :].values, 'r')
plt.plot(week_norm_var_ones, '-.b')
plt.plot(week_norm_var_twos, '-.g')
plt.plot(week_norm_ES, '-.m')
plt.plot(week_t_var_ones, '--b')
plt.plot(week_t_var_twos, '--g')
plt.plot(week_t_ES, '--m')
plt.plot(week_HS_var_ones, 'b')
plt.plot(week_HS_var_twos, 'g')
plt.plot(week_HS_ES, 'm')
plt.title('risk measures estimated with all resampled historical data')
plt.xlabel('time in week')
plt.ylabel('log return')
plt.legend(['log return', 'normal var 95%', 'normal var 99%', 'normal ES 97.5%', 't var 95%', 't var 99%', 't ES 97.5%', 'HS 95%', 'HS 99%', 'HS ES' ])

<matplotlib.legend.Legend at 0x7f6b046120f0>

In [15]:
numbins = int(30) 
plt.figure()
week_logR.hist(bins = numbins, normed=True)
plt.plot(x_ticks, (stats.t.pdf(x_ticks, df = nega_t_para[0] , loc = nega_t_para[1], scale = nega_t_para[2])), '--b', alpha = 0.5)
plt.plot(x_ticks, 1./((2.*np.pi)**0.5 *sigma_log)*np.exp(-((x_ticks - mu_log)/sigma_log)**2/2), 'r', alpha = 0.4)

plt.title('Histogram of log-retunrs and Gaussian with the same mean and variance')
plt.legend(['normal', 'histogram'])

<matplotlib.legend.Legend at 0x7f6b04678e10>

In [12]:
#resampled weekly and take the first day of the week as the observation.
#sample size reduced to 231. After taking one year (52 week) out as test
#set, the traning set is really small. Fitted with t and normal 
#distribution. Calculated var 95%, 99% and ES 97.5%. 

#In total, these 
#risk meansures perform worse than those estimated from the daily data.
#mainly because of lacking training data, especally some extreme cases 
#might got lost during the resampling. The historical method seems suffer the most.

#And among these, ES estimated from t distribution perform especially
#badly. The reason for this is ES heavily relies on the tail of
#distribution, not only quantile on the tail, but the shape of the tail.
#All the losses in information from resampling, especially the extreme
#observations, severely affects the shape of the estimated tail.

#RippleParaResample  vs  RippleParaAll
