In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.stattools import kpss
import scipy.stats as stats
import statsmodels.graphics.gofplots as sgg
from sklearn.grid_search import GridSearchCV
from sklearn.neighbors import KernelDensity
%matplotlib inline
#%matplotlib tk

  from pandas.core import datetools


In [2]:
my_data = pd.read_csv('/home/lin/教材/Financial_data/data/finance_M4/my_fx_data', header = None, usecols = [1], names = ['price'])

In [3]:
my_data.head()

Unnamed: 0,price
0,4915.1
1,4867.6
2,4679.8
3,4595.8
4,4609.2


In [4]:
my_data.describe()

Unnamed: 0,price
count,4197.0
mean,8665.180105
std,2891.102233
min,3613.4
25%,5867.9
50%,8626.7
75%,10569.8
max,15178.0


In [52]:
print('plot raw data')
%matplotlib tk
plt.plot(my_data['price'], linewidth = 0.4)
plt.ylabel('price')
plt.title('financial price')

plot raw data


Text(0.5,1,'financial price')

In [6]:
logR = np.log(my_data).diff()
logR.head()

Unnamed: 0,price
0,
1,-0.009711
2,-0.039346
3,-0.018113
4,0.002911


In [7]:
logR.values

array([[        nan],
       [-0.0097111 ],
       [-0.03934563],
       ...,
       [ 0.00020826],
       [ 0.00679489],
       [ 0.00697432]])

In [8]:
logR.drop(logR.index[0], inplace = True)

In [53]:
print('plot log return')

#%matplotlib tk
logR['price'].plot(linewidth = 0.4)
plt.ylabel('log return')
plt.title('Financial log return')

plot log return


Text(0.5,1,'Financial log return')

In [10]:
logR.describe()

Unnamed: 0,price
count,4196.0
mean,0.000267
std,0.012079
min,-0.14934
25%,-0.005606
50%,0.000742
75%,0.006732
max,0.060968


In [11]:
stats.skew(logR['price'])

-1.1502129828839347

In [12]:
stats.kurtosis(logR['price'])

11.260451026283784

In [13]:
adfuller(logR['price']) #null hypothesis of the Augmented Dickey-Fuller is that there is a unit root, not stationary, reject, means stationary

(-23.262701610140457,
 0.0,
 6,
 4189,
 {'1%': -3.4319120223653132,
  '10%': -2.5671374076308826,
  '5%': -2.8622302155703507},
 -24993.19087817247)

In [14]:
kpss(logR['price'])#  null hypothesis that x is level or trend stationary, cannot reject, is stationary



(0.04591689237306045,
 0.1,
 31,
 {'1%': 0.739, '10%': 0.347, '2.5%': 0.574, '5%': 0.463})

In [15]:
stats.anderson(logR['price']) #test for normal distribution

AndersonResult(statistic=44.06294861250717, critical_values=array([0.575, 0.655, 0.786, 0.917, 1.091]), significance_level=array([15. , 10. ,  5. ,  2.5,  1. ]))

In [16]:
print('plot bin number score')
numSmoothingFactor = 900
#this guy should be a input to the optimized bin number function, it should also take nomalized data
scores = np.zeros(numSmoothingFactor)
binNums = range(1,numSmoothingFactor + 1)
for binNum in binNums:
    frequen, binEdges = np.histogram(logR, bins = binNum)
    numObser = logR['price'].count().astype(float)
    proEst = frequen / numObser
    sumFreqSqua = np.sum(proEst ** 2)
    score = ((2. * binNum) / ( numObser - 1.)) - (sumFreqSqua * (numObser + 1.) * binNum/ (numObser - 1.))
    scores[binNum - 1] = score

plt.plot(binNums, scores)
plt.legend(['L^2 loss function'])

plot bin number score


<matplotlib.legend.Legend at 0x7f51600eafd0>

In [54]:
print('histgram and Gaussian')
numbins = int(60) 
logR.hist(bins = numbins, normed=True)


mu_log = np.mean(logR['price'])
sigma_log = np.std(logR['price'])
x_ticks = np.linspace(min(logR['price']), max(logR['price']),100)
plt.plot(x_ticks, 1./((2.*np.pi)**0.5 *sigma_log)*np.exp(-((x_ticks - mu_log)/sigma_log)**2/2), 'r', alpha = 0.4)
plt.title('Histogram of log-retunrs and Gaussian with the same mean and variance')
plt.legend(['normal', 'histogram'])
plt.xlabel('log return')
plt.ylabel('probability density')

histgram and Gaussian


Text(0,0.5,'probability density')

In [55]:
print('autocorrelations')
autocorr_logR = [logR['price'].autocorr(lag=_) for _ in range(1,252)]
autocorr_abs_logR = [abs(logR['price']).autocorr(lag=_) for _ in range(1,252)]
autocorr_sqLogR = [(logR['price'] ** 2).autocorr(lag=_) for _ in range(1,252)]
plt.plot(range(1, 252),autocorr_logR, range(1, 252), autocorr_abs_logR, range(1, 252), autocorr_sqLogR)
plt.title('Estimated autocorrelation vs lag for financial data')
plt.legend(['lot return', '|log return|', 'log return ^2'], loc = 1)
plt.xlabel('lag')
plt.ylabel('autocorrelation')
plt.grid(True)

autocorrelations


In [19]:
posiR = logR[logR > 0].dropna()
negaR = logR[logR <= 0].dropna()

In [20]:
np.unique(posiR > 0)

array([ True])

In [21]:
np.unique(negaR <= 0)

array([ True])

In [22]:
sortedPosiR = posiR['price'].sort_values()
sortedNegaR = (-negaR['price']).sort_values()
mirrorNegative = sortedNegaR.append(-sortedNegaR)
numObs = logR['price'].count()

In [23]:
mirrorPosi = sortedPosiR.append(-sortedPosiR)

In [24]:
print('loglog data and Gaussion')
plt.loglog(sortedPosiR, 1 - (range(1, sortedPosiR.count() + 1))/(sortedPosiR.count() + 1), '+b')
plt.loglog(sortedNegaR, 1 - (range(1, sortedNegaR.count() + 1))/(sortedNegaR.count() + 1), 'xr')
domain = np.linspace(mu_log, max(abs(logR['price'])), 1000)
plt.loglog(domain, 1 - (stats.norm.cdf(domain, mu_log, sigma_log) - 0.5) * 2, 'm')
plt.ylim([1e-4, 1.05])
plt.xlim([1e-5, 1])
plt.legend(['pos return', 'nega return', 'normal'], loc = 3)
plt.grid(True)

loglog data and Gaussion


In [25]:
print('volatility of log return')
window = 252
cumlogR = logR['price'].cumsum()
cumlogRSq = (logR['price'] ** 2).cumsum()
sqRunAvg = (cumlogRSq[cumlogRSq.index[window :]].values - cumlogRSq[cumlogRSq.index[: cumlogRSq.count() - window]].values)/window
RuningAvg = (cumlogR[cumlogR.index[window :]].values - cumlogR[cumlogR.index[: cumlogR.count() - window]].values)/window
volatility = np.sqrt(sqRunAvg - RuningAvg ** 2)
plt.figure()
plt.plot(volatility)
plt.legend(['volatility'])

volatility of log return


<matplotlib.legend.Legend at 0x7f51605dbef0>

In [26]:
print('qq fit whole set with Gaussian')
normal_para = [mu_log, sigma_log]
sgg.qqplot(logR['price'], stats.norm, fit = True, line = '45')
plt.title('normal whole set')
plt.xlim([-4, 4])
plt.grid(True)

qq fit whole set with Gaussian


In [27]:
print('qq fit whole set with t')
t_para = stats.t.fit(logR['price'], loc = mu_log, scale = sigma_log)
loc, scale = t_para[1:]
fake_distargs = (t_para[0],)
sgg.qqplot(logR['price'], stats.t, fit = False, line = '45',distargs = fake_distargs, loc=loc, scale=scale)
plt.title('t whole set')
plt.grid(True)

qq fit whole set with t


In [28]:
t_para 

(3.7159809964036814, 0.0007269058609301606, 0.008281154349273845)

In [29]:
print('qq fit mirrored positive with t')
posi_t_para = stats.t.fit(mirrorPosi)
loc, scale = posi_t_para[1:]
fake_distargs = (posi_t_para[0],)
sgg.qqplot(logR['price'], stats.t, fit = False, line = '45',distargs = fake_distargs, loc=loc, scale=scale)
plt.title ('t posi')
plt.grid(True)

qq fit mirrored positive with t


In [30]:
print('qq fit mirrored negative with t')
nega_t_para = stats.t.fit(mirrorNegative)
loc, scale = nega_t_para[1:]
fake_distargs = (nega_t_para[0],)
sgg.qqplot(logR['price'], stats.t, fit = False, line = '45',distargs = fake_distargs, loc=loc, scale=scale)
plt.title('t nega')
plt.grid(True)

qq fit mirrored negative with t


In [31]:
print('qq fit whole set with gen extreme')
gen_para = stats.genextreme.fit(logR['price'], loc = mu_log, scale = sigma_log)
#gen_para = list(gen_para)
#gen_para[0] = gen_para[0] * (-1)

# it's this one

loc, scale = gen_para[1:]
fake_distargs = (gen_para[0],)

pp = sgg.ProbPlot(logR['price'], dist=stats.genextreme, distargs=fake_distargs, fit=False, loc=loc, scale=scale)
fig = pp.qqplot(line='45')
plt.title('gen extreme whole set')
plt.grid(True)

qq fit whole set with gen extreme


In [32]:
print('qq fit negative half with gen pareto')
nega_gen_para = stats.genpareto.fit(sortedNegaR, loc = mu_log, scale = sigma_log)

#gen_para = list(gen_para)
#gen_para[0] = gen_para[0] * (-1)

# it's this one

loc, scale = nega_gen_para[1:]
fake_distargs = (nega_gen_para[0],)

pp = sgg.ProbPlot(sortedNegaR, dist=stats.genpareto, distargs=fake_distargs, fit=False, loc=loc, scale=scale)
fig = pp.qqplot(line='45')
plt.title('gen pareto whole set')
plt.grid(True)

qq fit negative half with gen pareto


  val = val + cnk * (-1) ** ki / (1.0 - c * ki)
  mu2 = mu2p - mu * mu
  Lhat = muhat - Shat*mu


In [33]:
nega_gen_para

(0.06745991821862588, -1.1063684670825943e-10, 0.008160063901092524)

In [34]:
print('qq fit positive half with gen pareto')
posi_gen_para = stats.genpareto.fit(sortedPosiR, loc = mu_log, scale = sigma_log)

#gen_para = list(gen_para)
#gen_para[0] = gen_para[0] * (-1)

# it's this one

loc, scale = posi_gen_para[1:]
fake_distargs = (posi_gen_para[0],)

pp = sgg.ProbPlot(sortedPosiR, dist=stats.genpareto, distargs=fake_distargs, fit=False, loc=loc, scale=scale)
fig = pp.qqplot(line='45')
plt.title('gen whole set')
plt.grid(True)

qq fit positive half with gen pareto


  val = val + cnk * (-1) ** ki / (1.0 - c * ki)
  mu2 = mu2p - mu * mu
  Lhat = muhat - Shat*mu


In [35]:
posi_gen_para

(-0.08145170068265845, -5.4005132009578286e-05, 0.00892150742470711)

In [36]:
print('negative return, hist, fitted gen pareto and mirrored negative t')
numbins = int(60) 
sortedNegaR.hist(bins = numbins, normed=True)
x_ticks = np.linspace(min(sortedNegaR), max(sortedNegaR),100)
plt.plot(x_ticks, stats.genpareto.pdf(x_ticks, c =nega_gen_para[0] , loc = nega_gen_para[1], scale = nega_gen_para[2]),'g', alpha = 0.4)
plt.plot(x_ticks, stats.t.pdf(x_ticks, df =nega_t_para[0] , loc = nega_t_para[1], scale = nega_t_para[2]) * 2, 'r', alpha = 0.4)

negative return, hist, fitted gen pareto and mirrored negative t


[<matplotlib.lines.Line2D at 0x7f51602c2908>]

In [37]:
plt.plot(logR['price'].sort_values(), stats.genextreme.cdf(logR['price'].sort_values(), c =gen_para[0] , loc = gen_para[1], scale = gen_para[2]), linewidth = 0.4)
plt.plot(logR['price'].sort_values(), stats.norm.cdf(logR['price'].sort_values(), mu_log, sigma_log), linewidth = 0.4)

[<matplotlib.lines.Line2D at 0x7f515fd4cba8>]

In [38]:
print('loglog data halves with whole gen extreme, t, normal')
#%matplotlib tk
domain = np.linspace(mu_log, max(abs(logR['price'])), 1000)
plt.figure()
plt.loglog(sortedPosiR, 1 - (range(1, sortedPosiR.count() + 1))/(sortedPosiR.count() + 1), '+b')
plt.loglog(sortedNegaR, 1 - (range(1, sortedNegaR.count() + 1))/(sortedNegaR.count() + 1), 'xr')
plt.loglog(domain, 1 - (stats.genextreme.cdf(domain, c =gen_para[0] , loc = gen_para[1], scale = gen_para[2]) - 0.485 + 0.012 ) * 2, 'm')
plt.loglog(domain, 1 - (stats.t.cdf(domain, df = t_para[0] , loc = t_para[1], scale = t_para[2]) - 0.479 + 0.012 ) * 2, '--b')
plt.loglog(domain, 1 - (stats.norm.cdf(domain, mu_log, sigma_log) - 0.5 + 0.012 ) * 2, 'y')
plt.title('posi nega data with parametric distibutinos')
plt.legend(['posi lreturn', 'nega lreturn', 'gen', 't','normal'])
plt.ylim([1e-4, 1.2])

loglog data halves with whole gen extreme, t, normal


(0.0001, 1.2)

In [39]:
print('pdf of three whole set fitted distribution')
plt.figure()
plt.plot(domain, (stats.genextreme.pdf(domain, c =gen_para[0] , loc = gen_para[1], scale = gen_para[2])) * 2, 'm')
plt.plot(domain, (stats.t.pdf(domain, df = t_para[0] , loc = t_para[1], scale = t_para[2])) * 2, '--b')
plt.plot(domain, (stats.norm.pdf(domain, mu_log, sigma_log)) * 2, 'y')

pdf of three whole set fitted distribution


[<matplotlib.lines.Line2D at 0x7f515fe31908>]

In [40]:
print('histgram of whole set plotted with whole set fitted gen extreme, t, normal')
#%matplotlib inline
#%matplotlib qt
numbins = int(60) 
logR.hist(bins = numbins, normed=True)


mu_log = np.mean(logR['price'])
sigma_log = np.std(logR['price'])
x_ticks = np.linspace(min(logR['price']), max(logR['price']),100)
plt.plot(x_ticks, 1./((2.*np.pi)**0.5 *sigma_log)*np.exp(-((x_ticks - mu_log)/sigma_log)**2/2), 'r', alpha = 0.4)
plt.plot(x_ticks, (stats.genextreme.pdf(x_ticks, c =gen_para[0] , loc = gen_para[1], scale = gen_para[2])), 'm', alpha = 0.4)
plt.plot(x_ticks, (stats.t.pdf(x_ticks, df = t_para[0] , loc = t_para[1], scale = t_para[2])), '--b', alpha = 0.5)
#plt.plot(logR_grid, pdf, '--r')
plt.legend(['normal', 'gen extreme', 't', 'hist log return'])

histgram of whole set plotted with whole set fitted gen extreme, t, normal


<matplotlib.legend.Legend at 0x7f515faf3400>

In [41]:
print('loglog halves with whole_gen_extreme and gen_pareto and normal')
plt.figure()
domain = np.linspace(max(abs(logR['price']))/1000, max(abs(logR['price'])), 1000)
plt.loglog(sortedPosiR, 1 - (range(1, sortedPosiR.count() + 1))/(sortedPosiR.count() + 1), '+b', alpha = 0.5)
plt.loglog(sortedNegaR, 1 - (range(1, sortedNegaR.count() + 1))/(sortedNegaR.count() + 1), 'xr', alpha = 0.5)
plt.loglog(domain, 1 - (stats.genextreme.cdf(domain, c =gen_para[0] , loc = gen_para[1], scale = gen_para[2]) - 0.48) * 2, 'm')
plt.loglog(domain, 1 - (stats.genpareto.cdf(domain, c =posi_gen_para[0] , loc = posi_gen_para[1], scale = posi_gen_para[2])), '--b')
plt.loglog(domain, 1 - (stats.genpareto.cdf(domain, c =nega_gen_para[0] , loc = nega_gen_para[1], scale = nega_gen_para[2])), '--r')
plt.loglog(domain, 1 - (stats.norm.cdf(domain, mu_log, sigma_log) - 0.5) * 2, 'y')
plt.title('tail genextreme, genpareto positive and negative, normal')
plt.legend(['posi data', 'nega data', 'genextreme whole set', 'genpareto posi', 'genpareto nega', 'normal'], loc = 3)
plt.ylim([4e-4, 1.05])

loglog halves with whole_gen_extreme and gen_pareto and normal


(0.0004, 1.05)

In [42]:
posi_gen_para

(-0.08145170068265845, -5.4005132009578286e-05, 0.00892150742470711)

In [43]:
nega_gen_para

(0.06745991821862588, -1.1063684670825943e-10, 0.008160063901092524)

In [44]:
print('loglog halves with whole_t and mirror_fitted_t and normal')

#%matplotlib tk
plt.figure()
domain = np.linspace(max(abs(logR['price']))/1000, max(abs(logR['price'])), 1000)
plt.loglog(sortedPosiR, 1 - (range(1, sortedPosiR.count() + 1))/(sortedPosiR.count() + 1), '+b', alpha = 0.5)
plt.loglog(sortedNegaR, 1 - (range(1, sortedNegaR.count() + 1))/(sortedNegaR.count() + 1), 'xr', alpha = 0.5)
plt.loglog(domain, 1 - (stats.t.cdf(domain, df =t_para[0] , loc = t_para[1], scale = t_para[2]) - 0.48) * 2, 'm')
plt.loglog(domain, 1 - (stats.t.cdf(domain, df =posi_t_para[0] , loc = posi_t_para[1], scale = posi_t_para[2]) - 0.5) * 2, '--b')
plt.loglog(domain, 1 - (stats.t.cdf(domain, df =nega_t_para[0] , loc = nega_t_para[1], scale = nega_t_para[2]) - 0.5) * 2, '--r')

plt.loglog(domain, 1 - (stats.norm.cdf(domain, mu_log, sigma_log) - 0.5) * 2, 'y')
plt.title('compare t fited with whole set with fited with mirrored halves')
plt.legend(['posi data', 'nega data', 't whole set', 't posi', 't nega', 'normal'])
plt.ylim([1e-4, 1.05])

loglog halves with whole_t and mirror_fitted_t and normal


(0.0001, 1.05)

In [45]:
print('histgram of whole set plotted with whole set fitted t and mirror_fitted_t')

numbins = int(60) 
logR.hist(bins = numbins, normed=True, alpha = 0.5)
plt.plot(domain, (stats.t.pdf(domain, df = nega_t_para[0] , loc = nega_t_para[1], scale = nega_t_para[2])), '--g')
plt.plot(domain, (stats.t.pdf(domain, df = posi_t_para[0] , loc = posi_t_para[1], scale = posi_t_para[2])), '--b')
plt.plot(domain, (stats.t.pdf(domain, df = t_para[0] , loc = t_para[1], scale = t_para[2])), 'r', )

plt.legend(['negetive_t', 'positive_t', 'whole set t', 'hist log return'])

histgram of whole set plotted with whole set fitted t and mirror_fitted_t


<matplotlib.legend.Legend at 0x7f515f62e518>

In [46]:
print('histgram of whole set plotted with halves fitted gen pareto')

numbins = int(60) 
logR.hist(bins = numbins, normed=True, alpha = 0.5)
x_ticks = np.linspace(min(logR['price']), max(abs(logR['price'])),1000)
plt.plot(domain, 0.5 *(stats.genpareto.pdf(domain, c =posi_gen_para[0] , loc = posi_gen_para[1], scale = posi_gen_para[2])), '--b')
plt.plot(-domain, 0.5 *(stats.genpareto.pdf(domain, c =nega_gen_para[0] , loc = nega_gen_para[1], scale = nega_gen_para[2])), '--r')
plt.plot(x_ticks, (stats.norm.pdf(x_ticks, mu_log, sigma_log)), 'y')

plt.legend(['positive gen pareto', 'negative gen pareto', 'naomal', 'hist log return'])

histgram of whole set plotted with halves fitted gen pareto


<matplotlib.legend.Legend at 0x7f515f7587b8>

In [47]:
print('histgram of whole set plotted with whole set fitted t, mirror_fitted_t, and halves with gen pareto')

numbins = int(60) 
logR.hist(bins = numbins, normed=True, alpha = 0.5)
x_ticks = np.linspace(min(logR['price']), max(abs(logR['price'])),1000)
plt.plot(x_ticks, (stats.t.pdf(x_ticks, df = nega_t_para[0] , loc = nega_t_para[1], scale = nega_t_para[2])), '--g')
plt.plot(x_ticks, (stats.t.pdf(x_ticks, df = posi_t_para[0] , loc = posi_t_para[1], scale = posi_t_para[2])), '--b')
plt.plot(x_ticks, (stats.t.pdf(x_ticks, df = t_para[0] , loc = t_para[1], scale = t_para[2])), 'r', )
#plt.plot(logR_grid, pdf, '--r')
plt.plot(domain, 0.5 *(stats.genpareto.pdf(domain, c =posi_gen_para[0] , loc = posi_gen_para[1], scale = posi_gen_para[2])), '--m')
plt.plot(-domain, 0.5 *(stats.genpareto.pdf(domain, c =nega_gen_para[0] , loc = nega_gen_para[1], scale = nega_gen_para[2])), '--k')
plt.legend(['negetive_t', 'positive_t', 'whole set t', 'posi gen', 'negative gen', 'hist log return'])


histgram of whole set plotted with whole set fitted t, mirror_fitted_t, and halves with gen pareto


<matplotlib.legend.Legend at 0x7f515f424748>

In [48]:
#what is this magical * 2 thing? Should I do it for all the fitted distribution for the whole set of data while plot with 
#halves? why distribution fitted with halves match worse than fitted with whole? Data points? still 2000+
#lay out of report

In [49]:
grid = GridSearchCV(KernelDensity(),
                    {'bandwidth': np.linspace(0.0001, 0.002, 50)},
                    cv=10) 
grid.fit(logR['price'][:, None])
kde = grid.best_estimator_
print (grid.best_params_)

{'bandwidth': 0.002}


In [50]:
logR_grid = np.linspace(min(logR['price']) - 0.05, max(logR['price']) + 0.05, 1000)
pdf = np.exp(kde.score_samples(logR_grid[:, None]))

In [51]:
#%matplotlib tk
pdf_002 = pdf
numbins = int(60) 
logR.hist(bins = numbins, normed=True)
plt.plot(logR_grid, pdf_002)

[<matplotlib.lines.Line2D at 0x7f515fb5fa58>]