In [30]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.stattools import kpss
import scipy.stats as stats
import powerlaw

%matplotlib inline
#%matplotlib tk

In [4]:
my_data = pd.read_csv('/home/lin/教材/Financial_data/data/finance_M4/my_fx_data', header = None, usecols = [1], names = ['price'])
logR = np.log(my_data).diff()
logR.drop(logR.index[0], inplace = True)

In [190]:
ripple = pd.read_csv('/home/lin/教材/Financial_data/data/Ripple.csv')
ripple['Date']=pd.to_datetime(ripple['Date'])
ripple.set_index('Date', inplace = True)
close = np.flip(ripple['Close'], 0)
logRR = np.log(close).diff()
logRR.drop(logRR.index[0], inplace = True)

In [200]:
def stationarity_tests(data, print_info=True):
    adf_p = adfuller(data)[1]
    KPSS_p = kpss(data)[1]
    
    if print_info:
        print('adfuller test')
        print(adfuller(data))
        print('KPSS test')
        print(kpss(data))
        
    return adf_p, KPSS_p


In [207]:
#%matplotlib tk
def plot_autocorrelations(data, financial=True):
    autocorr_logR = [data.autocorr(lag=_) for _ in range(1,252)]
    autocorr_abs_logR = [abs(data).autocorr(lag=_) for _ in range(1,252)]
    autocorr_sqLogR = [(data ** 2).autocorr(lag=_) for _ in range(1,252)]
    plt.plot(range(1, 252),autocorr_logR, range(1, 252), autocorr_abs_logR, range(1, 252), autocorr_sqLogR)
    if financial:
        plt.title('Estimated autocorrelation vs lag for financial data', fontsize=20)
    else:
        plt.title('Estimated autocorrelation vs lag for ripple data', fontsize=20)

    plt.legend(['lot return', '|log return|', 'log return ^2'], loc = 1)
    plt.xlabel('lag', fontsize=18)
    plt.ylabel('autocorrelation', fontsize=18)
    plt.grid(True)

In [205]:
def fit_a_dist(data, distType, locAndScale=True):
    
    if locAndScale:
        loc = np.mean(data)
        scale = np.std(data)
        para = distType.fit(data, loc = loc, scale = scale)
    else:
        para = distType.fit(data)
        
    return para



In [217]:
def sep_and_mirror(data):
    posiR = data[data > 0].dropna()
    negaR = data[data < 0].dropna()
    sortedPosiR = posiR.sort_values()
    sortedNegaR = (-negaR).sort_values()

    mirrorNegative = sortedNegaR.append(-sortedNegaR)
    mirrorPosi = sortedPosiR.append(-sortedPosiR)
    return sortedPosiR, sortedNegaR, mirrorNegative, mirrorPosi


In [103]:
def get_alpha_and_start(data):
    fit = powerlaw.Fit(data)
    alpha_expo = fit.power_law.alpha
    start_x = fit.xmin
    fit.power_law.plot_ccdf
    fit.plot_ccdf
    return start_x, alpha_expo


In [208]:
print(stationarity_tests(logR['price']))
print(stationarity_tests(logRR))
plt.figure()
plot_autocorrelations(logR['price'])
plt.figure()
plot_autocorrelations(logRR, False)



adfuller test
(-23.262701610140457, 0.0, 6, 4189, {'1%': -3.4319120223653132, '5%': -2.8622302155703507, '10%': -2.5671374076308826}, -24993.19087817247)
KPSS test
(0.04591689237306045, 0.1, 31, {'10%': 0.347, '5%': 0.463, '2.5%': 0.574, '1%': 0.739})
(0.0, 0.1)
adfuller test
(-10.924169213029211, 1.0201212291578786e-19, 11, 1606, {'1%': -3.4344283205802606, '5%': -2.8633413399051144, '10%': -2.5677289969277726}, -3606.664273929593)
KPSS test
(0.42111321712474087, 0.06805464779105998, 25, {'10%': 0.347, '5%': 0.463, '2.5%': 0.574, '1%': 0.739})
(1.0201212291578786e-19, 0.06805464779105998)


In [267]:
posiTail, negaTail, mirrorNega, mirrorPosi = sep_and_mirror(logR['price'])
nega_t_para = fit_a_dist(mirrorNega, stats.t, False)
posi_t_para = fit_a_dist(mirrorPosi, stats.t, False)
posiStart, posiAlpha = get_alpha_and_start(posiTail)
negaStart, negaAlpha = get_alpha_and_start(negaTail)

plt.figure()
domain = np.linspace(max(abs(logR['price']))/1000, max(abs(logR['price'])), 1000)
plt.loglog(posiTail, 1 - (range(1, posiTail.count() + 1))/(posiTail.count() + 1), '+b', alpha = 0.5)
plt.loglog(negaTail, 1 - (range(1, negaTail.count() + 1))/(negaTail.count() + 1), 'xr', alpha = 0.5)
plt.loglog(domain, 1 - (stats.t.cdf(domain, df =posi_t_para[0] , loc = posi_t_para[1], scale = posi_t_para[2]) - 0.5) * 2, '--b')
plt.loglog(domain, 1 - (stats.t.cdf(domain, df =nega_t_para[0] , loc = nega_t_para[1], scale = nega_t_para[2]) - 0.5) * 2, '--r')

plt.loglog(domain, (domain*115) ** (-3.3), '-.b')
plt.loglog(domain, (domain*130) ** (-2.55), '-.r')
plt.loglog(posiStart*np.ones(50), np.linspace(1e-4, 1.05), '--b')
plt.loglog(negaStart*np.ones(50), np.linspace(1e-4, 1.05), '--r')

plt.title('compare t fited with mirrored halves and power law', fontsize=20)
plt.legend(['positive data', 'negative data', 't positive', 't negative', 'power law positive', 'power law negative'])
plt.ylim([1e-4, 1.05])
plt.xlim([1e-2, 0.2])
plt.xlabel('log return', fontsize=18)
plt.ylabel('rank/cdf', fontsize=18)

Calculating best minimal value for power law fit
Calculating best minimal value for power law fit


Text(0,0.5,'rank/cdf')

In [189]:
#power law how to choose start point
#The methods find this start point by creating a power law fit starting from each unique value 
#in the dataset, then selecting the one that results in the minimal Kolmogorov-Smirnov distance
#between the data and the a fit. 

#I believe the selection of alpha is also based on the same principle, since it seem to fit well with the
#very extreme values but not the majority. 

#I selected some alpha my self and plotted them together with t distribution fitted with mirrored data.
#and it seems t distribution fit better anyway and alpha is greater than 2 which means the attractor is
#nomal distribution

In [268]:
posiTail, negaTail, mirrorNega, mirrorPosi = sep_and_mirror(logRR)
nega_gen_para = fit_a_dist(negaTail, stats.genpareto, False)
posi_gen_para = fit_a_dist(posiTail, stats.genpareto, False)
posiStart, posiAlpha = get_alpha_and_start(posiTail)
negaStart, negaAlpha = get_alpha_and_start(negaTail)

plt.figure()
domain = np.linspace(max(abs(logRR))/1000, max(abs(logRR)), 1000)
plt.loglog(posiTail, 1 - (range(1, posiTail.count() + 1))/(posiTail.count() + 1), '+b', alpha = 0.5)
plt.loglog(negaTail, 1 - (range(1, negaTail.count() + 1))/(negaTail.count() + 1), 'xr', alpha = 0.5)
plt.loglog(domain, 1 - stats.genpareto.cdf(domain, c=posi_gen_para[0] , loc = posi_gen_para[1], scale = posi_gen_para[2]), '--b')
plt.loglog(domain, 1 - stats.genpareto.cdf(domain, c=nega_gen_para[0] , loc = nega_gen_para[1], scale = nega_gen_para[2]), '--r')

plt.loglog(domain, (domain*21) ** (-2.15), '-.b')
plt.loglog(domain, (domain*27) ** (-2.4), '-.r')
plt.loglog(posiStart*np.ones(50), np.linspace(1e-4, 1.05), '--b')
plt.loglog(negaStart*np.ones(50), np.linspace(1e-4, 1.05), '--r')

plt.title('compare gen_pareto fited with halves and power law', fontsize=20)
plt.legend(['positive data', 'negative data', 'gen_pareto positive', 'gen_pareto negative', 'power law positive', 'power law negative'])
plt.ylim([1e-4, 1.1])
plt.xlim([10**(-1.2), 1.2])
plt.xlabel('log return', fontsize=18)
plt.ylabel('rank/cdf', fontsize=18)

Calculating best minimal value for power law fit
Calculating best minimal value for power law fit


Text(0,0.5,'rank/cdf')

In [None]:
#same for ripple, cannot say power law fit better than the gen_pareto and alpha is higher than 2. Both 
#alpha are lower than those for financial data indicating fatter tails but still higher than 2 
#Ripple's positive tail is fatter than negative tail, this is opposite to financial data

In [120]:
%matplotlib tk   
fit = powerlaw.Fit(posiTail)
alpha_expo = fit.power_law.alpha
start_x = fit.xmin
fit.power_law.plot_ccdf()
fit.plot_ccdf()


Calculating best minimal value for power law fit


<matplotlib.axes._subplots.AxesSubplot at 0x7fde2d3c5a20>