In [19]:
import pandas as pd
import numpy as np
import vaex as vx
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.stattools import adfuller, kpss,coint
from statsmodels.tsa.vector_ar import vecm
import time
from multiprocessing import Process
import gc
import warnings
warnings.filterwarnings('ignore')

plt.rcParams['font.sans-serif'] =['SimHei']
plt.rcParams['axes.unicode_minus'] = False
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)
pd.set_option("display.max_colwidth", 9999)
vx.settings.display.max_columns = 200
vx.settings.display.max_rows = 100
vx.settings.max_colwidth=60000
pd.set_option('display.float_format',lambda x : '%.4f' % x)
np.set_printoptions(suppress=True) #
pd.options.display.float_format = '{:.4f}'.format
%matplotlib inline

In [2]:
path = '../data/'
csv_name_a = 'a_series.csv'
csv_name_u = 'u_series.csv'

In [3]:
dfa = pd.read_csv(path+csv_name_a,header=None)
dfu = pd.read_csv(path+csv_name_u,header=None)
print(dfa.head(1))
print(dfu.head(1))

       0
0 0.6374
       0
0 1.3679


In [4]:
dfa.columns = ['A']
dfu.columns = ['U']

In [8]:
print(dfa.shape)
print(dfa.info())
print(dfa.dtypes)
print(dfa.isnull().sum())
print(dfa.duplicated().sum())
print(dfa.describe())

(19999, 1)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19999 entries, 0 to 19998
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       19999 non-null  float64
dtypes: float64(1)
memory usage: 156.4 KB
None
A    float64
dtype: object
A    0
dtype: int64
15541
               A
count 19999.0000
mean      0.6607
std       0.0109
min       0.6275
25%       0.6539
50%       0.6586
75%       0.6696
max       0.6870


In [9]:
print(dfu.shape)
print(dfu.info())
print(dfu.dtypes)
print(dfu.isnull().sum())
print(dfu.duplicated().sum())
print(dfu.describe())

(19999, 1)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19999 entries, 0 to 19998
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   U       19999 non-null  float64
dtypes: float64(1)
memory usage: 156.4 KB
None
U    float64
dtype: object
U    0
dtype: int64
14965
               U
count 19999.0000
mean      1.3482
std       0.0133
min       1.3180
25%       1.3386
50%       1.3476
75%       1.3572
max       1.3888


In [10]:

df = pd.concat([dfa['A'],dfu['U']],axis=1)
print(df.shape)
df.head()

(19999, 2)


Unnamed: 0,A,U
0,0.6374,1.3679
1,0.6378,1.3682
2,0.6376,1.3684
3,0.6378,1.3683
4,0.6382,1.368


In [12]:
result_A_adf = adfuller(df['A'])
result_U_adf = adfuller(df['U'])
result_A_kpss = kpss(df['A'])
result_U_kpss = kpss(df['U'])
# p-value  <  0.05 
print('ADF Test - Stock A: p-value =', result_A_adf[1])
print('ADF Test - Stock B: p-value =', result_U_adf[1])
print('KPSS Test - Stock A: p-value =', result_A_kpss[1])
print('KPSS Test - Stock B: p-value =', result_U_kpss[1])

ADF Test - Stock A: p-value = 0.18544305479437267
ADF Test - Stock B: p-value = 0.25592247220325454
KPSS Test - Stock A: p-value = 0.01
KPSS Test - Stock B: p-value = 0.01


In [15]:
df_diff = df.diff().dropna() # delet first line 
result_diff_A_adf = adfuller(df_diff['A'])
result_diff_U_adf = adfuller(df_diff['U'])
result_diff_A_kpss = kpss(df_diff['A'])
result_diff_U_kpss = kpss(df_diff['U'])

# p-value < 0.05
print('ADF Test (Diff) - Stock A: p-value =', result_diff_A_adf[1])
print('ADF Test (Diff) - Stock U: p-value =', result_diff_U_adf[1])
print('KPSS Test (Diff) - Stock A: p-value =', result_diff_A_kpss[1])
print('KPSS Test (Diff) - Stock U: p-value =', result_diff_U_kpss[1])

ADF Test (Diff) - Stock A: p-value = 0.0
ADF Test (Diff) - Stock U: p-value = 0.0
KPSS Test (Diff) - Stock A: p-value = 0.1
KPSS Test (Diff) - Stock U: p-value = 0.1


In [17]:
# coint
result_coint = coint(df_diff['A'], df_diff['U'])
print('Cointegration Test - p-value =', result_coint[1])

Cointegration Test - p-value = 0.0


In [18]:
'''
corr
df['A'].corr(df['U'])
'''
r_s = df['A'].corr(df['U'],method='spearman')
r_p = df['A'].corr(df['U'],method='pearson')
r_k = df['A'].corr(df['U'],method='kendall')
print(r_s)
print(r_p)
print(r_k)

-0.6949698212295361
-0.7726499707403791
-0.5295993984245311


In [25]:
'''
Covariance
Cov(X,Y) = E[(X - E[X])(Y - E[Y])]
         = E[XY] - 2E[Y]E[X] + E[X]E[Y]
         =E[XY] - E[X]E[Y]
'''
print(df['A'].cov(df['U']))

-0.00011229174872951324


In [20]:
# cointegration
model = vecm.VECM(df[:19999].values, k_ar_diff = 1, coint_rank = 2, deterministic='co')
res = model.fit()
print(res.summary())

Det. terms outside the coint. relation & lagged endog. parameters for equation y1
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0003      0.001     -0.520      0.603      -0.001       0.001
L1.y1          0.0074      0.007      0.996      0.319      -0.007       0.022
L1.y2         -0.0112      0.006     -1.805      0.071      -0.023       0.001
Det. terms outside the coint. relation & lagged endog. parameters for equation y2
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0028      0.001      4.407      0.000       0.002       0.004
L1.y1         -0.0355      0.009     -3.980      0.000      -0.053      -0.018
L1.y2         -0.0238      0.007     -3.210      0.001      -0.038      -0.009
                 Loading coefficients (alpha) 

In [21]:
model = vecm.VECM(df[:19999].values, k_ar_diff = 1, coint_rank = 2, deterministic='ci')
res = model.fit()
print(res.summary())

Det. terms outside the coint. relation & lagged endog. parameters for equation y1
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
L1.y1          0.0074      0.007      0.996      0.319      -0.007       0.022
L1.y2         -0.0112      0.006     -1.805      0.071      -0.023       0.001
Det. terms outside the coint. relation & lagged endog. parameters for equation y2
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
L1.y1         -0.0355      0.009     -3.980      0.000      -0.053      -0.018
L1.y2         -0.0238      0.007     -3.210      0.001      -0.038      -0.009
                 Loading coefficients (alpha) for equation y1                 
                 coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------