In [4]:
from statsmodels.tsa.stattools import grangercausalitytests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import statsmodels.api as sm
from scipy.stats import chi2_contingency

In [5]:
df = pd.read_csv('granger_test_select.csv')

In [6]:
#变量转为分类变量
df['classification'] = df['classification'].astype('category')
df.head()

Unnamed: 0,start_date,end_date,Clsindex_s,Clsindex_e,r,LAI,bins,classification
0,2014/12/2,2015/3/4,2763.545,3279.533,0.360167,-1.840243,"(-2, -1]",1
1,2014/12/9,2015/3/11,2856.269,3290.9,0.164,-2.96238,"(-3, -2]",2
2,2014/12/16,2015/3/18,3021.518,3577.301,0.339167,-2.633004,"(-3, -2]",2
3,2014/12/23,2015/3/25,3032.612,3660.727,0.281667,-4.35589,"(-5, -4]",3
4,2014/12/30,2015/4/1,3165.815,3810.294,0.325,-5.787868,"(-35, -5]",3


各列含义为：滑动开始日期、滑动结束日期、滑动开始日期的收盘指数、滑动结束日期的收盘指数、区间收盘指数涨跌幅、动态LAI、动态LAI分组、分组类别

In [7]:
df["r"] = df['r'].abs()
df[['r', 'classification']].to_csv('r_abs_classification.csv')
df_train = pd.read_csv("r_abs_classification.csv")

In [8]:
#ADF检验（时间序列平稳性检验）
from statsmodels.tsa.stattools import adfuller

def adf_test(df):
    result = adfuller(df.values)
    print('ADF Statistics: %f' % result[0])
    print('p-value: %f' % result[1])
    print('Critical values:')
    for key, value in result[4].items():
        print('\t%s: %.3f' % (key, value))
        
print('ADF Test: r')
adf_test(df_train['r'])
print('ADF Test: classfication')
adf_test(df_train['classification'])

ADF Test: r
ADF Statistics: -2.553113
p-value: 0.103080
Critical values:
	1%: -3.563
	5%: -2.919
	10%: -2.597
ADF Test: classfication
ADF Statistics: -3.437758
p-value: 0.009737
Critical values:
	1%: -3.563
	5%: -2.919
	10%: -2.597


In [9]:
#r列不平稳，尝试对r列进行差分
df_train['r_diff'] = df_train['r']-df_train['r'].shift(1)
df_train.head()

Unnamed: 0.1,Unnamed: 0,r,classification,r_diff
0,0,0.360167,1,
1,1,0.164,2,-0.196167
2,2,0.339167,2,0.175167
3,3,0.281667,3,-0.0575
4,4,0.325,3,0.043333


In [10]:
#删除缺失值
df_train = df_train.dropna()

In [11]:
#对差分后的r列再次进行ADF检验
print('ADF : classfication')
adf_test(df_train['r_diff'])

ADF : classfication
ADF Statistics: -5.620049
p-value: 0.000001
Critical values:
	1%: -3.571
	5%: -2.923
	10%: -2.599


In [13]:
#检验通过，保存两列数据至data_train.csv文件中
data_train = df_train[["r_diff", "classification"]]

In [14]:
#判断滞后阶数
from statsmodels.tsa.api import VAR

model = VAR(data_train)
for i in [1,2,3,4,5,6,7,8,9,10]:
    result = model.fit(i)
    print('Lag Order =', i)
    print('AIC : ', result.aic)
    print('BIC : ', result.bic)
    print('FPE : ', result.fpe)
    print('HQIC: ', result.hqic, '\n')

Lag Order = 1
AIC :  -5.07498207188981
BIC :  -4.847708468039889
FPE :  0.006252898778336265
HQIC:  -4.988134105864899 

Lag Order = 2
AIC :  -4.987095094085215
BIC :  -4.604690492999587
FPE :  0.006834624807383774
HQIC:  -4.8414732409298376 

Lag Order = 3
AIC :  -5.113939272737865
BIC :  -4.573419187563401
FPE :  0.006036060594583399
HQIC:  -4.908866706373167 

Lag Order = 4
AIC :  -5.046779347772586
BIC :  -4.3450789686821265
FPE :  0.0064880055654493165
HQIC:  -4.78160574954823 

Lag Order = 5
AIC :  -4.988545331305429
BIC :  -4.122518794334763
FPE :  0.0069371281600959355
HQIC:  -4.66265372769824 

Lag Order = 6
AIC :  -4.869417532401553
BIC :  -3.835837612646847
FPE :  0.007924527756085514
HQIC:  -4.482232299752902 

Lag Order = 7
AIC :  -4.811289900551152
BIC :  -3.606848240704273
FPE :  0.008579935587759394
HQIC:  -4.362286142816596 

Lag Order = 8
AIC :  -4.699252663537832
BIC :  -3.3205606736919027
FPE :  0.009905816068701192
HQIC:  -4.187967209147083 

Lag Order = 9
AIC :  -



In [15]:
results = model.fit(maxlags=3, ic='aic')
results.summary()

  Summary of Regression Results   
Model:                         VAR
Method:                        OLS
Date:           Sat, 22, Apr, 2023
Time:                     11:13:58
--------------------------------------------------------------------
No. of Equations:         2.00000    BIC:                   -4.57342
Nobs:                     49.0000    HQIC:                  -4.90887
Log likelihood:          0.235536    FPE:                 0.00603606
AIC:                     -5.11394    Det(Omega_mle):      0.00462136
--------------------------------------------------------------------
Results for equation r_diff
                       coefficient       std. error           t-stat            prob
------------------------------------------------------------------------------------
const                    -0.042585         0.039515           -1.078           0.281
L1.r_diff                 0.024932         0.142848            0.175           0.861
L1.classification        -0.025227         

In [18]:
#DW--检验残差自相关性
#DW值可以在0到4之间变化。它越接近值2，则没有明显的序列相关性。接近0时，存在正序列相关，而接近4时，则具有负序列相关
from statsmodels.stats.stattools import durbin_watson

out = durbin_watson(results.resid)

for col, val in zip(data_train.columns, out):
    print(col, ':', round(val, 2))

r_diff : 1.75
classification : 2.07


In [19]:
from statsmodels.tsa.stattools import grangercausalitytests

# maxlag=10
test = 'ssr_chi2test'

def grangers_causation_matrix(data, variables, maxlag, test='ssr_chi2test', verbose=False):    
   
    df = pd.DataFrame(np.zeros((len(variables), len(variables))), columns=variables, index=variables)
    for c in df.columns:
        for r in df.index:
            test_result = grangercausalitytests(data[[r, c]], maxlag=maxlag, verbose=False)
            p_values = [round(test_result[i+1][0][test][1],4) for i in range(maxlag)]
            if verbose: print(f'Y = {r}, X = {c}, P Values = {p_values}')
            min_p_value = np.min(p_values)
            df.loc[r, c] = min_p_value
    df.columns = [var + '_x' for var in variables]
    df.index = [var + '_y' for var in variables]
    return df

for i in range(1, 9):
    print(grangers_causation_matrix(data_train, variables = data_train.columns, maxlag=i))

                  r_diff_x  classification_x
r_diff_y            1.0000            0.3901
classification_y    0.1745            1.0000
                  r_diff_x  classification_x
r_diff_y            1.0000            0.3639
classification_y    0.1745            1.0000
                  r_diff_x  classification_x
r_diff_y            1.0000            0.0601
classification_y    0.1745            1.0000
                  r_diff_x  classification_x
r_diff_y            1.0000            0.0107
classification_y    0.1745            1.0000
                  r_diff_x  classification_x
r_diff_y            1.0000            0.0042
classification_y    0.1745            1.0000
                  r_diff_x  classification_x
r_diff_y            1.0000            0.0042
classification_y    0.1745            1.0000
                  r_diff_x  classification_x
r_diff_y            1.0000            0.0042
classification_y    0.0997            1.0000
                  r_diff_x  classification_x
r_diff_y  