In [118]:
import numpy as np
import pandas as pd

In [119]:
from d2c.descriptors.loader import DataLoader
from d2c.descriptors.estimators import MutualInformationEstimator

In [120]:
np.random.seed(42)

Simple Case

In [121]:
# number of observations
n = 100

# Initialize the arrays for z_i and z_j
zi = np.zeros(n)
zj = np.zeros(n)

# Seed for reproducibility
np.random.seed(42)

# Initialize the first element (could also be randomized)
zi[0] = np.random.normal(0, 1)
zj[0] = np.random.normal(0, 1)

# Parameters for the model
alpha_i = 0.8  # Autoregression coefficient for z_i
alpha_j = 0.5  # Autoregression coefficient for z_j
beta_j = 0.3   # Cross-lagged effect of z_i on z_j

# Generate the time series data
for t in range(1, n):
    zi[t] = alpha_i * zi[t - 1] + np.random.normal(0, 1)
    zj[t] = alpha_j * zj[t - 1] + beta_j * zi[t - 1] + np.random.normal(0, 1)

zi = zi.reshape(-1, 1)
zj = zj.reshape(-1, 1)

df_lagged_zi = pd.DataFrame(DataLoader._create_lagged_single_ts(zi, 4),columns=['zit2','zit1', 'zit', 'zit_1', 'zit_2'])
df_lagged_zj = pd.DataFrame(DataLoader._create_lagged_single_ts(zj, 4),columns=['zjt2','zjt1', 'zjt', 'zjt_1', 'zjt_2'])

df_lagged = pd.concat([df_lagged_zi, df_lagged_zj], axis=1)
df_lagged

Unnamed: 0,zit2,zit1,zit,zit_1,zit_2,zjt2,zjt1,zjt,zjt_1,zjt_2
0,1.179108,2.060728,0.601895,1.045060,0.496714,1.854989,1.388422,0.880837,1.602912,-0.138264
1,0.479869,1.179108,2.060728,0.601895,1.045060,0.815497,1.854989,1.388422,0.880837,1.602912
2,0.625857,0.479869,1.179108,2.060728,0.601895,-1.361571,0.815497,1.854989,1.388422,0.880837
3,-1.224232,0.625857,0.479869,1.179108,2.060728,-1.055316,-1.361571,0.815497,1.854989,1.388422
4,-1.992217,-1.224232,0.625857,0.479869,1.179108,-0.580680,-1.055316,-1.361571,0.815497,1.854989
...,...,...,...,...,...,...,...,...,...,...
91,-1.196723,-0.937760,-0.113708,-0.733681,-0.637773,-0.135596,-1.421333,0.255253,1.096373,1.147409
92,-0.743284,-1.196723,-0.937760,-0.113708,-0.733681,-1.672553,-0.135596,-1.421333,0.255253,1.096373
93,-0.421447,-0.743284,-1.196723,-0.937760,-0.113708,-0.673945,-1.672553,-0.135596,-1.421333,0.255253
94,-1.221015,-0.421447,-0.743284,-1.196723,-0.937760,-0.309681,-0.673945,-1.672553,-0.135596,-1.421333


In [122]:
conditional_relations = [
    ('zit_2', 'zjt1', None),
    ('zjt_1', 'zit', None),
    ('zit_2', 'zjt1', 'zjt_1'),
    ('zjt_1', 'zit', 'zit_2'),
    ('zit_2', 'zjt1', 'zjt_1'),
    ('zjt_1', 'zit', 'zit_2'),
    ('zit_2', 'zjt1', 'zit'),
    ('zjt_1', 'zit', 'zjt1'),
    ('zit', 'zjt_1', 'zit_1'),
    ('zjt1', 'zit_2', 'zjt'),
    ('zit', 'zjt_1', 'zjt'),
    ('zjt1', 'zit_2', 'zit_1'),
    ('zit_1', 'zjt', 'zjt_1'),
    ('zjt', 'zit_1', 'zit_2'),
    ('zit_2', 'zjt', 'zit'),
    ('zjt_1', 'zit_1', 'zjt1'),
    ('zit_1', 'zjt', 'zjt1'),
    ('zjt', 'zit_1', 'zit'),
    ('zit_1', 'zjt_1', None),
    ('zjt', 'zit_2', None),
    ('zit', 'zjt', None),
    ('zjt1', 'zit_1', None),
    ('zit_2', 'zjt_1', 'zjt'),
    ('zjt_1', 'zit_2', 'zit_1'),
    ('zit', 'zjt1', 'zit_1'),
    ('zjt1', 'zit', 'zjt'),
    ('zit1', 'zjt', 'zit_1'),
    ('zjt2', 'zit_1', 'zjt')
]

In [123]:
mi_dict_simple = {}
for a,b,c in conditional_relations:
    estimate = MutualInformationEstimator().estimate_original(df_lagged.values, 
                                               df_lagged.columns.get_loc(a), 
                                               df_lagged.columns.get_loc(b), 
                                               df_lagged.columns.get_loc(c) if c else None)
    
    mi_dict_simple[(a,b,c)] = estimate

Complex Case

In [124]:
# number of observations
n = 100

# Initialize the arrays for z_i and z_j
zi = np.zeros(n)
zj = np.zeros(n)

# Seed for reproducibility
np.random.seed(42)

# Initialize the first element (could also be randomized)
zi[0] = np.random.normal(0, 1)
zj[0] = np.random.normal(0, 1)
zi[1] = np.random.normal(0, 1)
zj[1] = np.random.normal(0, 1)

# Parameters for the model
alpha_i_1 = 0.8  # AR1 coefficient for z_i
alpha_i_2 = 0.2  # AR2 coefficient for z_i

alpha_j_1 = 0.5  # AR1 coefficient for z_j
alpha_j_2 = 0.2  # AR2 coefficient for z_j

beta_j = 0.3   # Cross-lagged effect of z_i on z_j

# Generate the time series data
for t in range(2, n):
    zi[t] = alpha_i_1 * zi[t - 1] + alpha_i_2 * zi[t - 2] + np.random.normal(0, 1)
    zj[t] = alpha_j_1 * zj[t - 1] + alpha_j_2 * zj[t - 2] + beta_j * zi[t - 1] + np.random.normal(0, 1)

zi = zi.reshape(-1, 1)
zj = zj.reshape(-1, 1)

df_lagged_zi = pd.DataFrame(DataLoader._create_lagged_single_ts(zi, 4),columns=['zit2','zit1', 'zit', 'zit_1', 'zit_2'])
df_lagged_zj = pd.DataFrame(DataLoader._create_lagged_single_ts(zj, 4),columns=['zjt2','zjt1', 'zjt', 'zjt_1', 'zjt_2'])

df_complex_lagged = pd.concat([df_lagged_zi, df_lagged_zj], axis=1)

In [125]:
df_complex_lagged

Unnamed: 0,zit2,zit1,zit,zit_1,zit_2,zjt2,zjt1,zjt,zjt_1,zjt_2
0,1.219532,2.015423,0.383340,0.647689,0.496714,2.053023,1.534059,0.694032,1.523030,-0.138264
1,0.915292,1.219532,2.015423,0.383340,0.647689,1.233453,2.053023,1.534059,0.694032,1.523030
2,1.218103,0.915292,1.219532,2.015423,0.383340,-0.611362,1.233453,2.053023,1.534059,0.694032
3,-0.567377,1.218103,0.915292,1.219532,2.015423,-0.255847,-0.611362,1.233453,2.053023,1.534059
4,-1.223112,-0.567377,1.218103,0.915292,1.219532,-0.106162,-0.255847,-0.611362,1.233453,2.053023
...,...,...,...,...,...,...,...,...,...,...
91,-9.639247,-9.380878,-8.440149,-8.909827,-8.927624,-7.830238,-8.888606,-7.140354,-6.357688,-6.078664
92,-9.373480,-9.639247,-9.380878,-8.440149,-8.909827,-9.830353,-7.830238,-8.888606,-7.140354,-6.357688
93,-9.253452,-9.373480,-9.639247,-9.380878,-8.440149,-8.907951,-9.830353,-7.830238,-8.888606,-7.140354
94,-10.161315,-9.253452,-9.373480,-9.639247,-9.380878,-9.042357,-8.907951,-9.830353,-7.830238,-8.888606


In [126]:
mi_dict_complex = {}
for a,b,c in conditional_relations:
    estimate = MutualInformationEstimator().estimate_original(df_complex_lagged.values, 
                                               df_complex_lagged.columns.get_loc(a), 
                                               df_complex_lagged.columns.get_loc(b), 
                                               df_complex_lagged.columns.get_loc(c) if c else None)
    
    mi_dict_complex[(a,b,c)] = estimate

In [127]:
open_complex = [8,4,0,0,7,21,2,11,11,6,3,1,5,22,20,7,2,4,8,8,5,2,6,12,3,19]
open_simple = [4,2,0,0,3,6,0,0,3,2,1,1,3,4,4,3,2,3,3,4,3,2,1,1,0,0]
l_complex = [2.875,3,0,0,3.142857,4.142857,2.5,4.090909,3.818182,2.833333,2.666667,1,2.4,4.272727,4.3,3.428571,2.5,2.25,3.5,3.375,2.6,1.5,3.5,4.5,4.333333,5.157895]
l_simple = [3.25 ,3.5 ,0,0,3.333333 ,4 ,0,0,3.333333 ,3.5 ,1 ,1 ,2.333333 ,2.75 ,2.75 ,2.666667 ,2.5 ,2.333333 ,3.666667 ,3.25 ,2 ,1.5 ,1 ,1 ,0,0]

In [128]:
xy_complex = pd.concat([pd.DataFrame(mi_dict_complex.values()),pd.Series(open_complex),pd.Series(l_complex)],axis=1)
xy_simple = pd.concat([pd.DataFrame(mi_dict_simple.values()),pd.Series(open_simple),pd.Series(l_simple)],axis=1)

In [129]:
xy_concat = pd.concat([xy_complex, xy_simple], axis=0)
xy_concat.columns = ['MI', '#Open', 'L']

#drop #Open = 0
xy_concat = xy_concat[xy_concat['#Open'] != 0]

#perform linear regression to check pvalues of coefficients to predict MI from #Open
import statsmodels.api as sm
X = xy_concat[['#Open', 'L']]
y = xy_concat['MI']
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
predictions = model.predict(X)
model.summary()

0,1,2,3
Dep. Variable:,MI,R-squared:,0.168
Model:,OLS,Adj. R-squared:,0.127
Method:,Least Squares,F-statistic:,4.128
Date:,"Wed, 14 Aug 2024",Prob (F-statistic):,0.0233
Time:,12:34:05,Log-Likelihood:,11.344
No. Observations:,44,AIC:,-16.69
Df Residuals:,41,BIC:,-11.34
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.2170,0.095,2.274,0.028,0.024,0.410
#Open,0.0193,0.008,2.565,0.014,0.004,0.035
L,-0.0342,0.040,-0.863,0.393,-0.114,0.046

0,1,2,3
Omnibus:,8.106,Durbin-Watson:,1.752
Prob(Omnibus):,0.017,Jarque-Bera (JB):,7.133
Skew:,0.936,Prob(JB):,0.0282
Kurtosis:,3.619,Cond. No.,29.2


In [117]:
X.shape

(44, 3)