In [1]:
import pandas as pd
import numpy as np
import scipy as sp

from generate_datasets import *
import sys
sys.path.append('..')
from utils import *
import matplotlib.pyplot as plt

from tqdm.auto import tqdm

import statsmodels as sm
import statsmodels.api as sma
import statsmodels.tsa.api as smt

import json

In [2]:
rng = np.random.default_rng()

In [16]:
d = [
    {
        'name': 'perfect',
        'shape': [100, 5],
        'gen_true_X': {'variance': 3},
        'gen_coefs': {'magnitude': 100, 'spread': 0.5, 'prop_neg': 0.6},
        'gen_Y': {'variance': 0, 'autocorr': 0},
        'gen_obs_X': {'coef': 1},
    },
    {
        'name': 'heteroschedastic',
        'shape': [100, 5],
        'gen_true_X': {'variance': 1},
        'gen_coefs': {'magnitude': 100, 'spread': 0.5, 'prop_neg': 0.6},
        'gen_Y': {'variance': list(np.linspace(20, 60, 100)**2), 'autocorr': 0},
        'gen_obs_X': {'coef': 1},
    },
    {
        'name': 'autocorrelated',
        'shape': [100, 5],
        'gen_true_X': {'variance': 3},
        'gen_coefs': {'magnitude': 100, 'spread': 0.5, 'prop_neg': 0.6},
        'gen_Y': {'variance': 100**2, 'autocorr': 0.5},
        'gen_obs_X': {'coef': 1},
    },
    {
        'name': 'bad_features',
        'shape': [100, 5],
        'gen_true_X': {'variance': 3},
        'gen_coefs': {'magnitude': 100, 'spread': 0.5, 'prop_neg': 0.6},
        'gen_Y': {'variance': 100**2, 'autocorr': 0},
        'gen_obs_X': {'coef': 0.9},
        'gen_bad_features': {'n': 3, 'prop': [0.05, 0.01, 1]}
    },
    {
        'name': 'bad_dataset',
        'shape': [100, 1],
        'gen_true_X': {'variance': 3},
        'gen_coefs': {'magnitude': 100, 'spread': 0.5, 'prop_neg': 0.6},
        'gen_Y': {'variance': 100**2, 'autocorr': 0},
        'gen_obs_X': {'coef': 0.9},
        'gen_bad_features': {'n': 5, 'prop': [0.05, 0.01, 0.95, 0.8, 0.0001]}
    },
    {
        'name': 'big_dataset',
        'shape': [50000, 1000],
        'gen_true_X': {'variance': 3},
        'gen_coefs': {'magnitude': 100, 'spread': 0.5, 'prop_neg': 0.6},
        'gen_Y': {'variance': 100**2, 'autocorr': 0.3},
        'gen_obs_X': {'coef': 0.9}
    },
    {
        'name': 'bigger_dataset',
        'shape': [100000, 5000],
        'gen_true_X': {'variance': 3},
        'gen_coefs': {'magnitude': 100, 'spread': 0.5, 'prop_neg': 0.6},
        'gen_Y': {'variance': 100**2, 'autocorr': 0.3},
        'gen_obs_X': {'coef': 0.95}
    }
]

In [9]:
def test_index(d, i):
    dg = DataGenerator(123 + i)
    print(d[i]['name'])
    dg.from_dict(d[i])
    model = sma.OLS(dg.Y, dg.obs_X).fit()
    print('True coefs:  ', dg.coefs)
    print('autocorrelation:  ', np.corrcoef(np.c_[model.resid, np.roll(model.resid, 1)], rowvar=False)[1, 0])
    print('autocorrelation test:  ', sm.stats.diagnostic.acorr_ljungbox(model.resid))
    print('heteroschedasticity:  ', sm.stats.diagnostic.het_goldfeldquandt(dg.Y, dg.obs_X))
    if hasattr(dg, 'choice'):
        print('features chosen:  ', dg.choice)
    return model.summary()

In [10]:
test_index(d, 0)

perfect
True coefs:   [ -18.25759264 -112.72330311  136.90067447 -148.49882635  177.95308097]
autocorrelation:   -0.16477285723893703
autocorrelation test:        lb_stat  lb_pvalue
1   2.663571   0.102670
2   2.764937   0.250958
3   2.767793   0.428829
4   2.815276   0.589199
5   3.552281   0.615492
6   3.951768   0.683204
7   3.959867   0.784388
8   4.026751   0.854702
9   5.549069   0.784062
10  5.555411   0.851131
heteroschedasticity:   (np.float64(3.99455854730819), np.float64(4.225677157524891e-06), 'increasing')


0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,1.0
Model:,OLS,Adj. R-squared (uncentered):,1.0
Method:,Least Squares,F-statistic:,5.297e+31
Date:,"Wed, 18 Sep 2024",Prob (F-statistic):,0.0
Time:,23:17:19,Log-Likelihood:,2738.2
No. Observations:,100,AIC:,-5466.0
Df Residuals:,95,BIC:,-5453.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,-18.2576,1.83e-14,-9.97e+14,0.000,-18.258,-18.258
x2,-112.7233,2e-14,-5.64e+15,0.000,-112.723,-112.723
x3,136.9007,1.82e-14,7.52e+15,0.000,136.901,136.901
x4,-148.4988,1.94e-14,-7.64e+15,0.000,-148.499,-148.499
x5,177.9531,1.86e-14,9.55e+15,0.000,177.953,177.953

0,1,2,3
Omnibus:,0.676,Durbin-Watson:,2.276
Prob(Omnibus):,0.713,Jarque-Bera (JB):,0.764
Skew:,-0.057,Prob(JB):,0.682
Kurtosis:,2.587,Cond. No.,1.34


In [11]:
test_index(d, 1)

heteroschedastic
True coefs:   [ -49.19950134  -82.02719789  -17.96891469 -129.73463085 -119.47792316]
autocorrelation:   -0.0822047359631501
autocorrelation test:        lb_stat  lb_pvalue
1   0.707423   0.400300
2   2.048463   0.359072
3   4.466527   0.215296
4   4.877548   0.300094
5   4.897892   0.428468
6   4.898298   0.556922
7   4.898963   0.672292
8   4.952221   0.762670
9   5.127382   0.823066
10  5.292464   0.870805
heteroschedasticity:   (np.float64(2.9346182449779508), np.float64(0.00022411263163931062), 'increasing')


0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.971
Model:,OLS,Adj. R-squared (uncentered):,0.97
Method:,Least Squares,F-statistic:,644.0
Date:,"Wed, 18 Sep 2024",Prob (F-statistic):,1.28e-71
Time:,23:17:22,Log-Likelihood:,-500.87
No. Observations:,100,AIC:,1012.0
Df Residuals:,95,BIC:,1025.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,-59.2188,3.797,-15.594,0.000,-66.758,-51.680
x2,-87.9922,3.940,-22.331,0.000,-95.815,-80.170
x3,-10.1515,3.501,-2.899,0.005,-17.103,-3.200
x4,-129.5400,4.054,-31.957,0.000,-137.587,-121.493
x5,-125.9228,3.570,-35.273,0.000,-133.010,-118.836

0,1,2,3
Omnibus:,1.048,Durbin-Watson:,2.153
Prob(Omnibus):,0.592,Jarque-Bera (JB):,0.623
Skew:,-0.164,Prob(JB):,0.732
Kurtosis:,3.205,Cond. No.,1.31


In [12]:
test_index(d, 2)

autocorrelated
True coefs:   [  89.68677425 -189.14869075 -153.17657197   50.52686567  -36.95900448]
autocorrelation:   0.4729386971227496
autocorrelation test:         lb_stat     lb_pvalue
1   23.276796  1.402823e-06
2   35.151290  2.328061e-08
3   35.682830  8.738520e-08
4   35.704325  3.328629e-07
5   35.712547  1.084219e-06
6   36.940398  1.808628e-06
7   37.134103  4.424111e-06
8   37.160310  1.075321e-05
9   37.245762  2.378883e-05
10  37.633761  4.394527e-05
heteroschedasticity:   (np.float64(1.2055020789717878), np.float64(0.26666005756068095), 'increasing')


0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.958
Model:,OLS,Adj. R-squared (uncentered):,0.955
Method:,Least Squares,F-statistic:,430.2
Date:,"Wed, 18 Sep 2024",Prob (F-statistic):,1.35e-63
Time:,23:17:22,Log-Likelihood:,-609.97
No. Observations:,100,AIC:,1230.0
Df Residuals:,95,BIC:,1243.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,88.3811,6.417,13.772,0.000,75.641,101.121
x2,-189.6056,6.640,-28.554,0.000,-202.788,-176.423
x3,-166.4395,7.039,-23.645,0.000,-180.414,-152.465
x4,35.7811,6.615,5.409,0.000,22.649,48.913
x5,-50.5104,7.713,-6.549,0.000,-65.823,-35.198

0,1,2,3
Omnibus:,2.855,Durbin-Watson:,0.901
Prob(Omnibus):,0.24,Jarque-Bera (JB):,2.778
Skew:,0.351,Prob(JB):,0.249
Kurtosis:,2.583,Cond. No.,1.63


In [13]:
test_index(d, 3)

bad_features
True coefs:   [  37.21772633 -133.4409771   -32.18618059  180.09437254  -45.51913261]
autocorrelation:   -0.052881114001968106
autocorrelation test:         lb_stat  lb_pvalue
1    0.304744   0.580924
2    3.739859   0.154135
3    4.488258   0.213340
4    5.443094   0.244777
5    5.750334   0.331275
6    7.614156   0.267755
7    8.522000   0.288819
8    8.534298   0.383090
9   10.983651   0.276831
10  12.005543   0.284686
heteroschedasticity:   (np.float64(0.9673963963663528), np.float64(0.5425137079125605), 'increasing')
features chosen:   [1 1 2]


0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.721
Model:,OLS,Adj. R-squared (uncentered):,0.697
Method:,Least Squares,F-statistic:,29.74
Date:,"Wed, 18 Sep 2024",Prob (F-statistic):,2.18e-22
Time:,23:17:22,Log-Likelihood:,-683.05
No. Observations:,100,AIC:,1382.0
Df Residuals:,92,BIC:,1403.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,27.9664,11.001,2.542,0.013,6.117,49.816
x2,-110.9475,12.072,-9.191,0.000,-134.923,-86.972
x3,-30.7142,28.947,-1.061,0.291,-88.205,26.777
x4,138.2265,11.711,11.803,0.000,114.968,161.485
x5,-48.3218,12.308,-3.926,0.000,-72.766,-23.878
x6,-15.9181,13.407,-1.187,0.238,-42.546,10.709
x7,-14.9222,12.824,-1.164,0.248,-40.392,10.548
x8,-17.2674,30.598,-0.564,0.574,-78.037,43.502

0,1,2,3
Omnibus:,0.628,Durbin-Watson:,2.101
Prob(Omnibus):,0.73,Jarque-Bera (JB):,0.616
Skew:,-0.183,Prob(JB):,0.735
Kurtosis:,2.884,Cond. No.,4.64


In [17]:
test_index(d, 4)

bad_dataset
True coefs:   [-130.38565166]
autocorrelation:   -0.026267595758805693
autocorrelation test:        lb_stat  lb_pvalue
1   0.082061   0.774523
2   2.182104   0.335863
3   3.512869   0.319097
4   3.667513   0.452869
5   4.070151   0.539361
6   4.710871   0.581394
7   6.134680   0.524115
8   7.618254   0.471619
9   8.699133   0.465498
10  8.728658   0.558034
heteroschedasticity:   (np.float64(1.4981907473203928), np.float64(0.09197264693895577), 'increasing')
features chosen:   [0 0 0 0 0]


0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.848
Model:,OLS,Adj. R-squared (uncentered):,0.839
Method:,Least Squares,F-statistic:,87.65
Date:,"Wed, 18 Sep 2024",Prob (F-statistic):,2.68e-36
Time:,23:44:16,Log-Likelihood:,-597.88
No. Observations:,100,AIC:,1208.0
Df Residuals:,94,BIC:,1223.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,13.7829,12.942,1.065,0.290,-11.913,39.479
x2,-6.7172,6.734,-0.998,0.321,-20.087,6.653
x3,-11.0700,5.220,-2.121,0.037,-21.435,-0.705
x4,-112.6656,28.828,-3.908,0.000,-169.903,-55.428
x5,-44.2405,31.917,-1.386,0.169,-107.612,19.131
x6,-1.9513,6.449,-0.303,0.763,-14.757,10.854

0,1,2,3
Omnibus:,1.637,Durbin-Watson:,2.007
Prob(Omnibus):,0.441,Jarque-Bera (JB):,1.234
Skew:,-0.264,Prob(JB):,0.539
Kurtosis:,3.131,Cond. No.,12.0


In [18]:
dg = DataGenerator(123)
print(d[5]['name'])
%timeit dg.from_dict(d[5])

big_dataset
1.78 s ± 19.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [19]:
dg = DataGenerator(123)
print(d[6]['name'])
%timeit dg.from_dict(d[6])

bigger_dataset
47.4 s ± 4.06 s per loop (mean ± std. dev. of 7 runs, 1 loop each)
