In [1]:
import statsmodels.formula.api as smf
import fastreg as fr
from fastreg import I, R, C

### Generate Data

In [2]:
models = ['linear', 'poisson', 'logit']
data = fr.dataset(N=1_000_000, K1=10, K2=100, models=models, seed=89320432)
data_wide = fr.dataset(N=1_000_000, K1=10, K2=10_000, models=models, seed=89320433)
data.head()

Unnamed: 0,x1,x2,yhat0,yhat,id1,id2,y0,y,Eb0,Eb,b0,b,Ep0,Ep,p0,p
0,-1.429225,1.830295,0.76941,2.14941,H,68,1.791671,1.58551,0.683393,0.895614,1,1,2.158492,8.579792,3,5
1,0.687153,0.795373,0.783369,1.403369,C,42,0.295552,3.163529,0.686406,0.802718,1,0,2.188835,4.068886,2,0
2,0.764353,-1.060225,-0.306829,-0.036829,C,7,0.381637,-0.639362,0.423889,0.490794,0,1,0.735777,0.963841,0,2
3,-0.309887,-0.777701,-0.459587,0.800413,D,96,0.143218,1.139854,0.387084,0.690063,1,1,0.631545,2.226461,0,1
4,-0.616042,-0.752266,-0.536172,0.623828,H,46,-0.468395,1.462211,0.369078,0.651089,0,0,0.584983,1.866057,1,2


In [3]:
# for statsmodels runs
data1 = data.copy()
data1['id2'] = data1['id2'].astype(str)

### Normal OLS

In [4]:
%time fr.ols(y=R.y0, x=I+R.x1+R.x2, data=data)

CPU times: user 44.1 ms, sys: 83.6 ms, total: 128 ms
Wall time: 22.6 ms


y0,coeff,stderr,low95,high95,pvalue
I,0.101859,0.001,0.0999,0.103818,0.0
x1,0.301647,0.001,0.299688,0.303606,0.0
x2,0.599408,0.000999,0.59745,0.601366,0.0


In [5]:
%time smf.ols('y0 ~ 1 + x1 + x2', data=data).fit().params

CPU times: user 377 ms, sys: 1.18 s, total: 1.56 s
Wall time: 131 ms


Intercept    0.101859
x1           0.301647
x2           0.599408
dtype: float64

In [6]:
%time fr.ols(y=R.y, x=I+R.x1+R.x2+C.id1+C.id2, data=data)

CPU times: user 402 ms, sys: 710 ms, total: 1.11 s
Wall time: 315 ms


y,coeff,stderr,low95,high95,pvalue
I,0.112463,0.010413,0.092054,0.132872,0.0
x1,0.299557,0.001001,0.297595,0.301518,0.0
x2,0.601840,0.001000,0.599880,0.603800,0.0
id1=B,0.104734,0.004476,0.095962,0.113507,0.0
id1=C,0.201965,0.004474,0.193197,0.210734,0.0
...,...,...,...,...,...
id2=95,0.937461,0.014102,0.909821,0.965101,0.0
id2=96,0.956624,0.014109,0.928970,0.984278,0.0
id2=97,0.960951,0.014099,0.933318,0.988584,0.0
id2=98,0.993398,0.014132,0.965699,1.021097,0.0


In [7]:
%time smf.ols('y ~ 1 + x1 + x2 + id1 + id2', data=data1).fit().params

CPU times: user 1min 35s, sys: 21.5 s, total: 1min 57s
Wall time: 11.1 s


Intercept    0.112463
id1[T.B]     0.104734
id1[T.C]     0.201965
id1[T.D]     0.300212
id1[T.E]     0.403094
               ...   
id2[T.97]    0.960951
id2[T.98]    0.993398
id2[T.99]    0.988670
x1           0.299557
x2           0.601840
Length: 111, dtype: float64

### High Dimensional

In [8]:
%time fr.ols(y=R.y, x=I+R.x1+R.x2+C.id1+C.id2, data=data_wide)

CPU times: user 3.98 s, sys: 2.25 s, total: 6.23 s
Wall time: 4.46 s


y,coeff,stderr,low95,high95,pvalue
I,0.143877,0.103699,-0.059369,0.347123,1.653041e-01
x1,0.301630,0.001004,0.299662,0.303597,0.000000e+00
x2,0.599284,0.001006,0.597313,0.601255,0.000000e+00
id1=B,0.097642,0.004499,0.088824,0.106461,0.000000e+00
id1=C,0.189008,0.004491,0.180207,0.197810,0.000000e+00
...,...,...,...,...,...
id2=9995,1.045182,0.146196,0.758643,1.331722,8.730794e-13
id2=9996,1.029893,0.146587,0.742588,1.317198,2.128075e-12
id2=9997,0.892911,0.142021,0.614554,1.171267,3.233234e-10
id2=9998,0.961372,0.144705,0.677755,1.244989,3.059886e-11


In [9]:
%time fr.ols(y=R.y, x=I+R.x1+R.x2+C.id1, hdfe=C.id2, data=data_wide)

CPU times: user 375 ms, sys: 42.3 ms, total: 418 ms
Wall time: 370 ms


y,coeff,stderr,low95,high95,pvalue
I,0.143877,0.103699,-0.059369,0.347123,1.653041e-01
x1,0.301630,0.001004,0.299662,0.303597,0.000000e+00
x2,0.599284,0.001006,0.597313,0.601255,0.000000e+00
id1=B,0.097642,0.004499,0.088824,0.106461,0.000000e+00
id1=C,0.189008,0.004491,0.180207,0.197810,0.000000e+00
...,...,...,...,...,...
id2=9995,1.045182,0.146196,0.758643,1.331722,8.730794e-13
id2=9996,1.029893,0.146587,0.742588,1.317198,2.128075e-12
id2=9997,0.892911,0.142021,0.614554,1.171267,3.233234e-10
id2=9998,0.961372,0.144705,0.677755,1.244989,3.059886e-11


In [10]:
%time fr.ols(y=R.y, x=I+R.x1+R.x2+C.id1, absorb=C.id2, data=data_wide)

CPU times: user 766 ms, sys: 1.8 s, total: 2.57 s
Wall time: 358 ms


y,coeff,stderr,low95,high95,pvalue
I,0.605524,0.003035,0.599576,0.611471,0.0
x1,0.30163,0.000998,0.299674,0.303585,0.0
x2,0.599284,0.001014,0.597297,0.60127,0.0
id1=B,0.097642,0.004531,0.088762,0.106523,0.0
id1=C,0.189008,0.004545,0.1801,0.197917,0.0
id1=D,0.29516,0.004506,0.286328,0.303991,0.0
id1=E,0.402802,0.00451,0.393963,0.411641,0.0
id1=F,0.495409,0.004493,0.486602,0.504215,0.0
id1=G,0.59691,0.004467,0.588155,0.605665,0.0
id1=H,0.694326,0.00449,0.685526,0.703125,0.0


### Poisson

In [11]:
%time fr.poisson(y=R.p0, x=I+R.x1+R.x2, data=data)

[  0] ℓ=-0.89794, g=0.75832, Δβ=0.12965, Δℓ=inf, μR=0.12830, μC=nan
[  7] ℓ=-0.62108, g=0.00013, Δβ=0.00000, Δℓ=0.00000, μR=0.33567, μC=nan
CPU times: user 1.04 s, sys: 151 ms, total: 1.19 s
Wall time: 1.1 s


p0,coeff,stderr,low95,high95,pvalue
I,0.103618,0.001024,0.101611,0.105625,0.0
x1,0.300842,0.000853,0.299171,0.302513,0.0
x2,0.602552,0.000847,0.600892,0.604213,0.0


In [12]:
%time fr.poisson(y=R.p, x=I+R.x1+R.x2+C.id1, data=data)

[  0] ℓ=-0.39264, g=2.72065, Δβ=0.13072, Δℓ=inf, μR=0.13035, μC=0.12990
[ 10] ℓ=2.39599, g=0.00131, Δβ=0.00722, Δℓ=0.00006, μR=0.52002, μC=0.47746
[ 18] ℓ=2.39607, g=0.00032, Δβ=0.00092, Δℓ=0.00000, μR=0.51334, μC=0.49867
CPU times: user 2.9 s, sys: 261 ms, total: 3.16 s
Wall time: 2.54 s


p,coeff,stderr,low95,high95,pvalue
I,0.638103,0.001829,0.634518,0.641689,0.0
x1,0.300814,0.000408,0.300015,0.301614,0.0
x2,0.601101,0.000394,0.600329,0.601873,0.0
id1=B,0.099735,0.002459,0.094915,0.104555,0.0
id1=C,0.197611,0.002395,0.192916,0.202305,0.0
id1=D,0.298519,0.002334,0.293945,0.303092,0.0
id1=E,0.397953,0.00228,0.393485,0.402421,0.0
id1=F,0.496801,0.002224,0.492442,0.50116,0.0
id1=G,0.596155,0.002177,0.591889,0.600422,0.0
id1=H,0.70274,0.002128,0.69857,0.70691,0.0


### Logit

In [13]:
%time fr.logit(y=R.b0, x=I+R.x1+R.x2, data=data)

[  0] ℓ=-0.67709, g=0.18025, Δβ=0.12969, Δℓ=inf, μR=0.11553, μC=nan
[  9] ℓ=-0.60641, g=0.00003, Δβ=0.00001, Δℓ=0.00000, μR=0.39733, μC=nan
CPU times: user 1.02 s, sys: 114 ms, total: 1.13 s
Wall time: 814 ms


b0,coeff,stderr,low95,high95,pvalue
I,0.846877,0.002203,0.84256,0.851194,0.0
x1,0.120391,0.002197,0.116085,0.124698,0.0
x2,0.22471,0.002211,0.220377,0.229043,0.0


In [14]:
%time fr.logit(y=R.b, x=I+R.x1+R.x2+C.id1+C.id2, data=data)

[  0] ℓ=-0.64660, g=0.21251, Δβ=0.12810, Δℓ=inf, μR=0.10345, μC=0.12493
[ 10] ℓ=-0.54531, g=0.00046, Δβ=0.01270, Δℓ=0.00003, μR=0.28754, μC=0.31096
[ 20] ℓ=-0.54521, g=0.00017, Δβ=0.00376, Δℓ=0.00000, μR=0.30825, μC=0.27702
[ 30] ℓ=-0.54520, g=0.00009, Δβ=0.00134, Δℓ=0.00000, μR=0.31536, μC=0.26547
[ 34] ℓ=-0.54520, g=0.00009, Δβ=0.00098, Δℓ=0.00000, μR=0.31683, μC=0.26311
CPU times: user 7.89 s, sys: 1.83 s, total: 9.72 s
Wall time: 5.36 s


b,coeff,stderr,low95,high95,pvalue
I,0.669476,0.023546,0.623327,0.715626,0.000000e+00
x1,0.095070,0.002355,0.090455,0.099685,0.000000e+00
x2,0.185934,0.002349,0.181330,0.190539,0.000000e+00
id1=B,0.065680,0.010242,0.045606,0.085754,1.428291e-10
id1=C,0.088547,0.010252,0.068455,0.108640,0.000000e+00
...,...,...,...,...,...
id2=95,0.476568,0.033170,0.411557,0.541580,0.000000e+00
id2=96,0.452166,0.033020,0.387449,0.516884,0.000000e+00
id2=97,0.476293,0.033138,0.411343,0.541243,0.000000e+00
id2=98,0.530590,0.033530,0.464872,0.596308,0.000000e+00


### Ultra Wide

In [15]:
N = 2_000_000
df = pd.DataFrame({ 
    'x1': np.random.rand(N), 
    'x2': np.random.rand(N), 
    'id1': np.ceil(10*np.arange(N)/N+1e-7).astype(int),
    'id2': np.random.randint(1, 10001, size=N)
})
df['y'] = 1 + 2*df['x1'] + 3*df['x2'] + np.log10(df['id1']) + np.log10(df['id2']) + np.random.randn(N)
print(df[['id1', 'id2']].nunique())

id1       10
id2    10000
dtype: int64


In [16]:
%time fr.ols(y=R.y, x=I+R.x1+R.x2+C.id1, hdfe=C.id2, data=df)

CPU times: user 636 ms, sys: 17.4 ms, total: 653 ms
Wall time: 596 ms


y,coeff,stderr,low95,high95,pvalue
I,1.063118,0.074093,0.917897,1.208338,0.0
x1,2.003521,0.002453,1.998714,2.008329,0.0
x2,3.006054,0.002452,3.001248,3.010859,0.0
id1=2,0.297162,0.003167,0.290955,0.303368,0.0
id1=3,0.477778,0.003167,0.471572,0.483985,0.0
...,...,...,...,...,...
id2=9996,3.843339,0.101050,3.645285,4.041393,0.0
id2=9997,3.781365,0.105299,3.574983,3.987746,0.0
id2=9998,3.913102,0.104287,3.708703,4.117500,0.0
id2=9999,4.037430,0.103209,3.835143,4.239716,0.0
