In [1]:
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import fastreg.linear as frl
import fastreg.general as frg
import fastreg.testing as frt
%matplotlib inline

### Generate Data

In [2]:
data = frt.dataset(N=5_000_000, K1=10, K2=100, seed=89320432)
data.head()

Unnamed: 0,one,id1,id2,x1,x2,yhat0,yhat,y0,y,Ep0,Ep,p0,p,pz,Eb0,Eb,b0,b
0,1,4,15,-0.373497,0.847802,-0.054538,0.028462,-0.643401,-0.948844,0.946922,1.028871,2,0,0,0.486369,0.507115,1,1
1,1,8,30,0.533385,-0.505209,0.218989,0.384989,1.245903,1.12213,1.244818,1.469598,1,2,2,0.55453,0.595076,1,1
2,1,8,24,-1.403897,0.738269,-0.694684,-0.558684,-0.406558,-0.621899,0.499232,0.571961,0,2,2,0.332992,0.363852,0,0
3,1,8,74,-1.150902,0.978757,-0.49479,-0.10879,0.129366,-1.258796,0.609699,0.896919,0,1,1,0.378766,0.472829,0,1
4,1,6,45,1.456634,2.104154,1.294811,1.531811,2.023731,2.275766,3.650306,4.626548,4,0,0,0.78496,0.822271,1,1


### Normal OLS

In [4]:
%time smf.ols('y0 ~ x1 + x2', data=data).fit().params

CPU times: user 2.85 s, sys: 1.88 s, total: 4.74 s
Wall time: 1.64 s


Intercept    0.000533
x1           0.599798
x2           0.199904
dtype: float64

In [5]:
%time frl.ols(y='y0', x=['x1', 'x2'], data=data)

CPU times: user 551 ms, sys: 538 ms, total: 1.09 s
Wall time: 457 ms


Unnamed: 0,coeff,stderr,low95,high95,pvalue
one,0.000533,0.000447,-0.000344,0.00141,0.116722
x1,0.599798,0.000447,0.598921,0.600675,0.0
x2,0.199904,0.000447,0.199027,0.20078,0.0


### Sparse OLS

In [6]:
%time frl.ols(y='y', x=['x1', 'x2'], fe=['id1', 'id2'], data=data)

CPU times: user 5.59 s, sys: 1.59 s, total: 7.18 s
Wall time: 6.79 s


Unnamed: 0,coeff,stderr,low95,high95,pvalue
one,0.004118,0.004655,-0.005007,0.013242,0.188213
x1,0.600189,0.000447,0.599312,0.601065,0.000000
x2,0.199648,0.000447,0.198772,0.200524,0.000000
id1=1,-0.001416,0.002001,-0.005337,0.002505,0.239536
id1=2,0.001605,0.001999,-0.002313,0.005524,0.211000
...,...,...,...,...,...
id2=95,0.477899,0.006317,0.465517,0.490282,0.000000
id2=96,0.473080,0.006309,0.460715,0.485445,0.000000
id2=97,0.482564,0.006305,0.470207,0.494921,0.000000
id2=98,0.481532,0.006305,0.469174,0.493891,0.000000


In [7]:
%time frl.ols(y='y', x=['x1', 'x2'], absorb=('id1', 'id2'), data=data)

CPU times: user 12.9 s, sys: 8 s, total: 20.9 s
Wall time: 6.87 s


Unnamed: 0,coeff,stderr,low95,high95,pvalue
one,0.25654,3.148679e-07,0.256539,0.256541,0.0
x1,0.600186,0.0004517726,0.599301,0.601072,0.0
x2,0.199648,0.0004703627,0.198726,0.20057,0.0


### Poisson

In [None]:
%time frg.poisson(y='p', x=['x1', 'x2'], data=data)

  0: loss = 0.5313539505004883
  1: loss = 0.5309544801712036
  2: loss = 0.5309544801712036


In [None]:
%time frg.poisson(y='p', x=['x1', 'x2'], fe=['id1', 'id2'], data=data)

  0: loss = 0.516063928604126
  1: loss = 0.5147619843482971
  2: loss = 0.5147106647491455
