In [2]:
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import fastreg.linear as frl
import fastreg.general as frg
import fastreg.testing as frt
%matplotlib inline

### Generate Data

In [3]:
data = frt.dataset(N=5_000_000, K1=10, K2=100, seed=89320432)
data.head()

Unnamed: 0,id1,id2,x1,x2,yhat0,yhat,y0,y,Eb0,Eb,...,Ep0,Ep,p0,p,pz0,pz,nb0,nb,nbz0,nbz
0,4,15,-0.373497,0.847802,-0.054538,0.028462,-0.643401,-0.948844,0.486369,0.507115,...,0.946922,1.028871,2,2,2,2,1,0,1,0
1,8,30,0.533385,-0.505209,0.218989,0.384989,1.245903,1.12213,0.55453,0.595076,...,1.244818,1.469598,0,2,0,2,3,2,0,2
2,8,24,-1.403897,0.738269,-0.694684,-0.558684,-0.406558,-0.621899,0.332992,0.363852,...,0.499232,0.571961,2,1,0,1,0,0,0,0
3,8,74,-1.150902,0.978757,-0.49479,-0.10879,0.129366,-1.258796,0.378766,0.472829,...,0.609699,0.896919,2,1,2,1,0,2,0,2
4,6,45,1.456634,2.104154,1.294811,1.531811,2.023731,2.275766,0.78496,0.822271,...,3.650306,4.626548,5,3,5,3,2,4,2,4


### Normal OLS

In [4]:
%time smf.ols('y0 ~ x1 + x2', data=data).fit().params

  data_klasses = (pandas.Series, pandas.DataFrame, pandas.Panel)


CPU times: user 2.86 s, sys: 1.92 s, total: 4.78 s
Wall time: 1.67 s


Intercept    0.000533
x1           0.599798
x2           0.199904
dtype: float64

In [5]:
%time frl.ols(y='y0', x=['x1', 'x2'], data=data)

CPU times: user 575 ms, sys: 501 ms, total: 1.08 s
Wall time: 452 ms


Unnamed: 0,coeff,stderr,low95,high95,pvalue
one,0.000533,0.000447,-0.000344,0.00141,0.233444
x1,0.599798,0.000447,0.598921,0.600675,0.0
x2,0.199904,0.000447,0.199027,0.20078,0.0


### Sparse OLS

In [6]:
%time frl.ols(y='y', x=['x1', 'x2'], fe=['id1', 'id2'], data=data)

CPU times: user 5.52 s, sys: 1.4 s, total: 6.92 s
Wall time: 6.59 s


Unnamed: 0,coeff,stderr,low95,high95,pvalue
one,0.004118,0.004655,-0.005007,0.013242,0.376426
x1,0.600189,0.000447,0.599312,0.601065,0.000000
x2,0.199648,0.000447,0.198772,0.200524,0.000000
id1=1,-0.001416,0.002001,-0.005337,0.002505,0.479072
id1=2,0.001605,0.001999,-0.002313,0.005524,0.421999
...,...,...,...,...,...
id2=95,0.477899,0.006317,0.465517,0.490282,0.000000
id2=96,0.473080,0.006309,0.460715,0.485445,0.000000
id2=97,0.482564,0.006305,0.470207,0.494921,0.000000
id2=98,0.481532,0.006305,0.469174,0.493891,0.000000


In [8]:
%time frl.ols(y='y', x=['x1', 'x2'], absorb=('id1', 'id2'), data=data)

CPU times: user 3.42 s, sys: 1.17 s, total: 4.6 s
Wall time: 3.64 s


Unnamed: 0,coeff,stderr,low95,high95,pvalue
one,0.337992,0.02734,0.284407,0.391577,0.0
x1,0.621514,0.016955,0.588282,0.654746,0.0
x2,0.221667,0.016877,0.188588,0.254746,0.0


### Poisson

In [9]:
%time frg.poisson(y='p', x=['x1', 'x2'], data=data)

  0: loss = 0.5313790440559387
  1: loss = 0.5309708714485168
  2: loss = 0.5309708714485168
CPU times: user 7.79 s, sys: 2.24 s, total: 10 s
Wall time: 8.24 s


Unnamed: 0,coeff,stderr,low95,high95,pvalue
one,0.266192,0.000419,0.26537,0.267014,0.0
x1,0.607038,0.000354,0.606344,0.607732,0.0
x2,0.17685,0.000354,0.176156,0.177544,0.0


In [10]:
%time frg.poisson(y='p', x=['x1', 'x2'], fe=['id1', 'id2'], data=data)

  0: loss = 0.5161331295967102
  1: loss = 0.5148211717605591
  2: loss = 0.5147688984870911
CPU times: user 16.5 s, sys: 3.03 s, total: 19.5 s
Wall time: 16.1 s


Unnamed: 0,coeff,stderr,low95,high95,pvalue
one,0.189902,0.003804,0.182446,0.197358,0.000000e+00
x1,0.607719,0.000354,0.607026,0.608413,0.000000e+00
x2,0.180563,0.000354,0.179870,0.181257,0.000000e+00
id1=1,-0.009688,0.001594,-0.012812,-0.006563,1.226917e-09
id1=2,-0.004432,0.001591,-0.007550,-0.001313,5.346953e-03
...,...,...,...,...,...
id2=95,0.296423,0.004821,0.286973,0.305872,0.000000e+00
id2=96,0.285234,0.004832,0.275764,0.294704,0.000000e+00
id2=97,0.295199,0.004819,0.285754,0.304643,0.000000e+00
id2=98,0.295738,0.004819,0.286292,0.305183,0.000000e+00


### Ultra Wide

In [33]:
N = 2_000_000
df = pd.DataFrame({ 
    'x1': np.random.rand(N), 
    'x2': np.random.rand(N), 
    'id1': np.ceil(np.arange(N)/200+1e-7).astype(np.int), 
    'id2': np.random.randint(1, 10001, size=N) 
})
df['y'] = df['x1'] + 2*df['x2'] + np.log(df['id1']) + np.log10(df['id2']) + np.random.randn(N)

In [37]:
%time frl.ols(y='y', x=['x1', 'x2'], absorb=['id1', 'id2'], data=df)

CPU times: user 4.44 s, sys: 2.14 s, total: 6.58 s
Wall time: 5.67 s


Unnamed: 0,coeff,stderr,low95,high95,pvalue
one,11.775893,0.00187,11.772228,11.779559,0.0
x1,1.000463,0.002449,0.995662,1.005264,0.0
x2,2.001758,0.002451,1.996954,2.006562,0.0
