In [1]:
import random
import pandas as pd
from statsmodels.formula.api import ols

# 模拟数据

In [2]:
def generate_sample_data(n:int, p_c:float, delta:float) -> pd.DataFrame:
    random.seed(20200820)
    p_t = p_c + delta
    control_group = pd.DataFrame({'d':[0]*n,
                                  'y':[int(random.random()<=p_c) for i in range(n)]})
    treatment_group = pd.DataFrame({'d':[1]*n, 
                                     'y':[int(random.random()<=p_t) for i in range(n)]})

    return pd.concat([control_group, treatment_group])

# 回归分析

In [11]:
ols(data=generate_sample_data(5100,0.15,0.01), formula='y~d').fit().summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.001
Model:,OLS,Adj. R-squared:,0.001
Method:,Least Squares,F-statistic:,7.772
Date:,"Fri, 04 Sep 2020",Prob (F-statistic):,0.00531
Time:,11:02:04,Log-Likelihood:,-3810.7
No. Observations:,10200,AIC:,7625.0
Df Residuals:,10198,BIC:,7640.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.1349,0.005,27.400,0.000,0.125,0.145
d,0.0194,0.007,2.788,0.005,0.006,0.033

0,1,2,3
Omnibus:,3540.138,Durbin-Watson:,1.98
Prob(Omnibus):,0.0,Jarque-Bera (JB):,8768.158
Skew:,2.019,Prob(JB):,0.0
Kurtosis:,5.081,Cond. No.,2.62


In [4]:
# 当缩小样本时，点估计差别不大，但是不再显著
ols(data=generate_sample_data(1000,0.15,0.01), formula='y~d').fit().summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.0
Model:,OLS,Adj. R-squared:,-0.0
Method:,Least Squares,F-statistic:,0.3962
Date:,"Fri, 04 Sep 2020",Prob (F-statistic):,0.529
Time:,11:00:51,Log-Likelihood:,-766.97
No. Observations:,2000,AIC:,1538.0
Df Residuals:,1998,BIC:,1549.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.1430,0.011,12.729,0.000,0.121,0.165
d,0.0100,0.016,0.629,0.529,-0.021,0.041

0,1,2,3
Omnibus:,682.452,Durbin-Watson:,1.983
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1619.688
Skew:,1.982,Prob(JB):,0.0
Kurtosis:,4.93,Cond. No.,2.62


In [14]:
# 给定样本大小，希望估计的预期收益变小，不显著
ols(data=generate_sample_data(5100,0.15,0.001), formula='y~d').fit().summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.0
Model:,OLS,Adj. R-squared:,0.0
Method:,Least Squares,F-statistic:,2.372
Date:,"Fri, 04 Sep 2020",Prob (F-statistic):,0.124
Time:,11:02:46,Log-Likelihood:,-3681.6
No. Observations:,10200,AIC:,7367.0
Df Residuals:,10198,BIC:,7382.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.1349,0.005,27.749,0.000,0.125,0.144
d,0.0106,0.007,1.540,0.124,-0.003,0.024

0,1,2,3
Omnibus:,3686.258,Durbin-Watson:,1.983
Prob(Omnibus):,0.0,Jarque-Bera (JB):,9536.175
Skew:,2.072,Prob(JB):,0.0
Kurtosis:,5.295,Cond. No.,2.62
