# Chap 2.1, 2.2

効果検証入門のRコードをPythonで再現

## 2.1.5 Rによるメールマーケティングデータの分析 (回帰編)

In [1]:
df = pd.read_csv('./Kevin_Hillstrom_MineThatData_E-MailAnalytics_DataMiningChallenge_2008.03.20.csv')

In [2]:
df_filtered = df.query('segment != "Womens E-Mail"').assign(
    treatment=lambda d: (d['segment'] == 'Mens E-Mail').astype(int)
)

In [3]:
_df = df_filtered
df_biased = _df.assign(
    obs_rate_c=np.where((_df['history'] > 300) | (_df['recency'] < 6) | (_df['channel'] == 'Multichannel'), 0.5, 1),
    obs_rate_t=np.where((_df['history'] > 300) | (_df['recency'] < 6) | (_df['channel'] == 'Multichannel'), 1, 0.5),
    random_number=np.random.random(len(_df))
).query('(treatment == 0 and random_number < obs_rate_c) or (treatment == 1 and random_number < obs_rate_t)')

In [4]:
df_biased.groupby('treatment').agg({
    'conversion': ['count', 'sum', 'mean'],
    'spend': 'mean'
})

Unnamed: 0_level_0,conversion,conversion,conversion,spend
Unnamed: 0_level_1,count,sum,mean,mean
treatment,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,14855,72,0.004847,0.617095
1,17197,221,0.012851,1.484236


In [5]:
import statsmodels.api as sm

In [6]:
_df = df_biased.copy()
_df['intercept'] = 1
results = sm.OLS(exog=_df[['treatment', 'history', 'intercept']], endog=_df['spend']).fit()

In [7]:
results.summary()

0,1,2,3
Dep. Variable:,spend,R-squared:,0.001
Model:,OLS,Adj. R-squared:,0.001
Method:,Least Squares,F-statistic:,18.0
Date:,"Tue, 11 Feb 2020",Prob (F-statistic):,1.54e-08
Time:,02:58:59,Log-Likelihood:,-133430.0
No. Observations:,32052,AIC:,266900.0
Df Residuals:,32049,BIC:,266900.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
treatment,0.7863,0.176,4.472,0.000,0.442,1.131
history,0.0011,0.000,3.348,0.001,0.000,0.002
intercept,0.3844,0.145,2.647,0.008,0.100,0.669

0,1,2,3
Omnibus:,70598.759,Durbin-Watson:,2.002
Prob(Omnibus):,0.0,Jarque-Bera (JB):,335678874.247
Skew:,20.512,Prob(JB):,0.0
Kurtosis:,502.668,Cond. No.,829.0


## 2.2.1 共変量の追加による影響

In [8]:
# RCTデータ
_df = df_filtered.copy()
_df['intercept'] = 1
rct_results = sm.OLS(exog=_df[['treatment', 'intercept']], endog=_df['spend']).fit()

In [9]:
rct_results.summary()

0,1,2,3
Dep. Variable:,spend,R-squared:,0.001
Model:,OLS,Adj. R-squared:,0.001
Method:,Least Squares,F-statistic:,28.09
Date:,"Tue, 11 Feb 2020",Prob (F-statistic):,1.16e-07
Time:,02:58:59,Log-Likelihood:,-175840.0
No. Observations:,42613,AIC:,351700.0
Df Residuals:,42611,BIC:,351700.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
treatment,0.7698,0.145,5.300,0.000,0.485,1.055
intercept,0.6528,0.103,6.356,0.000,0.451,0.854

0,1,2,3
Omnibus:,94877.86,Durbin-Watson:,2.0
Prob(Omnibus):,0.0,Jarque-Bera (JB):,502270597.344
Skew:,21.023,Prob(JB):,0.0
Kurtosis:,533.203,Cond. No.,2.62


In [10]:
# BIasedデータ
_df = df_biased.copy()
_df['intercept'] = 1
norct_results = sm.OLS(exog=_df[['treatment', 'intercept']], endog=_df['spend']).fit()

In [11]:
norct_results.summary()

0,1,2,3
Dep. Variable:,spend,R-squared:,0.001
Model:,OLS,Adj. R-squared:,0.001
Method:,Least Squares,F-statistic:,24.79
Date:,"Tue, 11 Feb 2020",Prob (F-statistic):,6.44e-07
Time:,02:58:59,Log-Likelihood:,-133430.0
No. Observations:,32052,AIC:,266900.0
Df Residuals:,32050,BIC:,266900.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
treatment,0.8671,0.174,4.978,0.000,0.526,1.209
intercept,0.6171,0.128,4.837,0.000,0.367,0.867

0,1,2,3
Omnibus:,70601.378,Durbin-Watson:,2.001
Prob(Omnibus):,0.0,Jarque-Bera (JB):,335622603.755
Skew:,20.514,Prob(JB):,0.0
Kurtosis:,502.625,Cond. No.,2.71


In [14]:
# BIasedデータ
_df = df_biased.copy()[['treatment', 'channel', 'recency', 'history']]
_df['intercept'] = 1
_df = _df.join(pd.get_dummies(_df['channel'])).drop('channel', axis=1)
norct_with_covariate_results = sm.OLS(exog=_df, endog=df_biased['spend']).fit()

In [15]:
norct_with_covariate_results.summary()

0,1,2,3
Dep. Variable:,spend,R-squared:,0.001
Model:,OLS,Adj. R-squared:,0.001
Method:,Least Squares,F-statistic:,8.942
Date:,"Tue, 11 Feb 2020",Prob (F-statistic):,1.68e-08
Time:,03:01:59,Log-Likelihood:,-133420.0
No. Observations:,32052,AIC:,266900.0
Df Residuals:,32046,BIC:,266900.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
treatment,0.6872,0.180,3.822,0.000,0.335,1.040
recency,-0.0699,0.026,-2.677,0.007,-0.121,-0.019
history,0.0010,0.000,2.517,0.012,0.000,0.002
intercept,0.6579,0.194,3.390,0.001,0.278,1.038
Multichannel,0.1979,0.226,0.874,0.382,-0.246,0.642
Phone,0.1139,0.131,0.868,0.385,-0.143,0.371
Web,0.3460,0.131,2.640,0.008,0.089,0.603

0,1,2,3
Omnibus:,70589.809,Durbin-Watson:,2.002
Prob(Omnibus):,0.0,Jarque-Bera (JB):,335431500.801
Skew:,20.506,Prob(JB):,0.0
Kurtosis:,502.483,Cond. No.,1.16e+18


## 2.2.7 Post treatment bias

In [16]:
# BIasedデータ
_df = df_biased.copy()[['treatment', 'visit', 'channel', 'recency', 'history']]
_df['intercept'] = 1
_df = _df.join(pd.get_dummies(_df['channel'])).drop('channel', axis=1)
post_treatment_bias_results = sm.OLS(exog=_df, endog=df_biased['spend']).fit()

In [17]:
post_treatment_bias_results.summary()

0,1,2,3
Dep. Variable:,spend,R-squared:,0.028
Model:,OLS,Adj. R-squared:,0.028
Method:,Least Squares,F-statistic:,153.4
Date:,"Tue, 11 Feb 2020",Prob (F-statistic):,9.06e-193
Time:,21:20:01,Log-Likelihood:,-132990.0
No. Observations:,32052,AIC:,266000.0
Df Residuals:,32045,BIC:,266100.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
treatment,0.1119,0.178,0.627,0.531,-0.238,0.462
visit,7.2298,0.244,29.571,0.000,6.751,7.709
recency,-0.0231,0.026,-0.895,0.371,-0.074,0.027
history,0.0004,0.000,1.088,0.276,-0.000,0.001
intercept,-0.0131,0.193,-0.068,0.946,-0.391,0.365
Multichannel,-0.0103,0.224,-0.046,0.963,-0.448,0.428
Phone,-0.0032,0.130,-0.025,0.980,-0.257,0.251
Web,0.0004,0.130,0.003,0.998,-0.254,0.255

0,1,2,3
Omnibus:,70134.185,Durbin-Watson:,2.002
Prob(Omnibus):,0.0,Jarque-Bera (JB):,329697223.059
Skew:,20.184,Prob(JB):,0.0
Kurtosis:,498.219,Cond. No.,5.31e+17
