In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("http://minethatdata.com/Kevin_Hillstrom_MineThatData_E-MailAnalytics_DataMiningChallenge_2008.03.20.csv")

In [3]:
df.head()

Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,segment,visit,conversion,spend
0,10,2) $100 - $200,142.44,1,0,Surburban,0,Phone,Womens E-Mail,0,0,0.0
1,6,3) $200 - $350,329.08,1,1,Rural,1,Web,No E-Mail,0,0,0.0
2,7,2) $100 - $200,180.65,0,1,Surburban,1,Web,Womens E-Mail,0,0,0.0
3,9,5) $500 - $750,675.83,1,0,Rural,1,Web,Mens E-Mail,0,0,0.0
4,2,1) $0 - $100,45.34,1,0,Urban,0,Web,Womens E-Mail,0,0,0.0


In [4]:
np.unique(df["segment"])

array(['Mens E-Mail', 'No E-Mail', 'Womens E-Mail'], dtype=object)

In [5]:
len(df)

64000

### RCTされた状態でウェルチのt検定

In [6]:
# 男性向けメールが配信されたユーザにのみ限定
male_df = df.query(" segment!= 'Womens E-Mail' ").copy()
male_df["treatment"] = (male_df["segment"]=="Mens E-Mail").astype(int)

In [7]:
male_df.head()

Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,segment,visit,conversion,spend,treatment
1,6,3) $200 - $350,329.08,1,1,Rural,1,Web,No E-Mail,0,0,0.0,0
3,9,5) $500 - $750,675.83,1,0,Rural,1,Web,Mens E-Mail,0,0,0.0,1
8,9,5) $500 - $750,675.07,1,1,Rural,1,Phone,Mens E-Mail,0,0,0.0,1
13,2,2) $100 - $200,101.64,0,1,Urban,0,Web,Mens E-Mail,1,0,0.0,1
14,4,3) $200 - $350,241.42,0,1,Rural,1,Multichannel,No E-Mail,0,0,0.0,0


In [8]:
# 介入があったグループとなかったグループでのconversion(売り上げが発生したら1、そうでないなら0)の発生率を確認
male_df.groupby("treatment").conversion.mean()

treatment
0    0.005726
1    0.012531
Name: conversion, dtype: float64

In [9]:
# 売り上げ金額
male_df.groupby("treatment").spend.mean()

treatment
0    0.652789
1    1.422617
Name: spend, dtype: float64

In [10]:
male_df.groupby("treatment").size()

treatment
0    21306
1    21307
dtype: int64

In [11]:
from statsmodels.stats.weightstats import ttest_ind

y1 = male_df.query("treatment==1").spend.to_numpy()
y0 = male_df.query("treatment==0").spend.to_numpy()
print(y1.var(), y0.var())

315.19700505136865 134.2800691039438


In [12]:
t_statistics, p_value, _ = ttest_ind(y1, y0, usevar="unequal")
print(f"t-statistics：{t_statistics}")
print(f"p-value：{p_value}")

t-statistics：5.300140358411662
p-value：1.1638149682255265e-07


### バイアスあり
- 去年の購入額(history)が300より高い、最後の購入(recency)が6より小さい、接触チャンネルが複数あるのいずれかを満たすユーザを購買意欲のあるユーザとする
- 購入意欲のあるユーザに介入をしたという状況を作り出す
  - 介入を受けてないユーザの中から購入意欲のあるユーザをランダムに削除
  - 介入を受けたユーザの中から購入意欲のないユーザをランダムに削除

In [13]:
# treatment==0の行は購入意欲のあるユーザを抽選対象にする
male_df["obs_rate_c"] = 1
male_df["obs_rate_c"] = male_df["obs_rate_c"].mask(((male_df["recency"]<6) | (male_df["history"]>300) | (male_df["channel"]=="Multichannel")), 0.5)
# treatment==1の行は購入意欲のないユーザを抽選対象にする
male_df["obs_rate_t"] = 0.5
male_df["obs_rate_t"] = male_df["obs_rate_t"].mask(((male_df["recency"]<6) | (male_df["history"]>300) | (male_df["channel"]=="Multichannel")), 1)

np.random.seed(2)
n = len(male_df)
male_df["random_number"] = np.random.rand(n)
biased_data = male_df.query(" (treatment==0 & random_number<obs_rate_c) | (treatment==1 & random_number<obs_rate_t)")

In [14]:
print(len(biased_data))
biased_data.head()

31896


Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,segment,visit,conversion,spend,treatment,obs_rate_c,obs_rate_t,random_number
1,6,3) $200 - $350,329.08,1,1,Rural,1,Web,No E-Mail,0,0,0.0,0,0.5,1.0,0.435995
3,9,5) $500 - $750,675.83,1,0,Rural,1,Web,Mens E-Mail,0,0,0.0,1,0.5,1.0,0.025926
8,9,5) $500 - $750,675.07,1,1,Rural,1,Phone,Mens E-Mail,0,0,0.0,1,0.5,1.0,0.549662
13,2,2) $100 - $200,101.64,0,1,Urban,0,Web,Mens E-Mail,1,0,0.0,1,0.5,1.0,0.435322
14,4,3) $200 - $350,241.42,0,1,Rural,1,Multichannel,No E-Mail,0,0,0.0,0,0.5,1.0,0.420368


In [15]:
biased_data.groupby("treatment").conversion.mean()

treatment
0    0.005051
1    0.013278
Name: conversion, dtype: float64

In [16]:
biased_data.groupby("treatment").spend.mean()

treatment
0    0.586214
1    1.517082
Name: spend, dtype: float64

In [17]:
biased_data.groupby("treatment").size()

treatment
0    14650
1    17246
dtype: int64

In [18]:
y1_biased = biased_data.query("treatment==1").spend.to_numpy()
y0_biased = biased_data.query("treatment==0").spend.to_numpy()

t_statistics, p_value, _ = ttest_ind(y1_biased, y0_biased, usevar="unequal")
print(f"t-statistics：{t_statistics}")
print(f"p-value：{p_value}")

t-statistics：5.548968955129425
p-value：2.899122164482464e-08
