In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings("ignore")

In [3]:
df = pd.read_csv("http://minethatdata.com/Kevin_Hillstrom_MineThatData_E-MailAnalytics_DataMiningChallenge_2008.03.20.csv")
df.head()

Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,segment,visit,conversion,spend
0,10,2) $100 - $200,142.44,1,0,Surburban,0,Phone,Womens E-Mail,0,0,0.0
1,6,3) $200 - $350,329.08,1,1,Rural,1,Web,No E-Mail,0,0,0.0
2,7,2) $100 - $200,180.65,0,1,Surburban,1,Web,Womens E-Mail,0,0,0.0
3,9,5) $500 - $750,675.83,1,0,Rural,1,Web,Mens E-Mail,0,0,0.0
4,2,1) $0 - $100,45.34,1,0,Urban,0,Web,Womens E-Mail,0,0,0.0


In [4]:
# 後の分析用にchannelをonehotに変換
df = pd.concat([df, pd.get_dummies(df["channel"], drop_first=True)], axis=1)
df.head()

Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,segment,visit,conversion,spend,Phone,Web
0,10,2) $100 - $200,142.44,1,0,Surburban,0,Phone,Womens E-Mail,0,0,0.0,1,0
1,6,3) $200 - $350,329.08,1,1,Rural,1,Web,No E-Mail,0,0,0.0,0,1
2,7,2) $100 - $200,180.65,0,1,Surburban,1,Web,Womens E-Mail,0,0,0.0,0,1
3,9,5) $500 - $750,675.83,1,0,Rural,1,Web,Mens E-Mail,0,0,0.0,0,1
4,2,1) $0 - $100,45.34,1,0,Urban,0,Web,Womens E-Mail,0,0,0.0,0,1


In [5]:
# バイアスデータの作成

# 男性向けメールが配信されたユーザにのみ限定
male_df = df.query(" segment!= 'Womens E-Mail' ").copy()
male_df["treatment"] = (male_df["segment"]=="Mens E-Mail").astype(int)

# treatment==0の行は購入意欲のあるユーザを抽選対象にする
male_df["obs_rate_c"] = 1
male_df["obs_rate_c"] = male_df["obs_rate_c"].mask(((male_df["recency"]<6) | (male_df["history"]>300) | (male_df["channel"]=="Multichannel")), 0.5)
# treatment==1の行は購入意欲のないユーザを抽選対象にする
male_df["obs_rate_t"] = 0.5
male_df["obs_rate_t"] = male_df["obs_rate_t"].mask(((male_df["recency"]<6) | (male_df["history"]>300) | (male_df["channel"]=="Multichannel")), 1)

np.random.seed(2)
n = len(male_df)
male_df["random_number"] = np.random.rand(n)
biased_data = male_df.query(" (treatment==0 & random_number<obs_rate_c) | (treatment==1 & random_number<obs_rate_t)")

In [6]:
y = biased_data[["treatment"]]
X = biased_data[["history", "recency", "Phone", "Web"]]
X = sm.add_constant(X) # 切片あり

# 傾向スコアを算出する学習器を作成
model = LogisticRegression(random_state=2).fit(X, y)
ps_score = model.predict_proba(X)[:, 1] # 確率を出力

In [7]:
biased_data["ps_score"] = ps_score
weightA = 1 / biased_data.query(" treatment==1 ")["ps_score"]
weightB = 1 / (1 - biased_data.query(" treatment==0 ")["ps_score"])

In [8]:
weights = pd.concat([weightA, weightB]).sort_index().to_numpy()
weights

array([2.1830264 , 1.98699512, 1.99163407, ..., 2.59923368, 1.89957871,
       1.41485679])

In [9]:
y = biased_data[["spend"]]
X = biased_data[["treatment"]]
X = sm.add_constant(X) # 切片あり

wls_model = sm.WLS(y,X, weights=weights).fit()

In [10]:
wls_model.summary()

0,1,2,3
Dep. Variable:,spend,R-squared:,0.001
Model:,WLS,Adj. R-squared:,0.001
Method:,Least Squares,F-statistic:,22.31
Date:,"Sat, 13 Aug 2022",Prob (F-statistic):,2.33e-06
Time:,16:06:53,Log-Likelihood:,-132280.0
No. Observations:,31896,AIC:,264600.0
Df Residuals:,31894,BIC:,264600.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.6408,0.119,5.405,0.000,0.408,0.873
treatment,0.7945,0.168,4.724,0.000,0.465,1.124

0,1,2,3
Omnibus:,70984.104,Durbin-Watson:,1.999
Prob(Omnibus):,0.0,Jarque-Bera (JB):,379013487.084
Skew:,20.976,Prob(JB):,0.0
Kurtosis:,535.379,Cond. No.,2.61
