In [1]:
import pandas as pd
import numpy as np
from load_data import ArtificialData
from dowhy import CausalModel

import torch
from econml.dml import CausalForestDML
from sklearn.linear_model import LassoCV, LinearRegression
from sklearn.model_selection import train_test_split

In [2]:
# データの読み込み
ad_train_D = ArtificialData(mode=0, N=1000, seed=0)  # 訓練データ, mode=RCT, 定数効果
ad_test_D = ArtificialData(mode=0, N=1000, seed=1)  # テストデータ, mode=RCT, 定数効果
ate_true = torch.mean(ad_test_D.mu1 - ad_test_D.mu0)

shape check has been done.
shape check has been done.


### Causal ForestによるITE, ATEの推定

In [3]:
# causal forestの訓練
covariates = ['x0', 'x1']
treatment = 'treatment'
outcome = 'yf'
label = covariates + [treatment, outcome]
df = pd.DataFrame(np.c_[ad_train_D.X, ad_train_D.t, ad_train_D.yf], columns=label)
model = CausalModel(data=df, treatment=treatment, outcome=outcome, common_causes=covariates, instruments=None, effect_modifiers=None)
# model.view_model()

train_data, test_data = train_test_split(df, test_size=0.2)

X_tr, Z_tr, Y_tr = train_data[covariates], train_data[treatment], train_data[outcome]
causal_forest = CausalForestDML(
    criterion='het',
    n_estimators=10000,
    min_samples_leaf=10,
    max_depth=None,
    max_samples=0.5,  # Q?: なぞ
    discrete_treatment=False,  # Q?: なぞ
    honest=True,  # Q?: Causal Treeを使うのですべてhonestでは？なぜわざわざ指定するオプションがある?
    inference=True,  # Q?: 推論しないなら何すんねん
    cv=10,
    model_t=LassoCV(),
    model_y=LassoCV()
)
causal_forest.fit(Y_tr, Z_tr, X=X_tr, W=None)  # Q?: Wがなぞ

<econml.dml.causal_forest.CausalForestDML at 0x7ff85477f908>

In [4]:
# Within Sample, Out of Sampleの評価で共通する手順

def loss_pehe(X, mu1, mu0):
    effect_hat = torch.from_numpy(causal_forest.effect(X=X, T1=torch.ones(X.shape[0]), T0=torch.zeros(X.shape[0])))  # 効果の推定量
    effect_true = mu1 - mu0  # 真の効果
    return torch.mean((effect_hat - effect_true)**2)

def loss_ate(X, mu1, mu0):
    effect_hat = torch.from_numpy(causal_forest.effect(X=X, T1=torch.ones(X.shape[0]), T0=torch.zeros(X.shape[0])))  # 効果の推定量
    effect_true = mu1 - mu0  # 真の効果
    ate_hat = torch.mean(effect_hat)  # ATEの推定量
    ate_true = torch.mean(effect_true)  # 真のATE
    return torch.abs(ate_hat - ate_true)

def evaluation(D, fname):
    # \epsilon_{PEHE}を算出
    pehe = loss_pehe(D.X, D.mu1, D.mu0)

    # \epsilon_{ATE}を算出
    ate_error = loss_ate(D.X, D.mu1, D.mu0)

    # 結果の表示
    print('pehe = ', pehe.item())
    print('error of ate =', ate_error.item())

    # 結果の保存
    torch.save({
        'X_test': D.X,
        'pehe': pehe,
        'ate_error': ate_error,
    }, fname)

In [5]:
# Within Sample
evaluation(ad_train_D, 'results/rct_causal_forest_WS.pt')

pehe =  1.360328161717765e-06
error of ate = 0.000751482794199454


In [6]:
# Out of Sample
evaluation(ad_test_D, 'results/rct_causal_forest_OoS.pt')

pehe =  1.4130081008890856e-06
error of ate = 0.0007625723996250167


### 疑問
* `causal_forest.effect_inference(X=X_test, T1=Z_test)`をした時に帰ってくる`InferenceResults`オブジェクトが謎
    * しかし、今は必要ないのでまた気になったら調べる。
* model_y, model_tにLassoを用いているが, CausalTreeでそんなんあったけ？
    * CausalTreeの内容がまだわかっていないようだな。理解し直しなさい。
    * ITE予測の研究をしないならいらないかもしれないが、教養として
* `causal_forest.marginal~`とかあるけど、何に関する周辺かをしてるのか, なぜしてるのかを理解してない。