# Caml API Usage

In [1]:
import os
import sys

os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

datasets = [
    "partially_linear_simple",
    "fully_heterogenous",
    "partially_linear_constant",
    "dowhy_linear",
]
backends = ["pandas", "pyspark", "polars"]

df_backend = backends[0]
dataset = datasets[3]

## Synthetic Data

In [2]:
from caml.extensions.synthetic_data import (
    make_partially_linear_dataset_simple,
    make_fully_heterogeneous_dataset,
    make_partially_linear_dataset_constant,
    make_dowhy_linear_dataset,
)

if dataset == "partially_linear_simple":
    df, true_cates, true_ate = make_partially_linear_dataset_simple(
        n_obs=5000,
        n_confounders=5,
        dim_heterogeneity=2,
        binary_treatment=True,
        seed=None,
    )
    df["true_cates"] = true_cates
elif dataset == "fully_heterogenous":
    df, true_cates, true_ate = make_fully_heterogeneous_dataset(
        n_obs=5000,
        n_confounders=50,
        theta=4.0,
        seed=None,
    )
    df["true_cates"] = true_cates
elif dataset == "partially_linear_constant":
    df, true_cates, true_ate = make_partially_linear_dataset_constant(
        n_obs=5000,
        ate=4.0,
        n_confounders=5,
        dgp="make_plr_CCDDHNR2018",  # make_plr_turrell2018
        seed=None,
    )
    df["true_cates"] = true_cates
elif dataset == "dowhy_linear":
    df, true_cates, true_ate = make_dowhy_linear_dataset(
        beta=2.0,
        n_obs=5000,
        n_confounders=0,
        n_discrete_confounders=0,
        n_effect_modifiers=4,
        n_discrete_effect_modifiers=4,
        n_treatments=1,
        binary_treatment=True,
        categorical_treatment=False,
        binary_outcome=False,
        seed=0,
    )

    for i in range(1, len(true_cates) + 1):
        if isinstance(true_cates[f"d{i}"], list):
            df[f"true_cate_d{i}_1"] = true_cates[f"d{i}"][0]
            df[f"true_cate_d{i}_2"] = true_cates[f"d{i}"][1]
        else:
            df[f"true_cate_d{i}"] = true_cates[f"d{i}"]


df["uuid"] = df.index

In [3]:
try:
    import polars as pl
    from pyspark.sql import SparkSession
except ImportError:
    pass

if df_backend == "polars":
    df = pl.from_pandas(df)
    spark = None
elif df_backend == "pandas":
    spark = None
    pass
elif df_backend == "pyspark":
    spark = (
        SparkSession.builder.master("local[1]")
        .appName("local-tests")
        .config("spark.executor.cores", "1")
        .config("spark.executor.instances", "1")
        .config("spark.sql.shuffle.partitions", "1")
        .getOrCreate()
    )
    df = spark.createDataFrame(df)

In [4]:
# onehot encode X0, X1, X2, and X3 pandas backend
import pandas as pd 

df = pd.get_dummies(df, columns=["X0", "X1", "X2", "X3"],drop_first=True)

## Core API

### CamlCATE

#### Class Instantiation

In [5]:
from caml import CamlCATE

caml = CamlCATE(
    df=df,
    Y="y",
    T="d1",
    X=[c for c in df.columns if "X" in c] + [c for c in df.columns if "W" in c],
    uuid="uuid",
    discrete_treatment=True,
    discrete_outcome=False,
    seed=0,
    verbose=1,
)

In [6]:
print(caml)

Data Backend: pandas
No. of Observations: 5000
UUID: uuid
Outcome Variable: y
Discrete Outcome: False
Treatment Variable: d1
Discrete Treatment: True
Features/Confounders: ['X0_1', 'X0_2', 'X0_3', 'X1_1', 'X1_2', 'X1_3', 'X2_1', 'X2_2', 'X2_3', 'X3_1', 'X3_2', 'X3_3']
Random Seed: 0



#### Nuissance Function AutoML

In [7]:
caml.auto_nuisance_functions(
    flaml_Y_kwargs={"time_budget": 30},
    flaml_T_kwargs={"time_budget": 30},
    use_ray=False,
    use_spark=False,
)

[flaml.automl.logger: 09-23 15:17:02] {1680} INFO - task = regression
[flaml.automl.logger: 09-23 15:17:02] {1691} INFO - Evaluation method: cv
[flaml.automl.logger: 09-23 15:17:02] {1789} INFO - Minimizing error metric: mse
[flaml.automl.logger: 09-23 15:17:02] {1901} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth']
[flaml.automl.logger: 09-23 15:17:02] {2219} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 09-23 15:17:02] {2345} INFO - Estimated sufficient time budget=631s. Estimated necessary time budget=4s.
[flaml.automl.logger: 09-23 15:17:02] {2392} INFO -  at 0.1s,	estimator lgbm's best error=15.9359,	best estimator lgbm's best error=15.9359
[flaml.automl.logger: 09-23 15:17:02] {2219} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 09-23 15:17:02] {2392} INFO -  at 0.1s,	estimator lgbm's best error=15.8603,	best estimator lgbm's best error=15.8603
[flaml.automl.logger: 09-23 15:17:02] {2219} INFO -

#### Fit and ensemble CATE models

In [8]:
caml.fit_validator(
    subset_cate_models=[
        "LinearDML",
        # "NonParamDML",
        # "DML-Lasso3d",
        # "CausalForestDML",
        # "XLearner",
        # "DomainAdaptationLearner",
        # "SLearner",
        # "TLearner",
        # "DRLearner",
        # "LinearDRLearner",
        # "ForestDRLearner",
    ],
    rscorer_kwargs={},
    use_ray=False,
    ray_remote_func_options_kwargs={},
    sample_fraction=1,
    n_jobs=-1,
)

A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
A column-vector y was passed when a 1d array was expected. Please cha

In [12]:
model_final.summary()

0,1,2,3,4,5,6
,point_estimate,stderr,zstat,pvalue,ci_lower,ci_upper
X0_1,0.88,0.082,10.723,0.0,0.719,1.04
X0_2,1.602,0.081,19.761,0.0,1.443,1.761
X0_3,2.27,0.091,24.947,0.0,2.092,2.448
X1_1,0.526,0.083,6.361,0.0,0.364,0.688
X1_2,1.247,0.085,14.722,0.0,1.081,1.413
X1_3,1.787,0.087,20.54,0.0,1.616,1.957
X2_1,0.967,0.078,12.446,0.0,0.815,1.12
X2_2,2.122,0.088,24.124,0.0,1.95,2.295
X2_3,3.343,0.083,40.416,0.0,3.181,3.505

0,1,2,3,4,5,6
,point_estimate,stderr,zstat,pvalue,ci_lower,ci_upper
cate_intercept,1.866,0.146,12.801,0.0,1.58,2.152


In [16]:
model_final.ate_inference(caml._X)

mean_point,stderr_mean,zstat,pvalue,ci_mean_lower,ci_mean_upper
7.449,0.029,253.606,0.0,7.391,7.507

std_point,pct_point_lower,pct_point_upper
2.17,3.271,11.712

stderr_point,ci_point_lower,ci_point_upper
2.17,3.281,11.647


In [17]:
true_ate

{'d1': 7.499120156616214}

In [27]:
from econml.dml import LinearDML

model_final = LinearDML(
    model_y=caml.model_Y_X,
    model_t=caml.model_T_X,
    discrete_treatment=True,
    discrete_outcome=False,
    cv=3,
)

model_final.fit(Y=caml._Y.execute().to_numpy().ravel(), T=caml._T.execute(),X=caml._X.execute(),cache_values=True)

<econml.dml.dml.LinearDML at 0x7fd4543e7cd0>

In [28]:
model_final.summary()

0,1,2,3,4,5,6
,point_estimate,stderr,zstat,pvalue,ci_lower,ci_upper
X0_1,0.836,0.065,12.927,0.0,0.709,0.962
X0_2,1.599,0.064,24.813,0.0,1.473,1.725
X0_3,2.242,0.071,31.445,0.0,2.102,2.381
X1_1,0.527,0.064,8.234,0.0,0.402,0.653
X1_2,1.144,0.065,17.638,0.0,1.017,1.272
X1_3,1.724,0.069,25.01,0.0,1.589,1.859
X2_1,1.069,0.064,16.661,0.0,0.943,1.195
X2_2,2.163,0.067,32.094,0.0,2.031,2.295
X2_3,3.34,0.064,52.023,0.0,3.214,3.465

0,1,2,3,4,5,6
,point_estimate,stderr,zstat,pvalue,ci_lower,ci_upper
cate_intercept,2.006,0.114,17.636,0.0,1.783,2.229


In [29]:
model_final.ate_inference(X=df[caml.X])

mean_point,stderr_mean,zstat,pvalue,ci_mean_lower,ci_mean_upper
7.489,0.023,327.343,0.0,7.444,7.534

std_point,pct_point_lower,pct_point_upper
2.121,3.369,11.666

stderr_point,ci_point_lower,ci_point_upper
2.121,3.405,11.588


In [42]:
model_final.ate_inference(X=df[df['X2_2']==1][caml.X])

mean_point,stderr_mean,zstat,pvalue,ci_mean_lower,ci_mean_upper
7.982,0.048,166.065,0.0,7.888,8.076

std_point,pct_point_lower,pct_point_upper
1.714,4.696,11.132

stderr_point,ci_point_lower,ci_point_upper
1.715,4.597,11.152


In [47]:
df['y_resid']= model_final.residuals_[0]
df['T_resid']= model_final.residuals_[1]

In [48]:
from statsmodels.formula.api import ols

model = ols("y_resid ~ -1 + T_resid", data=df).fit()
model.summary()

0,1,2,3
Dep. Variable:,y_resid,R-squared (uncentered):,0.888
Model:,OLS,Adj. R-squared (uncentered):,0.887
Method:,Least Squares,F-statistic:,39440.0
Date:,"Mon, 23 Sep 2024",Prob (F-statistic):,0.0
Time:,15:28:37,Log-Likelihood:,-8531.3
No. Observations:,5000,AIC:,17060.0
Df Residuals:,4999,BIC:,17070.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
T_resid,7.4880,0.038,198.589,0.000,7.414,7.562

0,1,2,3
Omnibus:,102.629,Durbin-Watson:,1.988
Prob(Omnibus):,0.0,Jarque-Bera (JB):,165.41
Skew:,-0.194,Prob(JB):,1.21e-36
Kurtosis:,3.802,Cond. No.,1.0


In [41]:
from statsmodels.formula.api import ols

model = ols("y_resid ~ -1 + T_resid", data=df[df['X2_2']==1][['y_resid','T_resid']]).fit()
model.summary()

0,1,2,3
Dep. Variable:,y_resid,R-squared (uncentered):,0.919
Model:,OLS,Adj. R-squared (uncentered):,0.919
Method:,Least Squares,F-statistic:,14170.0
Date:,"Mon, 23 Sep 2024",Prob (F-statistic):,0.0
Time:,15:25:47,Log-Likelihood:,-1985.0
No. Observations:,1250,AIC:,3972.0
Df Residuals:,1249,BIC:,3977.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
T_resid,7.9780,0.067,119.022,0.000,7.846,8.110

0,1,2,3
Omnibus:,47.854,Durbin-Watson:,1.797
Prob(Omnibus):,0.0,Jarque-Bera (JB):,129.163
Skew:,-0.093,Prob(JB):,8.97e-29
Kurtosis:,4.564,Cond. No.,1.0


#### CATE Validation

In [None]:
validation_results = caml.validate(estimator=None, print_full_report=True)

#### Refit best estimator on full dataset

In [None]:
caml.fit_final()

In [None]:
caml.final_estimator

#### Predict CATEs

In [None]:
## "Out of sample" predictions

df_predictions = caml.predict(
    out_of_sample_df=df,
    out_of_sample_uuid="uuid",
    return_predictions=False,
    join_predictions=True,
)

if df_backend == "pyspark":
    df_predictions.show()
else:
    print(df_predictions)

In [None]:
## Append to internal dataframe

caml.predict(
    out_of_sample_df=None,
    out_of_sample_uuid=None,
    join_predictions=True,
    return_predictions=False,
)

caml.dataframe

#### CATE Rank Ordering

In [None]:
## "Out of sample" predictions

df_rank_ordered = caml.rank_order(
    out_of_sample_df=df_predictions,
    return_rank_order=False,
    join_rank_order=True,
    treatment_category=1,
)

df_rank_ordered

In [None]:
## Append to internal dataframe

caml.rank_order(
    out_of_sample_df=None,
    return_rank_order=False,
    join_rank_order=True,
    treatment_category=1,
)

caml.dataframe

#### CATE Visualization/Summary

In [None]:
cate_summary = caml.summarize(out_of_sample_df=df_rank_ordered, treatment_category=1)

cate_summary

In [None]:
cate_summary = caml.summarize(out_of_sample_df=None, treatment_category=1)

cate_summary

In [None]:
true_ate

#### Access my dataframe, estimator object, and get string representation of class

In [None]:
caml.dataframe

In [None]:
from econml.score import EnsembleCateEstimator

# Use this estimator object as pickled object for optimized inference
final_estimator = caml.final_estimator

if isinstance(final_estimator, EnsembleCateEstimator):
    for model in final_estimator._cate_models:
        print(model)
        print(model._input_names)
else:
    print(final_estimator)
    print(final_estimator._input_names)

In [None]:
print(caml)

# Plots

In [23]:
from caml.extensions.plots import (cate_histogram_plot, 
                                   cate_true_vs_estimated_plot, 
                                   cate_line_plot)

In [None]:
cate_true_vs_estimated_plot(true_cates=caml.dataframe['true_cate_d1'],estimated_cates=caml.dataframe['cate_predictions_1'])

In [None]:
cate_histogram_plot(estimated_cates=caml.dataframe['cate_predictions_1'])

In [None]:
cate_histogram_plot(estimated_cates=caml.dataframe['cate_predictions_1'],true_cates=caml.dataframe['true_cate_d1'])

In [None]:
cate_line_plot(estimated_cates=caml.dataframe['cate_predictions_1'],window=30)

In [None]:
cate_line_plot(estimated_cates=caml.dataframe['cate_predictions_1'],true_cates=caml.dataframe['true_cate_d1'],window=30)