In [None]:
# Import standard libraries for data handling and visualization
import numpy as np  # Numerical computing
import pandas as pd  # Data manipulation
import matplotlib.pyplot as plt  # Plotting
from matplotlib.patches import Rectangle  # Used for adding shapes to plots
import seaborn as sns  # Statistical visualization

# Set seaborn style and font scaling for better visualization aesthetics
sns.set(style="whitegrid", font_scale=1.75)

# Import scientific computing and machine learning libraries
import scipy.special as sp  # Special mathematical functions
import sklearn.datasets as datasets  # Example datasets from sklearn
import sklearn.linear_model as lm  # Linear models (e.g., logistic regression)
import sklearn.ensemble as en  # Ensemble models (e.g., random forests, boosting)
import sklearn.tree as tree  # Decision tree models

# Import module handling
import importlib

#### Data generation packages (custom modules)
import strawman_edge  # Likely contains functions for generating edge-case datasets
import strawman_center  # Likely contains functions for generating center-case datasets
import all_linear  # Likely contains functions for generating fully linear datasets
import all_nonlinear  # Likely contains functions for generating fully nonlinear datasets

#### Main analysis packages
import learn_w as learn  # Custom module for learning methods
import black  # Code formatter for Python

# Reload the `learn` module to apply any recent changes
importlib.reload(learn)

# Suppress warnings to keep output clean
import warnings
warnings.filterwarnings("ignore")

# Load Jupyter Black extension for auto-formatting Python code in Jupyter Notebook
%load_ext jupyter_black


# Box DGP

In [None]:
# Generate synthetic data using the `strawman_edge` module
# - `df`: Contains observed covariates and treatment assignments
# - `Y`: Contains potential outcomes (Y0, Y1) under different treatments
df, Y = strawman_edge.get_data(n=10000, seed=0)

# Define key variable names
outcome = "Yobs"  # Observed outcome variable
treatment = "T"  # Treatment assignment column (1 = treated, 0 = control)
sample = "S"  # Sample indicator (used for selection bias analysis)

# Compute the true individual treatment effects (ITE)
# - TE = Y(1) - Y(0), i.e., the difference between treated and control outcomes
TE = Y["Y1"] - Y["Y0"]

# Create a deep copy of the original dataset to store the true treatment effect
df_true = df.copy(deep=True)
df_true["TE"] = TE  # Add true treatment effect to the dataset


In [None]:
importlib.reload(learn)
np.random.seed(42)

# Apply K-means optimization for data partitioning
D_labels, f, testing_data = learn.kmeans_opt(
    data=df,
    outcome=outcome,
    treatment=treatment,
    sample=sample,
)

# Assign cluster predictions to df_true
df_true["w"] = f.predict(df_true[[col for col in df_true.columns if "X" in col]])

# Merge treatment effect (TE), sample indicator (S), and partitioning labels (w)
D_w_true = D_labels.join(df_true[["TE", "S", "w"]], rsuffix="_true", how="outer")

# Compute bias and standard deviation before and after reweighting
brute_box_r = pd.DataFrame(
    [
        [
            (
                D_w_true.loc[(D_w_true["S_true"] == 1)]["v"].mean()
                - D_w_true.loc[(D_w_true["S_true"] == 0)]["TE"].mean()
            ),
            (
                D_w_true.loc[(D_w_true["S_true"] == 1) * (D_w_true["w_true"] == 1)][
                    "v"
                ].mean()
                - D_w_true.loc[(D_w_true["S_true"] == 0) * (D_w_true["w_true"] == 1)][
                    "TE"
                ].mean()
            ),
        ],
        [
            np.sqrt(
                D_w_true.loc[(D_w_true["S"] == 1)]["vsq"].sum()
                / (D_w_true.loc[(D_w_true["S"] == 1)].shape[0]) ** 2
            ),
            (
                np.sqrt(
                    D_w_true.loc[(D_w_true["S"] == 1) * (D_w_true["w"] == 1)][
                        "vsq"
                    ].sum()
                    / (
                        D_w_true.loc[(D_w_true["S"] == 1) * (D_w_true["w"] == 1)].shape[
                            0
                        ]
                    )
                    ** 2
                )
            ),
        ],
    ],
    index=["Bias", "Stdev"],
    columns=["Pre", "Post"],
)

# Repeat similar estimation using Linear Optimization
importlib.reload(learn)
np.random.seed(42)

D_labels, f, testing_data = learn.linear_opt(
    data=df, outcome=outcome, treatment=treatment, sample=sample, seed=42
)

D_labels["w"] = D_labels["w"].astype(int)

df_true["w"] = f.predict(df_true[[col for col in df_true.columns if "X" in col]])
D_w_true = D_labels.join(df_true[["TE", "S", "w"]], rsuffix="_true", how="outer")

linear_box_r = pd.DataFrame(
    [
        [
            (
                D_w_true.loc[(D_w_true["S_true"] == 1)]["v"].mean()
                - D_w_true.loc[(D_w_true["S_true"] == 0)]["TE"].mean()
            ),
            (
                D_w_true.loc[(D_w_true["S_true"] == 1) * (D_w_true["w_true"] == 1)][
                    "v"
                ].mean()
                - D_w_true.loc[(D_w_true["S_true"] == 0) * (D_w_true["w_true"] == 1)][
                    "TE"
                ].mean()
            ),
        ],
        [
            np.sqrt(
                D_w_true.loc[(D_w_true["S"] == 1)]["vsq"].sum()
                / (D_w_true.loc[(D_w_true["S"] == 1)].shape[0]) ** 2
            ),
            (
                np.sqrt(
                    D_w_true.loc[(D_w_true["S"] == 1) * (D_w_true["w"] == 1)][
                        "vsq"
                    ].sum()
                    / (
                        D_w_true.loc[(D_w_true["S"] == 1) * (D_w_true["w"] == 1)].shape[
                            0
                        ]
                    )
                    ** 2
                )
            ),
        ],
    ],
    index=["Bias", "Stdev"],
    columns=["Pre", "Post"],
)

# Apply Tree Optimization
importlib.reload(learn)
np.random.seed(42)

D_labels, f, testing_data = learn.tree_opt(
    data=df, outcome=outcome, treatment=treatment, sample=sample, seed=0
)

D_labels["w"] = D_labels["w"].astype(int)

df_true["w"] = f.predict(df_true[[col for col in df_true.columns if "X" in col]])
D_w_true = D_labels.join(df_true[["TE", "S", "w"]], rsuffix="_true", how="outer")

tree_box_r = pd.DataFrame(
    [
        [
            (
                D_w_true.loc[(D_w_true["S_true"] == 1)]["v"].mean()
                - D_w_true.loc[(D_w_true["S_true"] == 0)]["TE"].mean()
            ),
            (
                D_w_true.loc[(D_w_true["S_true"] == 1) * (D_w_true["w_true"] == 1)][
                    "v"
                ].mean()
                - D_w_true.loc[(D_w_true["S_true"] == 0) * (D_w_true["w_true"] == 1)][
                    "TE"
                ].mean()
            ),
        ],
        [
            np.sqrt(
                D_w_true.loc[(D_w_true["S"] == 1)]["vsq"].sum()
                / (D_w_true.loc[(D_w_true["S"] == 1)].shape[0]) ** 2
            ),
            (
                np.sqrt(
                    D_w_true.loc[(D_w_true["S"] == 1) * (D_w_true["w"] == 1)][
                        "vsq"
                    ].sum()
                    / (
                        D_w_true.loc[(D_w_true["S"] == 1) * (D_w_true["w"] == 1)].shape[
                            0
                        ]
                    )
                    ** 2
                )
            ),
        ],
    ],
    index=["Bias", "Stdev"],
    columns=["Pre", "Post"],
)

# Apply Forest Optimization
importlib.reload(learn)
D_rash, D_forest, w_forest, rashomon_set, f, testing_data = learn.forest_opt(
    data=df,
    outcome=outcome,
    treatment=treatment,
    sample=sample,
    leaf_proba=1,
    num_trees=3000,
    vote_threshold=2 / 5,
)

df_true["w"] = f.predict(df_true[[col for col in df_true.columns if "X" in col]])
D_w_true = D_rash.join(df_true[["TE", "S", "w"]], rsuffix="_true", how="outer")

df_refined = df.join(D_w_true[["w"]])
df_refined = df_refined.loc[df_refined["w"] == 1]

# Estimate DML (Double Machine Learning)
np.random.seed(42)
df_v, pi, pi_m, e_m, testing_data = learn.estimate_dml(
    data=df_refined, outcome="Yobs", treatment="T", sample="S", crossfit=5
)

forest_box_r = pd.DataFrame(
    [
        [
            (
                D_w_true.loc[(D_w_true["S_true"] == 1)]["v"].mean()
                - D_w_true.loc[(D_w_true["S_true"] == 0)]["TE"].mean()
            ),
            (
                df_v["te"].mean()
                - D_w_true.loc[(D_w_true["S_true"] == 0) * (D_w_true["w"] == 1)][
                    "TE"
                ].mean()
            ),
        ],
        [
            np.sqrt(
                D_w_true.loc[(D_w_true["S"] == 1)]["vsq"].sum()
                / (D_w_true.loc[(D_w_true["S"] == 1)].shape[0]) ** 2
            ),
            (
                np.sqrt(
                    D_w_true.loc[(D_w_true["S"] == 1) * (D_w_true["w"] == 1)][
                        "vsq"
                    ].sum()
                    / (
                        D_w_true.loc[(D_w_true["S"] == 1) * (D_w_true["w"] == 1)].shape[
                            0
                        ]
                    )
                    ** 2
                )
            ),
        ],
    ],
    index=["Bias", "Stdev"],
    columns=["Pre", "Post"],
)

# Combine results from different methods and print as LaTeX table
print(
    pd.concat([forest_box_r, tree_box_r, linear_box_r, brute_box_r], axis=1).to_latex()
)

In [None]:
# Reload the custom learning module
importlib.reload(learn)

# Apply Forest Optimization using the `learn.forest_opt` function
D_rash, D_forest, w_forest, rashomon_set, f, testing_data = learn.forest_opt(
    data=df,
    outcome=outcome,
    treatment=treatment,
    sample=sample,
    leaf_proba=1,      # Probability threshold for selecting leaf nodes
    num_trees=3000,    # Number of trees sampled
    vote_threshold=2 / 5, 
)

# Compute the baseline loss (a measure of variance)
baseline_loss = np.sqrt(np.sum(D_forest["vsq"]) / ((np.sum((1 - D_forest["S"])) ** 2)))

# Plot and save the decision tree visualization
fig, ax = plt.subplots(figsize=(10, 5), sharex=True, sharey=True, dpi=600)

# Visualize the learned decision tree from the forest model
tree.plot_tree(
    f,
    filled=True,  # Color nodes based on majority class
    feature_names=df.drop(columns=[outcome, treatment, sample]).columns,  # Exclude treatment and outcome variables
    ax=ax,
)

plt.tight_layout()
fig.savefig("explain_box.pdf", format="pdf", dpi=600)  # Save decision tree visualization

# Plot and save scatterplot of `w_opt` (the optimized group assignments)
fig, ax = plt.subplots(figsize=(5, 5), sharex=True, sharey=True, dpi=600)

# Scatterplot showing how `w_opt` (partitioning of the data) is assigned across two feature dimensions (X0, X1)
sns.scatterplot(
    x="X0",
    y="X1",
    hue="w_opt",
    data=D_rash,
    ax=ax,
    hue_order=[1, 0],  # Control the order of hue categories
)

# Add a legend for clarity
plt.legend(title="w", loc="lower left")

plt.tight_layout()
fig.savefig("forest_box.pdf", format="pdf", dpi=600)  # Save scatterplot visualization


# Community DGP

In [None]:
importlib.reload(strawman_center)
df, Y = strawman_center.get_data(n=5000, seed=0)
outcome = "Yobs"
treatment = "T"
sample = "S"
TE = Y["Y1"] - Y["Y0"]
df_true = df.copy(deep=True)
df_true["TE"] = TE

In [None]:
importlib.reload(learn)
np.random.seed(42)
D_labels, f, testing_data = learn.kmeans_opt(
    data=df,
    outcome=outcome,
    treatment=treatment,
    sample=sample,
)

# D_labels["w"] = D_labels["w"].astype(int)
# fig, ax = plt.subplots(figsize=(5, 5), sharex=True, sharey=True, dpi=600)
# sns.scatterplot(x="X0", y="X1", hue="w", data=D_labels, ax=ax)
# plt.legend(title="w")
# plt.legend(ncols=2, loc="lower left")
# plt.tight_layout()

df_true["w"] = f.predict(df_true[[col for col in df_true.columns if "X" in col]])
D_w_true = D_labels.join(df_true[["TE", "S", "w"]], rsuffix="_true", how="outer")

brute_box_r = pd.DataFrame(
    [
        [
            (
                D_w_true.loc[(D_w_true["S_true"] == 1)]["v"].mean()
                - D_w_true.loc[(D_w_true["S_true"] == 0)]["TE"].mean()
            ),
            (
                D_w_true.loc[(D_w_true["S_true"] == 1) * (D_w_true["w_true"] == 1)][
                    "v"
                ].mean()
                - D_w_true.loc[(D_w_true["S_true"] == 0) * (D_w_true["w_true"] == 1)][
                    "TE"
                ].mean()
            ),
        ],
        [
            np.sqrt(
                D_w_true.loc[(D_w_true["S"] == 1)]["vsq"].sum()
                / (D_w_true.loc[(D_w_true["S"] == 1)].shape[0]) ** 2
            ),
            (
                np.sqrt(
                    D_w_true.loc[(D_w_true["S"] == 1) * (D_w_true["w"] == 1)][
                        "vsq"
                    ].sum()
                    / (
                        D_w_true.loc[(D_w_true["S"] == 1) * (D_w_true["w"] == 1)].shape[
                            0
                        ]
                    )
                    ** 2
                )
            ),
        ],
    ],
    index=["Bias", "Stdev"],
    columns=["Pre", "Post"],
)

importlib.reload(learn)
np.random.seed(42)
D_labels, f, testing_data = learn.linear_opt(
    data=df, outcome=outcome, treatment=treatment, sample=sample, seed=42
)

D_labels["w"] = D_labels["w"].astype(int)
# fig, ax = plt.subplots(figsize=(5, 5), sharex=True, sharey=True, dpi=600)
# sns.scatterplot(
#     x="X0", y="X1", hue="w", data=D_labels, ax=ax, hue_order={0: "C1", 1: "C2"}
# )
# plt.legend(title="w")
# # plt.xlim(-0.01, 1.25)
# # plt.ylim(-0.01, 1.25)
# plt.legend(ncols=2, loc="lower left")
# plt.tight_layout()

df_true["w"] = f.predict(df_true[[col for col in df_true.columns if "X" in col]])
D_w_true = D_labels.join(df_true[["TE", "S", "w"]], rsuffix="_true", how="outer")

linear_box_r = pd.DataFrame(
    [
        [
            (
                D_w_true.loc[(D_w_true["S_true"] == 1)]["v"].mean()
                - D_w_true.loc[(D_w_true["S_true"] == 0)]["TE"].mean()
            ),
            (
                D_w_true.loc[(D_w_true["S_true"] == 1) * (D_w_true["w_true"] == 1)][
                    "v"
                ].mean()
                - D_w_true.loc[(D_w_true["S_true"] == 0) * (D_w_true["w_true"] == 1)][
                    "TE"
                ].mean()
            ),
        ],
        [
            np.sqrt(
                D_w_true.loc[(D_w_true["S"] == 1)]["vsq"].sum()
                / (D_w_true.loc[(D_w_true["S"] == 1)].shape[0]) ** 2
            ),
            (
                np.sqrt(
                    D_w_true.loc[(D_w_true["S"] == 1) * (D_w_true["w"] == 1)][
                        "vsq"
                    ].sum()
                    / (
                        D_w_true.loc[(D_w_true["S"] == 1) * (D_w_true["w"] == 1)].shape[
                            0
                        ]
                    )
                    ** 2
                )
            ),
        ],
    ],
    index=["Bias", "Stdev"],
    columns=["Pre", "Post"],
)

importlib.reload(learn)
np.random.seed(42)
D_labels, f, testing_data = learn.tree_opt(
    data=df, outcome=outcome, treatment=treatment, sample=sample, seed=0
)

D_labels["w"] = D_labels["w"].astype(int)
# fig, ax = plt.subplots(figsize=(5, 5), sharex=True, sharey=True, dpi=600)
# sns.scatterplot(
#     x="X0", y="X1", hue="w", data=D_labels, ax=ax, hue_order={0: "C1", 1: "C2"}
# )
# plt.legend(title="w")
# plt.legend(ncols=2, loc="lower left")
# plt.tight_layout()

df_true["w"] = f.predict(df_true[[col for col in df_true.columns if "X" in col]])
D_w_true = D_labels.join(df_true[["TE", "S", "w"]], rsuffix="_true", how="outer")

tree_box_r = pd.DataFrame(
    [
        [
            (
                D_w_true.loc[(D_w_true["S_true"] == 1)]["v"].mean()
                - D_w_true.loc[(D_w_true["S_true"] == 0)]["TE"].mean()
            ),
            (
                D_w_true.loc[(D_w_true["S_true"] == 1) * (D_w_true["w_true"] == 1)][
                    "v"
                ].mean()
                - D_w_true.loc[(D_w_true["S_true"] == 0) * (D_w_true["w_true"] == 1)][
                    "TE"
                ].mean()
            ),
        ],
        [
            np.sqrt(
                D_w_true.loc[(D_w_true["S"] == 1)]["vsq"].sum()
                / (D_w_true.loc[(D_w_true["S"] == 1)].shape[0]) ** 2
            ),
            (
                np.sqrt(
                    D_w_true.loc[(D_w_true["S"] == 1) * (D_w_true["w"] == 1)][
                        "vsq"
                    ].sum()
                    / (
                        D_w_true.loc[(D_w_true["S"] == 1) * (D_w_true["w"] == 1)].shape[
                            0
                        ]
                    )
                    ** 2
                )
            ),
        ],
    ],
    index=["Bias", "Stdev"],
    columns=["Pre", "Post"],
)

importlib.reload(learn)
np.random.seed(42)
D_rash, D_forest, w_forest, rashomon_set, f, testing_data = learn.forest_opt(
    data=df,
    outcome=outcome,
    treatment=treatment,
    sample=sample,
    leaf_proba=1,
    num_trees=3000,
    vote_threshold=1 / 2,
    top_k_trees=True,
    k=10,
)

# fig, ax = plt.subplots(figsize=(5, 5), sharex=True, sharey=True, dpi=600)
# sns.scatterplot(x="X0", y="X1", hue="w_opt", data=D_rash, ax=ax)
# # plt.xlim(-0.01, 1.25)
# # plt.ylim(-0.01, 1.25)
# plt.legend(ncols=2, loc="lower left")
# plt.tight_layout()

df_true["w"] = f.predict(df_true[[col for col in df_true.columns if "X" in col]])
D_w_true = D_rash.join(df_true[["TE", "S", "w"]], rsuffix="_true", how="outer")
df_refined = df.join(D_w_true[["w"]])
df_refined = df_refined.loc[df_refined["w"] == 1]

np.random.seed(42)
df_v, pi, pi_m, e_m, testing_data = learn.estimate_dml(
    data=df_refined, outcome="Yobs", treatment="T", sample="S", crossfit=5
)

forest_box_r = pd.DataFrame(
    [
        [
            (
                D_w_true.loc[(D_w_true["S_true"] == 1)]["v"].mean()
                - D_w_true.loc[(D_w_true["S_true"] == 0)]["TE"].mean()
            ),
            (
                df_v["te"].mean()
                - D_w_true.loc[(D_w_true["S_true"] == 0) * (D_w_true["w"] == 1)][
                    "TE"
                ].mean()
            ),
        ],
        [
            np.sqrt(
                D_w_true.loc[(D_w_true["S"] == 1)]["vsq"].sum()
                / (D_w_true.loc[(D_w_true["S"] == 1)].shape[0]) ** 2
            ),
            (
                np.sqrt(
                    D_w_true.loc[(D_w_true["S"] == 1) * (D_w_true["w"] == 1)][
                        "vsq"
                    ].sum()
                    / (
                        D_w_true.loc[(D_w_true["S"] == 1) * (D_w_true["w"] == 1)].shape[
                            0
                        ]
                    )
                    ** 2
                )
            ),
        ],
    ],
    index=["Bias", "Stdev"],
    columns=["Pre", "Post"],
)

print(
    pd.concat([forest_box_r, tree_box_r, linear_box_r, brute_box_r], axis=1).to_latex()
)

In [None]:
importlib.reload(learn)
D_rash, D_forest, w_forest, rashomon_set, f, testing_data = learn.forest_opt(
    data=df,
    outcome=outcome,
    treatment=treatment,
    sample=sample,
    leaf_proba=1,
    num_trees=3000,
    vote_threshold=1 / 2,
)
baseline_loss = np.sqrt(np.sum(D_forest["vsq"]) / ((np.sum((1 - D_forest["S"])) ** 2)))

fig, ax = plt.subplots(figsize=(10, 5), sharex=True, sharey=True, dpi=600)
tree.plot_tree(
    f,
    filled=True,
    feature_names=df.drop(columns=[outcome, treatment, sample]).columns,
    ax=ax,
)
plt.tight_layout()
fig.savefig("explain_community.pdf", format="pdf", dpi=600)

fig, ax = plt.subplots(figsize=(5, 5), sharex=True, sharey=True, dpi=600)
# sns.scatterplot(x="X0", y="X1", hue="S", data=df, ax=ax, palette="Set1", legend=False)
sns.scatterplot(x="X0", y="X1", hue="w_opt", data=D_rash, ax=ax, hue_order=[1, 0])
# plt.legend(title="w")
plt.tight_layout()
fig.savefig("forest_community.pdf", format="pdf", dpi=600)

fig, ax = plt.subplots(figsize=(5, 5), sharex=True, sharey=True, dpi=600)
sns.scatterplot(x="X0", y="X1", hue="S", data=df, ax=ax, palette="Set1", legend=False)
plt.tight_layout()

# Highdimensional Linear DGP

In [None]:
importlib.reload(all_linear)
df, Y, coef = all_linear.get_data(n=5000, seed=1)
outcome = "Yobs"
treatment = "T"
sample = "S"
TE = Y["Y1"] - Y["Y0"]
df_true = df.copy(deep=True)
df_true["TE"] = TE

In [None]:
importlib.reload(learn)
np.random.seed(42)
D_labels, f, testing_data = learn.kmeans_opt(
    data=df,
    outcome=outcome,
    treatment=treatment,
    sample=sample,
)

# D_labels["w"] = D_labels["w"].astype(int)
# fig, ax = plt.subplots(figsize=(5, 5), sharex=True, sharey=True, dpi=600)
# sns.scatterplot(x="X0", y="X1", hue="w", data=D_labels, ax=ax)
# plt.legend(title="w")
# plt.legend(ncols=2, loc="lower left")
# plt.tight_layout()

df_true["w"] = f.predict(df_true[[col for col in df_true.columns if "X" in col]])
D_w_true = D_labels.join(df_true[["TE", "S", "w"]], rsuffix="_true", how="outer")

brute_box_r = pd.DataFrame(
    [
        [
            (
                D_w_true.loc[(D_w_true["S_true"] == 1)]["v"].mean()
                - D_w_true.loc[(D_w_true["S_true"] == 0)]["TE"].mean()
            ),
            (
                D_w_true.loc[(D_w_true["S_true"] == 1) * (D_w_true["w_true"] == 1)][
                    "v"
                ].mean()
                - D_w_true.loc[(D_w_true["S_true"] == 0) * (D_w_true["w_true"] == 1)][
                    "TE"
                ].mean()
            ),
        ],
        [
            np.sqrt(
                D_w_true.loc[(D_w_true["S"] == 1)]["vsq"].sum()
                / (D_w_true.loc[(D_w_true["S"] == 1)].shape[0]) ** 2
            ),
            (
                np.sqrt(
                    D_w_true.loc[(D_w_true["S"] == 1) * (D_w_true["w"] == 1)][
                        "vsq"
                    ].sum()
                    / (
                        D_w_true.loc[(D_w_true["S"] == 1) * (D_w_true["w"] == 1)].shape[
                            0
                        ]
                    )
                    ** 2
                )
            ),
        ],
    ],
    index=["Bias", "Stdev"],
    columns=["Pre", "Post"],
)

importlib.reload(learn)
np.random.seed(42)
D_labels, f, testing_data = learn.linear_opt(
    data=df, outcome=outcome, treatment=treatment, sample=sample, seed=42
)

D_labels["w"] = D_labels["w"].astype(int)
# fig, ax = plt.subplots(figsize=(5, 5), sharex=True, sharey=True, dpi=600)
# sns.scatterplot(
#     x="X0", y="X1", hue="w", data=D_labels, ax=ax, hue_order={0: "C1", 1: "C2"}
# )
# plt.legend(title="w")
# # plt.xlim(-0.01, 1.25)
# # plt.ylim(-0.01, 1.25)
# plt.legend(ncols=2, loc="lower left")
# plt.tight_layout()

df_true["w"] = f.predict(df_true[[col for col in df_true.columns if "X" in col]])
D_w_true = D_labels.join(df_true[["TE", "S", "w"]], rsuffix="_true", how="outer")

linear_box_r = pd.DataFrame(
    [
        [
            (
                D_w_true.loc[(D_w_true["S_true"] == 1)]["v"].mean()
                - D_w_true.loc[(D_w_true["S_true"] == 0)]["TE"].mean()
            ),
            (
                D_w_true.loc[(D_w_true["S_true"] == 1) * (D_w_true["w_true"] == 1)][
                    "v"
                ].mean()
                - D_w_true.loc[(D_w_true["S_true"] == 0) * (D_w_true["w_true"] == 1)][
                    "TE"
                ].mean()
            ),
        ],
        [
            np.sqrt(
                D_w_true.loc[(D_w_true["S"] == 1)]["vsq"].sum()
                / (D_w_true.loc[(D_w_true["S"] == 1)].shape[0]) ** 2
            ),
            (
                np.sqrt(
                    D_w_true.loc[(D_w_true["S"] == 1) * (D_w_true["w"] == 1)][
                        "vsq"
                    ].sum()
                    / (
                        D_w_true.loc[(D_w_true["S"] == 1) * (D_w_true["w"] == 1)].shape[
                            0
                        ]
                    )
                    ** 2
                )
            ),
        ],
    ],
    index=["Bias", "Stdev"],
    columns=["Pre", "Post"],
)

importlib.reload(learn)
np.random.seed(42)
D_labels, f, testing_data = learn.tree_opt(
    data=df, outcome=outcome, treatment=treatment, sample=sample, seed=0
)

D_labels["w"] = D_labels["w"].astype(int)
# fig, ax = plt.subplots(figsize=(5, 5), sharex=True, sharey=True, dpi=600)
# sns.scatterplot(
#     x="X0", y="X1", hue="w", data=D_labels, ax=ax, hue_order={0: "C1", 1: "C2"}
# )
# plt.legend(title="w")
# plt.legend(ncols=2, loc="lower left")
# plt.tight_layout()

df_true["w"] = f.predict(df_true[[col for col in df_true.columns if "X" in col]])
D_w_true = D_labels.join(df_true[["TE", "S", "w"]], rsuffix="_true", how="outer")

tree_box_r = pd.DataFrame(
    [
        [
            (
                D_w_true.loc[(D_w_true["S_true"] == 1)]["v"].mean()
                - D_w_true.loc[(D_w_true["S_true"] == 0)]["TE"].mean()
            ),
            (
                D_w_true.loc[(D_w_true["S_true"] == 1) * (D_w_true["w_true"] == 1)][
                    "v"
                ].mean()
                - D_w_true.loc[(D_w_true["S_true"] == 0) * (D_w_true["w_true"] == 1)][
                    "TE"
                ].mean()
            ),
        ],
        [
            np.sqrt(
                D_w_true.loc[(D_w_true["S"] == 1)]["vsq"].sum()
                / (D_w_true.loc[(D_w_true["S"] == 1)].shape[0]) ** 2
            ),
            (
                np.sqrt(
                    D_w_true.loc[(D_w_true["S"] == 1) * (D_w_true["w"] == 1)][
                        "vsq"
                    ].sum()
                    / (
                        D_w_true.loc[(D_w_true["S"] == 1) * (D_w_true["w"] == 1)].shape[
                            0
                        ]
                    )
                    ** 2
                )
            ),
        ],
    ],
    index=["Bias", "Stdev"],
    columns=["Pre", "Post"],
)

importlib.reload(learn)
np.random.seed(42)
D_rash, D_forest, w_forest, rashomon_set, f, testing_data = learn.forest_opt(
    data=df,
    outcome=outcome,
    treatment=treatment,
    sample=sample,
    leaf_proba=0.5,
    num_trees=2000,
    vote_threshold=2 / 3,
    top_k_trees=True,
    k=20,
)

# fig, ax = plt.subplots(figsize=(5, 5), sharex=True, sharey=True, dpi=600)
# sns.scatterplot(x="X0", y="X1", hue="w_opt", data=D_rash, ax=ax)
# # plt.xlim(-0.01, 1.25)
# # plt.ylim(-0.01, 1.25)
# plt.legend(ncols=2, loc="lower left")
# plt.tight_layout()

df_true["w"] = f.predict(df_true[[col for col in df_true.columns if "X" in col]])
D_w_true = D_rash.join(df_true[["TE", "S", "w"]], rsuffix="_true", how="outer")
df_refined = df.join(D_w_true[["w"]])
df_refined = df_refined.loc[df_refined["w"] == 1]

np.random.seed(42)
df_v, pi, pi_m, e_m, testing_data = learn.estimate_dml(
    data=df_refined, outcome="Yobs", treatment="T", sample="S", crossfit=5
)

forest_box_r = pd.DataFrame(
    [
        [
            (
                D_w_true.loc[(D_w_true["S_true"] == 1)]["v"].mean()
                - D_w_true.loc[(D_w_true["S_true"] == 0)]["TE"].mean()
            ),
            (
                df_v["te"].mean()
                - D_w_true.loc[(D_w_true["S_true"] == 0) * (D_w_true["w"] == 1)][
                    "TE"
                ].mean()
            ),
        ],
        [
            np.sqrt(
                D_w_true.loc[(D_w_true["S"] == 1)]["vsq"].sum()
                / (D_w_true.loc[(D_w_true["S"] == 1)].shape[0]) ** 2
            ),
            (
                np.sqrt(
                    D_w_true.loc[(D_w_true["S"] == 1) * (D_w_true["w"] == 1)][
                        "vsq"
                    ].sum()
                    / (
                        D_w_true.loc[(D_w_true["S"] == 1) * (D_w_true["w"] == 1)].shape[
                            0
                        ]
                    )
                    ** 2
                )
            ),
        ],
    ],
    index=["Bias", "Stdev"],
    columns=["Pre", "Post"],
)

print(
    pd.concat([forest_box_r, tree_box_r, linear_box_r, brute_box_r], axis=1).to_latex()
)

In [None]:
importlib.reload(learn)
D_rash, D_forest, w_forest, rashomon_set, f, testing_data = learn.forest_opt(
    data=df,
    outcome=outcome,
    treatment=treatment,
    sample=sample,
    leaf_proba=1,
    num_trees=3000,
    vote_threshold=4 / 5,
    top_k_trees=True,
    k=10,
)
baseline_loss = np.sqrt(np.sum(D_forest["vsq"]) / ((np.sum((1 - D_forest["S"])) ** 2)))

fig, ax = plt.subplots(figsize=(10, 5), sharex=True, sharey=True, dpi=600)
tree.plot_tree(
    f,
    filled=True,
    feature_names=df.drop(columns=[outcome, treatment, sample]).columns,
    ax=ax,
)
plt.tight_layout()
fig.savefig("explain_linear.pdf", format="pdf", dpi=600)

fig, ax = plt.subplots(figsize=(5, 5), sharex=True, sharey=True, dpi=600)
sns.scatterplot(x="X2", y="X7", hue="w_opt", data=D_rash, ax=ax, hue_order=[1, 0])
plt.legend(title="w")
plt.tight_layout()
fig.savefig("forest_linear.pdf", format="pdf", dpi=600)

fig, ax = plt.subplots(figsize=(5, 5), sharex=True, sharey=True, dpi=600)
sns.scatterplot(
    x="X2",
    y="X7",
    hue="S",
    data=df,
    ax=ax,
    palette="Set1",
    legend=False,
)
plt.tight_layout()

# Nonlinear DGP

In [None]:
import all_nonlinear

importlib.reload(all_nonlinear)
df, Y, coef = all_nonlinear.get_data_nonlinear(n=10000, seed=1)
outcome = "Yobs"
treatment = "T"
sample = "S"
TE = Y["Y1"] - Y["Y0"]
df_true = df.copy(deep=True)
df_true["TE"] = TE

In [None]:
df["S"].sum()

In [None]:
importlib.reload(learn)
np.random.seed(42)
D_rash, D_forest, w_forest, rashomon_set, f, testing_data = learn.forest_opt(
    data=df,
    outcome=outcome,
    treatment=treatment,
    sample=sample,
    leaf_proba=0.5,
    num_trees=2000,
    vote_threshold=2 / 3,
    top_k_trees=True,
    k=20,
)

fig, ax = plt.subplots(figsize=(5, 5), sharex=True, sharey=True, dpi=600)
sns.scatterplot(x="X0", y="X1", hue="w_opt", data=D_rash, ax=ax)
# plt.xlim(-0.01, 1.25)
# plt.ylim(-0.01, 1.25)
plt.legend(ncols=2, loc="lower left")
plt.tight_layout()

df_true["w"] = f.predict(df_true[[col for col in df_true.columns if "X" in col]])
D_w_true = D_rash.join(df_true[["TE", "S", "w"]], rsuffix="_true", how="outer")
df_refined = df.join(D_w_true[["w"]])
df_refined = df_refined.loc[df_refined["w"] == 1]

np.random.seed(42)
df_v, pi, pi_m, e_m, testing_data = learn.estimate_dml(
    data=df_refined, outcome="Yobs", treatment="T", sample="S", crossfit=5
)

forest_box_r = pd.DataFrame(
    [
        [
            (
                D_w_true.loc[(D_w_true["S_true"] == 1)]["v"].mean()
                - D_w_true.loc[(D_w_true["S_true"] == 0)]["TE"].mean()
            ),
            (
                df_v["te"].mean()
                - D_w_true.loc[(D_w_true["S_true"] == 0) * (D_w_true["w"] == 1)][
                    "TE"
                ].mean()
            ),
        ],
        [
            np.sqrt(
                D_w_true.loc[(D_w_true["S"] == 1)]["vsq"].sum()
                / (D_w_true.loc[(D_w_true["S"] == 1)].shape[0]) ** 2
            ),
            (
                np.sqrt(
                    D_w_true.loc[(D_w_true["S"] == 1) * (D_w_true["w"] == 1)][
                        "vsq"
                    ].sum()
                    / (
                        D_w_true.loc[(D_w_true["S"] == 1) * (D_w_true["w"] == 1)].shape[
                            0
                        ]
                    )
                    ** 2
                )
            ),
        ],
    ],
    index=["Bias", "Stdev"],
    columns=["Pre", "Post"],
)

# print(
#     pd.concat([forest_box_r, tree_box_r, linear_box_r, brute_box_r], axis=1).to_latex()
# )

In [None]:
baseline_loss = np.sqrt(np.sum(D_forest["vsq"]) / ((np.sum((1 - D_forest["S"])) ** 2)))

fig, ax = plt.subplots(figsize=(10, 5), sharex=True, sharey=True, dpi=600)
tree.plot_tree(
    f,
    filled=True,
    feature_names=df.drop(columns=[outcome, treatment, sample]).columns,
    ax=ax,
)
plt.tight_layout()
fig.savefig("explain_nonlinear.pdf", format="pdf", dpi=600)

fig, ax = plt.subplots(figsize=(5, 5), sharex=True, sharey=True, dpi=600)
sns.scatterplot(x="X2", y="X7", hue="w_opt", data=D_rash, ax=ax, hue_order=[1, 0])
plt.legend(title="w")
plt.tight_layout()
fig.savefig("forest_nonlinear.pdf", format="pdf", dpi=600)

fig, ax = plt.subplots(figsize=(5, 5), sharex=True, sharey=True, dpi=600)
sns.scatterplot(
    x="X2",
    y="X7",
    hue="S",
    data=df,
    ax=ax,
    palette="Set1",
    legend=False,
)
plt.tight_layout()