In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.linear_model as lm
import sklearn.ensemble as en
import sklearn.tree as tree
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
import importlib
import pyreadr

import tqdm
import black

#### main package
import learn_w as learn

importlib.reload(learn)

import warnings

warnings.filterwarnings("ignore")

%load_ext jupyter_black
sns.set(font_scale=1.25, style="whitegrid")
np.random.seed(0)

# Fetching Real MOUD Data & Set it up

In [None]:
outcome_cols = ["opioiduse12", "opioiduse24"]
treatment_col = "medicine_assigned"
discrete_cov = ["xrace", "mar", "sex"]

baseline_harmonized = pd.read_csv(
    "/Users/harshparikh/Library/CloudStorage/OneDrive-JohnsHopkins/MOUD_data/updated_data/ctn0094/drv/clean_patients_with_relapse_wide.csv",
    index_col=0,
)

stacked_list = []
for i in range(1, 6):
    stacked_list.append(
        pd.read_csv(
            "/Users/harshparikh/Library/CloudStorage/OneDrive-JohnsHopkins/MOUD_data/stacked_list_%d.csv"
            % (i),
            index_col=0,
        )
    )
df = stacked_list[0]

df_tedsa = df.loc[df["trialdata"] == 0]

ct94 = baseline_harmonized.loc[(baseline_harmonized["project"] == 27)]
outcome94 = ct94[outcome_cols]

common_cols = set.intersection(set(df_tedsa.columns), set(ct94.columns))

ct94_cc = ct94[common_cols].drop(columns=["edu", "mar"])
ct94_cc["sex"] = (ct94["sex"] == "male").astype(int)  # male = 1 and female = 0

imputer = KNNImputer(n_neighbors=4, weights="distance", add_indicator=False)
ct94_cc_imputed = imputer.fit_transform(ct94_cc)

ct94_cc = pd.DataFrame(ct94_cc_imputed, index=ct94_cc.index, columns=ct94_cc.columns)

ct94_cc["med_met"] = (ct94[treatment_col] == "met").astype(
    int
)  # methadone = 1 and bupenorphine = 0
ct94_cc = ct94_cc.dropna()

ct94_cc["S"] = 1
ct94_cc = ct94_cc.round(0).astype(int)

ct94_cc = ct94_cc.join(outcome94, how="inner")
print(ct94_cc.shape)
ct94_cc.groupby(by="med_met").mean()[outcome_cols]

df_tedsa_cc = df_tedsa[common_cols].drop(columns=["edu", "mar"])
df_tedsa_cc["S"] = 0
# invert age categories
df_tedsa_cc["age"].replace(
    {
        1: 13,
        2: 16,
        3: 18,
        4: 22,
        5: 27,
        6: 32,
        7: 37,
        8: 42,
        9: 47,
        10: 52,
        11: 60,
        12: 68,
    },
    inplace=True,
)

df_primary = pd.concat([df_tedsa_cc.sample(frac=1, replace=False), ct94_cc])

df_ = df_primary.drop(columns=[outcome_cols[0]]).fillna(0)

outcome = outcome_cols[1]
treatment = "med_met"
sample = "S"
data = df_
S = df_[sample]  # indicator for the sample
Y = df_[outcome]  # outcome variable
T = df_[treatment]  # indicator for the treatment

data_dummy = pd.get_dummies(data, columns=["xrace"])
data_dummy.rename(
    columns={
        "sex": "Male",
        "age": "Age",
        "ivdrug": "IV Drug Use",
        "bamphetamine30_base": "Hx Amphetamine",
        "bbenzo30_base": "Hx Benzo",
        "bcannabis30_base": "Hx Cannabis",
        "xrace_1": "White",
        "xrace_2": "Black",
        "xrace_3": "Hispanic",
        "xrace_4": "Other Race",
    },
    inplace=True,
)

X = data_dummy.drop(columns=[outcome, treatment, sample])  # pre-treatment covariates

# Generate Synthetic MOUD Data via Modeling
Impute Y(t) \
Logistic regression to model P(S=1 | X), \
Logistic regression to model P(T=1 | X, S=1) 

In [None]:
np.random.seed(42)
y1_m = en.GradientBoostingClassifier(n_estimators=10000).fit(
    X.loc[(S == 1) * (T == 1)], Y.loc[(S == 1) * (T == 1)]
)
y0_m = en.GradientBoostingClassifier(n_estimators=10000).fit(
    X.loc[(S == 1) * (T == 0)], Y.loc[(S == 1) * (T == 0)]
)

In [None]:
# P(S=1 | X)
pi_m = lm.LogisticRegressionCV().fit(X, S)
# P(T=1 | X, S=1)
e_m = lm.LogisticRegressionCV().fit(X.loc[S == 1], T.loc[S == 1])

In [None]:
X_sim = X.copy(deep=True)

joint_sim = X_sim.copy(deep=True)
joint_sim["Y(1)"] = np.random.binomial(1, y1_m.predict_proba(X_sim)[:, 1])
joint_sim["Y(0)"] = np.random.binomial(1, y0_m.predict_proba(X_sim)[:, 1])

S_sim = np.random.binomial(1, pi_m.predict_proba(X_sim)[:, 1])
T_sim = np.random.binomial(1, e_m.predict_proba(X_sim)[:, 1])
Y_sim = T_sim * joint_sim["Y(1)"] + (1 - T_sim) * joint_sim["Y(0)"]

In [None]:
df_sim = X_sim.copy(deep=True)
df_sim["Y"] = Y_sim
df_sim["T"] = T_sim
df_sim["S"] = S_sim

## Plot feature importance of variables for outcome and selection models from the synthetic DGP

In [None]:
feature_imp = pd.DataFrame()
feature_imp["treatment effect"] = pd.Series(
    y1_m.feature_importances_ + y0_m.feature_importances_, index=X.columns
).abs()
feature_imp["sample"] = pd.Series(pi_m.coef_[0], index=X.columns).abs()

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
feature_imp_scaled = pd.DataFrame(
    scaler.fit_transform(feature_imp),
    columns=feature_imp.columns,
    index=feature_imp.index,
)


def label_point(data, x, y, val, ax):
    for i in data.index:
        if "Hispanic" in str(data.loc[i][val]):
            ax.text(data.loc[i][x] + 0.01, data.loc[i][y], str(data.loc[i][val]))
        else:
            ax.text(data.loc[i][x] + 0.01, data.loc[i][y] - 0.05, str(data.loc[i][val]))


fig, ax = plt.subplots(figsize=(10, 7), dpi=600)
sns.scatterplot(
    data=feature_imp_scaled.reset_index(),
    x="treatment effect",
    y="sample",
    hue="index",
    ax=ax,
    s=100,
    legend=False,
)
label_point(
    data=feature_imp_scaled.reset_index(),
    x="treatment effect",
    y="sample",
    val="index",
    ax=ax,
)
plt.xlabel("Relative Feature Importance\n (Treatment Effect)")
plt.ylabel("Relative Feature Importance\n (Sample Selection Function)")
# plt.legend(ncols=3, loc=(0, -0.35))
plt.tight_layout()
plt.savefig("feature_importance_synth_case.pdf")

In [None]:
joint_sim["TE"] = joint_sim["Y(1)"] - joint_sim["Y(0)"]
joint_sim["h"] = (joint_sim["TE"] - joint_sim["TE"].mean()) ** 2

te_exp = en.GradientBoostingRegressor().fit(X_sim, joint_sim["h"])

feature_imp["treatment effect"] = pd.Series(
    te_exp.feature_importances_, index=X_sim.columns
).abs()

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
feature_imp_scaled = pd.DataFrame(
    scaler.fit_transform(feature_imp),
    columns=feature_imp.columns,
    index=feature_imp.index,
)


def label_point(data, x, y, val, ax):
    for i in data.index:
        if "Hispanic" in str(data.loc[i][val]):
            ax.text(data.loc[i][x] + 0.01, data.loc[i][y] - 0.05, str(data.loc[i][val]))
        else:
            ax.text(data.loc[i][x] + 0.01, data.loc[i][y], str(data.loc[i][val]))


fig, ax = plt.subplots(figsize=(10, 7), dpi=600)
sns.scatterplot(
    data=np.log2(feature_imp_scaled + 1).reset_index(),
    x="treatment effect",
    y="sample",
    hue="index",
    ax=ax,
    s=100,
    legend=False,
)
label_point(
    data=np.log2(feature_imp_scaled + 1).reset_index(),
    x="treatment effect",
    y="sample",
    val="index",
    ax=ax,
)
plt.xlabel("Relative Feature Importance\n (Treatment Effect)")
plt.ylabel("Relative Feature Importance\n (Sample Selection Function)")
# plt.legend(ncols=3, loc=(0, -0.35))
plt.tight_layout()
plt.savefig("feature_importance_synth_case.pdf")

# Analyses

In [None]:
data = df_sim
treatment = "T"
outcome = "Y"
sample = "S"

## Estimate Treatment Effects

In [None]:
importlib.reload(learn)
df_v_est, pi_est, pi_m_est, e_m_est, data2_est = learn.estimate_ipw(
    data, outcome, treatment, sample
)

In [None]:
print(
    "RCT-ATE: %.2f ± %.2f"
    % (
        100
        * (
            data.loc[(data[sample] == 1) * (data[treatment] == 1), outcome].mean()
            - data.loc[(data[sample] == 1) * (data[treatment] == 0), outcome].mean()
        ),
        100
        * (
            data.loc[(data[sample] == 1) * (data[treatment] == 1), outcome].sem()
            + data.loc[(data[sample] == 1) * (data[treatment] == 0), outcome].sem()
        ),
    )
)


print(
    "RCT-IPW ATE: %.2f ± %.2f" % (100 * df_v_est["a"].mean(), 100 * df_v_est["a"].sem())
)

print(
    "Transported ATE: %.2f ± %.2f"
    % (100 * df_v_est["te"].mean(), 100 * df_v_est["te"].sem())
)

## Plot Selection Score per Sample

In [None]:
np.random.seed(42)
data_dummy_logit = data.copy(deep=True)
data_dummy_logit["pi(x)"] = pi_m_est.predict_proba(X_sim)[:, 1]
data_dummy_logit["pi(x)/pi"] = data_dummy_logit["pi(x)"] / data_dummy_logit["S"].mean()
fig, ax = plt.subplots(sharex=True, figsize=(10, 3), dpi=600)
sns.set(font_scale=1.8, style="whitegrid")
sns.violinplot(
    data=data_dummy_logit,
    x="pi(x)/pi",
    y="S",
    hue="S",
    split=True,
    orient="h",
    ax=ax,
    alpha=0.5,
    inner="quart",
    bw=0.25,
    palette="Set1",
)
plt.xlabel(r"$\ell(x)/{\ell}$")
plt.tight_layout()

## Learn Underrepresented Groups via 3 different proposed methods

### Indicator Approach

In [None]:
importlib.reload(learn)
np.random.seed(42)
D_brute, f_brute, _ = learn.kmeans_opt(
    data=data,
    outcome=outcome,
    treatment=treatment,
    sample=sample,
)
print(
    (
        100 * D_brute.loc[D_brute["w"].astype(int) == 1]["v"].mean(),
        100 * D_brute.loc[D_brute["w"].astype(int) == 1]["v"].sem(),
    )
)

tree.plot_tree(f_brute, feature_names=X_sim.columns)

### Linear Approximation

In [None]:
importlib.reload(learn)
np.random.seed(42)
D_linear, f_linear, _ = learn.linear_opt(
    data=data_dummy_logit,
    outcome=outcome,
    treatment=treatment,
    sample=sample,
)
print(
    (
        100 * D_linear.loc[D_linear["w"].astype(int) == 1]["v"].mean(),
        100 * D_linear.loc[D_linear["w"].astype(int) == 1]["v"].sem(),
    )
)

tree.plot_tree(f_linear, feature_names=X_sim.columns)

### Using a Single Tree Optimizer

In [None]:
importlib.reload(learn)
np.random.seed(366)
D_tree, f_tree, _ = learn.tree_opt(
    data=data,
    outcome=outcome,
    treatment=treatment,
    sample=sample,
    leaf_proba=0.1,
)
print(
    (
        100 * D_tree.loc[D_tree["w"].astype(int) == 1]["v"].mean(),
        100 * D_tree.loc[D_tree["w"].astype(int) == 1]["v"].sem(),
    )
)

tree.plot_tree(f_tree, feature_names=X_sim.columns)

### Using ROOT based Forest Optimizer 

In [None]:
importlib.reload(learn)
np.random.seed(0)
D_rash, D_forest, w_forest, rashomon_set, f_forest, _ = learn.forest_opt(
    data=data,
    outcome=outcome,
    treatment=treatment,
    sample=sample,
    num_trees=2000,
    vote_threshold=99 / 100,
    explore_proba=0.1,
    feature_est="gbt",
    top_k_trees=1,
)
print(
    (
        100 * D_rash.loc[D_rash["w_opt"].astype(int) == 1]["v"].mean(),
        100 * D_rash.loc[D_rash["w_opt"].astype(int) == 1]["v"].sem(),
    )
)

# tree.plot_tree(f_forest, feature_names=X_sim.columns)

### Plotting ROOT Results

In [None]:
baseline_loss = np.sqrt(np.sum(D_forest["vsq"]) / ((D_forest.shape[0] ** 2)))
local_obj = pd.DataFrame(
    np.array([w_forest[i]["local objective"] for i in range(len(w_forest))]),
    columns=["Objective"],
).sort_values(by="Objective")

# top_k = 1
# # sns.pointplot((local_obj.iloc[:top_k])["Objective"].values)


w_rash = [
    "w_tree_%d" % (i)
    for i in range(len(w_forest))
    if i in list(local_obj.iloc[:top_k].index)
]
avg_votes = (D_forest[w_rash].mean(axis=1) >= 0.99).astype(int)
D_rash["w_opt"] = avg_votes

np.random.seed(42)
num_trees = 1
explainer = tree.DecisionTreeClassifier(max_depth=3).fit(
    X.loc[avg_votes.index], avg_votes
)


fig, ax = plt.subplots(nrows=num_trees, figsize=(20, 8), dpi=600)
for i in range(num_trees):
    if num_trees == 1:
        tree.plot_tree(
            explainer,  # .estimators_[i, 0],
            feature_names=X.columns,
            ax=ax,
            filled=True,
            fontsize=10,
            # proportion=True,
            impurity=False,
        )
    else:
        tree.plot_tree(
            explainer.estimators_[i, 0],
            feature_names=X.columns,
            ax=ax[i],
            filled=True,
            fontsize=10,
            # proportion=True,
        )