In [4]:
import pandas as pd

path = "../data/dataset.csv"
def load_data(path):
    """Load dataset from a CSV file located in the specified directory."""
    return pd.read_csv(path)

In [5]:
def preprocess_data(path):
    """Preprocess data: handle missing values, encode categorical variables, etc."""
    dataset = load_data(path)
    data = dataset.rename(columns={
        'Y': 'StudentAchievementScore',
        'Z': 'GrowthMindsetIntervention',
        'S3': 'FutureSuccessExpectations',
        'C1': 'StudentRaceEthnicity',
        'C2': 'StudentGender',
        'C3': 'FirstGenCollegeStatus',
        'XC': 'SchoolUrbanicity',
        'X1': 'PreInterventionFixedMindset',
        'X2': 'SchoolAchievementLevel',
        'X3': 'SchoolMinorityComposition',
        'X4': 'PovertyConcentration',
        'X5': 'TotalStudentPopulation'
    })

    return data

In [6]:
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor

"""
The S-learner uses a single model to estimate the treatment effect by including the treatment as a feature in the model.
"""


def s_fit(data, treatment_col, outcome_col, covariate_cols):
    """
    Train an S-learner model to estimate the Conditional Average Treatment Effect (CATE).

    Parameters:
    - data: DataFrame, the preprocessed dataset
    - treatment_col: str, name of the treatment column
    - outcome_col: str, name of the outcome column
    - covariate_cols: list of str, names of the covariate columns

    Returns:
    - cate_estimates: DataFrame, containing the CATE estimates for each instance
    """
    # Split the data into treatment and control groups
    treated_data = data[data[treatment_col] == 1]
    control_data = data[data[treatment_col] == 0]

    # Create the covariates matrix and the outcome vector
    X = data[covariate_cols + [treatment_col]]
    Y = data[outcome_col]

    # Train a model on the combined data
    model = GradientBoostingRegressor(n_estimators=200, random_state=42,learning_rate= 0.1, max_depth= 3 )
    model.fit(X, Y)

    return model


def predict_outcomes(X, model, treatment_col):
    """
    Predict potential outcomes for both treatment and control groups.

    Parameters:
    - X: pd.DataFrame, feature matrix excluding the treatment variable
    - model: trained model
    - treatment_col: str, name of the treatment column

    Returns:
    - pd.DataFrame with columns 'pred_0' and 'pred_1' for control and treatment predictions.
    """
    X_control = X.copy()
    X_control[treatment_col] = 0
    pred_0 = model.predict(X_control)

    X_treatment = X.copy()
    X_treatment[treatment_col] = 1
    pred_1 = model.predict(X_treatment)

    return pd.DataFrame({'pred_0': pred_0, 'pred_1': pred_1})


def estimate_CATE(df):
    """
    Estimate the Conditional Average Treatment Effect (CATE).

    Parameters:
    - X: pd.DataFrame, feature matrix excluding the treatment variable
    - model: trained model
    - treatment_col: str, name of the treatment column

    Returns:
    - pd.Series with CATE estimates.
    """

    return df['pred_1'] - df['pred_0']

In [7]:
import pandas as pd
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split


def t_fit(data, treatment_col, outcome_col, covariate_cols):
    """
    Train a T-learner model to estimate the Conditional Average Treatment Effect (CATE).

    Parameters:
    - data: DataFrame, the preprocessed dataset
    - treatment_col: str, name of the treatment column
    - outcome_col: str, name of the outcome column
    - covariate_cols: list of str, names of the covariate columns

    Returns:
    - model_treated: trained model for the treated group
    - model_control: trained model for the control group
    """
    # Split the data into treatment and control groups
    treated_data = data[data[treatment_col] == 1]
    control_data = data[data[treatment_col] == 0]

    # Create the covariates matrix and the outcome vector for both groups
    X_treated = treated_data[covariate_cols]
    y_treated = treated_data[outcome_col]
    X_control = control_data[covariate_cols]
    y_control = control_data[outcome_col]

    # Train separate models on the treated and control groups
    model_treated = Lasso(alpha=0.1, random_state=42)
    model_control = Lasso(alpha=0.1, random_state=42)

    model_treated.fit(X_treated, y_treated)
    model_control.fit(X_control, y_control)

    return model_treated, model_control


def predict_outcomes_t(X, model_treated, model_control):
    """
    Predict potential outcomes for both treatment and control groups.

    Parameters:
    - X: pd.DataFrame, feature matrix excluding the treatment variable
    - model_treated: trained model for treated data
    - model_control: trained model for control data

    Returns:
    - pd.DataFrame with columns 'pred_treated' and 'pred_control'.
    """
    pred_treated = model_treated.predict(X)
    pred_control = model_control.predict(X)

    return pd.DataFrame({'pred_treated': pred_treated, 'pred_control': pred_control})


def estimate_CATE_t(df):
    """
    Estimate the Conditional Average Treatment Effect (CATE) using T-learner.

    Parameters:
    - df: pd.DataFrame, dataframe containing predictions for treated and control groups

    Returns:
    - pd.Series with CATE estimates.
    """
    return df['pred_treated'] - df['pred_control']


In [8]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge


def x_fit(data, treatment_col, outcome_col, covariate_cols):
    """
    Train an X-learner model to estimate the Conditional Average Treatment Effect (CATE).

    Parameters:
    - data: DataFrame, the preprocessed dataset
    - treatment_col: str, name of the treatment column
    - outcome_col: str, name of the outcome column
    - covariate_cols: list of str, names of the covariate columns

    Returns:
    - cate_estimates: DataFrame, containing the CATE estimates for each instance
    """
    # Split the data into treatment and control groups
    treated_data = data[data[treatment_col] == 1]
    control_data = data[data[treatment_col] == 0]

    # Create the covariates matrix and the outcome vector for both groups
    X_treated = treated_data[covariate_cols]
    y_treated = treated_data[outcome_col]
    X_control = control_data[covariate_cols]
    y_control = control_data[outcome_col]

    # Train separate models on the treated and control groups
    model_treated = Ridge(alpha=10.0, random_state=42)
    model_control = Ridge(alpha=10.0, random_state=42)

    model_treated.fit(X_treated, y_treated)
    model_control.fit(X_control, y_control)

    return model_treated, model_control


def predict_outcomes_x(X, model_treated, model_control):
    """
    Predict potential outcomes for both treatment and control groups.

    Parameters:
    - X: pd.DataFrame, feature matrix excluding the treatment variable
    - model_treated: trained model for treated data
    - model_control: trained model for control data

    Returns:
    - pd.DataFrame with columns 'pred_treated' and 'pred_control'.
    """
    pred_treated = model_treated.predict(X)
    pred_control = model_control.predict(X)

    return pd.DataFrame({'pred_treated': pred_treated, 'pred_control': pred_control})


def estimate_CATE_x(df):
    """
    Estimate the Conditional Average Treatment Effect (CATE) using X-learner.

    Parameters:
    - df: pd.DataFrame, dataframe containing predictions for treated and control groups

    Returns:
    - pd.Series with CATE estimates.
    """
    return df['pred_treated'] - df['pred_control']


In [9]:
import pandas as pd
from sklearn.linear_model import Ridge
import numpy as np
from sklearn.model_selection import cross_val_predict

def r_fit(data, treatment_col, outcome_col, covariate_cols):
    """
    Train an R-learner model to estimate the Conditional Average Treatment Effect (CATE).

    Parameters:
    - data: DataFrame, the preprocessed dataset
    - treatment_col: str, name of the treatment column
    - outcome_col: str, name of the outcome column
    - covariate_cols: list of str, names of the covariate columns

    Returns:
    - tau_model: trained model for estimating the treatment effect
    """
    model = Ridge(alpha=10.0)
    X = data[covariate_cols]
    T = data[treatment_col]
    y = data[outcome_col]

    # Fit outcome model
    y_model = cross_val_predict(model, X, y, cv=5)

    # Fit treatment model
    t_model = cross_val_predict(model, X, T, cv=5)

    # Calculate residuals
    y_residual = y - y_model
    t_residual = T - t_model

    # Regularization of residuals
    t_residual_clipped = np.clip(t_residual, a_min=0.001, a_max=None)
    y_residual = (y_residual - np.mean(y_residual)) / np.std(y_residual)
    t_residual_clipped = (t_residual_clipped - np.mean(t_residual_clipped)) / np.std(t_residual_clipped)

    # Add a small constant to prevent division by zero
    epsilon = 1e-3 * np.std(t_residual)
    t_residual_regularized = t_residual_clipped + epsilon

    #     # Use RidgeCV or LassoCV for additional regularization in the final model
    #     regularization = 'ridge'
    #     tau_model = RidgeCV(alphas=[0.1, 1.0, 10.0])
    regularization = 'lasso'
    tau_model = Ridge(alpha=0.1)
    # Fit the treatment effect model
    division_result = y_residual / t_residual_regularized
    print(f"division result : {division_result.describe()}")

    tau_model.fit(X, y_residual / t_residual_regularized)
    print(f"tau model coefficient :{tau_model.coef_}")
    # print(f"Best alpha chosen by cross-validation: {tau_model.alpha_}")

    return tau_model, y_model, t_model, y_residual,t_residual

def predict_outcomes_r(X, tau_model, y_model, t_model):
    """
    Predict potential outcomes for both treatment and control groups.

    Parameters:
    - X: pd.DataFrame, feature matrix
    - tau_model: trained model for estimating the treatment effect
    - y_model: trained model for predicting the outcome
    - t_model: trained model for predicting the treatment

    Returns:
    - pd.Series with treatment effect estimates.
    """
    # Predict residuals
    y_residual = y_model
    t_residual = t_model

    # Predict treatment effect
    tau_pred = tau_model.predict(X)

    return pd.DataFrame({'tau_pred': tau_pred, 'y_residual': y_residual, 't_residual': t_residual})

def estimate_CATE_r(df):
    """
    Estimate the Conditional Average Treatment Effect (CATE) using R-learner.

    Parameters:
    - df: pd.DataFrame, dataframe containing predictions for treatment effect and residuals

    Returns:
    - pd.Series with CATE estimates.
    """
    return df['tau_pred']