In [20]:
import os
import json
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from causallib.estimation import IPW
from causalvis import CohortEvaluator, TreatmentEffectExplorer
from data_preparation import apply_variable_mapping, student_variable_mapping, load_and_prepare_adult_data, load_and_prepare_student_data
from matching import perform_matching
from dag_utils import load_dag

In [12]:
def convert_types(data):
    """ Convert numpy data types to native Python types for JSON serialization. """
    if isinstance(data, dict):
        return {k: convert_types(v) for k, v in data.items()}
    elif isinstance(data, list):
        return [convert_types(i) for i in data]
    elif isinstance(data, (np.int32, np.int64)):
        return int(data)
    elif isinstance(data, (np.float32, np.float64)):
        return float(data)
    else:
        return data

In [18]:
dataset = 'student'  # Change to 'adult', 'student', 'adult_small', or 'student_small'

# Load and prepare data based on the dataset selection
if dataset == 'adult':
    data_file = 'data/adult_cleaned.csv'
    dag_file = 'adult_true_confounds.json'
    df_encoded, labels, data = load_and_prepare_adult_data(data_file)
elif dataset == 'student':
    data_file = 'data/student-por_raw.csv'
    dag_file = 'student_true_confounds.json'
    df_encoded, labels, data = load_and_prepare_student_data(data_file)
elif dataset == 'adult_small':
    data_file = 'data/smaller_adult_dataset.csv'
    dag_file = 'small_adult_true_confounds.json'
    df_encoded, labels, data = load_and_prepare_adult_data(data_file)
elif dataset == 'student_small':
    data_file = 'data/smaller_student_dataset.csv'
    dag_file = 'small_student_true_confounds.json'
    df_encoded, labels, data = load_and_prepare_student_data(data_file)

# Apply variable mapping if the student dataset is selected
if dataset == 'student' or dataset == 'student_small':
    confounds = apply_variable_mapping(confounds, student_variable_mapping)
    prognostics = apply_variable_mapping(prognostics, student_variable_mapping)

# Ensure boolean columns are explicitly converted to integers if using student dataset
if dataset == 'student' or dataset == 'student_small':
    boolean_columns = ['internet_yes', 'higher_yes', 'famsup_yes', 'paid_yes']
    for col in boolean_columns:
        if col in df_encoded.columns:
            df_encoded[col] = df_encoded[col].astype(int)

# Filter confounds and prognostics to include only those present in the dataset
confounds = [var for var in confounds if var in labels]
prognostics = [var for var in prognostics if var in labels]

if not confounds:
    print("Warning: No confounds are present in the dataset. Proceed with caution.")
if not prognostics:
    print("Warning: No prognostics are present in the dataset. Proceed with caution.")

print("Filtered Confounds: ", confounds)
print("Filtered Prognostics: ", prognostics)

Filtered Confounds:  ['Medu', 'health', 'internet_yes', 'failures', 'famsup_yes']
Filtered Prognostics:  ['higher_yes', 'paid_yes', 'schoolsup', 'studytime', 'Pstatus']


In [19]:
if dataset == 'adult' or dataset == 'adult_small':
    treatment = 'hours.per.week'
    outcome = 'income'
elif dataset == 'student' or dataset == 'student_small':
    treatment = 'absences'
    outcome = 'G_avg'
else:
    raise ValueError("Unknown dataset specified")

# Perform matching
print(f"Before matching - df_encoded shape: {df_encoded.shape}")
adjustedCohort, unadjustedCohort = perform_matching(df_encoded, confounds, prognostics, treatment, outcome)
print(f"After matching - adjustedCohort length: {len(adjustedCohort)}, unadjustedCohort length: {len(unadjustedCohort)}")

Before matching - df_encoded shape: (410, 13)
Covariates:  ['Medu', 'health', 'internet_yes', 'failures', 'famsup_yes', 'higher_yes', 'paid_yes', 'schoolsup', 'studytime', 'Pstatus']
X shape:  (410, 10)
X types:  Medu            float64
health            int64
internet_yes       bool
failures          int64
famsup_yes         bool
higher_yes         bool
paid_yes           bool
schoolsup         int64
studytime         int64
Pstatus           int64
dtype: object
a shape:  (410,)
a types:  int32
y shape:  (410,)
y types:  float64


  X_at_a = X[self.treatments_ == a].copy()


IndexingError: Unalignable boolean Series provided as indexer (index of the boolean Series and of the indexed object do not match).

In [15]:
adjustedCohort = convert_types(adjustedCohort)
unadjustedCohort = convert_types(unadjustedCohort)

# Save cohorts for visualization
base_name = os.path.splitext(os.path.basename(dag_file))[0]
adjusted_file_name = f'adjustedCohort_{base_name}.json'
unadjusted_file_name = f'unadjustedCohort_{base_name}.json'

with open(adjusted_file_name, 'w') as f:
    json.dump(adjustedCohort, f, indent=4)
with open(unadjusted_file_name, 'w') as f:
    json.dump(unadjustedCohort, f, indent=4)

In [16]:
cohort_evaluator = CohortEvaluator(unadjustedCohort=unadjustedCohort)
display(cohort_evaluator)

CohortEvaluator(component='CohortEvaluator', props={'unadjustedCohort': [{'race': 1, 'age': 82, 'native.countr…

In [17]:
X = df_encoded[confounds + prognostics]
a = df_encoded[treatment] >= df_encoded[treatment].median()
a = a.astype(int)
y = df_encoded[outcome]

lr = LogisticRegression(solver='lbfgs', max_iter=1000)
ipw = IPW(lr)
ipw.fit(X, a)

# Calculate the ATE
outcomes = ipw.estimate_population_outcome(X, a, y)
ate = outcomes[1] - outcomes[0]
print(f"Average Treatment Effect (ATE): {ate}")

# Convert data types
adjustedCohort = convert_types(adjustedCohort)
unadjustedCohort = convert_types(unadjustedCohort)

# Save cohorts for visualization
base_name = os.path.splitext(os.path.basename(dag_file))[0]
adjusted_file_name = f'adjustedCohort_{base_name}.json'
unadjusted_file_name = f'unadjustedCohort_{base_name}.json'

with open(adjusted_file_name, 'w') as f:
    json.dump(adjustedCohort, f, indent=4)
with open(unadjusted_file_name, 'w') as f:
    json.dump(unadjustedCohort, f, indent=4)

Average Treatment Effect (ATE): 0.12156228163190821
