In [5]:
import os
import json
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from causallib.estimation import IPW
from causalvis import CohortEvaluator, TreatmentEffectExplorer
from data_preparation import load_and_prepare_student_data, load_and_prepare_adult_data, apply_variable_mapping, student_variable_mapping
from matching import perform_matching
from dag_utils import load_dag

In [6]:
def convert_types(data):
    """ Convert numpy data types to native Python types for JSON serialization. """
    if isinstance(data, dict):
        return {k: convert_types(v) for k, v in data.items()}
    elif isinstance(data, list):
        return [convert_types(i) for i in data]
    elif isinstance(data, (np.int32, np.int64)):
        return int(data)
    elif isinstance(data, (np.float32, np.float64)):
        return float(data)
    else:
        return data

In [7]:
dataset = 'adult'  # Change to 'student' for the student dataset
data_file = 'data/adult_cleaned.csv' if dataset == 'adult' else 'data/student-por_raw.csv'

# Load and prepare data
if dataset == 'adult':
    df_encoded, labels, data = load_and_prepare_adult_data(data_file)
else:
    df_encoded, labels, data = load_and_prepare_student_data(data_file)

# Load DAG and map variables
dag_file = 'adult_true_confounders.json' if dataset == 'adult' else 'student_true_confounders.json'
G, confounds, prognostics = load_dag(dag_file)

if dataset == 'student':
    confounds = apply_variable_mapping(confounds, student_variable_mapping)
    prognostics = apply_variable_mapping(prognostics, student_variable_mapping)

print("Confounds: ", confounds)
print("Prognostics: ", prognostics)

Confounds:  ['race', 'age', 'sex', 'native.country', 'marital.status', 'education']
Prognostics:  ['occupation', 'workclass', 'relationship']


In [8]:
treatment = 'hours.per.week' if dataset == 'adult' else 'absences'
outcome = 'income' if dataset == 'adult' else 'G_avg'

# Perform matching
print(f"Before matching - df_encoded shape: {df_encoded.shape}")
adjustedCohort, unadjustedCohort = perform_matching(df_encoded, confounds, prognostics, treatment, outcome)
print(f"After matching - adjustedCohort length: {len(adjustedCohort)}, unadjustedCohort length: {len(unadjustedCohort)}")

Before matching - df_encoded shape: (30162, 11)
Covariates:  ['race', 'age', 'sex', 'native.country', 'marital.status', 'education', 'occupation', 'workclass', 'relationship']
X shape:  (30162, 9)
X types:  race              object
age                int64
sex               object
native.country    object
marital.status    object
education         object
occupation        object
workclass         object
relationship      object
dtype: object
a shape:  (30162,)
a types:  int32
y shape:  (30162,)
y types:  object
After matching - adjustedCohort length: 29846, unadjustedCohort length: 30162


In [9]:
adjustedCohort = convert_types(adjustedCohort)
unadjustedCohort = convert_types(unadjustedCohort)

# Save cohorts for visualization
base_name = os.path.splitext(os.path.basename(dag_file))[0]
adjusted_file_name = f'adjustedCohort_{base_name}.json'
unadjusted_file_name = f'unadjustedCohort_{base_name}.json'

with open(adjusted_file_name, 'w') as f:
    json.dump(adjustedCohort, f, indent=4)
with open(unadjusted_file_name, 'w') as f:
    json.dump(unadjustedCohort, f, indent=4)

In [10]:
cohort_evaluator = CohortEvaluator(unadjustedCohort=unadjustedCohort)
display(cohort_evaluator)

CohortEvaluator(component='CohortEvaluator', props={'unadjustedCohort': [{'race': 1, 'age': 82, 'sex': 2, 'nat…

In [13]:
X = df_encoded[confounds + prognostics]
a = df_encoded[treatment] >= df_encoded[treatment].median()
a = a.astype(int)
y = df_encoded[outcome]

lr = LogisticRegression(solver='lbfgs', max_iter=1000)
ipw = IPW(lr)
ipw.fit(X, a)

# Calculate the ATE
outcomes = ipw.estimate_population_outcome(X, a, y)
ate = outcomes[1] - outcomes[0]
print(f"Average Treatment Effect (ATE): {ate}")

# Convert data types
adjustedCohort = convert_types(adjustedCohort)
unadjustedCohort = convert_types(unadjustedCohort)

# Save cohorts for visualization
base_name = os.path.splitext(os.path.basename(dag_file))[0]
adjusted_file_name = f'adjustedCohort_{base_name}.json'
unadjusted_file_name = f'unadjustedCohort_{base_name}.json'

with open(adjusted_file_name, 'w') as f:
    json.dump(adjustedCohort, f, indent=4)
with open(unadjusted_file_name, 'w') as f:
    json.dump(unadjustedCohort, f, indent=4)

Average Treatment Effect (ATE): 0.11107210131884968
