In [1]:
import os
import json
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from causallib.estimation import IPW
from causalvis import CohortEvaluator, TreatmentEffectExplorer
from data_preparation import apply_variable_mapping, student_variable_mapping
from matching import perform_matching
from dag_utils import load_dag

In [2]:
def convert_types(data):
    if isinstance(data, dict):
        return {k: convert_types(v) for k, v in data.items()}
    elif isinstance(data, list):
        return [convert_types(i) for i in data]
    elif isinstance(data, (np.int32, np.int64)):
        return int(data)
    elif isinstance(data, (np.float32, np.float64)):
        return float(data)
    else:
        return data

In [3]:
dataset = 'adult_small'  # 'adult', 'student', 'adult_small', or 'student_small'

In [4]:
if dataset == 'adult':
    data_file = 'data/processed_adult.csv'
    dag_file = 'adult_true_confounds.json'
    df_encoded = pd.read_csv(data_file)
elif dataset == 'student':
    data_file = 'data/processed_student.csv'
    dag_file = 'student_true_confounds.json'
    df_encoded = pd.read_csv(data_file)
elif dataset == 'adult_small':
    data_file = 'data/processed_adult_small.csv'
    dag_file =  'small_adult_lingam_bk.json'   #'small_adult_true_confounds.json' 
    df_encoded = pd.read_csv(data_file)
elif dataset == 'student_small':
    data_file = 'data/processed_student_small.csv'
    dag_file =  'small_student_true_confounds.json' 
    df_encoded = pd.read_csv(data_file)

# Extract labels and data
labels = df_encoded.columns.tolist()
data = df_encoded.to_numpy()

# Load DAG and map
G, confounds, prognostics = load_dag(dag_file)

# Apply variable mapping if student is selected
if dataset == 'student' or dataset == 'student_small':
    confounds = apply_variable_mapping(confounds, student_variable_mapping)
    prognostics = apply_variable_mapping(prognostics, student_variable_mapping)

# Filter confounds and prognostics
confounds = [var for var in confounds if var in labels]
prognostics = [var for var in prognostics if var in labels]

if not confounds:
    print("No confounds are present in the dataset.")
if not prognostics:
    print("No prognostics are present in the dataset")

print("Confounds: ", confounds)
print("Prognostics: ", prognostics)

No prognostics are present in the dataset
Confounds:  ['education', 'occupation', 'age', 'native.country']
Prognostics:  []


In [5]:
if dataset == 'adult' or dataset == 'adult_small':
    treatment = 'hours.per.week'
    outcome = 'income'
elif dataset == 'student' or dataset == 'student_small':
    treatment = 'absences'
    outcome = 'G_avg'
else:
    raise ValueError("Unknown dataset specified")

# Perform matching
print(f"Before matching - df_encoded shape: {df_encoded.shape}")
adjustedCohort, unadjustedCohort = perform_matching(df_encoded, confounds, prognostics, treatment, outcome)
print(f"After matching - adjustedCohort length: {len(adjustedCohort)}, unadjustedCohort length: {len(unadjustedCohort)}")

Before matching - df_encoded shape: (30162, 7)
Covariates:  ['education', 'occupation', 'age', 'native.country']
X shape:  (30162, 4)
X types:  education         int64
occupation        int64
age               int64
native.country    int64
dtype: object
a shape:  (30162,)
a types:  int32
y shape:  (30162,)
y types:  int64
After matching - adjustedCohort length: 30162, unadjustedCohort length: 30162


In [6]:
adjustedCohort = convert_types(adjustedCohort)
unadjustedCohort = convert_types(unadjustedCohort)

In [7]:
cohort_evaluator = CohortEvaluator(unadjustedCohort=unadjustedCohort)
display(cohort_evaluator)

CohortEvaluator(component='CohortEvaluator', props={'unadjustedCohort': [{'education': 0.17476277261377215, 'o…

In [8]:
X = df_encoded[confounds + prognostics]
a = df_encoded[treatment] >= df_encoded[treatment].median()
a = a.astype(int)
y = df_encoded[outcome]

lr = LogisticRegression(solver='lbfgs', max_iter=1000)
ipw = IPW(lr)
ipw.fit(X, a)

# Calculate the ATE
outcomes = ipw.estimate_population_outcome(X, a, y)
ate = outcomes[1] - outcomes[0]
print(f"Average Treatment Effect (ATE): {ate}")

Average Treatment Effect (ATE): 0.17281855446118058
