# Import demo dataframe

Since the *All of Us* data is restricted, we have provided a demo data frame with fake data to test the code for the models

We now import the relevant dataframe and rename it `birth`

In [None]:
!pip install import-ipynb

In [None]:
import import_ipynb
print("Thanks for your patience while I import the dataframe.")
from rawdemodataframe import *
print("All done.")

In [None]:
birth = df

# Modeling

### Importing packages

In [None]:
## For data handling
import pandas as pd
import numpy as np

## For plotting
import matplotlib.pyplot as plt
import seaborn as sns

## This sets the plot style
## to have a grid on a white background
sns.set_style("whitegrid")

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import recall_score, f1_score, precision_recall_curve, auc
from sklearn.metrics import confusion_matrix, classification_report
#from sklearn.metrics import precision_score
#from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC



### Import Demographics Dataset from Data_Cleaning_Demographics_FINAL

- final dataframe is 'birth'
- response variable is 'birth_class_binary'
- 'preprocessor' is pipeline for one-hot encoding race and ethnicity and scaling numerical features  

### Train-test splits

- Stratified splits
The preterm birth rate in the United States is about 10%. Our data reflects this (although slightly higher), so we will be using stratified splits to ensure that our training and test sets both contain preterm births. 
  
    
- Multiple observations
We are using multiple observations per person. To avoid using future data to predict the past, the *last* birth for each person will be reserved for the test set only. The training set may contain all births up to the last birth, including the first birth if it is the only birth. This is not a perfect solution to avoid data leakage, but the first thing we are trying. (*Note that this didn't end up being an issue here because of data quality.)  

### Mark births that are the last birth of multiples

In [None]:
import pandas as pd

# Assuming 'birth' DataFrame has columns 'person_id' and 'birth_order'

# Sort the DataFrame by 'person_id' and 'birth_order'
birth = birth.sort_values(by=['person_id', 'birth_order'])

# Identify the last birth in each group (person_id) with more than one birth
last_births = birth[birth.duplicated(subset='person_id', keep=False) & ~birth.duplicated(subset='person_id', keep='last')]

# Create a new column 'last_birth_of_multiples' and mark the identified last births as 1
birth['last_birth_of_multiples'] = 0
birth.loc[last_births.index, 'last_birth_of_multiples'] = 1

# Display the resulting DataFrame
print(birth[['person_id', 'birth_order', 'last_birth_of_multiples']])
print("Number of last births of multiples", birth['last_birth_of_multiples'].sum())

# Check the distribution of term and preterm in this set 
filtered_mutliples_df = birth[birth['last_birth_of_multiples'] == 1]
birth_class_percentage = filtered_mutliples_df['birth_class'].value_counts(normalize=True) * 100
print(birth_class_percentage)


In [None]:
# Include marked last births only in the test set
birth_test_data = birth[birth['last_birth_of_multiples'] == 1]

# Exclude last births of multiples from the dataset
birth_remaining = birth[birth['last_birth_of_multiples'] == 0]

# Reset the index for the resulting DataFrames
birth_test_data.reset_index(drop=True, inplace=True)
birth_remaining.reset_index(drop=True, inplace=True)


# Stratify the remaining births for inclusion in the training set
birth_train_data, birth_test_remaining = train_test_split(birth_remaining,
                                                          test_size=0.2,
                                                          shuffle=True,
                                                          random_state=404,
                                                          stratify=birth_remaining['birth_class'])


# Append last_birth_test_set to birth_test_remaining
birth_test_data = birth_test_remaining.append(birth_test_data)


# Reset the index for the resulting DataFrames
birth_train_data.reset_index(drop=True, inplace=True)
birth_test_data.reset_index(drop=True, inplace=True)


   #### Training and Test Data Set names:
   Test Data Set: birth_test_data  
   Training Data Set: birth_train_data 

#### View dataframes to check that multiples births loop worked as expected

In [None]:
print("Birth Train Data:")
#print(birth_train_data.info())
print("birth train unique vals", birth_train_data['birth_order'].unique())

print("\nBirth Test Data:")
#print(birth_test_data.info())
print("birth test unique vals",birth_test_data['birth_order'].unique())

#### Verifying train-test splits by inspecting percentage of preterm

In [None]:
#Training Data
birth_class_counts = birth_train_data['birth_class'].value_counts()

# Display the counts
print("Training Data: counts of each birth class:")
print(birth_class_counts)

# Calculate the percentage of preterm births
preterm_percentage = (birth_class_counts['Preterm'] / birth_class_counts.sum()) * 100

print(f"Training Data: percentage of preterm births: {preterm_percentage:.2f}%")


#Test Data
birth_class_counts = birth_test_data['birth_class'].value_counts()

# Display the counts
print("Test Data: counts of each birth class:")
print(birth_class_counts)

# Calculate the percentage of preterm births
preterm_percentage = (birth_class_counts['Preterm'] / birth_class_counts.sum()) * 100

print(f"Test Data: percentage of preterm births: {preterm_percentage:.2f}%")


In [None]:
# Normalize train data
normalized_value_counts = birth_train_data['birth_class'].value_counts(normalize=True)

# Display the normalized value counts
print("Normalized value counts:")
print(normalized_value_counts)


## Baseline Model

Based on our normalized value counts, our baseline model is a random coin flip with probability matching the likelihood of our preterm births (0.145).

In [None]:
import numpy as np
from sklearn.metrics import recall_score, precision_recall_curve, auc, f1_score

# Initialize empty lists to store evaluation metrics
baseline_recalls = []
baseline_f1_scores = []
baseline_pr_aucs = []

# Set the seed for reproducibility
np.random.seed(404)

# Perform 1000 random draws
for obs in range(1000):
    # Generate random binomial draws with probability 0.17
    draw = np.random.binomial(n=1, p=0.145, size=len(birth_train_data))
    
    # Calculate recall score and append to the list
    recall = recall_score(birth_train_data.birth_class_binary.values, draw)
    baseline_recalls.append(recall)
    
    # Calculate precision-recall curve and AUC
    precision, recall, _ = precision_recall_curve(birth_train_data.birth_class_binary.values, draw)
    pr_auc = auc(recall, precision)
    baseline_pr_aucs.append(pr_auc)
    
    # Calculate F1 score and append to the list
    f1 = f1_score(birth_train_data.birth_class_binary.values, draw)
    baseline_f1_scores.append(f1)

# Print statistics for recall, F1, and PR AUC scores
print("Here are some statistics for the recall score of our baseline:")
print("Mean Recall - " + str(round(np.mean(baseline_recalls), 3)))
print("Mean F1 Score - " + str(round(np.mean(baseline_f1_scores), 3)))
print("Mean PR AUC - " + str(round(np.mean(baseline_pr_aucs), 3)))
print("Median Recall - " + str(round(np.median(baseline_recalls), 3)))
print("Median F1 Score - " + str(round(np.median(baseline_f1_scores), 3)))
print("Median PR AUC - " + str(round(np.median(baseline_pr_aucs), 3)))


## Training: Setting up for models

We will use stratified 10-fold cross validation to account for the smaller percentage of pre-term births

#### Kfold Splits

In [None]:
kfold_splits = 10

kfold = StratifiedKFold(kfold_splits, shuffle=True, random_state=123)

#### Model Features

In [None]:
model_feat = ['assisted_income_zip', 'high_school_education_zip', 'median_income_zip',
              'no_health_insurance_zip', 'poverty_zip', 'vacant_housing_zip', 'deprivation_index_zip', 'age_at_birth']

model_feat.extend(["race_person", "ethnicity_person"])
model_feat.extend(["birth_order"])

## AdaBoost model - decision tree classifier with 50 weak learners

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import recall_score, f1_score, precision_recall_curve, auc
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import numpy as np
import pandas as pd


# Initialize arrays to store evaluation metrics
ab_recalls = np.zeros(kfold_splits)
ab_f1 = np.zeros(kfold_splits)
ab_pr_auc = np.zeros(kfold_splits)

# Initialize an empty dictionary to store feature importances
feature_importance_dict = {}

counter = 0

for train_index, test_index in kfold.split(birth_train_data, birth_train_data.birth_class_binary):
    birth_tt = birth_train_data.iloc[train_index]
    birth_ho = birth_train_data.iloc[test_index]

    # Create the pipeline with preprocessing and AdaBoostClassifier
    adaboost_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('adaboost', AdaBoostClassifier(n_estimators=50, random_state=404))
    ])

    # Fit the pipeline on training data
    adaboost_pipeline.fit(birth_tt[model_feat], birth_tt.birth_class_binary)
        
     # Get feature importances
    feature_importances = adaboost_pipeline.named_steps['adaboost'].feature_importances_

    # Access the feature names after preprocessing
    preprocessed_columns = adaboost_pipeline.named_steps['preprocessor'].get_feature_names_out(input_features=model_feat)

    # Print feature names and their importances
    for feature, importance in zip(preprocessed_columns, feature_importances):
        #print(f"Feature: {feature}, Importance: {importance}")

        # Store feature importance in the dictionary
        feature_importance_dict.setdefault(feature, []).append(importance)
        
    # Predict on the test data
    ab_pred = adaboost_pipeline.predict(birth_ho[model_feat])
    
    # Calculate evaluation metrics using predictions on the test set
    ab_recalls[counter] = recall_score(birth_ho.birth_class_binary, ab_pred)
    ab_f1[counter] = f1_score(birth_ho.birth_class_binary, ab_pred)
    
    # Calculate precision-recall curve and AUC
    ab_precision, ab_recall, _ = precision_recall_curve(birth_ho.birth_class_binary, adaboost_pipeline.predict_proba(birth_ho[model_feat])[:, 1])
    ab_pr_auc[counter] = auc(ab_recall, ab_precision)
    
    # Adjust counter for the next k-fold split
    counter += 1
    
# Calculate mean feature importances across folds
mean_feature_importances = {feature: np.mean(importances) for feature, importances in feature_importance_dict.items()}

# Print mean feature importances
for feature, mean_importance in mean_feature_importances.items():
    print(f"Mean Feature Importance: {feature}, Mean Importance: {mean_importance.round(3)}")

print("Mean recall:", np.mean(ab_recalls).round(3))
print("Mean F1:", np.mean(ab_f1).round(3))
print("Mean PR-AUC", np.mean(ab_pr_auc).round(3))


### Features to keep

We looked at feature importance from our AdaBoost model to inform feature selection for future models. The highest numbers are listed below. Ultimately, this did not improve model performance. 

**Mean Feature Importance: cat__race_person_Asian, Mean Importance: 0.006
Mean Feature Importance: cat__race_person_Black or African American, Mean Importance: 0.033999999999999996
Mean Feature Importance: cat__race_person_Middle Eastern or North African, Mean Importance: 0.019999999999999997
Mean Feature Importance: cat__race_person_More than one population, Mean Importance: 0.019999999999999997
Mean Feature Importance: cat__race_person_Native Hawaiian or Other Pacific Islander, Mean Importance: 0.002
Mean Feature Importance: cat__race_person_None of these, Mean Importance: 0.0
Mean Feature Importance: cat__race_person_White, Mean Importance: 0.0
Mean Feature Importance: cat__race_person_no answer, Mean Importance: 0.0**

Mean Feature Importance: cat__ethnicity_person_Hispanic or Latino, Mean Importance: 0.0
Mean Feature Importance: cat__ethnicity_person_None of these, Mean Importance: 0.0
Mean Feature Importance: cat__ethnicity_person_Not Hispanic or Latino, Mean Importance: 0.027999999999999997
Mean Feature Importance: cat__ethnicity_person_no answer, Mean Importance: 0.014000000000000002

Mean Feature Importance: cat__birth_order_1, Mean Importance: 0.008
Mean Feature Importance: cat__birth_order_2, Mean Importance: 0.0
Mean Feature Importance: cat__birth_order_3, Mean Importance: 0.014000000000000002
Mean Feature Importance: cat__birth_order_4, Mean Importance: 0.0

Mean Feature Importance: num__assisted_income_zip, Mean Importance: 0.054
**Mean Feature Importance: num__high_school_education_zip, Mean Importance: 0.120**
Mean Feature Importance: num__median_income_zip, Mean Importance: 0.062
**Mean Feature Importance: num__no_health_insurance_zip, Mean Importance: 0.138**
Mean Feature Importance: num__poverty_zip, Mean Importance: 0.102
**Mean Feature Importance: num__vacant_housing_zip, Mean Importance: 0.144**
Mean Feature Importance: num__deprivation_index_zip, Mean Importance: 0.060
**Mean Feature Importance: num__age_at_birth, Mean Importance: 0.174**

#### Update model features based on feature importance 


In [None]:
#model_feat = ['high_school_education_zip', 'no_health_insurance_zip', 'vacant_housing_zip', 'age_at_birth']


#### Update preprocessor to transform a smaller number of columns

Updated pipeline for using a subset of features based on feature selection. Commented out as the subset of features performed worse than the full set of features. 

In [None]:
#numeric_columns = ['high_school_education_zip', 'no_health_insurance_zip', 'vacant_housing_zip', 'age_at_birth']


#preprocessor = ColumnTransformer(
#    transformers=[
#          ('num', StandardScaler(), numeric_columns)
#    ],
#    remainder='passthrough'  # Keeps the non-categorical columns as they are
#)



## AdaBoost with Log Regression, 50 weak learners

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import recall_score, f1_score, precision_recall_curve, auc
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import numpy as np
import pandas as pd


# Initialize arrays to store evaluation metrics
ab_recalls = np.zeros(kfold_splits)
ab_f1 = np.zeros(kfold_splits)
ab_pr_auc = np.zeros(kfold_splits)
counter = 0

for train_index, test_index in kfold.split(birth_train_data, birth_train_data['birth_class_binary']):
    birth_tt = birth_train_data.iloc[train_index]
    birth_ho = birth_train_data.iloc[test_index]

    # Create the pipeline with preprocessing and AdaBoostClassifier
    adaboost_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('adaboost', AdaBoostClassifier(base_estimator=LogisticRegression(), n_estimators=50, random_state=404))
    ])

    # Fit the pipeline on training data
    adaboost_pipeline.fit(birth_tt[model_feat], birth_tt['birth_class_binary'])

    # Predict on the holdout data
    ab_pred = adaboost_pipeline.predict(birth_ho[model_feat])

    # Calculate evaluation metrics
    ab_recalls[counter] = recall_score(birth_ho['birth_class_binary'], ab_pred)
    ab_f1[counter] = f1_score(birth_ho['birth_class_binary'], ab_pred)

    # Calculate precision-recall curve and AUC
    ab_precision, ab_recall, _ = precision_recall_curve(birth_ho['birth_class_binary'], adaboost_pipeline.predict_proba(birth_ho[model_feat])[:, 1])
    ab_pr_auc[counter] = auc(ab_recall, ab_precision)

    # Adjust counter for the next k-fold split
    counter += 1

print("Mean recall:", np.mean(ab_recalls).round(3))
print("Mean F1:", np.mean(ab_f1).round(3))
print("Mean PR-AUC:", np.mean(ab_pr_auc).round(3))


## SVC model with class weights and loop to print values for multiple class weights

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import recall_score, f1_score, precision_recall_curve, auc
import numpy as np

weights = [5.75] #add numbers to explore more weights

for weight in weights:
    
    # Initialize arrays to store evaluation metrics
    svc_recalls = np.zeros(kfold_splits)
    svc_f1 = np.zeros(kfold_splits)
    svc_pr_auc = np.zeros(kfold_splits)

    # Initialize list to store predictions
    svc_predictions = []

    counter = 0

    for train_index, test_index in kfold.split(birth_train_data, birth_train_data.birth_class_binary):
        birth_tt = birth_train_data.iloc[train_index]
        birth_ho = birth_train_data.iloc[test_index]

        # Assuming class_weights is a dictionary containing class weights; preterm birth (1) should have heavier weight
        class_weights = {0: 1, 1: weight}  
    
        # Add a column for class weights to birth_tt
        birth_tt = birth_tt.copy()
        birth_tt['class_weights'] = birth_tt['birth_class_binary'].map(class_weights)
    
        # Create the pipeline with preprocessing and AdaBoostClassifier
        svc_pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('svc', SVC(probability=True, random_state=404))
        ])

        print("Now fitting the model. Please be patient.")
        # Fit the pipeline on training data
        svc_pipeline.fit(birth_tt[model_feat], birth_tt['birth_class_binary'], svc__sample_weight=birth_tt['class_weights'])

        # Predict on the holdout data
        svc_pred = svc_pipeline.predict(birth_ho[model_feat])

        # Append predictions to the list
        svc_predictions.append(svc_pred)
    
        # Calculate evaluation metrics
        svc_recalls[counter] = recall_score(birth_ho.birth_class_binary.values, svc_pred)
        #print("fold", counter, "recall", svc_recalls[counter])    
        svc_f1[counter] = f1_score(birth_ho.birth_class_binary.values, svc_pred)
        #print("fold", counter, "f1", svc_f1[counter]) 
    
        # Calculate precision-recall curve and AUC
        svc_precision, svc_recall, _ = precision_recall_curve(birth_ho.birth_class_binary, svc_pipeline.predict_proba(birth_ho[model_feat])[:, 1])
        svc_pr_auc[counter] = auc(svc_recall, svc_precision)
        #print("fold", counter, "pr-auc", svc_pr_auc[counter]) 
    
        # Adjust counter for the next k-fold split
        counter += 1
        #print("onto the next fold") 


#print("Recalls", svc_recalls)
#print("F1 Scores", svc_f1)
#print("PR-AUC", svc_pr_auc)
    print("weight:", weight)
    print("Mean recall", np.mean(svc_recalls).round(3))
    print("Mean F1", np.mean(svc_f1).round(3))
    print("Mean PR-AUC", np.mean(svc_pr_auc).round(3))


##### I tried to maximize the PR-AUC, which is still performing below baseline. I'm not able to improve this model further with class weights alone. Basically the model is just over-predicting preterm births. These are some scores for 

model_feat = ['assisted_income_zip', 'high_school_education_zip', 'median_income_zip',
              'no_health_insurance_zip', 'poverty_zip', 'vacant_housing_zip', 'deprivation_index_zip', 'age_at_birth']  
              
model_feat.extend(["race_person", "ethnicity_person"])  

model_feat.extend(["birth_order"])  

weight: 5  
Mean recall 0.291
Mean F1 0.237 
Mean PR-AUC 0.182

weight: 5.25  
Mean recall 0.320
Mean F1 0.241
Mean PR-AUC 0.180  

weight: 5.5  
Mean recall 0.355 
Mean F1 0.244
Mean PR-AUC 0.179  

**weight: 5.75  
Mean recall 0.434  
Mean F1 0.261  
Mean PR-AUC 0.181**  

weight: 6  
Mean recall 0.464  
Mean F1 0.250  
Mean PR-AUC 0.179  

weight: 6.25  
Mean recall 0.521  
Mean F1 0.253  
Mean PR-AUC 0.177  

weight: 6.5  
Mean recall 0.581  
Mean F1 0.255  
Mean PR-AUC 0.174  


In [None]:
###### Confusion matrix
print("Confusion Matrix:\n", confusion_matrix(birth_ho['birth_class_binary'], svc_pred))

# Classification report
print("Classification Report:\n", classification_report(birth_ho['birth_class_binary'], svc_pred))



## SVC with GridSearch CV

Remembered there is a better way to tune hyperparameters than the loop above....

Checked with:
param_grid = {
     'svc__C': [.1, 1, 10],  
    'svc__kernel': ['linear', 'poly', 'rbf' ],
    'svc__class_weight': [{0: 1, 1: weight} for weight in np.arange(5, 7, 0.25)]
}

param_grid below updated to best model in order to save computational time

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import average_precision_score


# Define the hyperparameter grid to search
param_grid = {
     'svc__C': [1],  
    'svc__kernel': ['linear'],
    'svc__class_weight': [{0: 1, 1: 5.75}]
}



# Create pipeline using preprocessor and initial hyperparameter settings (must match hyperparameters in param grid)
gridsearch_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('svc', SVC(C=1.0, kernel='linear', class_weight=None, probability=True, random_state=404)) 
])


# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=gridsearch_pipeline, param_grid=param_grid, scoring='average_precision', cv=5)  
print("Created the GridSearch SV object. Now fitting the model. Please be patient.")

# Fit the GridSearchCV object to your data
grid_search.fit(birth_tt[model_feat], birth_tt['birth_class_binary'])

# Print the best hyperparameters found
print("Best Hyperparameters:", grid_search.best_params_)

# Get the best model
best_model = grid_search.best_estimator_



### Getting best hyperparameters for SVC model (class weights, kernel, C)

In [None]:
# Accessing hyperparameters
best_kernel = best_model.named_steps['svc'].kernel
best_C = best_model.named_steps['svc'].C
best_class_weight = best_model.named_steps['svc'].class_weight

# Print or use the values as needed
print("Best Kernel:", best_kernel)
print("Best C:", best_C)
print("Best Class Weight:", best_class_weight)


#Make predictions on the test set using the best model
gridsearch_pred = best_model.predict(birth_ho[model_feat])

gridsearch_scores = best_model.decision_function(birth_ho[model_feat])


# Print scores
pr_auc = average_precision_score(birth_ho['birth_class_binary'], gridsearch_scores)
print("PR AUC of the Best Model:", pr_auc.round(3))

In [None]:
###### Confusion matrix
print("Confusion Matrix:\n", confusion_matrix(birth_ho['birth_class_binary'], gridsearch_pred))

# Classification report
print("Classification Report:\n", classification_report(birth_ho['birth_class_binary'], gridsearch_pred))


# Testing: SVC model with final hyperparameters and pipeline

- Chose 6.25; highest PR-AUC achieved  

Recalls 0.5492957746478874  
F1 Scores 0.23330009970089732  
PR-AUC 0.15802294790524057  

In [None]:
#Add class weights 
class_weights = {0: 1, 1: 5.75}  
birth_test_data = birth_test_data.copy()
birth_test_data['class_weights'] = birth_test_data['birth_class_binary'].map(class_weights)

# Predict on the test data
svc_pred_final = svc_pipeline.predict(birth_test_data[model_feat])
        
# Calculate evaluation metrics
svc_recalls = recall_score(birth_test_data.birth_class_binary.values, svc_pred_final)   
svc_f1 = f1_score(birth_test_data.birth_class_binary.values, svc_pred_final)
    
# Calculate precision-recall curve and AUC
svc_precision, svc_recall, _ = precision_recall_curve(birth_test_data.birth_class_binary, svc_pipeline.predict_proba(birth_test_data[model_feat])[:, 1])
svc_pr_auc = auc(svc_recall, svc_precision)
    

print("Recalls", svc_recalls.round(3))
print("F1 Scores", svc_f1.round(3))
print("PR-AUC", svc_pr_auc.round(3))

## Fairness Metrics

### One-hot encoding for fairness metrics because I couldn't get the pipeline to work

#### Birth_encoded - entire birth dataframe for metrics on entire dataset

In [None]:
birth_encoded = pd.get_dummies(birth, columns=['race_person', 'ethnicity_person'], prefix=['race', 'ethnicity'], prefix_sep='_')

# Drop datetime and obj col as it causes errors and is not necessary
columns_to_drop = ['condition_start_date', 'birth_class']
birth_encoded = birth_encoded.drop(columns=columns_to_drop)

birth_encoded.head()

#### Birth_ho encoding (birth holdout set) 

In [None]:
birth_ho_encoded = pd.get_dummies(birth_ho, columns=['race_person', 'ethnicity_person'], prefix=['race', 'ethnicity'], prefix_sep='_')

columns_to_drop = ['condition_start_date', 'birth_class']
birth_ho_encoded = birth_ho_encoded.drop(columns=columns_to_drop)
birth_ho_encoded.head()

#### Birth_test_data (Test dataset)

In [None]:
birth_test_data_encoded = pd.get_dummies(birth_test_data, columns=['race_person', 'ethnicity_person'], prefix=['race', 'ethnicity'], prefix_sep='_')

columns_to_drop = ['condition_start_date', 'birth_class']
birth_test_data_encoded = birth_test_data_encoded.drop(columns=columns_to_drop)
birth_test_data_encoded.head()

##### Calculate SPD and Equalized Odds on Training Data Holdout set using loop to access folds

In [None]:
import aif360
from aif360.metrics import BinaryLabelDatasetMetric, ClassificationMetric
from aif360.datasets import BinaryLabelDataset
from sklearn.metrics import accuracy_score
import numpy as np

# Assuming total_folds is the number of folds from your previous code
total_folds = kfold.get_n_splits()

race_categories = ['race_Black or African American', 'race_None of these', 'race_no answer',
                   'race_More than one population', 'race_Asian', 'race_Middle Eastern or North African',
                   'race_Native Hawaiian or Other Pacific Islander']

# Initialize arrays to store fairness metrics across folds for each minority group
mean_diffs = {group: [] for group in race_categories}
equalized_odds_ratios = {group: [] for group in race_categories}

# Iterate over folds
for fold in range(total_folds):
    
    # Use the predictions from the corresponding fold
    svc_pred_fold = svc_predictions[fold]

    # Iterate over minority groups
    for minority_group in race_categories:
        
        # Create a new BinaryLabelDataset for each minority group within the fold
        label_column_name = 'birth_class_binary'
        bld = BinaryLabelDataset(
            favorable_label=0, unfavorable_label=1,
            df=birth_ho_encoded, label_names=[label_column_name],
            protected_attribute_names=['race_White'] + race_categories
        )

        privileged_groups = [{'race_White': 1}]
        unprivileged_groups = [{'race_White': 0, minority_group: 1}]

        # Create an instance of BinaryLabelDatasetMetric
        metric_bld = BinaryLabelDatasetMetric(bld, privileged_groups=privileged_groups, unprivileged_groups=unprivileged_groups)

        # Calculate mean difference
        mean_diffs[minority_group].append(metric_bld.mean_difference())

        # Assuming you have ground truth labels and predicted labels
        cm = ClassificationMetric(bld, bld, unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups)

        # Calculate equalized odds ratio
        equalized_odds_ratios[minority_group].append(cm.average_odds_difference())

# Print the mean fairness metrics for each minority group across all folds
for minority_group in race_categories:
    print(f" Mean Difference ({minority_group}): {np.mean(mean_diffs[minority_group]).round(3)}")
    print(f" Equalized Odds Ratio ({minority_group}): {np.mean(equalized_odds_ratios[minority_group]).round(3)}")


### Fairness Metrics SPD and Equalized Odds on Test Predictions

In [None]:
import aif360
from aif360.metrics import BinaryLabelDatasetMetric, ClassificationMetric
from aif360.datasets import BinaryLabelDataset
from sklearn.metrics import accuracy_score
import numpy as np



race_categories = ['race_Black or African American', 'race_None of these', 'race_no answer',
                   'race_More than one population', 'race_Asian', 'race_Middle Eastern or North African',
                   'race_Native Hawaiian or Other Pacific Islander']

# Initialize arrays to store fairness metrics across folds for each minority group
mean_diffs = {group: [] for group in race_categories}
equalized_odds_ratios = {group: [] for group in race_categories}


# Iterate over minority groups
for minority_group in race_categories:

    # Create a new BinaryLabelDataset for each minority group
    label_column_name = 'birth_class_binary'
    bld = BinaryLabelDataset(
        favorable_label=0, unfavorable_label=1,
        df=birth_test_data_encoded, label_names=[label_column_name],
        protected_attribute_names=['race_White'] + race_categories
    )

    privileged_groups = [{'race_White': 1}]
    unprivileged_groups = [{'race_White': 0, minority_group: 1}]

    # Create an instance of BinaryLabelDatasetMetric
    metric_bld = BinaryLabelDatasetMetric(bld, privileged_groups=privileged_groups, unprivileged_groups=unprivileged_groups)

    # Calculate mean difference
    mean_diffs[minority_group].append(metric_bld.mean_difference())

    # Assuming you have ground truth labels and predicted labels
    cm = ClassificationMetric(bld, bld, unprivileged_groups=unprivileged_groups, privileged_groups=privileged_groups)

    # Calculate equalized odds ratio
    equalized_odds_ratios[minority_group].append(cm.average_odds_difference())

# Print the mean fairness metrics for each minority group across all folds
for minority_group in race_categories:
    print(f" Mean Difference ({minority_group}): {mean_diffs[minority_group]}")
    print(f" Equalized Odds Ratio ({minority_group}): {equalized_odds_ratios[minority_group]}")



#### Ground truth SPD calculation (this applies to entire dataset to see what our actual disparities are); no pipeline

In [None]:
import numpy as np
from aif360.metrics import BinaryLabelDatasetMetric
from aif360.datasets import BinaryLabelDataset

for minority_group in race_categories:
    

    # Create a BinaryLabelDataset for ground truth labels
    label_column_name = 'birth_class_binary'
    bld_ground_truth = BinaryLabelDataset(
        favorable_label=0, unfavorable_label=1,
        df=birth_encoded, label_names=[label_column_name],
        protected_attribute_names=['race_White', minority_group]
    )

    # Set privileged and unprivileged groups
    privileged_groups = [{'race_White': 1}]
    unprivileged_groups = [{'race_White': 0, minority_group: 1}]

    # Create an instance of BinaryLabelDatasetMetric
    metric_bld_ground_truth = BinaryLabelDatasetMetric(bld_ground_truth, privileged_groups=privileged_groups, unprivileged_groups=unprivileged_groups)

    # Calculate SPD for ground truth labels
    spd_ground_truth = metric_bld_ground_truth.statistical_parity_difference()
              

    print(f" Statistical Parity Difference (Ground Truth):", minority_group, spd_ground_truth)
   

    