In [4]:
import pandas as pd 
import numpy as np
import tensorflow as tf
import seaborn as sns
import os
import random
import datetime

import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelEncoder 
from sklearn.model_selection import train_test_split

from tensorflow.keras.layers import (
    Input, Dense, Conv2D, Flatten, 
    MaxPooling2D, BatchNormalization, Dropout
)

from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import (
    EarlyStopping,
    ModelCheckpoint,
    LearningRateScheduler
)
from tensorflow.keras.initializers import RandomUniform
from tensorflow.keras.regularizers import l1, l2
from tensorflow.keras.constraints import Constraint
from tensorflow.keras.optimizers import Adam
from keras.initializers import Constant

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score

from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
from tensorflow.keras.layers import GlobalAveragePooling2D
from tqdm import tqdm
from keras.models import load_model
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, regularizers
from tensorflow.keras.layers import (
    Input, Dense, Conv2D, Flatten, 
    MaxPooling2D, BatchNormalization, Dropout, Concatenate
)

from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import (
    EarlyStopping,
    ModelCheckpoint,
    LearningRateScheduler
)

from tensorflow.keras.regularizers import Regularizer
from tensorflow.keras.models import Sequential, Model
from keras.models import load_model
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import LearningRateScheduler, ModelCheckpoint
from tensorflow.keras.utils import to_categorical

from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import StandardScaler
import tensorflow as tf

2025-02-06 17:32:10.122846: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-02-06 17:32:10.122937: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-02-06 17:32:10.124700: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-06 17:32:10.134054: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [5]:
# -------------------------------
# Custom Gradient Reversal Layer
# -------------------------------
@tf.custom_gradient
def grad_reverse(x, lambda_):
    def grad(dy):
        return -lambda_ * dy, None # reverses direction of gradient 
    return x, grad

# custom Keras layer
"""
Layer is used to ensure that the feature representation are independent of a sensitive attribute
- feature extract learns normally in the forward pass
- reversing gradients of classifier that tries to predict the sensitive attribute during backpropagation -- stops feature extractor from encoding sensitive information
"""
class GradientReversalLayer(tf.keras.layers.Layer): 
    def __init__(self, lambda_=1.0, **kwargs):
        super(GradientReversalLayer, self).__init__(**kwargs)
        self.lambda_ = lambda_ # strength of gradient reversal
    def call(self, x):
        return grad_reverse(x, self.lambda_)

# -------------------------------
# Data Loading and Preprocessing
# -------------------------------
def set_seed(seed_num):
    random.seed(seed_num)
    np.random.seed(seed_num)
    tf.random.set_seed(seed_num)
    os.environ['PYTHONHASHSEED'] = str(seed_num)

In [6]:
# -------------------------------
# Adversarial Debiasing Model
# -------------------------------
def build_adversarial_model(input_dim, lambda_adv=1.0):
    """
    Build an adversarial debiasing model that learns pseudo‑labels Y' from X.

    Architecture:
      - Main branch (encoder): from X, several dense layers produce a latent pseudo‑label pseudo_Y (via sigmoid).
      - Adversary branch: pseudo_Y is passed through a Gradient Reversal Layer and then dense layers predict S.
      - Decoder branch: concatenates pseudo_Y and the one-hot sensitive attribute S to predict the observed label Y.

    Losses:
      - For the main branch, binary crossentropy between observed Y and pseudo_Y (and Y_pred).
      - For the adversary branch, categorical crossentropy to predict S.

    Returns a compiled Keras model that takes inputs X and S (one-hot encoded) and outputs:
      [pseudo_Y, S_pred, Y_pred].
    """
    X_input = tf.keras.Input(shape=(input_dim,), name="X")
    S_input = tf.keras.Input(shape=(2,), name="S")  # one-hot encoded S

    # Main branch: Encoder for pseudo-label.
    """
    """
    h = Dense(64, activation='relu')(X_input)
    h = BatchNormalization()(h)
    h = Dense(32, activation='relu')(h)
    h = BatchNormalization()(h)
    pseudo_Y = Dense(1, activation='sigmoid', name="pseudo_Y")(h) ## outputs  probability value for pseudo_Y between 0,1

    # Adversary branch: from pseudo_Y, with GRL.
    """
    This is to prevent psuedo_Y from containing information about S
    - adversary will try to predict S from pseudo_Y (fair label)...if it can accurately predict S, then Y' still encodes information about S (don't want this) 
    - use the gradient reversal layer to prevent this from happening
    """
    grl = GradientReversalLayer(lambda_=lambda_adv)(pseudo_Y)
    a = Dense(32, activation='relu')(grl)
    a = BatchNormalization()(a)
    S_pred = Dense(2, activation='softmax', name="S_pred")(a)

    # Decoder branch: combine pseudo_Y and S to predict observed Y.
    """
    Y depepends on both Y' and S 
    -- predict the final observed label Y using both psuedo_Y and S
    -- Y may still depend on S, that is why it's being used here 
    -- decoder ensures Y_final is accurate, while psuedo_Y is not directly influenced by S 
    -- psuedo_Y removes unfair dependencies on S...however S might still contain legit info needed to predict Y accurately 
    -- IMPORTANT - THIS STEP ALLOWS FAIR DEPENDENCIES WHILE ELIMINATING UNFAIR ONES
    -- structure how S influences Y, without letting hidden biases leak through 
    """
    concat = Concatenate()([pseudo_Y, S_input])
    d = Dense(16, activation='relu')(concat)
    d = BatchNormalization()(d)
    Y_pred = Dense(1, activation='sigmoid', name="Y_pred")(d)

    model = tf.keras.Model(inputs=[X_input, S_input],
                           outputs=[pseudo_Y, S_pred, Y_pred])
    model.compile(optimizer=tf.keras.optimizers.Adam(1e-4),
                  loss={"pseudo_Y": "binary_crossentropy",
                        "S_pred": "categorical_crossentropy",
                        "Y_pred": "binary_crossentropy"},
                  loss_weights={"pseudo_Y": 1.0, "S_pred": lambda_adv, "Y_pred": 1.0},
                  metrics={"pseudo_Y": "accuracy",
                           "S_pred": "accuracy",
                           "Y_pred": "accuracy"}) # Y_pred is the best estimate of Y accounting for fair dependencies 
    return model

In [7]:
# -------------------------------
# Manual Fairness Metrics
# -------------------------------
def compute_fairness_metrics_manual(y_true, y_pred, sensitive_features):
    """
    Compute fairness metrics manually.
    y_true: binary ground-truth labels (1-D numpy array).
    y_pred: continuous scores (will be thresholded at 0.5).
    sensitive_features: 1-D numpy array (0 or 1).

    Returns a dictionary with:
      - Demographic parity difference (absolute difference in positive rates).
      - Equalized odds difference (average difference in TPR and FPR).
      - Selection rates per group.
      - Group-wise accuracy.
    """
    y_pred_bin = (y_pred > 0.5).astype(int) # y_pred is continuous value, so converting it to binary 
    groups = np.unique(sensitive_features)

    # Demographic parity 
    """
    All groups (from sensitive feature) should receive positive predictions at the same rate
    P(Y_hat = 1|S=0) = P(Y_hat=1|S=1)
    """

    # For each group in the sensitive feature, find the demographic parity and compute the difference (based on the formula in above comment)
    pos_rates = {}
    for g in groups: 
        pos_rates[g] = np.mean(y_pred_bin[sensitive_features == g])
    dp_diff = abs(pos_rates[0] - pos_rates[1]) ## this line assumes that there are only 2 groups, 0 and 1 -- if there are more than 2 groups, this would need to be changed
    ## in all the examples used, there were only 2 groups -- need to double check this when working on new data
    
    # dp_diff > 0, then demographic parity isn't fair 

    # Equalized odds
    """
    Ensuring the different groups in the sensitive feature similar TPR and FPR rates -- this is so that the model isn't discriminating in error types
    """
    metrics = {}
    for g in groups:
        mask = (sensitive_features == g)
        y_true_g = y_true[mask]
        y_pred_g = y_pred_bin[mask]
        tpr = np.sum((y_pred_g == 1) & (y_true_g == 1)) / (np.sum(y_true_g == 1) + 1e-8) # True Positive Rate
        fpr = np.sum((y_pred_g == 1) & (y_true_g == 0)) / (np.sum(y_true_g == 0) + 1e-8) # False Positive Rate
        metrics[g] = (tpr, fpr)
    eo_diff = (abs(metrics[0][0] - metrics[1][0]) + abs(metrics[0][1] - metrics[1][1])) # taking average of two error types

    # Selection rate per group.
    """
    proportion of samples predicted as positive for each group -- a a group has a higher selection rate, the model may favor that group unfairly
    """
    sel_rate = {}
    for g in groups:
        sel_rate[g] = pos_rates[g]

    # Group-wise accuracy.
    """
    for each group in the sensitive feature, compute the accuracy of the model (to ensure that it's perfoming consistently across groups)
    """
    group_acc = {}
    for g in groups:
        mask = (sensitive_features == g)
        group_acc[g] = accuracy_score(y_true[mask], y_pred_bin[mask])

    return {
        "demographic_parity_difference": dp_diff,
        "equalized_odds_difference": eo_diff,
        "selection_rate": sel_rate,
        "group_accuracy": group_acc
    }

In [8]:
# -------------------------------
# Plotting Function
# -------------------------------
def plot_comparison(metrics_baseline, metrics_fair):
    """
    parameters are dictionaries with the stored values of the evaluation metrics
    """
    models = ['Baseline', 'Fair']
    aucs = [metrics_baseline['auc'], metrics_fair['auc']]
    accs = [metrics_baseline['accuracy'], metrics_fair['accuracy']]
    dp_diff = [metrics_baseline["demographic_parity_difference"], metrics_fair["demographic_parity_difference"]]
    eo_diff = [metrics_baseline["equalized_odds_difference"], metrics_fair["equalized_odds_difference"]]

    # creating a 2x3 gird of bar chars comparing baseline model and fair model across: AUC, accuracy, demographic parity diff, equalized odd difference
    fig, axs = plt.subplots(2, 2, figsize=(14, 10))

    ## measures how well the model seperates postiive and negative classes, higher AUC = better model performance
    # if fair model has a lower AUC than the baseline, can indicate a fairness-performance tradeoff (meaning less well seperation for more fair results)
    axs[0,0].bar(models, aucs, color=['blue', 'green'])
    axs[0,0].set_title('AUC')
    axs[0,0].set_ylim([0, 1])

    ## correct pred/total pred
    ## fairness may lower accuracy 
    axs[0,1].bar(models, accs, color=['blue', 'green'])
    axs[0,1].set_title('Accuracy')
    axs[0,1].set_ylim([0, 1])

    ## orange = baseline, purple = fairness -LOOK INTO TO SEE HOW TO KNOW WHICH GROUP IS CONTRIBUTING TO HIGHER DP
    # lower values of dp indciate better fairness
    axs[1,0].bar(models, dp_diff, color=['orange', 'purple'])
    axs[1,0].set_title('Demographic Parity Difference')

    ## lower value - better fairness
    ## equalized odds is satisfied if tpr and fpr are equal across the different groups in the sensitive feature
    axs[1,1].bar(models, eo_diff, color=['orange', 'purple'])
    axs[1,1].set_title('Equalized Odds Difference')

    plt.suptitle("Comparison: Baseline (X → Y) vs. Fair (X → Y') Model")
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    plt.show()

In [9]:
# -------------------------------
# Main Function: Comparison and Visualization
# -------------------------------
def main(data_url, dataset_name, lambda_adv=1.0):
    set_seed(42)

    if dataset_name == "compas": 
        X, Y_obs, S = load_and_preprocess_compas_data_binary(data_url) ##  S is binary

    elif dataset_name == "german":
        X, Y_obs, S = load_and_preprocess_german_data(data_url) ##  S is binary


    elif dataset_name == "adult":
        X, Y_obs, S = load_and_preprocess_adult_data(data_url) ##  S is binary

    else:
        print ("Invalid dataset_name")
        return 
    

    print(f"Loading and preprocessing {dataset_name} data...")
    X_train, X_test, Y_train_obs, Y_test_obs, S_train, S_test = train_test_split(
        X, Y_obs, S, test_size=0.2, random_state=42
    )

    if dataset_name == "compas":
        print(f"Features shape: {X.shape}")
        print(f"Observed Label Y shape: {Y_obs.shape}   (Recidivism: 1=recid, 0=non-recid)")
        print(f"Sensitive Attribute (Race, binarized) shape: {S.shape}")
        
    elif dataset_name == "german": 
        print(f"Features shape: {X.shape}")
        print(f"Observed Label Y shape: {Y_obs.shape}   (Credit risk: 1=good, 0=bad)")
        print(f"Sensitive Attribute (Age, binarized) shape: {S.shape}")
        
    elif dataset_name == "adult":
        print(f"Features shape: {X.shape}")
        print(f"Observed Label Y shape: {Y_obs.shape}   (Label from 'income')")
        print(f"Sensitive Attribute (Sex) shape: {S.shape}")

    input_dim = X_train.shape[1]

    # One-hot encode S for adversarial model training.
    S_train_oh = tf.keras.utils.to_categorical(S_train, num_classes=2)
    S_test_oh  = tf.keras.utils.to_categorical(S_test, num_classes=2)

    ### 1. Train adversarial debiasing model (X → Y' with adversary)
    print("\nTraining adversarial model (X → Y' with adversary) ...")
    adv_model = build_adversarial_model(input_dim, lambda_adv=lambda_adv)
    # For training, we use the observed Y as target for both pseudo_Y and Y_pred.
    # Reshape Y_obs to (-1,1) since our outputs are scalars.
    Y_train_obs_exp = Y_train_obs.reshape(-1, 1)
    Y_test_obs_exp  = Y_test_obs.reshape(-1, 1)
    adv_model.fit([X_train, S_train_oh],
                  {"pseudo_Y": Y_train_obs_exp, "S_pred": S_train_oh, "Y_pred": Y_train_obs_exp},
                  epochs=30, batch_size=128, verbose=1)

    # Get pseudo-label predictions.
    pseudo_Y_train, _, _ = adv_model.predict([X_train, S_train_oh]) ## do we want psuedo_Y or Y_pred? psuedo_Y is for complete fairness why pred_Y can be a bit more accurate by keep necessary dependencies
    pseudo_Y_test,  _, _ = adv_model.predict([X_test, S_test_oh])

    # Threshold pseudo-labels to get binary labels.
    pseudo_Y_train_bin = (pseudo_Y_train > 0.5).astype(np.float32)
    pseudo_Y_test_bin  = (pseudo_Y_test > 0.5).astype(np.float32)

    print("\nPseudo-label statistics (training):")
    for g in np.unique(S_train):
        mask = (S_train == g)
        print(f"Group {g} pseudo-positive rate: {np.mean(pseudo_Y_train_bin[mask]):.4f}") # average probability of a postive prediction per group -- fairness check to see if both groups receive similar treatment

    ### 2. Train baseline logistic regression model on observed Y (X → Y) -- regular logistic regression for baseline for comparison; does not include any fairness constraints
    print("\nTraining baseline logistic regression classifier (X → Y)...")
    baseline_clf = LogisticRegression(solver='lbfgs', max_iter=1000)
    baseline_clf.fit(X_train, Y_train_obs)
    baseline_preds = baseline_clf.predict_proba(X_test)[:, 1]
    baseline_auc = roc_auc_score(Y_test_obs, baseline_preds)
    baseline_acc = accuracy_score(Y_test_obs, (baseline_preds > 0.5).astype(int))
    baseline_fairness = compute_fairness_metrics_manual(Y_test_obs, baseline_preds, sensitive_features=S_test)

    ### 3. Train fair logistic regression model on pseudo-labels (X → Y') -- using psuedo_Y from the the adv_model, 
    print("\nTraining fair logistic regression classifier (X → Y') using pseudo-labels...")
    fair_clf = LogisticRegression(solver='lbfgs', max_iter=1000)
    fair_clf.fit(X_train, pseudo_Y_train_bin.ravel())
    fair_preds = fair_clf.predict_proba(X_test)[:, 1]
    fair_auc = roc_auc_score(Y_test_obs, fair_preds)
    fair_acc = accuracy_score(Y_test_obs, (fair_preds > 0.5).astype(int))
    fair_fairness = compute_fairness_metrics_manual(Y_test_obs, fair_preds, sensitive_features=S_test)

    # Aggregate metrics for plotting.
    metrics_baseline = {
        "auc": baseline_auc,
        "accuracy": baseline_acc,
        "demographic_parity_difference": baseline_fairness["demographic_parity_difference"],
        "equalized_odds_difference": baseline_fairness["equalized_odds_difference"]
    }
    metrics_fair = {
        "auc": fair_auc,
        "accuracy": fair_acc,
        "demographic_parity_difference": fair_fairness["demographic_parity_difference"],
        "equalized_odds_difference": fair_fairness["equalized_odds_difference"]
    }

    print("\nBaseline Logistic Regression (X → Y) Evaluation:")
    print(f"AUC: {baseline_auc:.4f}, Accuracy: {baseline_acc:.4f}")
    print("Fairness metrics:", baseline_fairness)

    print("\nFair Logistic Regression (X → Y') Evaluation (compared to observed Y):")
    print(f"AUC: {fair_auc:.4f}, Accuracy: {fair_acc:.4f}")
    print("Fairness metrics:", fair_fairness)

    # Plot comparison.
    plot_comparison(metrics_baseline, metrics_fair)

### Application on Datasets

#### Drug Consumption Datasets

In [None]:
X["gender"].value_counts()

In [None]:
df["education"].value_counts()

In [None]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 

# def load_and_process_drug_consumption_data():
# drug_consumption_quantified = fetch_ucirepo(id=373) 

path = os.path.join("data", "drug_consumption.csv")
df = pd.read_csv(path)
df.columns = df.columns.str.lower().str.strip()

# convert to 4 classes
df = df[df.columns[1:]]
df = df.replace(
    {
        "cannabis": {
            "CL0": "never_used",
            "CL1": "not_in_last_year",
            "CL2": "not_in_last_year",
            "CL3": "used_in_last_year",
            "CL4": "used_in_last_year",
            "CL5": "used_in_last_week",
            "CL6": "used_in_last_week",
        }
    }
)

educated_cat = {
    "University degree",
    "Masters degree",
    "Doctorate degree",
    "Professional certificate/ diploma"
}

df["education"] = df["education"].apply(lambda x: 1 if x in educated_cat else 0)

label_encoder = LabelEncoder()
df["age"] = label_encoder.fit_transform(df["age"])

df["gender"] = df["gender"].apply(lambda x: 1 if x == "M" else 0)


df["country"] = label_encoder.fit_transform(df["country"])





X = df[df.columns[1:13]]
Y = df["cannabis"]
X


# # data (as pandas dataframes) 
# X = drug_consumption_quantified.data.features 
# # will just look at cannabis usuage likeliness only 
# y = drug_consumption_quantified.data.targets["cannabis"] 

# df = pd.concat([X,y], axis = 1)


# Y = df["cannabis"]
# S = df["education"].astype(np.int32).values 
# X = df.drop(["cannabis", "education"], axis = 1)
# scaler = StandardScaler()

# label_encoder = LabelEncoder()
# Y = label_encoder.fit(Y)

# # scaling features
# X = scaler.fit_transform(X.values.astype(np.float32))                         


In [None]:
df

#### Compas Dataset

In [None]:
# COME BACK AND DO IF WANT TO IMPLEMENT WHERE SENSITIVE FEATURE IS NOT JUST BINARY
def load_and_preprocess_compas_data_multi_cat(data_url):
    """
    Try the algorithm, except instead of binary sensitive feature, it multi classes (so not just Aferican American)? 
    
    Download and preprocess the COMPAS dataset.

    We assume the dataset contains, among others, the following columns:
      - 'age'
      - 'race'
      - 'priors_count'
      - 'juv_fel_count'
      - 'juv_misd_count'
      - 'juv_other_count'
      - 'two_year_recid'

    Features (X): We select a few numerical features.
    Observed Label (Y): Use 'two_year_recid' as a binary label (0/1).
    Protected Attribute (S): Use 'race'. Here we binarize race so that:
         African‑American  → 1
         all other races  → 0.
    """
    data = pd.read_csv(data_url)
    # Drop rows with missing values in the selected columns.
    data = data.dropna(subset=["age", "race", "priors_count", "juv_fel_count", "juv_misd_count", "juv_other_count", "two_year_recid"])

    # Observed label: two_year_recid (already 0/1)
    Y = data["two_year_recid"].values

    # Sensitive attribute: race. We set S=1 if race is African-American, else 0.
    S = (data["race"] == "African-American").astype(int).values

    # Features: use a subset of numerical features.
    feature_cols = ["age", "priors_count", "juv_fel_count", "juv_misd_count", "juv_other_count"]
    X = data[feature_cols].copy().astype(np.float32)
    scaler = StandardScaler()
    X = scaler.fit_transform(X.values)

    return X, Y, S

# URL for the ProPublica COMPAS dataset
compas_data_url = "https://raw.githubusercontent.com/propublica/compas-analysis/master/compas-scores-two-years.csv"
# You can adjust lambda_adv as desired (e.g., lambda_adv=15.5 as in your German data experiment)
main_compas(compas_data_url, lambda_adv=3.1)

## Health Readmission Dataset

In [41]:
import kagglehub
path = kagglehub.dataset_download("vanpatangan/readmission-dataset")

#print("Path to dataset files:", path)
dataset_files = os.listdir(path)
#print("Files in dataset folder:", dataset_files)

train_df = pd.read_csv(os.path.join(path, "train_df.csv"))
test_df = pd.read_csv(os.path.join(path, "test_df.csv"))



In [42]:
train_df.head()

Unnamed: 0,age,gender,primary_diagnosis,num_procedures,days_in_hospital,comorbidity_score,discharge_to,readmitted
0,69,Male,Heart Disease,1,2,1,Home Health Care,0
1,32,Female,COPD,2,13,2,Rehabilitation Facility,0
2,89,Male,Diabetes,1,7,1,Home,0
3,78,Male,COPD,9,2,2,Skilled Nursing Facility,0
4,38,Male,Diabetes,6,4,4,Rehabilitation Facility,0


In [43]:
test_df.head()

Unnamed: 0,age,gender,primary_diagnosis,num_procedures,days_in_hospital,comorbidity_score,discharge_to
0,52,Male,Heart Disease,3,9,3,Home
1,47,Female,Diabetes,2,4,0,Skilled Nursing Facility
2,72,Female,Heart Disease,7,12,4,Home
3,18,Female,COPD,5,14,3,Home
4,32,Male,Heart Disease,9,2,4,Rehabilitation Facility


In [44]:
train_df.isnull().sum()

age                  0
gender               0
primary_diagnosis    0
num_procedures       0
days_in_hospital     0
comorbidity_score    0
discharge_to         0
readmitted           0
dtype: int64

In [45]:
test_df.isnull().sum()

age                  0
gender               0
primary_diagnosis    0
num_procedures       0
days_in_hospital     0
comorbidity_score    0
discharge_to         0
dtype: int64

In [46]:
print(train_df['readmitted'].value_counts())

readmitted
0    4060
1     940
Name: count, dtype: int64


In [47]:
print(test_df['discharge_to'].value_counts())

discharge_to
Home                        521
Rehabilitation Facility     514
Skilled Nursing Facility    497
Home Health Care            468
Name: count, dtype: int64


In [48]:
test_df

Unnamed: 0,age,gender,primary_diagnosis,num_procedures,days_in_hospital,comorbidity_score,discharge_to
0,52,Male,Heart Disease,3,9,3,Home
1,47,Female,Diabetes,2,4,0,Skilled Nursing Facility
2,72,Female,Heart Disease,7,12,4,Home
3,18,Female,COPD,5,14,3,Home
4,32,Male,Heart Disease,9,2,4,Rehabilitation Facility


In [58]:
#No Readmission = Class 0
#Short-term Readmission = Class 1
#Long-term Readmission = Class 2

mapping_discharge = {
    'Home': 0,
    'Home Health Care': 0,
    'Skilled Nursing Facility': 1, 
    'Rehabilitation Facility': 2,
}

test_df['discharge_category'] = test_df['discharge_to'].map(mapping_discharge)

mapping_gender = {
    "Male": 0, 
    "Female": 1
}

test_df['gender_category'] = test_df['gender'].map(mapping_gender)


In [59]:
print(test_df['primary_diagnosis'].value_counts())

primary_diagnosis
Hypertension      433
COPD              410
Diabetes          401
Kidney Disease    382
Heart Disease     374
Name: count, dtype: int64


In [60]:
test_df

Unnamed: 0,age,gender,primary_diagnosis,num_procedures,days_in_hospital,comorbidity_score,discharge_to,discharge_category,gender_category
0,52,Male,Heart Disease,3,9,3,Home,0,0
1,47,Female,Diabetes,2,4,0,Skilled Nursing Facility,1,1
2,72,Female,Heart Disease,7,12,4,Home,0,1
3,18,Female,COPD,5,14,3,Home,0,1
4,32,Male,Heart Disease,9,2,4,Rehabilitation Facility,2,0
...,...,...,...,...,...,...,...,...,...
1995,35,Female,Kidney Disease,7,4,4,Home,0,1
1996,65,Male,Hypertension,5,3,2,Home Health Care,0,0
1997,89,Female,Kidney Disease,2,7,2,Rehabilitation Facility,2,1
1998,83,Female,Diabetes,4,4,0,Rehabilitation Facility,2,1
