In [1]:
%pip install fairlearn



In [17]:
# --- Import libraries ---
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from fairlearn.metrics import demographic_parity_difference, equalized_odds_difference
from fairlearn.postprocessing import ThresholdOptimizer
from sklearn.pipeline import Pipeline

# --- Load Data ---
apps = pd.read_csv("Credit_card.csv")
labels = pd.read_csv("Credit_card_label.csv")
df = apps.merge(labels, on="Ind_ID", how="inner")
print("Merged shape:", df.shape)

# --- Basic Data Checks ---
print("\nDataframe info:")
print(df.info())
print("\nFirst 5 rows:")
print(df.head())
print("\nMissing values per column:")
print(df.isna().sum())
print("\nDuplicates:", df.duplicated().sum())

# --- Optional: Drop columns with >25% missing values ---
threshold = 0.25
n_rows = len(df)
missing = df.isna().sum() / n_rows
drop_cols = missing[missing > threshold].index.tolist()
if drop_cols:
    print("\nDropping columns due to too many missing values:", drop_cols)
    df = df.drop(columns=drop_cols)

# --- Feature Engineering: Age Bins ---
df["approved"] = 1 - df["label"]
ages = (-df["Birthday_count"]) / 365.25
bins = [0, 25, 40, 60, 200]
df["AGE_BIN"] = pd.cut(ages, bins, labels=["<25", "25-40", "40-60", "60+"], right=False)
print("\nAGE_BIN distribution:")
print(df["AGE_BIN"].value_counts(dropna=False))

# --- Sensitive Attributes & Columns ---
TARGET = "approved"
ATTRS = ["AGE_BIN", "Marital_status", "Type_Income"]

# --- Feature Lists ---
num_cols = df.select_dtypes(include=np.number).columns.difference(["label", TARGET])
cat_cols = df.columns.difference(num_cols.union(["label", TARGET]))

print("\nNumeric columns:", list(num_cols))
print("Categorical columns:", list(cat_cols))

# --- Pipelines for Preprocessing (handles missing values) ---
num_pipe = Pipeline([
    ("imp", SimpleImputer(strategy="median")),  # Fills NAs with median
    ("sc", StandardScaler(with_mean=False))
])
cat_pipe = Pipeline([
    ("imp", SimpleImputer(strategy="most_frequent")),  # Fills NAs with most frequent
    ("ohe", OneHotEncoder(handle_unknown="ignore"))
])
preproc = ColumnTransformer([
    ("num", num_pipe, num_cols),
    ("cat", cat_pipe, cat_cols)
])

# --- Fairness Audit Loop ---
summary = []
for S in ATTRS:
    print(f"\n=== Sensitive attribute: {S} ===")
    data = df.dropna(subset=[S])
    X = data.drop(columns=["label", TARGET])
    y = data[TARGET]
    A = data[S]

    # Train-test split
    X_tr, X_te, y_tr, y_te, A_tr, A_te = train_test_split(
        X, y, A, test_size=0.3, random_state=0, stratify=y
    )

    # Preprocess (handles missing values!)
    X_tr_enc = preproc.fit_transform(X_tr)
    X_te_enc = preproc.transform(X_te)
    # Check that there are no missing values after preprocessing
    print("Missing values after preprocessing (train):", np.isnan(X_tr_enc).sum())
    print("Missing values after preprocessing (test):", np.isnan(X_te_enc).sum())

    # --- Baseline Model ---
    base = LogisticRegression(max_iter=2000, class_weight="balanced", random_state=0)
    base.fit(X_tr_enc, y_tr)
    pred_b = base.predict(X_te_enc)
    prob_b = base.predict_proba(X_te_enc)[:, 1]

    acc_b = accuracy_score(y_te, pred_b)
    f1_b = f1_score(y_te, pred_b, zero_division=0)
    auc_b = roc_auc_score(y_te, prob_b)
    dp_b = demographic_parity_difference(y_te, pred_b, sensitive_features=A_te)
    eo_b = equalized_odds_difference(y_te, pred_b, sensitive_features=A_te)
    sel_b = pred_b.mean()
    print(f"Baseline  → acc {acc_b:.3f} | F1 {f1_b:.3f} | AUC {auc_b:.3f} | sel {sel_b:.3f} | DP {dp_b:.3f} | EO {eo_b:.3f}")

    # --- Manual Re-Weighing for Fair Model ---
    tbl = pd.DataFrame({"y": y_tr, "s": A_tr})
    n = len(tbl)
    w_map = {}
    for (sv, yv), n_sy in tbl.groupby(["s", "y"]).size().items():
        w_map[(sv, yv)] = ((tbl["s"] == sv).sum() * (tbl["y"] == yv).sum()) / (n * n_sy)
    w_tr = tbl.apply(lambda r: w_map[(r["s"], r["y"])], axis=1).values

    fair = LogisticRegression(max_iter=2000, random_state=0)
    fair.fit(X_tr_enc, y_tr, sample_weight=w_tr)
    prob_f = fair.predict_proba(X_te_enc)[:, 1]
    auc_f = roc_auc_score(y_te, prob_f)
    print(f"Weighted model AUC {auc_f:.3f} (pre-threshold)")

    # --- ThresholdOptimizer Post-processing ---
    post = ThresholdOptimizer(estimator=fair, constraints="demographic_parity", prefit=True)
    post.fit(X_te_enc, y_te, sensitive_features=A_te)
    pred_p = post.predict(X_te_enc, sensitive_features=A_te)

    sel_p = pred_p.mean()
    dp_p = demographic_parity_difference(y_te, pred_p, sensitive_features=A_te)
    eo_p = equaliz_


Merged shape: (1548, 19)

Dataframe info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1548 entries, 0 to 1547
Data columns (total 19 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Ind_ID           1548 non-null   int64  
 1   GENDER           1541 non-null   object 
 2   Car_Owner        1548 non-null   object 
 3   Propert_Owner    1548 non-null   object 
 4   CHILDREN         1548 non-null   int64  
 5   Annual_income    1525 non-null   float64
 6   Type_Income      1548 non-null   object 
 7   EDUCATION        1548 non-null   object 
 8   Marital_status   1548 non-null   object 
 9   Housing_type     1548 non-null   object 
 10  Birthday_count   1526 non-null   float64
 11  Employed_days    1548 non-null   int64  
 12  Mobile_phone     1548 non-null   int64  
 13  Work_Phone       1548 non-null   int64  
 14  Phone            1548 non-null   int64  
 15  EMAIL_ID         1548 non-null   int64  
 16  Type_Occupation  1