In [1]:
%pip install fairlearn



This is a credit-card approval model to decide which to approve. Justice is crucial here discriminatory action can lead to legal liability, damage to reputation, and lost business.

Why is it important to the company?

Compliance with the law: Equal treatment under fair-lending laws.

Customer confidence: Perceived discrimination deters good customers.

Long-term profitability: Avoids excessive concentration of risk in one group.

##Import Libraries

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from fairlearn.metrics import demographic_parity_difference, equalized_odds_difference
from fairlearn.postprocessing import ThresholdOptimizer
from sklearn.pipeline import Pipeline
import plotly.express as px

##Load Dataset

Dataset:[ https://www.kaggle.com/datasets/rohitudageri/credit-card-details?select=Credit_card_label.csv ]

In [3]:
apps   = pd.read_csv("Credit_card.csv")
labels = pd.read_csv("Credit_card_label.csv")
df     = apps.merge(labels, on="Ind_ID", how="inner")
df.head()

Unnamed: 0,Ind_ID,GENDER,Car_Owner,Propert_Owner,CHILDREN,Annual_income,Type_Income,EDUCATION,Marital_status,Housing_type,Birthday_count,Employed_days,Mobile_phone,Work_Phone,Phone,EMAIL_ID,Type_Occupation,Family_Members,label
0,5008827,M,Y,Y,0,180000.0,Pensioner,Higher education,Married,House / apartment,-18772.0,365243,1,0,0,0,,2,1
1,5009744,F,Y,N,0,315000.0,Commercial associate,Higher education,Married,House / apartment,-13557.0,-586,1,1,1,0,,2,1
2,5009746,F,Y,N,0,315000.0,Commercial associate,Higher education,Married,House / apartment,,-586,1,1,1,0,,2,1
3,5009749,F,Y,N,0,,Commercial associate,Higher education,Married,House / apartment,-13557.0,-586,1,1,1,0,,2,1
4,5009752,F,Y,N,0,315000.0,Commercial associate,Higher education,Married,House / apartment,-13557.0,-586,1,1,1,0,,2,1


##Data Exploration

In [4]:
print("Duplicates:", df.duplicated().sum())

Duplicates: 0


In [5]:
print("Shape:", df.shape)

Shape: (1548, 19)


In [6]:
print("Missing values in column:\n", df.isna().sum())

Missing values in column:
 Ind_ID               0
GENDER               7
Car_Owner            0
Propert_Owner        0
CHILDREN             0
Annual_income       23
Type_Income          0
EDUCATION            0
Marital_status       0
Housing_type         0
Birthday_count      22
Employed_days        0
Mobile_phone         0
Work_Phone           0
Phone                0
EMAIL_ID             0
Type_Occupation    488
Family_Members       0
label                0
dtype: int64


## Pre-processing

In [7]:
#    drop very sparse column
if "Type_Occupation" in df.columns:
    df.drop(columns=["Type_Occupation"], inplace=True)

num_cols = df.select_dtypes(include="number").columns.difference(["label", "approved"])
cat_cols = df.columns.difference(num_cols.union(["label", "approved"]))

num_pipe = Pipeline([
    ("imp", SimpleImputer(strategy="median")),
    ("sc",  StandardScaler(with_mean=False)),
])
cat_pipe = Pipeline([
    ("imp", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore")),
])
preproc = ColumnTransformer([
    ("num", num_pipe, num_cols),
    ("cat", cat_pipe, cat_cols),
])

## Feature Engineering

In [8]:
df["approved"] = 1 - df["label"]
df["AGE_BIN"]   = pd.cut(-df["Birthday_count"]/365.25,
                         bins=[0,25,40,60,200],
                         labels=["<25","25-40","40-60","60+"],
                         right=False)

In [9]:
summary = []   #for visualization

##Train-test split and Baseline Evaluation

In [10]:
TARGET = "approved"
ATTRS  = ["AGE_BIN","Marital_status","Type_Income"]

for S in ATTRS:
    sub = df.dropna(subset=[S])
    X, y, A = sub.drop(columns=["label", TARGET]), sub[TARGET], sub[S]

    X_tr, X_te, y_tr, y_te, A_tr, A_te = train_test_split(
        X, y, A, test_size=0.3, random_state=0, stratify=y
    )

    # apply preprocessing
    X_tr_enc = preproc.fit_transform(X_tr)
    X_te_enc = preproc.transform(X_te)

##  Train & Evaluate Baseline Model

In [11]:
base = LogisticRegression(max_iter=2000, class_weight="balanced", random_state=0)
base.fit(X_tr_enc, y_tr)
pred_base  = base.predict(X_te_enc)
prob_base  = base.predict_proba(X_te_enc)[:,1]
acc_b = accuracy_score(y_te, pred_base)
f1_b  = f1_score(y_te, pred_base, zero_division=0)
auc_b = roc_auc_score(y_te, prob_base)
sel_b = pred_base.mean()
dp_b  = demographic_parity_difference(y_te, pred_base, sensitive_features=A_te)
eo_b  = equalized_odds_difference(y_te, pred_base, sensitive_features=A_te)
print("Before fairness: ",
          f"acc {acc_b:.3f}", f"F1 {f1_b:.3f}", f"AUC {auc_b:.3f}",
          f"sel {sel_b:.3f}", f"DP {dp_b:.3f}", f"EO {eo_b:.3f}")

Before fairness:  acc 0.617 F1 0.746 AUC 0.577 sel 0.624 DP 0.628 EO 0.909


## Fairness Model (re-weigh + ThresholdOptimizer)

In [12]:
tbl = pd.DataFrame({"y": y_tr, "s": A_tr})
n = len(tbl)
cnt = tbl.groupby(["s","y"]).size().to_dict()
wmap= {k: ((tbl.s==k[0]).sum()*(tbl.y==k[1]).sum())/(n*v) for k,v in cnt.items()}
w_tr= tbl.apply(lambda r: wmap[(r["s"],r["y"])], axis=1)

fair = LogisticRegression(max_iter=2000, random_state=0)
fair.fit(X_tr_enc, y_tr, sample_weight=w_tr)

post      = ThresholdOptimizer(estimator=fair,
                                  constraints="demographic_parity",
                                  prefit=True)

post.fit(X_te_enc, y_te, sensitive_features=A_te)
pred_fair = post.predict(X_te_enc, sensitive_features=A_te)

## Evaluate Post-Fairness

In [13]:
acc_f = accuracy_score(y_te, pred_fair)
f1_f  = f1_score(y_te, pred_fair, zero_division=0)
auc_f = roc_auc_score(y_te, prob_base)  # ranking unchanged
sel_f = pred_fair.mean()
dp_f  = demographic_parity_difference(y_te, pred_fair, sensitive_features=A_te)
eo_f  = equalized_odds_difference(y_te, pred_fair, sensitive_features=A_te)
print("After fairness →",
          f"acc {acc_f:.3f}", f"F1 {f1_f:.3f}", f"AUC {auc_f:.3f}",
          f"sel {sel_f:.3f}", f"DP {dp_f:.3f}", f"EO {eo_f:.3f}")

After fairness → acc 0.890 F1 0.942 AUC 0.577 sel 0.991 DP 0.012 EO 0.091


In [14]:
# 9.b Collect metrics for visualization  # NEW
summary.append({
        "Attribute": S,
        "acc_b": acc_b, "acc_f": acc_f,
        "dp_b":  dp_b,  "dp_f": dp_f,
        "eo_b":  eo_b,  "eo_f": eo_f
    })

In [15]:
# 10. Build summary DataFrame  # NEW
summary_df = pd.DataFrame(summary).set_index("Attribute")  # NEW


# 11. Visualization  # NEW
attrs = summary_df.index.tolist()  # NEW
x = np.arange(len(attrs))           # NEW
w = 0.35                            # NEW

dfp = summary_df.reset_index().assign(
    idx=np.arange(1, len(summary_df)+1)
)

In [16]:
# 1) Accuracy Before vs After
# — Accuracy before vs after —  # NEW
fig = px.bar(
    dfp,
    x='idx',
    y=['acc_b','acc_f'],
    barmode='group',
    template='plotly_dark',
    color_discrete_sequence=['white','green'],
    labels={'value':'Accuracy','variable':'Stage','idx':'Attribute'}
)
fig.update_xaxes(
    tickmode='array',
    tickvals=dfp['idx'],
    ticktext=dfp['Attribute']
)
fig.update_layout(title='Accuracy: Before vs After Fairness')
fig.show()

In [19]:
# DP gap before vs after
fig = px.bar(
    dfp,
    x='idx',
    y=['dp_b','dp_f'],
    barmode='group',
    template='plotly_dark',
    color_discrete_sequence=['white','blue'],
    labels={'value':'DP Gap','variable':'Stage','idx':'Attribute'}
)
fig.update_xaxes(
    tickmode='array',
    tickvals=dfp['idx'],
    ticktext=dfp['Attribute']
)
fig.update_layout(title='Demographic Parity Gap: Before vs After')
fig.show()

In [23]:
# DP & EO gaps comparison
dfm = dfp.melt(
    id_vars=['idx','Attribute'],
    value_vars=['dp_b','eo_b','dp_f','eo_f'],
    var_name='MetricStage', value_name='Gap'
)
dfm['Metric'] = dfm['MetricStage'].str.split('_').str[0].str.upper()
dfm['Stage']  = np.where(dfm['MetricStage'].str.endswith('_f'),
                         'After','Before')

fig = px.bar(
    dfm,
    x='idx',
    y='Gap',
    color='Stage',
    facet_col='Metric',
    barmode='group',
    template='plotly_dark',
    color_discrete_sequence=['orange','green'],
    labels={'idx':'Attribute'}
)
fig.update_xaxes(
    tickmode='array',
    tickvals=dfp['idx'],
    ticktext=dfp['Attribute']
)
fig.update_layout(title='DP & EO Gaps: Before vs After Fairness')
fig.show()