<a href="https://colab.research.google.com/github/jitesh4144/AIES-LAB/blob/main/exp_2_aies.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install fairlearn
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from fairlearn.metrics import MetricFrame, true_positive_rate, false_positive_rate, false_negative_rate, selection_rate

# === Step 1: Load the data ===
file_path = '/content/data.csv'  # Update path if needed
df = pd.read_csv(file_path)

# === Step 2: Clean and prepare ===
df = df.dropna(subset=['closing_rank', 'category', 'program_duration', 'degree_short', 'institute_short', 'round_no', 'opening_rank'])

# Target: Good Rank = closing_rank < 2000
df['GoodRank'] = df['closing_rank'].apply(lambda x: 1 if x < 2000 else 0)

# Sensitive attribute: category (GEN, OBC-NCL, SC, ST, etc.)
sensitive_feature = df['category']

# Features
df_encoded = pd.get_dummies(df[['program_duration', 'degree_short', 'institute_short']], drop_first=True)
df_numeric = df[['round_no', 'opening_rank']].copy()
X = pd.concat([df_numeric, df_encoded], axis=1)
y = df['GoodRank']

# === Step 3: Train/Test Split ===
X_train, X_test, y_train, y_test, s_train, s_test = train_test_split(
    X, y, sensitive_feature, test_size=0.3, random_state=42, stratify=sensitive_feature
)

# === Step 4: Train model ===
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# === Step 5: Fairness Metrics (especially TPR) ===
metric_frame = MetricFrame(
    metrics={
        'TPR': true_positive_rate,
        'FPR': false_positive_rate,
        'FNR': false_negative_rate,
        'Selection Rate': selection_rate
    },
    y_true=y_test,
    y_pred=y_pred,
    sensitive_features=s_test
)

print("🎯 Fairness Metrics by Category:\n")
print(metric_frame.by_group)

# === Step 6: Highlight Lower TPR Groups ===
gen_tpr = metric_frame.by_group.loc['GEN', 'TPR'] if 'GEN' in metric_frame.by_group.index else None
if gen_tpr:
    print("\n🔍 Categories with significantly lower TPR than GEN:")
    for cat in metric_frame.by_group.index:
        if cat != 'GEN':
            cat_tpr = metric_frame.by_group.loc[cat, 'TPR']
            gap = gen_tpr - cat_tpr
            if gap > 0.05:  # flag if TPR is >5% lower than GEN
                print(f" - {cat}: TPR = {cat_tpr:.3f} (↓{gap:.3f})")


Collecting fairlearn
  Downloading fairlearn-0.12.0-py3-none-any.whl.metadata (7.0 kB)
Downloading fairlearn-0.12.0-py3-none-any.whl (240 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m240.0/240.0 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fairlearn
Successfully installed fairlearn-0.12.0


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


🎯 Fairness Metrics by Category:

                  TPR       FPR       FNR  Selection Rate
category                                                 
GEN          0.978852  0.044332  0.021148        0.123748
GEN-EWS      0.939302  0.010559  0.060698        0.428620
GEN-EWS-PWD  1.000000  0.000000  0.000000        1.000000
GEN-PWD      0.951197  0.020000  0.048803        0.910211
OBC-NCL      0.989378  0.036268  0.010622        0.206392
OBC-NCL-PWD  1.000000  0.000000  0.000000        1.000000
SC           0.912835  0.052342  0.087165        0.397249
SC-PWD       1.000000  0.000000  0.000000        1.000000
ST           0.969094  0.061785  0.030906        0.724042
ST-PWD       1.000000  0.000000  0.000000        1.000000

🔍 Categories with significantly lower TPR than GEN:
 - SC: TPR = 0.913 (↓0.066)
