<a href="https://colab.research.google.com/github/kusum83/Pipelines-Projects/blob/main/M515_Ethical_Issues_for_AI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
import matplotlib.pyplot as mpt
import seaborn as sb

In [None]:
df = pd.read_csv("/content/job_applicant_dataset.csv")
df.head()

Unnamed: 0,Job Applicant Name,Age,Gender,Race,Ethnicity,Resume,Job Roles,Job Description,Best Match
0,Daisuke Mori,29,Male,Mongoloid/Asian,Vietnamese,"Proficient in Injury Prevention, Motivation, N...",Fitness Coach,A Fitness Coach is responsible for helping cl...,0
1,Taichi Shimizu,31,Male,Mongoloid/Asian,Filipino,"Proficient in Healthcare, Pharmacology, Medica...",Physician,"Diagnose and treat illnesses, prescribe medica...",0
2,Sarah Martin,46,Female,White/Caucasian,Dutch,"Proficient in Forecasting, Financial Modelling...",Financial Analyst,"As a Financial Analyst, you will be responsibl...",0
3,Keith Hughes,43,Male,Negroid/Black,Caribbean,"Proficient in Budgeting, Supply Chain Optimiza...",Supply Chain Manager,A Supply Chain Manager oversees the entire sup...,1
4,James Davis,49,Male,White/Caucasian,English,"Proficient in Logistics, Negotiation, Procurem...",Supply Chain Manager,A Supply Chain Manager oversees the entire sup...,1


In [None]:
df.drop(columns=["Job Applicant Name"], inplace=True)

In [None]:
categoricals = ["Gender", "Race", "Ethnicity", "Job Roles"]
label_encoders = {}
for col in categoricals:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [None]:
# TF-IDF Vectorization for text features
tfidf_resume = TfidfVectorizer(stop_words="english", max_features=100)
tfidf_jobdesc = TfidfVectorizer(stop_words="english", max_features=100)
resume_features = tfidf_resume.fit_transform(df["Resume"])
jobdesc_features = tfidf_jobdesc.fit_transform(df["Job Description"])

In [None]:
# Combine features
x_other = df[["Age", "Gender", "Race", "Ethnicity", "Job Roles"]].values
x = np.hstack((x_other, resume_features.toarray(), jobdesc_features.toarray()))
y = df["Best Match"].values

In [None]:
# Train-test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
# 3. Train Multiple ML Models
# --------------------------
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Support Vector Machine": SVC(kernel='linear', probability=True)
}

results = {}

for name, model in models.items():
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    acc = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    results[name] = {
        "accuracy": acc,
        "precision": report['1']['precision'],
        "recall": report['1']['recall'],
        "f1-score": report['1']['f1-score'],
        "predictions": y_pred
    }
    print(f"\nClassification Report - {name}")
    print(classification_report(y_test, y_pred))


Classification Report - Logistic Regression
              precision    recall  f1-score   support

           0       0.66      0.66      0.66      1023
           1       0.65      0.64      0.64       977

    accuracy                           0.65      2000
   macro avg       0.65      0.65      0.65      2000
weighted avg       0.65      0.65      0.65      2000


Classification Report - Random Forest
              precision    recall  f1-score   support

           0       0.87      0.88      0.87      1023
           1       0.87      0.86      0.87       977

    accuracy                           0.87      2000
   macro avg       0.87      0.87      0.87      2000
weighted avg       0.87      0.87      0.87      2000


Classification Report - Support Vector Machine
              precision    recall  f1-score   support

           0       0.69      0.71      0.70      1023
           1       0.69      0.67      0.68       977

    accuracy                           0.69      2

In [None]:
# 4. Evaluate Bias in Predictions
# --------------------------
def group_accuracy(df, sensitive_attr, predictions, labels):
    df_temp = df.iloc[y_test.index].copy()
    df_temp["pred"] = predictions
    df_temp["true"] = labels
    return df_temp.groupby(sensitive_attr).apply(lambda x: (x.pred == x.true).mean())

df_test = df.iloc[y_test.index].copy()

for name, result in results.items():
    print(f"\nBias Detection - {name}")
    df_test["pred"] = result["predictions"]
    print("Accuracy by Gender:")
    print(group_accuracy(df_test, "Gender", df_test["pred"], y_test))
    print("Accuracy by Race:")
    print(group_accuracy(df_test, "Race", df_test["pred"], y_test))