Imports and Data Cleaning

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

*Baseline Representation*

In [3]:
# Switch to your own path
df = pd.read_csv("Crime_Data_from_2020_to_Present.csv")

df

Unnamed: 0,DR_NO,Date Rptd,DATE OCC,TIME OCC,AREA,AREA NAME,Rpt Dist No,Part 1-2,Crm Cd,Crm Cd Desc,...,Status,Status Desc,Crm Cd 1,Crm Cd 2,Crm Cd 3,Crm Cd 4,LOCATION,Cross Street,LAT,LON
0,211507896,04/11/2021 12:00:00 AM,11/07/2020 12:00:00 AM,845,15,N Hollywood,1502,2,354,THEFT OF IDENTITY,...,IC,Invest Cont,354.0,,,,7800 BEEMAN AV,,34.2124,-118.4092
1,201516622,10/21/2020 12:00:00 AM,10/18/2020 12:00:00 AM,1845,15,N Hollywood,1521,1,230,"ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT",...,IC,Invest Cont,230.0,,,,ATOLL AV,N GAULT,34.1993,-118.4203
2,240913563,12/10/2024 12:00:00 AM,10/30/2020 12:00:00 AM,1240,9,Van Nuys,933,2,354,THEFT OF IDENTITY,...,IC,Invest Cont,354.0,,,,14600 SYLVAN ST,,34.1847,-118.4509
3,210704711,12/24/2020 12:00:00 AM,12/24/2020 12:00:00 AM,1310,7,Wilshire,782,1,331,THEFT FROM MOTOR VEHICLE - GRAND ($950.01 AND ...,...,IC,Invest Cont,331.0,,,,6000 COMEY AV,,34.0339,-118.3747
4,201418201,10/03/2020 12:00:00 AM,09/29/2020 12:00:00 AM,1830,14,Pacific,1454,1,420,THEFT FROM MOTOR VEHICLE - PETTY ($950 & UNDER),...,IC,Invest Cont,420.0,,,,4700 LA VILLA MARINA,,33.9813,-118.4350
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1004986,252104112,02/02/2025 12:00:00 AM,02/02/2025 12:00:00 AM,130,21,Topanga,2103,2,946,OTHER MISCELLANEOUS CRIME,...,IC,Invest Cont,946.0,,,,22100 ROSCOE BL,,34.2259,-118.6126
1004987,250404100,02/18/2025 12:00:00 AM,02/18/2025 12:00:00 AM,1000,4,Hollenbeck,479,2,237,CHILD NEGLECT (SEE 300 W.I.C.),...,IC,Invest Cont,237.0,,,,3500 PERCY ST,,34.0277,-118.1979
1004988,251304095,01/31/2025 12:00:00 AM,01/30/2025 12:00:00 AM,1554,13,Newton,1372,2,850,INDECENT EXPOSURE,...,IC,Invest Cont,850.0,,,,300 E 53RD ST,,33.9942,-118.2701
1004989,251704066,01/17/2025 12:00:00 AM,01/17/2025 12:00:00 AM,1600,17,Devonshire,1774,2,624,BATTERY - SIMPLE ASSAULT,...,IC,Invest Cont,624.0,,,,9600 ZELZAH AV,,34.2450,-118.5233


In [5]:
df.drop(columns=[
    'Status', 'Status Desc', 'Crm Cd', 'Crm Cd 1', 'Crm Cd 2', 
    'Crm Cd 3', 'Crm Cd 4', 'DR_NO', 'LOCATION', 'Cross Street'
], inplace=True, errors='ignore')

In [6]:
for col in ['Date Rptd', 'DATE OCC']:
    df[col] = pd.to_datetime(df[col], errors='coerce')

  df[col] = pd.to_datetime(df[col], errors='coerce')
  df[col] = pd.to_datetime(df[col], errors='coerce')


In [7]:
df.dropna(subset=['Crm Cd Desc', 'Vict Age', 'Vict Sex', 'Vict Descent', 'LAT', 'LON'], inplace=True)


In [8]:
df = df[(df['Vict Age'] > 0) & (df['Vict Age'] < 120)]

In [9]:
def simplify_crime_type(desc):
    desc = str(desc).lower()

    # --- Theft / financial crimes ---
    if any(word in desc for word in [
        "theft", "robbery", "burglary", "stolen", "shoplift", "stealing",
        "pickpocket", "purse snatching", "till tap", "forgery", "embezzlement",
        "bunco", "prowler", "larceny", "credit card", "fraud", "counterfeit",
        "document worthless", "stolen property"
    ]):
        return "Theft"

    # --- Assault / violent interpersonal crimes ---
    elif any(word in desc for word in ["assault", "battery", "fighting", "mayhem"]):
        return "Assault"

    # --- Homicide-related ---
    elif any(word in desc for word in ["homicide", "murder", "manslaughter"]):
        return "Homicide"

    # --- Sexual crimes ---
    elif any(word in desc for word in [
        "rape", "sexual", "lewd", "oral copulation", "indecent", "molest",
        "child pornography", "sex", "peeping tom", "pimping"
    ]):
        return "Sexual Crime"

    # --- Property damage / vandalism / trespass ---
    elif any(word in desc for word in [
        "vandalism", "arson", "trespass", "malicious mischief", "graffiti",
        "illegal dumping", "property damage", "telephone"
    ]):
        return "Property Crime"

    # --- Drugs / alcohol ---
    elif any(word in desc for word in ["drunk", "narcotic", "drug", "alcohol", "under influence"]):
        return "Drug/Alcohol"

    # --- Weapons / shootings ---
    elif any(word in desc for word in ["weapon", "firearm", "gun", "shooting", "shots fired"]):
        return "Weapon Offense"

    # --- Kidnapping or abduction ---
    elif any(word in desc for word in ["kidnap", "abduction", "child stealing", "false imprisonment"]):
        return "Kidnapping"

    # --- Threats, harassment, stalking, restraining order, or disruption ---
    elif any(word in desc for word in [
        "threat", "extortion", "stalking", "harass", "intimidation", "restraining order",
        "court order", "contempt", "violation of restraining", "disturbing the peace",
        "disrupt school", "riot", "lynching", "bomb"
    ]):
        return "Threat/Intimidation"

    # --- Vehicle / traffic crimes ---
    elif any(word in desc for word in ["vehicle", "traffic", "hit and run", "failure to yield", "driving"]):
        return "Vehicle Crime"

    # --- Family or child-related crimes ---
    elif any(word in desc for word in [
        "child", "chld", "abandonment", "neglect", "runaway", "custody", "domestic",
        "pandering", "contributing", "cruelty to animals", "family", "disrupt school", "bigamy"
    ]):
        return "Family/Child Issue"

    # --- Cyber / computer crimes ---
    elif any(word in desc for word in ["computer", "cyber", "unauthorized access", "hacking"]):
        return "Cyber Crime"

    # --- Miscellaneous or rare ---
    else:
        return "Other"


In [10]:
# Apply the simplification
df["Crime_Category"] = df["Crm Cd Desc"].apply(simplify_crime_type)

# Check distribution
print(df["Crime_Category"].value_counts())

Crime_Category
Theft                  367298
Assault                197608
Property Crime          76724
Weapon Offense          35269
Threat/Intimidation     25905
Sexual Crime            19388
Other                    4713
Family/Child Issue       3985
Homicide                 1552
Vehicle Crime            1387
Kidnapping               1260
Cyber Crime               463
Drug/Alcohol               44
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Crime_Category"] = df["Crm Cd Desc"].apply(simplify_crime_type)


In [11]:
df["Crime_Category"] = df["Crm Cd Desc"].apply(simplify_crime_type)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Crime_Category"] = df["Crm Cd Desc"].apply(simplify_crime_type)


In [12]:
df["OCC_Year"] = df["DATE OCC"].dt.year
df["OCC_Month"] = df["DATE OCC"].dt.month
df["OCC_Day"] = df["DATE OCC"].dt.day
df["OCC_Weekday"] = df["DATE OCC"].dt.day_name()
df["OCC_Hour"] = df["TIME OCC"] // 100

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["OCC_Year"] = df["DATE OCC"].dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["OCC_Month"] = df["DATE OCC"].dt.month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["OCC_Day"] = df["DATE OCC"].dt.day
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row

In [13]:
demo_cols = ["Vict Age", "Vict Sex", "Vict Descent"]
contextual_cols = ["AREA", "AREA NAME", "Rpt Dist No"]

In [14]:
model_cols = demo_cols + contextual_cols + [
    "Part 1-2", "OCC_Year", "OCC_Month", "OCC_Weekday", "OCC_Hour"
]

df_model = df[model_cols + ["Crime_Category"]].dropna().copy()

In [15]:
print("Shape:", df_model.shape)
print("\nColumns ready for modeling:\n", df_model.columns.tolist())
print("\nTarget distribution:\n", df_model["Crime_Category"].value_counts())

Shape: (735596, 12)

Columns ready for modeling:
 ['Vict Age', 'Vict Sex', 'Vict Descent', 'AREA', 'AREA NAME', 'Rpt Dist No', 'Part 1-2', 'OCC_Year', 'OCC_Month', 'OCC_Weekday', 'OCC_Hour', 'Crime_Category']

Target distribution:
 Crime_Category
Theft                  367298
Assault                197608
Property Crime          76724
Weapon Offense          35269
Threat/Intimidation     25905
Sexual Crime            19388
Other                    4713
Family/Child Issue       3985
Homicide                 1552
Vehicle Crime            1387
Kidnapping               1260
Cyber Crime               463
Drug/Alcohol               44
Name: count, dtype: int64


Pre-processing Steps

In [16]:
X = df_model.drop(columns=["Crime_Category"])
y = df_model["Crime_Category"]

In [17]:
cat_cols = [c for c in X.columns if X[c].dtype == "object"]
num_cols = [c for c in X.columns if np.issubdtype(X[c].dtype, np.number)]

In [18]:
preprocessor = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
])

In [19]:
# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, stratify=y, random_state=42)

Model 1: Logistic Regression

In [20]:
logreg_pipe = Pipeline([
    ("pre", preprocessor),
    ("model", LogisticRegression(max_iter=300, n_jobs=-1))
])

In [21]:
logreg_pipe.fit(X_train, y_train)
y_pred_logreg = logreg_pipe.predict(X_test)

In [22]:
print("Logistic Regression Results:")
print(classification_report(y_test, y_pred_logreg, zero_division=0))
print("Accuracy:", round(accuracy_score(y_test, y_pred_logreg), 4))

Logistic Regression Results:
                     precision    recall  f1-score   support

            Assault       0.42      0.57      0.49     39522
        Cyber Crime       0.00      0.00      0.00        92
       Drug/Alcohol       0.00      0.00      0.00         9
 Family/Child Issue       0.49      0.36      0.41       797
           Homicide       0.00      0.00      0.00       310
         Kidnapping       0.00      0.00      0.00       252
              Other       0.00      0.00      0.00       943
     Property Crime       0.38      0.12      0.19     15345
       Sexual Crime       0.00      0.00      0.00      3878
              Theft       0.71      0.85      0.77     73460
Threat/Intimidation       0.00      0.00      0.00      5181
      Vehicle Crime       0.00      0.00      0.00       277
     Weapon Offense       0.00      0.00      0.00      7054

           accuracy                           0.59    147120
          macro avg       0.15      0.15      0.14    

Model 2: Gradient Boost

In [23]:
gb_pipe = Pipeline([
    ("pre", preprocessor),
    ("model", GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42))
])

In [24]:
gb_pipe.fit(X_train, y_train)
y_pred_gb = gb_pipe.predict(X_test)

KeyboardInterrupt: 

In [None]:
print("Gradient Boosting Results")
print(classification_report(y_test, y_pred_gb, zero_division=0))
print("Accuracy:", round(accuracy_score(y_test, y_pred_gb), 4))

Model 3: Random Forest

In [None]:
rf_pipe = Pipeline([
    ("pre", preprocessor),
    ("model", RandomForestClassifier(
        n_estimators=150, max_depth=15, n_jobs=-1, random_state=42))
])

In [None]:
rf_pipe.fit(X_train, y_train)
y_pred_rf = rf_pipe.predict(X_test)

In [None]:
print("Random Forest Results")
print(classification_report(y_test, y_pred_rf, zero_division=0))
print("Accuracy:", round(accuracy_score(y_test, y_pred_rf), 4))

Fairness Evaluation 

In [None]:
# Map LAPD Vict Descent codes to full subgroup names (make reading fairness easier)
descent_mapping = {
    "A": "Other Asian",
    "B": "Black",
    "C": "Chinese",
    "D": "Cambodian",
    "F": "Filipino",
    "G": "Guamanian",
    "H": "Hispanic/Latino",
    "I": "American Indian/Alaskan Native",
    "J": "Japanese",
    "K": "Korean",
    "L": "Laotian",
    "O": "Other",
    "P": "Pacific Islander",
    "S": "Samoan",
    "U": "Hawaiian",
    "V": "Vietnamese",
    "W": "White",
    "X": "Unknown",
    "Z": "Asian Indian"
}

df["Vict Descent Full"] = df["Vict Descent"].map(descent_mapping).fillna("Unknown")

In [None]:
# Logistic Regression Fairness
df_logisticreg = pd.DataFrame({
    "y_true" : y_test,
    "y_pred" : y_pred_logreg,
    "group" :  df.loc[y_test.index, "Vict Descent Full"]
})

acc_by_group = df_logisticreg.groupby("group").apply(lambda x: accuracy_score(x.y_true, x.y_pred))
gap = acc_by_group.max() - acc_by_group.min()

print("Fairness Results — Logistic Regression")
print(acc_by_group.sort_values(ascending=False).round(3))
print(f"\nSubgroup Accuracy Gap: {gap:.3f}")

In [None]:
# Gradient Boost Fairness
df_gb = pd.DataFrame({
    "y_true": y_test,
    "y_pred": y_pred_gb,
    "group": df.loc[y_test.index, "Vict Descent Full"]
})

acc_by_group_gb = df_gb.groupby("group").apply(lambda x: accuracy_score(x.y_true, x.y_pred))
gap_gb = acc_by_group_gb.max() - acc_by_group_gb.min()

print("Fairness Results — Gradient Boosting")
print(acc_by_group_gb.sort_values(ascending=False).round(3))
print(f"\nSubgroup Accuracy Gap: {gap_gb:.3f}")

In [None]:
# Random Forest Fairness
df_rf = pd.DataFrame({
    "y_true": y_test,
    "y_pred": y_pred_rf,
    "group": df.loc[y_test.index, "Vict Descent Full"]
})

acc_by_group_rf = df_rf.groupby("group").apply(lambda x: accuracy_score(x.y_true, x.y_pred))
gap_rf = acc_by_group_rf.max() - acc_by_group_rf.min()

print("Fairness Results — Random Forest")
print(acc_by_group_rf.sort_values(ascending=False).round(3))
print(f"\nSubgroup Accuracy Gap: {gap_rf:.3f}")

In [None]:
# Overall
fairness_summary = pd.DataFrame({
    "Model": ["Logistic Regression", "Random Forest", "Gradient Boosting"],
    "Subgroup Accuracy Gap": [gap, gap_rf, gap_gb]
}).sort_values("Subgroup Accuracy Gap")

print("\nOverall Fairness Comparison:")
print(fairness_summary.round(3))

Feature Importance

In [None]:
# Logistic Regression 
log_model = logreg_pipe.named_steps["model"]
pre = logreg_pipe.named_steps["pre"]

coefs = log_model.coef_[0]  # one-vs-rest (first class)
feature_names = pre.get_feature_names_out()

idx = np.argsort(np.abs(coefs))[-10:]
plt.figure(figsize=(8,6))
plt.barh(np.array(feature_names)[idx], np.abs(coefs[idx]))
plt.title("Logistic Regression — Top 15 Coefficients (by Magnitude)")
plt.xlabel("|Coefficient|")
plt.tight_layout()
plt.show()

In [None]:
# Gradient Boost 
gb_model = gb_pipe.named_steps["model"]
pre = gb_pipe.named_steps["pre"]

feature_names = pre.get_feature_names_out()
importances = gb_model.feature_importances_

idx = np.argsort(importances)[-10:]
plt.figure(figsize=(8,6))
plt.barh(np.array(feature_names)[idx], importances[idx])
plt.title("Gradient Boosting — Top 15 Feature Importances")
plt.xlabel("Importance")
plt.tight_layout()
plt.show()



In [None]:
rf_model = rf_pipe.named_steps["model"]
pre = rf_pipe.named_steps["pre"]

feature_names = pre.get_feature_names_out()
importances = rf_model.feature_importances_

# Sort and select top 15
idx = np.argsort(importances)[-10:]
plt.figure(figsize=(8,6))
plt.barh(np.array(feature_names)[idx], importances[idx])
plt.title("Random Forest — Top 15 Feature Importances")
plt.xlabel("Importance")
plt.tight_layout()
plt.show()

In [None]:
# Overall Analysis
top_features_rf = pd.DataFrame({
    "Feature": np.array(feature_names)[idx],
    "Importance": importances[idx]
}).sort_values("Importance", ascending=False)

display(top_features_rf)


*Improved Representation*

In [None]:
df_imp = df.copy()

In [None]:
# Age bins (interpretable)
df_imp["Vict Age Bin"] = pd.cut(
    df_imp["Vict Age"],
    bins=[0, 17, 29, 49, 64, 120],
    labels=["0-17","18-29","30-49","50-64","65+"],
    right=True
)

In [None]:
# Group rare AREA NAMEs
min_freq = 0.01  
area_counts = df_imp["AREA NAME"].value_counts(normalize=True)
rare_areas = area_counts[area_counts < min_freq].index
df_imp["AREA NAME Grouped"] = df_imp["AREA NAME"].where(~df_imp["AREA NAME"].isin(rare_areas), "Other")

In [None]:
# Same datetime features as baseline
df_imp["OCC_Year"] = df_imp["DATE OCC"].dt.year
df_imp["OCC_Month"] = df_imp["DATE OCC"].dt.month
df_imp["OCC_Weekday"] = df_imp["DATE OCC"].dt.day_name()
df_imp["OCC_Hour"] = df_imp["TIME OCC"] // 100

In [None]:
demo_cols_imp = ["Vict Age Bin"]
context_cols_imp = ["AREA", "AREA NAME Grouped", "Rpt Dist No"]

In [None]:
model_cols_imp = demo_cols_imp + context_cols_imp + ["Part 1-2","OCC_Year","OCC_Month","OCC_Weekday","OCC_Hour"]
df_model_improved = df_imp[model_cols_imp + ["Crime_Category","Vict Descent Full"]].dropna().copy()

In [None]:
X_imp = df_model_improved.drop(columns=["Crime_Category", "Vict Descent Full"])
y_imp = df_model_improved["Crime_Category"]
sens_imp = df_model_improved["Vict Descent Full"]

In [None]:
Xtr_imp, Xte_imp, ytr_imp, yte_imp = train_test_split(
    X_imp, y_imp, test_size=0.2, stratify=y_imp, random_state=42
)

Preprocessing

In [None]:
cat_cols_imp = [
    c for c in X_imp.columns 
    if pd.api.types.is_object_dtype(X_imp[c]) or pd.api.types.is_categorical_dtype(X_imp[c])
]

num_cols_imp = [
    c for c in X_imp.columns 
    if pd.api.types.is_numeric_dtype(X_imp[c])
]

In [None]:
pre_imp = ColumnTransformer([
    ("num", StandardScaler(), num_cols_imp),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols_imp)
])

Model 1: Linear Regression

In [None]:
logreg_imp = Pipeline([
    ("pre", pre_imp),
    ("model", LogisticRegression(max_iter=300, n_jobs=-1))
])

In [None]:
logreg_imp.fit(Xtr_imp, ytr_imp)
y_pred_log_imp = logreg_imp.predict(Xte_imp)

In [None]:
print("Linear Regression (Improved)")
print(classification_report(yte_imp, y_pred_log_imp, zero_division=0))
print("Accuracy:", round(accuracy_score(yte_imp, y_pred_log_imp), 4))

Model 2: Gradient Boost 

In [None]:
gb_imp = Pipeline([
    ("pre", pre_imp),
    ("model", GradientBoostingClassifier(
        n_estimators=100, max_depth=3, learning_rate=0.1, random_state=42))
])

In [None]:
gb_imp.fit(Xtr_imp, ytr_imp)
y_pred_gb_imp = gb_imp.predict(Xte_imp)

In [None]:
print("Gradient Boost (Improved)")
print(classification_report(yte_imp, y_pred_gb_imp, zero_division=0))
print("Accuracy:", round(accuracy_score(yte_imp, y_pred_gb_imp), 4))

Model 3: Random Forest (Improved)

In [None]:
rf_imp = Pipeline([
    ("pre", pre_imp),
    ("model", RandomForestClassifier(
        n_estimators=150, max_depth=15, n_jobs=-1, random_state=42))
])

In [None]:
rf_imp.fit(Xtr_imp, ytr_imp)
y_pred_rf_imp = rf_imp.predict(Xte_imp)

In [None]:
print("Random Forest (Improved)")
print(classification_report(yte_imp, y_pred_rf_imp, zero_division=0))
print("Accuracy:", round(accuracy_score(yte_imp, y_pred_rf_imp), 4))

Fairness Report

In [None]:
# Logistic Regression Fairness (Improved)
df_logreg_imp = pd.DataFrame({
    "y_true": yte_imp,
    "y_pred": y_pred_log_imp,
    "group": sens_imp.loc[yte_imp.index]
})

acc_by_group = df_logreg_imp.groupby("group").apply(lambda x: accuracy_score(x.y_true, x.y_pred))
gap_log_imp = acc_by_group.max() - acc_by_group.min()

print("Fairness Results — Logistic Regression (Improved)")
print(acc_by_group.sort_values(ascending=False).round(3))
print(f"\nSubgroup Accuracy Gap: {gap:.3f}")

In [None]:
# Gradient Boosting Fairness (Improved)
df_gb_imp = pd.DataFrame({
    "y_true": yte_imp,
    "y_pred": y_pred_gb_imp,
    "group": sens_imp.loc[yte_imp.index]
})

acc_by_group = df_gb_imp.groupby("group").apply(lambda x: accuracy_score(x.y_true, x.y_pred))
gap_gb_imp = acc_by_group.max() - acc_by_group.min()

print("Fairness Results — Gradient Boosting (Improved)")
print(acc_by_group.sort_values(ascending=False).round(3))
print(f"\nSubgroup Accuracy Gap: {gap:.3f}")

In [None]:
# Random Forest Fairness (Improved)
df_rf_imp = pd.DataFrame({
    "y_true": yte_imp,
    "y_pred": y_pred_rf_imp,
    "group": sens_imp.loc[yte_imp.index]
})

acc_by_group = df_rf_imp.groupby("group").apply(lambda x: accuracy_score(x.y_true, x.y_pred))
gap_rf_imp = acc_by_group.max() - acc_by_group.min()

print("Fairness Results — Random Forest (Improved)")
print(acc_by_group.sort_values(ascending=False).round(3))
print(f"\nSubgroup Accuracy Gap: {gap:.3f}")

In [None]:
# Overall Fairness Summary
fairness_summary = pd.DataFrame({
    "Model": ["Logistic Regression (Improved)", "Random Forest (Improved)", "Gradient Boosting (Improved)"],
    "Subgroup Accuracy Gap": [gap_log_imp, gap_rf_imp, gap_gb_imp]
}).sort_values("Subgroup Accuracy Gap")

print("\nOverall Fairness Comparison:")
print(fairness_summary.round(3))


Feature Importance

In [None]:
# Logistic Regression
log_model = logreg_imp.named_steps["model"]
pre = logreg_imp.named_steps["pre"]

feature_names = pre.get_feature_names_out()
coefs = log_model.coef_[0]

idx = np.argsort(np.abs(coefs))[-10:]
plt.figure(figsize=(8,6))
plt.barh(np.array(feature_names)[idx], np.abs(coefs[idx]))
plt.title("Logistic Regression (Improved) — Top 15 Coefficients by Magnitude")
plt.xlabel("|Coefficient|")
plt.tight_layout()
plt.show()

In [None]:
# Gradient Boosting
gb_model = gb_imp.named_steps["model"]
pre = gb_imp.named_steps["pre"]

feature_names = pre.get_feature_names_out()
importances = gb_model.feature_importances_

idx = np.argsort(importances)[-10:]
plt.figure(figsize=(8,6))
plt.barh(np.array(feature_names)[idx], importances[idx])
plt.title("Gradient Boosting (Improved) — Top 15 Feature Importances")
plt.xlabel("Importance")
plt.tight_layout()
plt.show()

In [None]:
rf_model = rf_imp.named_steps["model"]
pre = rf_imp.named_steps["pre"]

feature_names = pre.get_feature_names_out()
importances = rf_model.feature_importances_

idx = np.argsort(importances)[-10:]
plt.figure(figsize=(8,6))
plt.barh(np.array(feature_names)[idx], importances[idx])
plt.title("Random Forest (Improved) — Top 15 Feature Importances")
plt.xlabel("Importance")
plt.tight_layout()
plt.show()


*Comparing Baseline to Improved*

In [None]:
report = classification_report(yte_imp, y_pred_log_imp, output_dict=True, zero_division=0)
f1_log_imp = report["weighted avg"]["f1-score"]
acc_log_imp = report["accuracy"]

In [None]:
# Extracting Report from Baseline

# Logistic Regression (Baseline)
report_log = classification_report(y_test, y_pred_logreg, output_dict=True, zero_division=0)
acc_log = report_log["accuracy"]
f1_log = report_log["weighted avg"]["f1-score"]

# Random Forest (Baseline)
report_rf = classification_report(y_test, y_pred_rf, output_dict=True, zero_division=0)
acc_rf = report_rf["accuracy"]
f1_rf = report_rf["weighted avg"]["f1-score"]

# Gradient Boosting (Baseline)
report_gb = classification_report(y_test, y_pred_gb, output_dict=True, zero_division=0)
acc_gb = report_gb["accuracy"]
f1_gb = report_gb["weighted avg"]["f1-score"]

In [None]:
# Extracting Report from Improved

# Logistic Regression (Improved)
report_log_imp = classification_report(yte_imp, y_pred_log_imp, output_dict=True, zero_division=0)
acc_log_imp = report_log_imp["accuracy"]
f1_log_imp = report_log_imp["weighted avg"]["f1-score"]

# Random Forest (Improved)
report_rf_imp = classification_report(yte_imp, y_pred_rf_imp, output_dict=True, zero_division=0)
acc_rf_imp = report_rf_imp["accuracy"]
f1_rf_imp = report_rf_imp["weighted avg"]["f1-score"]

# Gradient Boosting (Improved)
report_gb_imp = classification_report(yte_imp, y_pred_gb_imp, output_dict=True, zero_division=0)
acc_gb_imp = report_gb_imp["accuracy"]
f1_gb_imp = report_gb_imp["weighted avg"]["f1-score"]

In [None]:
df_logisticreg = pd.DataFrame({
    "y_true" : y_test,
    "y_pred" : y_pred_logreg,
    "group" :  df.loc[y_test.index, "Vict Descent Full"]
})
acc_by_group = df_logisticreg.groupby("group").apply(lambda x: accuracy_score(x.y_true, x.y_pred))
gap = acc_by_group.max() - acc_by_group.min()


In [None]:
gap_log = gap   # after Logistic Regression baseline fairness
gap_rf = gap    # after Random Forest baseline fairness
gap_gb = gap    # after Gradient Boosting baseline fairness


In [None]:
comparison_df = pd.DataFrame({
    "Model": [
        "Logistic Regression (Baseline)", "Logistic Regression (Improved)",
        "Random Forest (Baseline)", "Random Forest (Improved)",
        "Gradient Boosting (Baseline)", "Gradient Boosting (Improved)"
    ],
    "Accuracy": [
        acc_log, acc_log_imp,
        acc_rf, acc_rf_imp,
        acc_gb, acc_gb_imp
    ],
    "F1-Score": [
        f1_log, f1_log_imp,
        f1_rf, f1_rf_imp,
        f1_gb, f1_gb_imp
    ],
    "Fairness Gap": [
        gap_log, gap_log_imp,
        gap_rf, gap_rf_imp,
        gap_gb, gap_gb_imp
    ]
})
comparison_df.round(3)


In [None]:
# Baseline vs Improved Visualizations

models = ["LogReg", "RandomForest", "GradBoost"]
x = np.arange(len(models))
width = 0.35

acc_baseline = [0.594, 0.613, 0.616]
acc_improved = [0.585, 0.608, 0.607]

f1_baseline = [0.539, 0.548, 0.565]
f1_improved = [0.523, 0.541, 0.548]

gap_baseline = [0.390, 0.390, 0.390]
gap_improved = [0.383, 0.425, 0.362]

fig, axes = plt.subplots(1, 3, figsize=(15,5))
titles = ["Accuracy", "F1-Score", "Fairness Gap (Lower = Fairer)"]
metrics = [(acc_baseline, acc_improved),
           (f1_baseline, f1_improved),
           (gap_baseline, gap_improved)]

for ax, (baseline, improved), title in zip(axes, metrics, titles):
    ax.bar(x - width/2, baseline, width, label="Baseline", alpha=0.8)
    ax.bar(x + width/2, improved, width, label="Improved", alpha=0.8)
    ax.set_xticks(x)
    ax.set_xticklabels(models)
    ax.set_title(title)
    ax.legend()
    ax.grid(axis='y', linestyle='--', alpha=0.5)

plt.suptitle("Baseline vs Improved — Model Performance and Fairness", fontsize=14)
plt.tight_layout()
plt.show()
