In [9]:
import pandas as pd

In [20]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import OneHotEncoder

# Load the data
df = pd.read_csv('processed_data.csv')

# Define the sensitive attributes
sensitive_attributes = ['race1', 'gender']

# Assuming 'bar' is the target column
X = df.drop(columns=['target'])  # Features
y = df['target']  # Target

# Define one-hot encoding for race1 (adjust based on your actual columns)
race1_cols = ['race1_asian', 'race1_black', 'race1_hisp', 'race1_other', 'race1_white']

# Define logistic regression model with balanced class weights
log_reg = LogisticRegression(max_iter=200, class_weight='balanced')

# Define the parameter grid for grid search
param_grid = {
    'penalty': ['l2'],
    'C': [0.1, 1, 10],
    'solver': ['liblinear', 'lbfgs'],
}

# Set up cross-validation and grid search
kf = KFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(estimator=log_reg, param_grid=param_grid, cv=kf, scoring='accuracy')

# Store results for each subgroup
results = []

# Train and evaluate the model for each sensitive attribute
for attribute in sensitive_attributes:
    print(f"\nEvaluation for attribute: {attribute}\n")
    
    # Subset X and y for the current attribute
    if attribute == 'race1':
        X_group = X[race1_cols]  # Subset features for race1
    else:
        X_group = X[[attribute]]  # Subset features for gender
    
    y_group = y  # Use the entire target variable for simplicity
    
    # Perform grid search with cross-validation
    grid_search.fit(X_group, y_group)
    
    # Get the best model and its performance metrics
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_group)
    
    # Calculate evaluation metrics: accuracy, sensitivity (TPR), specificity (TNR)
    accuracy = accuracy_score(y_group, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_group, y_pred).ravel()
    tpr = tp / (tp + fn)  # True Positive Rate (Sensitivity)
    tnr = tn / (tn + fp)  # True Negative Rate (Specificity)
    
    # Print and store results for each subgroup
    print(f"Accuracy: {accuracy}")
    print(f"True Positive Rate (Sensitivity): {tpr}")
    print(f"True Negative Rate (Specificity): {tnr}")
    print()
    
    # Store results in a structured format for further analysis
    results.append({
        'Attribute': attribute,
        'Accuracy': accuracy,
        'Sensitivity': tpr,
        'Specificity': tnr
    })
    
    # Print classification report for more detailed metrics
    print(f"Classification Report for {attribute}:")
    print(classification_report(y_group, y_pred))
    
# Convert results to DataFrame for further analysis or visualization
results_df = pd.DataFrame(results)

# Display the results DataFrame
print("Results for each attribute:")
print(results_df)



Evaluation for attribute: race1

Accuracy: 0.8318382648279555
True Positive Rate (Sensitivity): 0.8522930596101328
True Negative Rate (Specificity): 0.4602224123182207

Classification Report for race1:
              precision    recall  f1-score   support

           0       0.15      0.46      0.22      1169
           1       0.97      0.85      0.91     21238

    accuracy                           0.83     22407
   macro avg       0.56      0.66      0.56     22407
weighted avg       0.92      0.83      0.87     22407


Evaluation for attribute: gender

Accuracy: 0.5600035703128486
True Positive Rate (Sensitivity): 0.5640832470100763
True Negative Rate (Specificity): 0.48588537211291705

Classification Report for gender:
              precision    recall  f1-score   support

           0       0.06      0.49      0.10      1169
           1       0.95      0.56      0.71     21238

    accuracy                           0.56     22407
   macro avg       0.51      0.52      0.41   