# Auto Identification of Beneficiaries - Fairness Audit

**Use Case:** AI-PLATFORM-03 - Auto Identification of Beneficiaries  
**Objective:** Audit eligibility identification for demographic fairness and bias detection  
**MLflow Experiment:** `smart/identification_beneficiary/*`

## Overview

This notebook performs fairness audits on:
- Eligibility rates by demographic groups (caste, gender, geography)
- ML score distributions across protected attributes
- Rule engine bias detection
- Hybrid evaluator fairness metrics
- Geographic coverage equity
- Scheme-wise fairness analysis


In [None]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import yaml
import warnings
from scipy import stats
warnings.filterwarnings('ignore')

# Add paths
project_root = Path().absolute().parent.parent.parent.parent
sys.path.append(str(project_root / 'shared' / 'utils'))
from db_connector import DBConnector

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 7)

# Load config
config_path = Path().absolute().parent.parent / "config" / "db_config.yaml"
with open(config_path, 'r') as f:
    config = yaml.safe_load(f)

# Connect to database
db = DBConnector(
    host=config['database']['host'],
    port=config['database']['port'],
    database=config['database']['name'],
    user=config['database']['user'],
    password=config['database']['password']
)
db.connect()

print("‚úÖ Connected to database")
print(f"   Database: {config['database']['name']} at {config['database']['host']}:{config['database']['port']}")
print(f"   Schema: {config['database']['schema']}")


## 1. Demographic Fairness Analysis


In [None]:
# Analyze eligibility by demographic groups (combining with Golden Records if available)
try:
    # First, try to get data from eligibility snapshots joined with golden records
    fairness_query = """
    SELECT 
        CASE 
            WHEN gr.caste_id = 1 THEN 'GEN'
            WHEN gr.caste_id = 2 THEN 'OBC'
            WHEN gr.caste_id = 3 THEN 'SC'
            WHEN gr.caste_id = 4 THEN 'ST'
            ELSE 'OTHER'
        END as caste_group,
        CASE 
            WHEN gr.gender = 'M' THEN 'Male'
            WHEN gr.gender = 'F' THEN 'Female'
            ELSE 'Other'
        END as gender_group,
        es.scheme_code,
        COUNT(*) as total_candidates,
        COUNT(*) FILTER (WHERE es.rule_status = 'ELIGIBLE') as eligible_count,
        COUNT(*) FILTER (WHERE es.rule_status = 'NOT_ELIGIBLE') as not_eligible_count,
        AVG(es.ml_score) as avg_ml_score,
        AVG(es.hybrid_score) as avg_hybrid_score
    FROM eligibility.eligibility_snapshots es
    LEFT JOIN golden_record.golden_records gr ON es.beneficiary_id = gr.gr_id
    WHERE gr.caste_id IS NOT NULL
    GROUP BY caste_group, gender_group, es.scheme_code
    ORDER BY total_candidates DESC
    LIMIT 100
    """
    fairness_df = pd.read_sql(fairness_query, db.connection)
    
    if len(fairness_df) > 0:
        print("üìä Demographic Fairness Analysis:")
        print(f"   Found {len(fairness_df)} demographic-group-scheme combinations")
        
        # Calculate eligibility rates
        fairness_df['eligibility_rate'] = fairness_df['eligible_count'] / fairness_df['total_candidates']
        
        # Visualization
        fig, axes = plt.subplots(2, 2, figsize=(16, 12))
        
        # Eligibility rate by caste
        caste_eligibility = fairness_df.groupby('caste_group').agg({
            'eligible_count': 'sum',
            'total_candidates': 'sum'
        })
        caste_eligibility['eligibility_rate'] = caste_eligibility['eligible_count'] / caste_eligibility['total_candidates']
        axes[0, 0].bar(caste_eligibility.index, caste_eligibility['eligibility_rate'].values, 
                      color=['blue', 'green', 'orange', 'red', 'purple'], alpha=0.7)
        axes[0, 0].set_title('Eligibility Rate by Caste Group')
        axes[0, 0].set_ylabel('Eligibility Rate')
        axes[0, 0].set_ylim(0, 1)
        
        # Eligibility rate by gender
        gender_eligibility = fairness_df.groupby('gender_group').agg({
            'eligible_count': 'sum',
            'total_candidates': 'sum'
        })
        gender_eligibility['eligibility_rate'] = gender_eligibility['eligible_count'] / gender_eligibility['total_candidates']
        axes[0, 1].bar(gender_eligibility.index, gender_eligibility['eligibility_rate'].values, 
                      color=['blue', 'pink', 'gray'], alpha=0.7)
        axes[0, 1].set_title('Eligibility Rate by Gender')
        axes[0, 1].set_ylabel('Eligibility Rate')
        axes[0, 1].set_ylim(0, 1)
        
        # ML Score distribution by caste
        caste_scores = fairness_df.groupby('caste_group')['avg_ml_score'].mean()
        axes[1, 0].bar(caste_scores.index, caste_scores.values, 
                      color=['blue', 'green', 'orange', 'red', 'purple'], alpha=0.7)
        axes[1, 0].set_title('Average ML Score by Caste Group')
        axes[1, 0].set_ylabel('Average ML Score')
        
        # Hybrid Score distribution by caste
        caste_hybrid = fairness_df.groupby('caste_group')['avg_hybrid_score'].mean()
        axes[1, 1].bar(caste_hybrid.index, caste_hybrid.values, 
                      color=['blue', 'green', 'orange', 'red', 'purple'], alpha=0.7)
        axes[1, 1].set_title('Average Hybrid Score by Caste Group')
        axes[1, 1].set_ylabel('Average Hybrid Score')
        
        plt.tight_layout()
        plt.show()
        
        # Statistical tests for fairness
        print("\nüìà Statistical Fairness Tests:")
        print("="*60)
        
        # Chi-square test for independence (caste vs eligibility)
        contingency_caste = pd.crosstab(
            fairness_df['caste_group'], 
            (fairness_df['eligible_count'] > fairness_df['not_eligible_count'])
        )
        chi2_caste, p_value_caste, dof_caste, expected_caste = stats.chi2_contingency(contingency_caste)
        print(f"\nCaste vs Eligibility (Chi-square test):")
        print(f"   Chi-square statistic: {chi2_caste:.4f}")
        print(f"   P-value: {p_value_caste:.4f}")
        print(f"   {'‚ö†Ô∏è Significant bias detected' if p_value_caste < 0.05 else '‚úÖ No significant bias detected'} (Œ±=0.05)")
        
    else:
        print("‚ö†Ô∏è No demographic fairness data found.")
        print("   This may require joining with Golden Records data.")
        print("   Ensure eligibility snapshots have beneficiary_id linking to golden_records.")
except Exception as e:
    print(f"‚ö†Ô∏è Demographic fairness analysis error: {e}")
    print("   This is expected if eligibility snapshots haven't been created yet or")
    print("   if the data structure doesn't include demographic information.")


## 2. Geographic Equity Analysis


In [None]:
# Analyze geographic equity
try:
    geo_fairness_query = """
    SELECT 
        district_id,
        COUNT(*) as total_candidates,
        COUNT(*) FILTER (WHERE rule_status = 'ELIGIBLE') as eligible_count,
        AVG(hybrid_score) as avg_hybrid_score,
        STDDEV(hybrid_score) as std_hybrid_score
    FROM eligibility.eligibility_snapshots
    WHERE district_id IS NOT NULL
    GROUP BY district_id
    HAVING COUNT(*) >= 10
    ORDER BY total_candidates DESC
    """
    geo_fairness_df = pd.read_sql(geo_fairness_query, db.connection)
    
    if len(geo_fairness_df) > 0:
        geo_fairness_df['eligibility_rate'] = geo_fairness_df['eligible_count'] / geo_fairness_df['total_candidates']
        
        print(f"üó∫Ô∏è Geographic Equity Analysis ({len(geo_fairness_df)} districts):")
        print(f"   Average Eligibility Rate: {geo_fairness_df['eligibility_rate'].mean():.3f}")
        print(f"   Std Dev of Eligibility Rate: {geo_fairness_df['eligibility_rate'].std():.3f}")
        print(f"   Coefficient of Variation: {geo_fairness_df['eligibility_rate'].std() / geo_fairness_df['eligibility_rate'].mean():.3f}")
        
        # Visualization
        fig, axes = plt.subplots(2, 2, figsize=(16, 12))
        
        # Eligibility rate distribution
        axes[0, 0].hist(geo_fairness_df['eligibility_rate'], bins=20, edgecolor='black', alpha=0.7)
        axes[0, 0].axvline(geo_fairness_df['eligibility_rate'].mean(), color='red', linestyle='--', 
                          label=f'Mean: {geo_fairness_df["eligibility_rate"].mean():.3f}')
        axes[0, 0].set_title('Eligibility Rate Distribution by District')
        axes[0, 0].set_xlabel('Eligibility Rate')
        axes[0, 0].set_ylabel('Number of Districts')
        axes[0, 0].legend()
        
        # Top and bottom districts
        top_districts = geo_fairness_df.nlargest(10, 'eligibility_rate')
        bottom_districts = geo_fairness_df.nsmallest(10, 'eligibility_rate')
        
        axes[0, 1].barh(range(len(top_districts)), top_districts['eligibility_rate'].values, 
                       alpha=0.7, color='green', label='Top 10')
        axes[0, 1].set_yticks(range(len(top_districts)))
        axes[0, 1].set_yticklabels(top_districts['district_id'])
        axes[0, 1].set_title('Top 10 Districts by Eligibility Rate')
        axes[0, 1].set_xlabel('Eligibility Rate')
        
        axes[1, 0].barh(range(len(bottom_districts)), bottom_districts['eligibility_rate'].values, 
                       alpha=0.7, color='red', label='Bottom 10')
        axes[1, 0].set_yticks(range(len(bottom_districts)))
        axes[1, 0].set_yticklabels(bottom_districts['district_id'])
        axes[1, 0].set_title('Bottom 10 Districts by Eligibility Rate')
        axes[1, 0].set_xlabel('Eligibility Rate')
        
        # Scatter: candidates vs eligibility rate
        axes[1, 1].scatter(geo_fairness_df['total_candidates'], geo_fairness_df['eligibility_rate'], 
                          alpha=0.6, s=100)
        axes[1, 1].set_title('Candidate Count vs Eligibility Rate by District')
        axes[1, 1].set_xlabel('Total Candidates')
        axes[1, 1].set_ylabel('Eligibility Rate')
        
        plt.tight_layout()
        plt.show()
        
        # Identify equity gaps
        equity_gap = geo_fairness_df['eligibility_rate'].max() - geo_fairness_df['eligibility_rate'].min()
        print(f"\n‚ö†Ô∏è Equity Gap Analysis:")
        print(f"   Maximum Eligibility Rate: {geo_fairness_df['eligibility_rate'].max():.3f}")
        print(f"   Minimum Eligibility Rate: {geo_fairness_df['eligibility_rate'].min():.3f}")
        print(f"   Equity Gap: {equity_gap:.3f} ({equity_gap/geo_fairness_df['eligibility_rate'].mean()*100:.1f}% of mean)")
        
    else:
        print("‚ö†Ô∏è No geographic fairness data found.")
except Exception as e:
    print(f"‚ö†Ô∏è Geographic equity analysis error: {e}")


In [None]:
print("="*80)
print("üìä FAIRNESS AUDIT SUMMARY")
print("="*80)
print("\n‚úÖ Fairness audit completed")
print("\nüí° Recommendations:")
print("1. Monitor demographic parity metrics regularly")
print("2. Investigate schemes with significant demographic bias")
print("3. Review rule engine criteria for potential discriminatory patterns")
print("4. Consider ML model retraining with fairness constraints if bias detected")
print("5. Implement geographic equity adjustments if needed")
print("\nüîç Next Steps:")
print("- Set up automated fairness monitoring")
print("- Review rule expressions for potential bias")
print("- Consider fairness-aware ML model training")
print("- Document fairness metrics in MLflow experiments")


In [None]:
# Close database connection
db.disconnect()
print("‚úÖ Database connection closed")
