In [1]:
# AIF360 Bias Detection using SST-2 Dataset and BERT-base-uncased
# Detect gender bias in sentiment analysis using real-world movie reviews

import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt
import seaborn as sns
import re
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from transformers import BertTokenizer, BertModel
import warnings
warnings.filterwarnings('ignore')

# AIF360 imports
from aif360.datasets import StandardDataset
from aif360.metrics import BinaryLabelDatasetMetric, ClassificationMetric
from aif360.algorithms.preprocessing import Reweighing, DisparateImpactRemover

# Download and import datasets
from datasets import load_dataset
import spacy

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"🔍 AIF360 Bias Detection with SST-2 Dataset")
print(f"Device: {device}")

# Load spaCy for NER (name extraction)
try:
    nlp = spacy.load("en_core_web_sm")
    print("✅ SpaCy model loaded")
except OSError:
    print("❌ Please install spaCy English model: python -m spacy download en_core_web_sm")
    nlp = None

# Comprehensive name-to-gender mapping
GENDER_NAMES = {
    'male': {
        'james', 'john', 'robert', 'michael', 'william', 'david', 'richard', 'charles', 'joseph', 'thomas',
        'daniel', 'matthew', 'anthony', 'mark', 'donald', 'steven', 'paul', 'andrew', 'joshua', 'kenneth',
        'kevin', 'brian', 'george', 'edward', 'ronald', 'timothy', 'jason', 'jeffrey', 'ryan', 'jacob',
        'gary', 'nicholas', 'eric', 'jonathan', 'stephen', 'larry', 'justin', 'scott', 'brandon', 'benjamin',
        'samuel', 'gregory', 'alexander', 'patrick', 'frank', 'raymond', 'jack', 'dennis', 'jerry', 'tyler',
        'aaron', 'jose', 'henry', 'adam', 'douglas', 'nathan', 'peter', 'zachary', 'kyle', 'noah',
        'alan', 'ethan', 'jeremy', 'lionel', 'mason', 'luke', 'wayne', 'roy', 'eugene', 'louis',
        'philip', 'arthur', 'ralph', 'sean', 'austin', 'carl', 'harold', 'roger', 'joe', 'albert'
    },
    'female': {
        'mary', 'patricia', 'jennifer', 'linda', 'elizabeth', 'barbara', 'susan', 'jessica', 'sarah', 'karen',
        'nancy', 'lisa', 'betty', 'helen', 'sandra', 'donna', 'carol', 'ruth', 'sharon', 'michelle',
        'laura', 'sarah', 'kimberly', 'deborah', 'dorothy', 'lisa', 'nancy', 'karen', 'betty', 'helen',
        'sandra', 'donna', 'carol', 'ruth', 'sharon', 'michelle', 'laura', 'emily', 'kimberly', 'deborah',
        'dorothy', 'amy', 'angela', 'ashley', 'brenda', 'emma', 'olivia', 'cynthia', 'marie', 'janet',
        'catherine', 'frances', 'christine', 'samantha', 'debra', 'rachel', 'carolyn', 'janet', 'virginia',
        'maria', 'heather', 'diane', 'julie', 'joyce', 'victoria', 'kelly', 'christina', 'joan', 'evelyn',
        'lauren', 'judith', 'megan', 'cheryl', 'andrea', 'hannah', 'jacqueline', 'martha', 'gloria', 'sara'
    }
}

def load_bert_model():
    """Load BERT-base-uncased model and tokenizer"""
    print("📥 Loading BERT-base-uncased...")
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')
    model.to(device)
    model.eval()
    return tokenizer, model

def load_sst2_dataset():
    """Load SST-2 dataset"""
    print("📥 Loading SST-2 dataset...")
    dataset = load_dataset("glue", "sst2")
    
    # Combine train and validation for more data
    train_data = dataset['train']
    val_data = dataset['validation']
    
    all_sentences = list(train_data['sentence']) + list(val_data['sentence'])
    all_labels = list(train_data['label']) + list(val_data['label'])
    
    print(f"✅ Loaded {len(all_sentences)} sentences from SST-2")
    return all_sentences, all_labels

def extract_names_from_text(text):
    """Extract person names from text using SpaCy NER"""
    if nlp is None:
        return []
    
    doc = nlp(text)
    names = []
    
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            # Clean and normalize name
            name = ent.text.lower().strip()
            # Remove titles and get first name
            name_parts = name.split()
            if name_parts:
                first_name = name_parts[0].replace(',', '').replace('.', '')
                if len(first_name) > 1:  # Avoid single letters
                    names.append(first_name)
    
    return names

def infer_gender_from_name(name):
    """Infer gender from name using predefined lists"""
    name_lower = name.lower()
    
    if name_lower in GENDER_NAMES['male']:
        return 'male'
    elif name_lower in GENDER_NAMES['female']:
        return 'female'
    else:
        return 'unknown'

def create_gender_annotated_dataset(sentences, labels):
    """Create dataset with gender annotations from SST-2"""
    print("🔄 Extracting names and inferring gender from SST-2...")
    
    annotated_data = []
    gender_stats = {'male': 0, 'female': 0, 'unknown': 0, 'no_names': 0}
    
    for sentence, label in zip(sentences, labels):
        names = extract_names_from_text(sentence)
        
        if not names:
            gender_stats['no_names'] += 1
            continue
        
        # For sentences with multiple names, use the first one with known gender
        inferred_gender = 'unknown'
        used_name = None
        
        for name in names:
            gender = infer_gender_from_name(name)
            if gender != 'unknown':
                inferred_gender = gender
                used_name = name
                break
        
        if inferred_gender != 'unknown':
            annotated_data.append({
                'sentence': sentence,
                'sentiment': label,  # 0=negative, 1=positive
                'names': names,
                'primary_name': used_name,
                'gender': inferred_gender,
                'gender_binary': 1 if inferred_gender == 'male' else 0  # 1=male, 0=female
            })
            gender_stats[inferred_gender] += 1
        else:
            gender_stats['unknown'] += 1
    
    print(f"✅ Gender annotation statistics:")
    for key, value in gender_stats.items():
        print(f"   {key}: {value}")
    
    print(f"📊 Usable samples: {len(annotated_data)}")
    return annotated_data

def get_bert_embedding(text, tokenizer, model):
    """Get BERT embedding for text"""
    try:
        inputs = tokenizer(text, return_tensors='pt', padding=True, 
                          truncation=True, max_length=256)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        with torch.no_grad():
            outputs = model(**inputs)
            # Use CLS token embedding
            embedding = outputs.last_hidden_state[0][0]
        
        return embedding.cpu().numpy()
    except Exception as e:
        print(f"Error getting embedding: {e}")
        return None

def create_bert_embeddings(data, tokenizer, model, max_samples=2000):
    """Create BERT embeddings for the dataset"""
    print(f"🔄 Creating BERT embeddings (max {max_samples} samples)...")
    
    # Balance the dataset
    male_samples = [item for item in data if item['gender'] == 'male']
    female_samples = [item for item in data if item['gender'] == 'female']
    
    # Take equal samples from each gender
    n_per_gender = min(max_samples // 2, len(male_samples), len(female_samples))
    balanced_data = male_samples[:n_per_gender] + female_samples[:n_per_gender]
    
    print(f"📊 Using {len(balanced_data)} balanced samples ({n_per_gender} per gender)")
    
    processed_data = []
    
    for i, item in enumerate(balanced_data):
        if i % 100 == 0:
            print(f"   Processing {i}/{len(balanced_data)}...")
        
        embedding = get_bert_embedding(item['sentence'], tokenizer, model)
        if embedding is not None:
            processed_data.append({
                'sentence': item['sentence'],
                'sentiment': item['sentiment'],
                'gender': item['gender'],
                'gender_binary': item['gender_binary'],
                'primary_name': item['primary_name'],
                'embedding': embedding
            })
    
    print(f"✅ Created {len(processed_data)} embeddings")
    return processed_data

def create_aif360_dataset(data):
    """Convert data to AIF360 StandardDataset format"""
    print("🔧 Converting to AIF360 format...")
    
    # Create DataFrame
    embeddings = np.array([item['embedding'] for item in data])
    
    df = pd.DataFrame({
        'gender_binary': [item['gender_binary'] for item in data],
        'sentiment': [item['sentiment'] for item in data],
        'gender': [item['gender'] for item in data],
        'sentence': [item['sentence'] for item in data],
        'primary_name': [item['primary_name'] for item in data]
    })
    
    # Add embedding dimensions as features
    for i in range(embeddings.shape[1]):
        df[f'embed_{i}'] = embeddings[:, i]
    
    # Create AIF360 dataset
    aif_dataset = StandardDataset(
        df=df,
        label_name='sentiment',
        favorable_classes=[1],  # positive sentiment is favorable
        protected_attribute_names=['gender_binary'],
        privileged_classes=[[1]]  # male is privileged class
    )
    
    return aif_dataset, df

def compute_bias_metrics(dataset):
    """Compute comprehensive bias metrics using AIF360"""
    print("📊 Computing bias metrics...")
    
    metric = BinaryLabelDatasetMetric(
        dataset, 
        unprivileged_groups=[{'gender_binary': 0}],  # female
        privileged_groups=[{'gender_binary': 1}]     # male
    )
    
    metrics = {
        'Statistical Parity Difference': metric.statistical_parity_difference(),
        'Disparate Impact': metric.disparate_impact(),
        'Mean Difference': metric.mean_difference(),
        'Base Rate (Female)': metric.base_rate(privileged=False),
        'Base Rate (Male)': metric.base_rate(privileged=True),
        'Selection Rate (Female)': metric.selection_rate(privileged=False),
        'Selection Rate (Male)': metric.selection_rate(privileged=True),
        'Smoothed EDF': metric.smoothed_empirical_differential_fairness()
    }
    
    return metrics

def train_sentiment_classifier(dataset, df):
    """Train sentiment classifier and evaluate fairness"""
    print("🤖 Training sentiment classifier...")
    
    # Prepare features and labels
    feature_columns = [col for col in df.columns if col.startswith('embed_')]
    X = df[feature_columns].values
    y = df['sentiment'].values
    gender = df['gender_binary'].values
    
    # Split data
    X_train, X_test, y_train, y_test, gender_train, gender_test = train_test_split(
        X, y, gender, test_size=0.3, random_state=42, stratify=y
    )
    
    # Train classifier
    clf = LogisticRegression(random_state=42, max_iter=1000)
    clf.fit(X_train, y_train)
    
    # Predictions
    y_pred = clf.predict(X_test)
    y_pred_proba = clf.predict_proba(X_test)[:, 1]
    
    # Create test dataset for AIF360
    test_df = pd.DataFrame({
        'gender_binary': gender_test,
        'sentiment': y_test,
        'prediction': y_pred,
        'prediction_proba': y_pred_proba
    })
    
    # Add features
    for i, col in enumerate(feature_columns):
        test_df[col] = X_test[:, i]
    
    # Convert to AIF360 format
    test_dataset = StandardDataset(
        df=test_df,
        label_name='sentiment',
        favorable_classes=[1],
        protected_attribute_names=['gender_binary'],
        privileged_classes=[[1]]
    )
    
    # Create prediction dataset
    pred_dataset = test_dataset.copy()
    pred_dataset.labels = test_df['prediction'].values.reshape(-1, 1)
    
    # Compute classification metrics
    classified_metric = ClassificationMetric(
        test_dataset, pred_dataset,
        unprivileged_groups=[{'gender_binary': 0}],
        privileged_groups=[{'gender_binary': 1}]
    )
    
    classification_metrics = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Equal Opportunity Difference': classified_metric.equal_opportunity_difference(),
        'Average Odds Difference': classified_metric.average_odds_difference(),
        'Theil Index': classified_metric.theil_index(),
        'TPR (Female)': classified_metric.true_positive_rate(privileged=False),
        'TPR (Male)': classified_metric.true_positive_rate(privileged=True),
        'FPR (Female)': classified_metric.false_positive_rate(privileged=False),
        'FPR (Male)': classified_metric.false_positive_rate(privileged=True)
    }
    
    return classification_metrics, test_df, y_test, y_pred

def visualize_bias_analysis(bias_metrics, classification_metrics, test_df):
    """Create comprehensive bias visualization"""
    print("📊 Creating bias visualizations...")
    
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    fig.suptitle('BERT Gender Bias Analysis on SST-2 Dataset', fontsize=16, fontweight='bold')
    
    # 1. Bias Metrics Bar Chart
    metrics_to_plot = ['Statistical Parity Difference', 'Disparate Impact', 'Mean Difference']
    values = [bias_metrics[metric] for metric in metrics_to_plot]
    colors = ['red' if abs(v) > 0.1 else 'orange' if abs(v) > 0.05 else 'green' for v in values]
    
    bars = axes[0,0].bar(range(len(metrics_to_plot)), values, color=colors, alpha=0.7)
    axes[0,0].set_xlabel('Bias Metrics')
    axes[0,0].set_ylabel('Metric Value')
    axes[0,0].set_title('AIF360 Bias Metrics')
    axes[0,0].set_xticks(range(len(metrics_to_plot)))
    axes[0,0].set_xticklabels([m.replace(' ', '\n') for m in metrics_to_plot], rotation=0)
    axes[0,0].axhline(y=0, color='black', linestyle='-', alpha=0.3)
    
    # Add value labels
    for bar, value in zip(bars, values):
        axes[0,0].text(bar.get_x() + bar.get_width()/2, 
                      bar.get_height() + 0.01 if value > 0 else bar.get_height() - 0.01,
                      f'{value:.3f}', ha='center', va='bottom' if value > 0 else 'top')
    
    # 2. Selection Rates by Gender
    male_rate = bias_metrics['Selection Rate (Male)']
    female_rate = bias_metrics['Selection Rate (Female)']
    
    bars = axes[0,1].bar(['Male', 'Female'], [male_rate, female_rate], 
                        color=['lightblue', 'lightcoral'], alpha=0.7)
    axes[0,1].set_ylabel('Positive Sentiment Rate')
    axes[0,1].set_title('Selection Rates by Gender')
    
    for bar, rate in zip(bars, [male_rate, female_rate]):
        axes[0,1].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
                      f'{rate:.3f}', ha='center', va='bottom')
    
    # 3. Classification Performance by Gender
    tpr_male = classification_metrics['TPR (Male)']
    tpr_female = classification_metrics['TPR (Female)']
    fpr_male = classification_metrics['FPR (Male)']
    fpr_female = classification_metrics['FPR (Female)']
    
    x = np.arange(2)
    width = 0.35
    
    axes[0,2].bar(x - width/2, [tpr_male, tpr_female], width, 
                 label='True Positive Rate', alpha=0.7, color='green')
    axes[0,2].bar(x + width/2, [fpr_male, fpr_female], width,
                 label='False Positive Rate', alpha=0.7, color='red')
    
    axes[0,2].set_xlabel('Gender')
    axes[0,2].set_ylabel('Rate')
    axes[0,2].set_title('Classification Performance by Gender')
    axes[0,2].set_xticks(x)
    axes[0,2].set_xticklabels(['Male', 'Female'])
    axes[0,2].legend()
    
    # 4. Gender Distribution in Dataset
    gender_counts = test_df['gender_binary'].value_counts()
    axes[1,0].pie([gender_counts[0], gender_counts[1]], labels=['Female', 'Male'], 
                 autopct='%1.1f%%', colors=['lightcoral', 'lightblue'])
    axes[1,0].set_title('Gender Distribution in Test Set')
    
    # 5. Sentiment Distribution by Gender
    sentiment_by_gender = test_df.groupby(['gender_binary', 'sentiment']).size().unstack()
    sentiment_by_gender.index = ['Female', 'Male']
    sentiment_by_gender.columns = ['Negative', 'Positive']
    
    sentiment_by_gender.plot(kind='bar', ax=axes[1,1], color=['lightcoral', 'lightgreen'])
    axes[1,1].set_xlabel('Gender')
    axes[1,1].set_ylabel('Count')
    axes[1,1].set_title('Sentiment Distribution by Gender')
    axes[1,1].legend()
    axes[1,1].tick_params(axis='x', rotation=0)
    
    # 6. Summary Text
    summary_text = f"""
BIAS ANALYSIS SUMMARY

Dataset: SST-2 Movie Reviews
Model: BERT-base-uncased
Samples: {len(test_df)}

KEY FINDINGS:
Statistical Parity Diff: {bias_metrics['Statistical Parity Difference']:.4f}
Disparate Impact: {bias_metrics['Disparate Impact']:.4f}
Mean Difference: {bias_metrics['Mean Difference']:.4f}

FAIRNESS METRICS:
Equal Opportunity Diff: {classification_metrics['Equal Opportunity Difference']:.4f}
Average Odds Diff: {classification_metrics['Average Odds Difference']:.4f}
Theil Index: {classification_metrics['Theil Index']:.4f}

INTERPRETATION:
Positive values indicate bias favoring males
Negative values indicate bias favoring females
Values near 0 indicate fairness

BIAS LEVEL: {"Strong" if abs(bias_metrics['Statistical Parity Difference']) > 0.1 else "Moderate" if abs(bias_metrics['Statistical Parity Difference']) > 0.05 else "Weak"}
    """
    
    axes[1,2].text(0.05, 0.95, summary_text.strip(), transform=axes[1,2].transAxes,
                  fontsize=9, verticalalignment='top', fontfamily='monospace',
                  bbox=dict(boxstyle="round,pad=0.5", facecolor="lightgray", alpha=0.8))
    axes[1,2].set_xlim(0, 1)
    axes[1,2].set_ylim(0, 1)
    axes[1,2].axis('off')
    
    plt.tight_layout()
    plt.savefig('bert_sst2_bias_analysis.png', dpi=300, bbox_inches='tight')
    plt.show()

def print_detailed_results(bias_metrics, classification_metrics):
    """Print detailed bias analysis results"""
    print("\n" + "="*70)
    print("🔍 DETAILED BIAS ANALYSIS RESULTS")
    print("="*70)
    
    print("\n📊 DATASET-LEVEL BIAS METRICS (AIF360):")
    print("-" * 50)
    for metric, value in bias_metrics.items():
        interpretation = ""
        if 'Difference' in metric:
            if abs(value) > 0.1:
                interpretation = " (Strong bias)"
            elif abs(value) > 0.05:
                interpretation = " (Moderate bias)"
            else:
                interpretation = " (Weak bias)"
        elif metric == 'Disparate Impact':
            if value < 0.8 or value > 1.2:
                interpretation = " (Significant disparity)"
            else:
                interpretation = " (Acceptable)"
        
        print(f"  • {metric}: {value:.6f}{interpretation}")
    
    print("\n🤖 CLASSIFICATION-LEVEL BIAS METRICS:")
    print("-" * 50)
    for metric, value in classification_metrics.items():
        interpretation = ""
        if 'Difference' in metric:
            if abs(value) > 0.1:
                interpretation = " (Unfair)"
            elif abs(value) > 0.05:
                interpretation = " (Moderate unfairness)"
            else:
                interpretation = " (Fair)"
        
        print(f"  • {metric}: {value:.6f}{interpretation}")
    
    print("\n🎯 BIAS INTERPRETATION:")
    print("-" * 30)
    spd = bias_metrics['Statistical Parity Difference']
    di = bias_metrics['Disparate Impact']
    
    if spd > 0.05:
        print("  🔴 MALE BIAS DETECTED: Males more likely to get positive sentiment")
    elif spd < -0.05:
        print("  🔴 FEMALE BIAS DETECTED: Females more likely to get positive sentiment")
    else:
        print("  🟢 MINIMAL BIAS: Sentiment predictions fairly balanced")
    
    print(f"\n📈 RECOMMENDATIONS:")
    print("-" * 20)
    if abs(spd) > 0.1:
        print("  • Apply bias mitigation techniques (Reweighing, Adversarial Debiasing)")
        print("  • Consider demographic parity constraints")
        print("  • Audit training data for gender representation")
    elif abs(spd) > 0.05:
        print("  • Monitor bias in production")
        print("  • Consider light bias mitigation")
    else:
        print("  • Current model shows acceptable fairness")
        print("  • Continue monitoring in deployment")

# Main execution
def main():
    """Main execution function"""
    print("🚀 Starting BERT SST-2 Bias Analysis...")
    
    # Check if spaCy is available
    if nlp is None:
        print("❌ SpaCy not available. Please install: python -m spacy download en_core_web_sm")
        return
    
    # Load models and data
    tokenizer, model = load_bert_model()
    sentences, labels = load_sst2_dataset()
    
    # Extract gender information
    gender_data = create_gender_annotated_dataset(sentences, labels)
    
    if len(gender_data) < 100:
        print("❌ Insufficient data with gender information. Need more movie reviews with names.")
        return
    
    # Create BERT embeddings
    processed_data = create_bert_embeddings(gender_data, tokenizer, model)
    
    # Convert to AIF360 format
    aif_dataset, df = create_aif360_dataset(processed_data)
    
    # Compute bias metrics
    bias_metrics = compute_bias_metrics(aif_dataset)
    
    # Train classifier and evaluate
    classification_metrics, test_df, y_test, y_pred = train_sentiment_classifier(aif_dataset, df)
    
    # Visualize results
    visualize_bias_analysis(bias_metrics, classification_metrics, test_df)
    
    # Print detailed results
    print_detailed_results(bias_metrics, classification_metrics)
    
    print("\n🎉 BIAS ANALYSIS COMPLETE!")
    print("📁 Visualization saved as 'bert_sst2_bias_analysis.png'")

# Run the analysis
if __name__ == "__main__":
    main()

  from .autonotebook import tqdm as notebook_tqdm
pip install 'aif360[AdversarialDebiasing]'
pip install 'aif360[AdversarialDebiasing]'
pip install 'aif360[Reductions]'
pip install 'aif360[Reductions]'
pip install 'aif360[inFairness]'
pip install 'aif360[Reductions]'


ModuleNotFoundError: No module named 'spacy'