# Income Classifier on Adult Dataset

Classifier to predict whether someone will have income ">50K".

## 1. Import Libraries

In [None]:
import numpy as np
import pandas as pd
from aif360.datasets import AdultDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report,ConfusionMatrixDisplay
import matplotlib.pyplot as plt

## 2. Load and Preprocess the Adult Dataset

We use **custom preprocessing** to align with the project requirements:
- **Age**: Binarized using median threshold
- **Sex**: Binary (Male=1, Female=0)  
- **Race**: Simplified to binary (White=1, non-White=0)

This preprocessing ensures consistency across all tasks (Classification, Fairness, Privacy, etc.).

In [None]:
# Custom preprocessing function (aligned with Fairness.ipynb)
def custom_preprocessing(df):
    """
    Binarize age, encode race/sex, and drop raw columns to expose protected attributes explicitly.
    This ensures consistency across all project tasks.
    """
    median_age = df['age'].median()
    df['age_binary'] = (df['age'] > median_age).astype(float)
    df.drop(columns=['age'], inplace=True)
    df['race'] = (df['race'] == 'White').astype(float)
    df['sex'] = (df['sex'] == 'Male').astype(float)
    return df

# Load dataset with custom preprocessing
dataset = AdultDataset(
    custom_preprocessing=custom_preprocessing,
    protected_attribute_names=['age_binary', 'sex'],
    privileged_classes=[np.array([1.0]), np.array([1.0])]
)

# Convert to pandas for exploration
df = pd.DataFrame(dataset.features, columns=dataset.feature_names)
df['income'] = dataset.labels.ravel()

print(f"Dataset shape: {df.shape}")
print(f"\nFirst 10 rows:")
print(df.head(10))
print(f"\nIncome distribution:")
print(df['income'].value_counts())
print(f"\nFeature names: {list(df.columns)}")

In [None]:
# Explore the binarized age feature
print("Age binary distribution:")
print(df['age_binary'].value_counts())
print(f"\nAge binary mean: {df['age_binary'].mean():.4f}")
print(f"(Should be close to 0.5 since we used median threshold)")

# Check protected attributes
print(f"\nSex distribution (1=Male, 0=Female):")
print(df['sex'].value_counts())
print(f"\nRace distribution (1=White, 0=non-White):")
print(df['race'].value_counts())

## 3. Prepare Features and Target

Extract features (X) and target (y) for model training.

In [None]:
# Prepare features and target
X = df.drop(['income'], axis=1)  # All features (including age_binary, sex, race)
y = df['income']  # Target: income classification

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nFeature names: {list(X.columns)}")
print(f"\nTarget distribution:")
print(y.value_counts(normalize=True))

## 4. Split Data into Train, Validation, and Test Sets

In [None]:
# Split: 70% train, 15% validation, 15% test (aligned with Fairness.ipynb)
# First split: 70% train, 30% temp (validation + test)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=1, stratify=y  # Using random_state=1 for consistency with Fairness
)

# Second split: Split temp into 50% validation, 50% test (15% each of total)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=1, stratify=y_temp
)

print(f"Training set size: {X_train.shape[0]} ({X_train.shape[0]/len(X)*100:.1f}%)")
print(f"Validation set size: {X_val.shape[0]} ({X_val.shape[0]/len(X)*100:.1f}%)")
print(f"Test set size: {X_test.shape[0]} ({X_test.shape[0]/len(X)*100:.1f}%)")

print(f"\nClass distribution in training set:")
print(y_train.value_counts(normalize=True))
print(f"\nClass distribution in validation set:")
print(y_val.value_counts(normalize=True))
print(f"\nClass distribution in test set:")
print(y_test.value_counts(normalize=True))

## 5. Feature Scaling

The idea is to put features on a similar scale so that no single feature dominates the learning process due to its magnitude. This is especially important for algorithms like Logistic Regression, which are sensitive to the scale of input features.

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train) # computes mean and std, then scales
X_val_scaled = scaler.transform(X_val) # scales validation set
X_test_scaled = scaler.transform(X_test) # scales test set

print("Features scaled using StandardScaler")
print(f"Training set scaled shape: {X_train_scaled.shape}")
print(f"Validation set scaled shape: {X_val_scaled.shape}")
print(f"Test set scaled shape: {X_test_scaled.shape}")

## 6. Train the Classifier

In [None]:
# Train Logistic Regression classifier
lr_classifier = LogisticRegression(max_iter=1000, random_state=1)
lr_classifier.fit(X_train_scaled, y_train)
print("✓ Logistic Regression trained successfully")
print(f"  Model: {type(lr_classifier).__name__}")
print(f"  Features: {X_train_scaled.shape[1]}")
print(f"  Training samples: {X_train_scaled.shape[0]}")

### Saving "The Classifier"

This is **the base classifier** referenced in the project statement (Section 1: Classification). 
We save it with a clear name for reuse in subsequent tasks (Fairness, Privacy, etc.).

In [None]:
import joblib
import os

# Create models directory
os.makedirs("models", exist_ok=True)

# Build artifact with all necessary components for reusability
artifact = {
    "model": lr_classifier,           # the trained classifier
    "scaler": scaler,                 # the trained StandardScaler
    "feature_names": list(X.columns), # feature names for consistency
    "preprocessing": "custom",        # indicate custom preprocessing was used
    "protected_attributes": ["age_binary", "sex"]  # for fairness analysis
}

# Save THE CLASSIFIER (primary reference for the entire project)
joblib.dump(artifact, "models/the_classifier.joblib")

print("=" * 70)
print("✓ THE CLASSIFIER saved successfully!")
print("=" * 70)
print(f"  Location: models/the_classifier.joblib")
print(f"  Model: {type(lr_classifier).__name__}")
print(f"  Features: {len(artifact['feature_names'])}")
print(f"  Preprocessing: Custom (age_binary, binary race/sex)")
print(f"  Protected attributes: {artifact['protected_attributes']}")
print("=" * 70)
print("\nThis model will be reused in:")
print("  - Fairness.ipynb (fairness assessment)")
print("  - Privacy tasks (baseline comparison)")
print("  - Explainability analysis")
print("=" * 70)

## 7. Measure Performance on Validation Set

In [None]:
# TODO : add confusion matrix visualization -------------OK
# Predict on validation set
y_val_pred = lr_classifier.predict(X_val_scaled)

# Calculate validation metrics
val_accuracy = accuracy_score(y_val, y_val_pred)
val_precision = precision_score(y_val, y_val_pred)
val_recall = recall_score(y_val, y_val_pred)
val_f1 = f1_score(y_val, y_val_pred)

print("VALIDATION SET PERFORMANCE")
print("="*50)
print(f"Accuracy:  {val_accuracy:.4f}")
print(f"Precision: {val_precision:.4f}")
print(f"Recall:    {val_recall:.4f}")
print(f"F1-Score:  {val_f1:.4f}")

print("Detailed Classification Report:")
print(classification_report(y_val, y_val_pred, target_names=['<=50K', '>50K']))

In [None]:
cm_val = confusion_matrix(y_val, y_val_pred)
print("\nConfusion Matrix:")
print(cm_val)

disp = ConfusionMatrixDisplay(confusion_matrix=cm_val, display_labels=['<=50K', '>50K'])
disp.plot()
plt.title("Validation Set - Confusion Matrix")
plt.show()

## 8. Measure Performance on Test Set

In [None]:
# Predict on test set
y_test_pred = lr_classifier.predict(X_test_scaled)

# Calculate test metrics
test_accuracy = accuracy_score(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred)
test_recall = recall_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)

print("=" * 70)
print("TEST SET PERFORMANCE (THE CLASSIFIER)")
print("=" * 70)
print(f"Accuracy:  {test_accuracy:.4f}")
print(f"Precision: {test_precision:.4f}")
print(f"Recall:    {test_recall:.4f}")
print(f"F1-Score:  {test_f1:.4f}")
print("=" * 70)

print("\nDetailed Classification Report:")
print(classification_report(y_test, y_test_pred, target_names=['<=50K', '>50K']))

# Confusion matrix
cm_test = confusion_matrix(y_test, y_test_pred)
print("\nConfusion Matrix:")
print(cm_test)

disp = ConfusionMatrixDisplay(confusion_matrix=cm_test, display_labels=['<=50K', '>50K'])
disp.plot()
plt.title("Test Set - THE CLASSIFIER Confusion Matrix")
plt.show()

## Summary: Task 1 - Classification Complete ✅

**THE CLASSIFIER** has been successfully trained and saved for reuse throughout the project.

### Key Details:
- **Model**: Logistic Regression (max_iter=1000)
- **Dataset**: Adult Dataset with custom preprocessing
- **Features**: Age (binary), Sex (binary), Race (binary), and other socioeconomic features
- **Protected Attributes**: `age_binary`, `sex` (for fairness analysis)
- **Data Split**: 70% train / 15% validation / 15% test
- **Saved Location**: `models/the_classifier.joblib`

### Next Steps:
1. **Task 2 - Fairness**: Load THE CLASSIFIER and assess fairness metrics
2. **Task 3 - Privacy**: Use as baseline for privacy-preserving classifier comparison
3. **Task 5 - Explainability**: Analyze confident mistakes and feature importance

### Important Notes:
- All subsequent notebooks should **load** this classifier (not retrain)
- Custom preprocessing must be applied consistently across all tasks
- The saved artifact includes model, scaler, and feature names for consistency