## Exploration of base dataset - Adult

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

In [None]:
DATA_DIR = Path("/opt/anaconda3/lib/python3.11/site-packages/aif360/data/raw/adult")

# 2. Column names (from adult.names)
COLUMNS = [
    'age', 'workclass', 'fnlwgt', 'education', 'education-num',
    'marital-status', 'occupation', 'relationship', 'race', 'sex',
    'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income'
]

# 3. Read train & test, then concatenate
train = pd.read_csv(DATA_DIR/"adult.data", names=COLUMNS,
                    sep=r',\s*', engine='python', na_values='?')
test  = pd.read_csv(DATA_DIR/"adult.test", names=COLUMNS,
                    sep=r',\s*', engine='python', skiprows=1, na_values='?')

df = pd.concat([train, test], ignore_index=True)

In [None]:
pd.set_option('display.max_colwidth', None)
top_n = 5
rows = []
for col in df.columns:
    feature_label = 'Target: income' if col == 'income' else col
    ser = df[col]

    if pd.api.types.is_numeric_dtype(ser):
        rows.append({
            'Feature': feature_label,
            'Type': 'Continuous',
            'Min': ser.min(),
            'Max': ser.max(),
            'Mean': round(ser.mean(), 2),
            'Std': round(ser.std(), 2),
            'Values': ''
        })
    else:
        counts = ser.dropna().value_counts()
        if counts.size > top_n:
            vals = counts.index[:top_n].tolist() + ['Other']
        else:
            vals = counts.index.tolist()
        rows.append({
            'Feature': feature_label,
            'Type': 'Categorical',
            'Min': '',
            'Max': '',
            'Mean': '',
            'Std': '',
            'Values': ", ".join(map(str, vals))
        })

summary_df = pd.DataFrame(rows)[['Feature', 'Type', 'Min', 'Max', 'Mean', 'Std', 'Values']]

# Reset index to start at 1
summary_df.index = range(1, len(summary_df) + 1)
summary_df

In [None]:
df['income'] = df['income'].str.strip().str.rstrip('.')

n_instances = len(df)
n_features  = df.shape[1] - 1  
positive_rate = df['income'].eq('>50K').mean() * 100

# Positive-class rate by sex
sex_rates = (
    df
    .groupby('sex')['income']
    .apply(lambda s: s.eq('>50K').mean() * 100)
    .round(1)
)

# Positive-class rate by race
race_rates = (
    df
    .groupby('race')['income']
    .apply(lambda s: s.eq('>50K').mean() * 100)
    .round(1)
)

# Define White vs. Non-White
df['race_group'] = np.where(df['race'] == 'White', 'White', 'Non-White')

# Positive-class rate by race_group
racegrp_rates = (
    df
    .groupby('race_group')['income']
    .apply(lambda s: s.eq('>50K').mean() * 100)
    .round(1)
)

# Intersectional 2×2 matrix (race_group × sex) out of curiosity
matrix = (
    df
    .assign(positive=df['income'].eq('>50K').astype(int))
    .pivot_table(
        index='race_group',
        columns='sex',
        values='positive',
        aggfunc=lambda x: x.mean() * 100
    )
    .round(1)
)

# Print all summaries
print(f"Instances:        {n_instances}")
print(f"Features:         {n_features}")
print(f"Positive-class:   {positive_rate:.1f}%")
print(f"Protected attrs:  race, sex")

print("\n Positive-class rate by race")
print(race_rates.astype(str) + '%')

print("\n Positive-class rate by sex")
print(sex_rates.astype(str) + '%')

print("\nPositive-class rate (White vs. Non-White)")
print(racegrp_rates.astype(str) + '%')

print("\nIntersection (Race × Sex)")
print(matrix.astype(str) + '%')