In [6]:
# Install the 'ucimlrepo' package, which allows you to easily access datasets
# from the UCI Machine Learning Repository.
# (Run this only once per environment â€” remove the '!' if running in a script.)
!pip install ucimlrepo  



In [7]:
# Import the function 'fetch_ucirepo' from the ucimlrepo package.
# This function is used to download datasets by their ID number.
from ucimlrepo import fetch_ucirepo 

# Fetch the dataset with ID = 2, which corresponds to the "Adult" dataset
# (also called the "Census Income" dataset).
adult = fetch_ucirepo(id=2)

# Extract the data (features and target) from the downloaded dataset.
# 'features' are the input variables (e.g., age, education, hours per week, etc.)
# 'targets' are what we want to predict (e.g., whether income > $50K).
X = adult.data.features 
y = adult.data.targets 

# Print metadata (general information) about the dataset.
# This usually includes the dataset name, description, number of rows/columns, etc.
print(adult.metadata)

# Print details about each variable (feature) in the dataset.
# This helps you understand the data types and what each column represents.
print(adult.variables)


{'uci_id': 2, 'name': 'Adult', 'repository_url': 'https://archive.ics.uci.edu/dataset/2/adult', 'data_url': 'https://archive.ics.uci.edu/static/public/2/data.csv', 'abstract': 'Predict whether annual income of an individual exceeds $50K/yr based on census data. Also known as "Census Income" dataset. ', 'area': 'Social Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 48842, 'num_features': 14, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Income', 'Education Level', 'Other', 'Race', 'Sex'], 'target_col': ['income'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1996, 'last_updated': 'Tue Sep 24 2024', 'dataset_doi': '10.24432/C5XW20', 'creators': ['Barry Becker', 'Ronny Kohavi'], 'intro_paper': None, 'additional_info': {'summary': "Extraction was done by Barry Becker from the 1994 Census database.  A set of reasonably clean records was extracted using the fol

In [8]:
import pandas as pd

# Combine features and target for easier viewing
df = pd.concat([X, y], axis=1)

# Quick cleaning example
df = df.dropna()  # remove rows with missing values
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)  # remove extra spaces

df.head()


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [9]:
# Pick a sensitive attribute
attr = 'sex'

# Combine target (income) and sensitive attribute
df.groupby(attr)['income'].value_counts(normalize=True)

# Calculate % earning >50K for each sex
positive_rates = df[df['income'] == '>50K'][attr].value_counts(normalize=True)

print(positive_rates)




sex
Male      0.849637
Female    0.150363
Name: proportion, dtype: float64


In [10]:
# Convert to numeric for correlation
df['sex_num'] = (df['sex'] == 'Male').astype(int)
df['target_num'] = (df['income'] == '>50K').astype(int)

df[['sex_num', 'target_num']].corr()


Unnamed: 0,sex_num,target_num
sex_num,1.0,0.169082
target_num,0.169082,1.0
