In [2]:
# Import libraries with specific purposes in mind
import sys
import pandas as pd              # For data manipulation
import numpy as np               # For numerical operations
import matplotlib.pyplot as plt  # For visualizations
import sklearn
import imblearn
import seaborn as sns           # For attractive statistical plots
from ucimlrepo import fetch_ucirepo

In [3]:
print(f"Python version: {sys.version}")
print(f"NumPy version: {np.__version__}")
print(f"Pandas version: {pd.__version__}")
print(f"Scikit-learn version: {sklearn.__version__}")
print(f"Imbalanced-learn version: {imblearn.__version__}")

Python version: 3.12.3 (main, Feb  4 2025, 14:48:35) [GCC 13.3.0]
NumPy version: 2.3.0
Pandas version: 2.3.0
Scikit-learn version: 1.6.1
Imbalanced-learn version: 0.13.0


In [4]:
# testing basic operations
data = pd.DataFrame({
    'feature1': np.random.randn(100),
    'feature2': np.random.randn(100),
    'gender': np.random.choice(['Male', 'Female'], 100),
    'income': np.random.choice([0, 1], 100)
})

print("Test dataframe created successfully!")
print(data.head())
print(f"\nGender distribution:\n{data['gender'].value_counts()}")


Test dataframe created successfully!
   feature1  feature2  gender  income
0 -1.148213 -0.619333  Female       1
1 -2.246665 -1.279985    Male       1
2 -0.376425 -0.314782  Female       0
3 -0.869122  0.875327    Male       1
4  1.668132  0.022896  Female       0

Gender distribution:
gender
Female    56
Male      44
Name: count, dtype: int64


In [5]:
print("Downloading UCI Adult dataset...")

# Fetch the dataset using its ID (2)
adult = fetch_ucirepo(id=2)

# The fetch_ucirepo function returns an object with several components
print("\nDataset downloaded successfully!")
print(f"Dataset name: {adult.metadata['name']}")
print(f"Dataset description: {adult.metadata['abstract'][:200]}...")

Downloading UCI Adult dataset...

Dataset downloaded successfully!
Dataset name: Adult
Dataset description: Predict whether annual income of an individual exceeds $50K/yr based on census data. Also known as "Census Income" dataset. ...


In [6]:
# Extract the main components
X = adult.data.features  # Features (independent variables)
y = adult.data.targets   # Target (dependent variable - income)

print(f"\nDataset shape: {X.shape}")
print(f"This means we have {X.shape[0]:,} individuals and {X.shape[1]} features about each person")


Dataset shape: (48842, 14)
This means we have 48,842 individuals and 14 features about each person


In [7]:
print("Features (X) Preview:")
display(X.head())  # First 5 rows of features

Features (X) Preview:


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba


In [8]:
print("\nTarget (y) Preview:")
display(y.head())  # First 5 rows of target


Target (y) Preview:


Unnamed: 0,income
0,<=50K
1,<=50K
2,<=50K
3,<=50K
4,<=50K


In [9]:
print("\nFeatures Summary:")
display(X.describe(include='all'))  # Stats for numeric AND categorical features


Features Summary:


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
count,48842.0,47879,48842.0,48842,48842.0,48842,47876,48842,48842,48842,48842.0,48842.0,48842.0,48568
unique,,9,,16,,7,15,6,5,2,,,,42
top,,Private,,HS-grad,,Married-civ-spouse,Prof-specialty,Husband,White,Male,,,,United-States
freq,,33906,,15784,,22379,6172,19716,41762,32650,,,,43832
mean,38.643585,,189664.1,,10.078089,,,,,,1079.067626,87.502314,40.422382,
std,13.71051,,105604.0,,2.570973,,,,,,7452.019058,403.004552,12.391444,
min,17.0,,12285.0,,1.0,,,,,,0.0,0.0,1.0,
25%,28.0,,117550.5,,9.0,,,,,,0.0,0.0,40.0,
50%,37.0,,178144.5,,10.0,,,,,,0.0,0.0,40.0,
75%,48.0,,237642.0,,12.0,,,,,,0.0,0.0,45.0,


In [10]:
print("\nTarget Summary:")
display(y.describe(include='all'))  # Stats for target variable


Target Summary:


Unnamed: 0,income
count,48842
unique,4
top,<=50K
freq,24720


#### checking dimensions and metadata

In [11]:
print(f"X shape: {X.shape}")  # (rows, columns)
print(f"y shape: {y.shape}")  # (rows, columns)
print("\nX Columns:", X.columns.tolist())
print("y Column:", y.columns.tolist())

X shape: (48842, 14)
y shape: (48842, 1)

X Columns: ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']
y Column: ['income']


In [12]:
print("Features in our dataset:")
print("=" * 60)

# Create a feature information summary
feature_info = pd.DataFrame({
    'Feature': X.columns,
    'Type': X.dtypes,
    'Unique_Values': [X[col].nunique() for col in X.columns],
    'Missing_Values': [X[col].isna().sum() for col in X.columns],
    'Missing_Percentage': [f"{(X[col].isna().sum() / len(X)) * 100:.1f}%" for col in X.columns]
})

print(feature_info.to_string())

print(f"\n\nTarget variable (income) distribution:")
print(y.value_counts())
print(f"\nPercentage earning >50K: {(y.values == '>50K').mean() * 100:.2f}%")

Features in our dataset:
                       Feature    Type  Unique_Values  Missing_Values Missing_Percentage
age                        age   int64             74               0               0.0%
workclass            workclass  object              9             963               2.0%
fnlwgt                  fnlwgt   int64          28523               0               0.0%
education            education  object             16               0               0.0%
education-num    education-num   int64             16               0               0.0%
marital-status  marital-status  object              7               0               0.0%
occupation          occupation  object             15             966               2.0%
relationship      relationship  object              6               0               0.0%
race                      race  object              5               0               0.0%
sex                        sex  object              2               0               0

## feature desciption 

In [13]:


feature_descriptions = {
    'age': "Continuous variable. Important for understanding career stage and earning potential.",
    'workclass': "Employment sector. Shows if gender bias varies by employer type (private, government, etc.)",
    'fnlwgt': "Final weight - represents how many people in the population this person represents. Critical for accurate statistics.",
    'education': "Highest education level. May reveal gender disparities in educational opportunities.",
    'education-num': "Numerical encoding of education. Useful for ordered analysis.",
    'marital-status': "Marital status. Often shows different patterns for men and women in career progression.",
    'occupation': "Job type. Key for identifying occupational segregation by gender.",
    'relationship': "Family role. Can reveal societal expectations by gender.",
    'race': "Racial category. Important for intersectional analysis with gender.",
    'sex': "THE KEY VARIABLE - the protected attribute analyzing for bias.",
    'capital-gain': "Investment income. May show wealth accumulation differences by gender.",
    'capital-loss': "Investment losses. Part of the complete financial picture.",
    'hours-per-week': "Work hours. Can reveal work-life balance differences by gender.",
    'native-country': "Country of origin. Useful for understanding immigrant status effects."
}

for feature, description in feature_descriptions.items():
    if feature in X.columns:
        print(f"\n{feature.upper()}:")
        print(f"  Description: {description}")
        if X[feature].dtype == 'object':
            print(f"  Sample values: {X[feature].value_counts().head(3).index.tolist()}")
        else:
            print(f"  Range: {X[feature].min()} to {X[feature].max()}")



AGE:
  Description: Continuous variable. Important for understanding career stage and earning potential.
  Range: 17 to 90

WORKCLASS:
  Description: Employment sector. Shows if gender bias varies by employer type (private, government, etc.)
  Sample values: ['Private', 'Self-emp-not-inc', 'Local-gov']

FNLWGT:
  Description: Final weight - represents how many people in the population this person represents. Critical for accurate statistics.
  Range: 12285 to 1490400

EDUCATION:
  Description: Highest education level. May reveal gender disparities in educational opportunities.
  Sample values: ['HS-grad', 'Some-college', 'Bachelors']

EDUCATION-NUM:
  Description: Numerical encoding of education. Useful for ordered analysis.
  Range: 1 to 16

MARITAL-STATUS:
  Description: Marital status. Often shows different patterns for men and women in career progression.
  Sample values: ['Married-civ-spouse', 'Never-married', 'Divorced']

OCCUPATION:
  Description: Job type. Key for identifying 

In [20]:
# Gender distribution analysis
gender_analysis = pd.DataFrame()

# Basic gender distribution
gender_counts = X['sex'].value_counts()
gender_analysis['Count'] = gender_counts
gender_analysis['Percentage'] = (gender_counts / len(X) * 100).round(2)

In [21]:
print("Gender Distribution in Dataset:")
print(gender_analysis)
print(f"\nGender Ratio (Male:Female): {gender_counts['Male']/gender_counts['Female']:.2f}:1")

Gender Distribution in Dataset:
        Count  Percentage
sex                      
Male    32650       66.85
Female  16192       33.15

Gender Ratio (Male:Female): 2.02:1
