# **Importing Required Libraries**

In [24]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler


# **Step 1 : Data Loading**

In [2]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
df = pd.read_csv(
    url,
    header=None,
    names=[
        'age', 'workclass', 'fnlwgt', 'education', 'education-num',
        'marital-status', 'occupation', 'relationship', 'race', 'sex',
        'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income'
    ],
    na_values=' ?',
    skipinitialspace=True,
    delimiter=','
)


In [4]:
# Preview the first 5 rows
print(df.head())

# Show dataset info
print(df.info())


# Get the shape (rows, columns) of the dataset
print("Dataset Dimensions:", df.shape)



   age         workclass  fnlwgt  education  education-num  \
0   39         State-gov   77516  Bachelors             13   
1   50  Self-emp-not-inc   83311  Bachelors             13   
2   38           Private  215646    HS-grad              9   
3   53           Private  234721       11th              7   
4   28           Private  338409  Bachelors             13   

       marital-status         occupation   relationship   race     sex  \
0       Never-married       Adm-clerical  Not-in-family  White    Male   
1  Married-civ-spouse    Exec-managerial        Husband  White    Male   
2            Divorced  Handlers-cleaners  Not-in-family  White    Male   
3  Married-civ-spouse  Handlers-cleaners        Husband  Black    Male   
4  Married-civ-spouse     Prof-specialty           Wife  Black  Female   

   capital-gain  capital-loss  hours-per-week native-country income  
0          2174             0              40  United-States  <=50K  
1             0             0             

# **Step 2 : Initial Data Inspection and Cleaning**

### Step 2.1 : Display dataset information and preview data

**Preview Data:** Using head() lets students see the actual records, which is critical for understanding the context of the data.

**Dataset Info:** The info() function shows non-null counts and datatypes, which helps in quickly spotting missing values or incorrect data formats.



In [5]:
# Display dataset information and preview data
# -> Display the first 5 rows and print the basic states for data
# -> Hint: head and info

# Preview the first 5 rows
print(df.head())

# Show dataset info
print(df.info())




   age         workclass  fnlwgt  education  education-num  \
0   39         State-gov   77516  Bachelors             13   
1   50  Self-emp-not-inc   83311  Bachelors             13   
2   38           Private  215646    HS-grad              9   
3   53           Private  234721       11th              7   
4   28           Private  338409  Bachelors             13   

       marital-status         occupation   relationship   race     sex  \
0       Never-married       Adm-clerical  Not-in-family  White    Male   
1  Married-civ-spouse    Exec-managerial        Husband  White    Male   
2            Divorced  Handlers-cleaners  Not-in-family  White    Male   
3  Married-civ-spouse  Handlers-cleaners        Husband  Black    Male   
4  Married-civ-spouse     Prof-specialty           Wife  Black  Female   

   capital-gain  capital-loss  hours-per-week native-country income  
0          2174             0              40  United-States  <=50K  
1             0             0             

**Dataset Dimensions:** Knowing the shape of the data informs students about its scale, which can affect computation time and choice of algorithms.

In [6]:
# Display the dimension of the data
# -> Hint : shape

# Get the shape (rows, columns) of the dataset
print("Dataset Dimensions:", df.shape)


Dataset Dimensions: (32561, 15)


### **Step 2.2 : Basic Statistical Summary:**

The describe() function provides vital statistics (min, max, mean, standard deviation) that help identify any outliers or anomalies in numerical columns.

In [29]:
# Check the min , max , count , means , standard diviation etc
# Hint -> describe

# Get summary statistics for numeric columns
df.describe()



Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,education_hours_interaction
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456,412.342219
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429,177.835317
min,17.0,12285.0,1.0,0.0,0.0,1.0,5.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0,320.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0,400.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0,520.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0,1584.0


# **Step 2.3 : Counting Missing Values:**

This step is essential for diagnosing data quality. Missing values can lead to biased or inaccurate models if not handled properly.

In [8]:
# Count missing values in each column

# Count the number of missing values in each column
print(df.isnull().sum())


age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
income            0
dtype: int64


### **Step 2.4 : Checking for Duplicate Records:**

Duplicates can skew the analysis by over-representing some data, so it's important to remove them.

In [9]:
# Check for duplicate records

# Count and optionally display duplicate rows
duplicates = df[df.duplicated()]
print("Number of duplicate rows:", len(duplicates))




Number of duplicate rows: 24


### **Step 2.5 : Inspecting Unique Values in Categorical Columns:**

Unique value inspection reveals if there are any unexpected values (e.g., a '?' or extra spaces) that need cleaning. This is important for ensuring reliable encoding later.

In [10]:
# Check the unique data of each Categorical column to find if there is any irrelevant record or data e.g one record contains ? mark
categorical_columns = ['workclass', 'education', 'marital-status', 'occupation',
                      'relationship', 'race', 'sex', 'native-country', 'income']

print("Unique values in categorical columns:")
for col in categorical_columns:
    print(f"\n{col} unique values:")
    print(df[col].unique())
    print("-" * 50)

Unique values in categorical columns:

workclass unique values:
['State-gov' 'Self-emp-not-inc' 'Private' 'Federal-gov' 'Local-gov' '?'
 'Self-emp-inc' 'Without-pay' 'Never-worked']
--------------------------------------------------

education unique values:
['Bachelors' 'HS-grad' '11th' 'Masters' '9th' 'Some-college' 'Assoc-acdm'
 'Assoc-voc' '7th-8th' 'Doctorate' 'Prof-school' '5th-6th' '10th'
 '1st-4th' 'Preschool' '12th']
--------------------------------------------------

marital-status unique values:
['Never-married' 'Married-civ-spouse' 'Divorced' 'Married-spouse-absent'
 'Separated' 'Married-AF-spouse' 'Widowed']
--------------------------------------------------

occupation unique values:
['Adm-clerical' 'Exec-managerial' 'Handlers-cleaners' 'Prof-specialty'
 'Other-service' 'Sales' 'Craft-repair' 'Transport-moving'
 'Farming-fishing' 'Machine-op-inspct' 'Tech-support' '?'
 'Protective-serv' 'Armed-Forces' 'Priv-house-serv']
--------------------------------------------------



### **Step 2.6 : Validating and Converting Data Types:**

Converting columns to the correct datatype (like converting an ID column to a string) prevents errors in operations such as merging, filtering, or encoding.

In [11]:
# Check the data type of each column and if wrong datatype convert it to the suitable datatype
print("\nCurrent data types:")
print(df.dtypes)


Current data types:
age                int64
workclass         object
fnlwgt             int64
education         object
education-num      int64
marital-status    object
occupation        object
relationship      object
race              object
sex               object
capital-gain       int64
capital-loss       int64
hours-per-week     int64
native-country    object
income            object
dtype: object


### **Step 2.7 : Checking Value Counts for Categorical Columns:**

Value counts help in understanding the distribution within each category. They are useful for detecting class imbalances and anomalies.

In [12]:
# Check value count for each Cateorical column
print("\nValue counts for categorical columns:")
for col in categorical_columns:
    print(f"\n{col} value counts:")
    print(df[col].value_counts())
    print("-" * 50)



Value counts for categorical columns:

workclass value counts:
workclass
Private             22696
Self-emp-not-inc     2541
Local-gov            2093
?                    1836
State-gov            1298
Self-emp-inc         1116
Federal-gov           960
Without-pay            14
Never-worked            7
Name: count, dtype: int64
--------------------------------------------------

education value counts:
education
HS-grad         10501
Some-college     7291
Bachelors        5355
Masters          1723
Assoc-voc        1382
11th             1175
Assoc-acdm       1067
10th              933
7th-8th           646
Prof-school       576
9th               514
12th              433
Doctorate         413
5th-6th           333
1st-4th           168
Preschool          51
Name: count, dtype: int64
--------------------------------------------------

marital-status value counts:
marital-status
Married-civ-spouse       14976
Never-married            10683
Divorced                  4443
Separated    

### **Step 2.8 : Handling Missing Values using SimpleImputer:**

Imputation preserves the dataset size while ensuring that no null values interfere with analysis. Different strategies are used for numerical (mean) and categorical (mode) columns based on their characteristics.

In [33]:
# TODO: fill null values either by mean , median  or mode based on type of data

# Separate numeric and categorical columns

# Numeric: fill with mean

# Categorical: fill with most frequent (mode)

numeric_columns = ['age', 'fnlwgt', 'education-num', 'capital-gain',
                  'capital-loss', 'hours-per-week']

# Create and fit numeric imputer
numeric_imputer = SimpleImputer(strategy='mean')
df[numeric_columns] = numeric_imputer.fit_transform(df[numeric_columns])

# Create and fit categorical imputer
categorical_imputer = SimpleImputer(strategy='most_frequent')
df[categorical_columns] = categorical_imputer.fit_transform(df[categorical_columns])


print(df[categorical_columns].head())


          workclass  education      marital-status         occupation  \
0         State-gov  Bachelors       Never-married       Adm-clerical   
1  Self-emp-not-inc  Bachelors  Married-civ-spouse    Exec-managerial   
2           Private    HS-grad            Divorced  Handlers-cleaners   
3           Private       11th  Married-civ-spouse  Handlers-cleaners   
4           Private  Bachelors  Married-civ-spouse     Prof-specialty   

    relationship   race     sex native-country income  
0  Not-in-family  White    Male  United-States  <=50K  
1        Husband  White    Male  United-States  <=50K  
2  Not-in-family  White    Male  United-States  <=50K  
3        Husband  Black    Male  United-States  <=50K  
4           Wife  Black  Female           Cuba  <=50K  


# **Step 3 : Converting Data Types and Cleaning Categorical Data**

### **Step 3.1 :Removing Leading/Trailing Spaces:**

Standardize entries in categorical columns so that no extra spaces lead to misclassification of similar values.



In [14]:
# Check for leading/trailing spaces in categorical data and remove them
for col in categorical_columns:
    df[col] = df[col].str.strip()


### **Step 3.2 : Checking and Converting Data Types:**

Ensure that every column is of the correct data type (e.g., IDs as strings, dates as datetime).

In [None]:
# Check the data type of each column and if wrong datatype convert it to the suitable datatype
# all are correct

### **3.3 : Converting to 'category' Datatype:**

Transform columns that represent categorical data (like Gender or Embarked port) into the 'category' type to optimize memory and computational performance.

In [15]:
# Convert suitable columns to 'category' datatype
for col in categorical_columns:
    df[col] = df[col].astype('category')

# **Step 4 : Feature Engineering**

### **Step 4.1 : Creating "age_group" Feature:**

Binning converts continuous age values into meaningful categories (e.g., 'Young', 'Adult') that are easier to analyze and interpret.

In [16]:
bins = [0, 25, 45, 65, np.inf]
labels = ['Young', 'Adult', 'Middle-Aged', 'Senior']
df['age_group'] = pd.cut(df['age'], bins=bins, labels=labels)

### **Step 4.2 : Creating "education_hours_interaction" Feature:**

Interaction features help capture complex relationships between variables. In this case, the interaction between education (via 'education-num') and work intensity ('hours-per-week') may reveal underlying patterns related to social or economic outcomes.

In [17]:
# Create an interaction feature "education_hours_interaction": education-num multiplied by hours-per-week (as a proxy for workload vs. education level)
df['education_hours_interaction'] = df['education-num'] * df['hours-per-week']

# **Step 5 : Encoding Categorical Data**

**One-Hot Encoding:** Converts multiple categorical values into binary columns to prevent ordinality.


In [21]:
# One-hot encode the categorical columns (sex, workclass, education, etc.).
categorical_cols_to_encode = [col for col in categorical_columns if col != 'income']
df_encoded = pd.get_dummies(df, columns=categorical_cols_to_encode)



**Label Encoding for Income:** Maps income to binary labels for binary classification tasks.

In [27]:
# Use label encoding for the income column, converting ≤50K to 0 and 50K to 1.
le = LabelEncoder()
df_encoded['income'] = le.fit_transform(df_encoded['income'])

# **Step 6 : Normalization and Standardization**

Standardization transforms the specified columns to a mean of 0 and a standard deviation of 1, which is important to ensure comparability among numerical features during model training.

In [30]:
# Standardize the "age", "hours-per-week", "capital-gain" and "capital-loss" column to have a mean of 0 and a standard deviation of 1.
columns_to_standardize = ['age', 'hours-per-week', 'capital-gain', 'capital-loss']
scaler = StandardScaler()
df_encoded[columns_to_standardize] = scaler.fit_transform(df_encoded[columns_to_standardize])

# Display the final transformed dataset
print("\nFinal dataset shape:", df_encoded.shape)
print("\nFirst few rows of transformed dataset:")
df_encoded.head()


Final dataset shape: (32561, 111)

First few rows of transformed dataset:


Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,income,age_group,education_hours_interaction,workclass_?,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
0,0.030671,77516.0,13.0,0.148453,-0.21666,-0.035429,0,Adult,520.0,False,...,False,False,False,False,False,False,False,True,False,False
1,0.837109,83311.0,13.0,-0.14592,-0.21666,-2.222153,0,Middle-Aged,169.0,False,...,False,False,False,False,False,False,False,True,False,False
2,-0.042642,215646.0,9.0,-0.14592,-0.21666,-0.035429,0,Adult,360.0,False,...,False,False,False,False,False,False,False,True,False,False
3,1.057047,234721.0,7.0,-0.14592,-0.21666,-0.035429,0,Middle-Aged,280.0,False,...,False,False,False,False,False,False,False,True,False,False
4,-0.775768,338409.0,13.0,-0.14592,-0.21666,-0.035429,0,Adult,520.0,False,...,False,False,False,False,False,False,False,False,False,False
