In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
import warnings
warnings.filterwarnings("ignore",category=FutureWarning)

In [2]:
file_path = "adult_with_headers (1).csv"
df = pd.read_csv(file_path)

print("===== FIRST 5 ROWS =====")
print(df.head())

print("\n===== DATA TYPES =====")
print(df.dtypes)

print("\n===== MISSING VALUES =====")
print(df.isnull().sum())

print("\n===== SUMMARY STATISTICS =====")
print(df.describe(include='all'))

===== FIRST 5 ROWS =====
   age          workclass  fnlwgt   education  education_num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital_status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital_gain  capital_loss  hours_per_week  native_country  income  
0          2174             0              

### 1 Handle Missing Values

In [10]:
df.replace('?', np.nan, inplace=True)

In [11]:
# Separate numerical and categorical
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
cat_cols = df.select_dtypes(include=['object']).columns

In [12]:
# Impute numerical with median
for col in num_cols:
    df[col].fillna(df[col].median(), inplace=True)

In [13]:
# Impute categorical with mode
for col in cat_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)

In [14]:
print("\nMissing values after handling:")
print(df.isnull().sum())


Missing values after handling:
age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64


In [15]:
#a)Scaling Numerical Features

In [16]:
# Standard Scaling
standard_scaler = StandardScaler()
df_standard_scaled = df.copy()
df_standard_scaled[num_cols] = standard_scaler.fit_transform(df[num_cols])

Standard Scaling
- StandardScaler is sensitive to outliers.
- Centers data to mean = 0 and std = 1

In [17]:
#b) Min-Max Scaling
minmax_scaler = MinMaxScaler()
df_minmax_scaled = df.copy()
df_minmax_scaled[num_cols] = minmax_scaler.fit_transform(df[num_cols])

print("\nStandard Scaled Sample:")
print(df_standard_scaled[num_cols].head())

print("\nMinMax Scaled Sample:")
print(df_minmax_scaled[num_cols].head())


Standard Scaled Sample:
        age    fnlwgt  education_num  capital_gain  capital_loss  \
0  0.030671 -1.063611       1.134739      0.148453      -0.21666   
1  0.837109 -1.008707       1.134739     -0.145920      -0.21666   
2 -0.042642  0.245079      -0.420060     -0.145920      -0.21666   
3  1.057047  0.425801      -1.197459     -0.145920      -0.21666   
4 -0.775768  1.408176       1.134739     -0.145920      -0.21666   

   hours_per_week  
0       -0.035429  
1       -2.222153  
2       -0.035429  
3       -0.035429  
4       -0.035429  

MinMax Scaled Sample:
        age    fnlwgt  education_num  capital_gain  capital_loss  \
0  0.301370  0.044302       0.800000       0.02174           0.0   
1  0.452055  0.048238       0.800000       0.00000           0.0   
2  0.287671  0.138113       0.533333       0.00000           0.0   
3  0.493151  0.151068       0.400000       0.00000           0.0   
4  0.150685  0.221488       0.800000       0.00000           0.0   

   hours_per_w

Min-Max Scaling-
- Scales features between 0 and 1.
- You need bounded values.
- Commonly used for Neural Networks.


### 2 Encoding Techniques

In [18]:
encoded_df = df.copy()

for col in cat_cols:
    unique_count = df[col].nunique()
    
    if unique_count < 5:
        # One-Hot Encoding
        encoded_df = pd.get_dummies(encoded_df, columns=[col], drop_first=True)
    else:
        # Label Encoding
        le = LabelEncoder()
        encoded_df[col] = le.fit_transform(encoded_df[col])

print("\nEncoded Data Sample:")
print(encoded_df.head())


Encoded Data Sample:
   age  workclass  fnlwgt  education  education_num  marital_status  \
0   39          7   77516          9             13               4   
1   50          6   83311          9             13               2   
2   38          4  215646         11              9               0   
3   53          4  234721          1              7               2   
4   28          4  338409          9             13               2   

   occupation  relationship  race  capital_gain  capital_loss  hours_per_week  \
0           1             1     4          2174             0              40   
1           4             0     4             0             0              13   
2           6             1     4             0             0              40   
3           6             0     2             0             0              40   
4          10             5     2             0             0              40   

   native_country  sex_ Male  income_ >50K  
0              39  

### 3 Feature Engineering

In [19]:
encoded_df['capital_net'] = df['capital_gain'] - df['capital_loss']

#Work intensity = hours-per-week * education-num
encoded_df['work_intensity'] = df['hours_per_week'] * df['education_num']

print("\nNew Feature Samples:")
print(encoded_df[['capital_net','work_intensity']].head())


New Feature Samples:
   capital_net  work_intensity
0         2174             520
1            0             169
2            0             360
3            0             280
4            0             520


In [20]:
# Log Transformation
# Applying log transform on skewed 'capital_gain'

encoded_df['capital_gain_log'] = np.log1p(df['capital_gain'])

print("\nLog Transformed Feature Sample:")
print(encoded_df[['capital_gain','capital_gain_log']].head())


Log Transformed Feature Sample:
   capital_gain  capital_gain_log
0          2174          7.684784
1             0          0.000000
2             0          0.000000
3             0          0.000000
4             0          0.000000


Log Transformation
- capital-gain is highly skewed.

In [21]:
# Processed Dataset
print("\nProcessed Dataset Shape:", encoded_df.shape)
print(encoded_df.head())


Processed Dataset Shape: (32561, 18)
   age  workclass  fnlwgt  education  education_num  marital_status  \
0   39          7   77516          9             13               4   
1   50          6   83311          9             13               2   
2   38          4  215646         11              9               0   
3   53          4  234721          1              7               2   
4   28          4  338409          9             13               2   

   occupation  relationship  race  capital_gain  capital_loss  hours_per_week  \
0           1             1     4          2174             0              40   
1           4             0     4             0             0              13   
2           6             1     4             0             0              40   
3           6             0     2             0             0              40   
4          10             5     2             0             0              40   

   native_country  sex_ Male  income_ >50K  capi

### Pros and Cons

One-Hot Encoding-

Pros
- No ordinal relationship introduced.
- Works well for nominal categories.

Cons
- Increases dimensionality.

Label Encoding-

Pros
- Simple and memory efficient.

Cons
- May introduce false ordinal relationships.