<h1> 1 - Data Engineering 

In [42]:
import numpy as np
import pandas as pd

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 140)

In [43]:
DATA_PATH = "adult_all.csv"
df = pd.read_csv(DATA_PATH)

print("Data shape:", df.shape)
df.head()

Data shape: (48842, 15)


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,?,12285,Some-college,10,Never-married,?,Not-in-family,Amer-Indian-Eskimo,Female,0,0,20,United-States,<=50K
1,32,?,13862,HS-grad,9,Never-married,?,Not-in-family,Amer-Indian-Eskimo,Female,0,0,38,United-States,<=50K
2,29,?,19793,Some-college,10,Divorced,?,Unmarried,White,Female,0,0,8,United-States,<=50K
3,18,?,20057,Some-college,10,Never-married,?,Not-in-family,Asian-Pac-Islander,Female,0,0,16,United-States,<=50K
4,19,?,20469,HS-grad,9,Never-married,?,Other-relative,Asian-Pac-Islander,Female,0,0,12,South,<=50K


In [44]:
print("Columns:", df.columns.tolist())

TARGET_COL = "income" 
assert TARGET_COL in df.columns, "Target column not found!"

print("Target unique values (raw):", df[TARGET_COL].unique())

Columns: ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']
Target unique values (raw): ['<=50K' '>50K']


In [45]:
# data cleaning

df_clean = df.copy()

obj_cols = df_clean.select_dtypes(include=["object"]).columns
for c in obj_cols:
    df_clean[c] = df_clean[c].astype("string").str.strip()

df_clean = df_clean.replace("?", np.nan)

print("After cleaning, sample rows:")
df_clean.head()

After cleaning, sample rows:


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,,12285,Some-college,10,Never-married,,Not-in-family,Amer-Indian-Eskimo,Female,0,0,20,United-States,<=50K
1,32,,13862,HS-grad,9,Never-married,,Not-in-family,Amer-Indian-Eskimo,Female,0,0,38,United-States,<=50K
2,29,,19793,Some-college,10,Divorced,,Unmarried,White,Female,0,0,8,United-States,<=50K
3,18,,20057,Some-college,10,Never-married,,Not-in-family,Asian-Pac-Islander,Female,0,0,16,United-States,<=50K
4,19,,20469,HS-grad,9,Never-married,,Other-relative,Asian-Pac-Islander,Female,0,0,12,South,<=50K


In [46]:
# حذف ردیف‌هایی که برچسب ندارند
before = df_clean.shape[0]
df_clean = df_clean.dropna(subset=[TARGET_COL])
after = df_clean.shape[0]

print(f"Dropped rows with missing target: {before - after}")
print("New shape:", df_clean.shape)

X = df_clean.drop(columns=[TARGET_COL]).copy()
y = df_clean[TARGET_COL].copy()

print("X shape:", X.shape)
print("y distribution (count):\n", y.value_counts())
print("\ny distribution (ratio):\n", y.value_counts(normalize=True))

Dropped rows with missing target: 0
New shape: (48842, 15)
X shape: (48842, 14)
y distribution (count):
 income
<=50K    37155
>50K     11687
Name: count, dtype: Int64

y distribution (ratio):
 income
<=50K    0.760718
>50K     0.239282
Name: proportion, dtype: Float64


In [47]:
numeric_cols = X.select_dtypes(include=["number"]).columns.tolist()
categorical_cols = X.select_dtypes(exclude=["number"]).columns.tolist()

print("Number of features:", X.shape[1])
print("Numeric features:", len(numeric_cols), numeric_cols)
print("Categorical features:", len(categorical_cols), categorical_cols)

Number of features: 14
Numeric features: 6 ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
Categorical features: 8 ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']


In [48]:
# خلاصه آماری عددی‌ها
if len(numeric_cols) > 0:
    display(X[numeric_cols].describe().T)

# Missing summary
missing_counts = X.isna().sum().sort_values(ascending=False)
missing_ratio = (missing_counts / len(X)).sort_values(ascending=False)

missing_summary = pd.DataFrame({
    "missing_count": missing_counts,
    "missing_ratio": (missing_ratio * 100).round(2)
})

print("Missing summary (only columns with missing > 0):")
display(missing_summary[missing_summary["missing_count"] > 0])

rows_with_missing = X.isna().any(axis=1).mean()
print(f"Rows with at least one missing value: {rows_with_missing*100:.2f}%")

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,48842.0,38.643196,13.711223,4.0,28.0,37.0,48.0,90.0
fnlwgt,48842.0,189664.134597,105604.025423,12285.0,117550.5,178144.5,237642.0,1490400.0
education-num,48842.0,10.078089,2.570973,1.0,9.0,10.0,12.0,16.0
capital-gain,48842.0,1079.067626,7452.019058,0.0,0.0,0.0,0.0,99999.0
capital-loss,48842.0,87.502314,403.004552,0.0,0.0,0.0,0.0,4356.0
hours-per-week,48842.0,40.422382,12.391444,1.0,40.0,40.0,45.0,99.0


Missing summary (only columns with missing > 0):


Unnamed: 0,missing_count,missing_ratio
occupation,2809,5.75
workclass,2799,5.73
native-country,857,1.75


Rows with at least one missing value: 7.41%


In [49]:
print("Final dataset shape:", df_clean.shape)
print("X shape:", X.shape, "y shape:", y.shape)

print("\nNumeric features (final):", len(numeric_cols), numeric_cols)
print("Categorical features (final):", len(categorical_cols), categorical_cols)

print("\nTarget distribution (ratio):")
print(y.value_counts(normalize=True))

missing_counts = X.isna().sum()
missing_ratio = (missing_counts / len(X) * 100).round(2)
missing_summary = pd.DataFrame({"missing_count": missing_counts, "missing_ratio(%)": missing_ratio})
print("\nMissing columns (missing_count > 0):")
display(missing_summary[missing_summary["missing_count"] > 0].sort_values("missing_count", ascending=False))

Final dataset shape: (48842, 15)
X shape: (48842, 14) y shape: (48842,)

Numeric features (final): 6 ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
Categorical features (final): 8 ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']

Target distribution (ratio):
income
<=50K    0.760718
>50K     0.239282
Name: proportion, dtype: Float64

Missing columns (missing_count > 0):


Unnamed: 0,missing_count,missing_ratio(%)
occupation,2809,5.75
workclass,2799,5.73
native-country,857,1.75


<h1> 2 - Feature Engineering

In [50]:
# Manual feature engineering (safe before split)
df_fe = df_clean.copy()

# New feature: net capital (gain - loss)
df_fe["capital_total"] = df_fe["capital-gain"] - df_fe["capital-loss"]

df_fe[["capital-gain", "capital-loss", "capital_total"]].head()

Unnamed: 0,capital-gain,capital-loss,capital_total
0,0,0,0
1,0,0,0
2,0,0,0
3,0,0,0
4,0,0,0


In [51]:
df_fe[df_fe["capital_total"] != 0][
    ["capital-gain", "capital-loss", "capital_total"]
].head()

Unnamed: 0,capital-gain,capital-loss,capital_total
15,0,1719,-1719
27,2936,0,2936
31,0,2415,-2415
37,2228,0,2228
48,9386,0,9386


In [52]:
# Drop census weight (usually weakly related to income prediction)
DROP_COLS = ["fnlwgt"]

df_fe = df_fe.drop(columns=DROP_COLS)

print("Dropped columns:", DROP_COLS)
print("New shape:", df_fe.shape)

Dropped columns: ['fnlwgt']
New shape: (48842, 15)


In [53]:
X = df_fe.drop(columns=[TARGET_COL]).copy()
y = df_fe[TARGET_COL].copy()

numeric_cols = X.select_dtypes(include=["number"]).columns.tolist()
categorical_cols = X.select_dtypes(exclude=["number"]).columns.tolist()

print("X shape:", X.shape)
print("Numeric features:", len(numeric_cols), numeric_cols)
print("Categorical features:", len(categorical_cols), categorical_cols)
print("\nTarget ratio:\n", y.value_counts(normalize=True))

X shape: (48842, 14)
Numeric features: 6 ['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week', 'capital_total']
Categorical features: 8 ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']

Target ratio:
 income
<=50K    0.760718
>50K     0.239282
Name: proportion, dtype: Float64


In [54]:
from sklearn.model_selection import train_test_split

# Main split (used in all next sections)
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=RANDOM_SEED,
    stratify=y
)

print("Train shape:", X_train.shape, " Test shape:", X_test.shape)

Train shape: (39073, 14)  Test shape: (9769, 14)


In [63]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer


# Numerical preprocessing: impute + scale
numeric_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# Categorical preprocessing: impute + one-hot encode
categorical_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

# Combine pipelines
preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_pipeline, numeric_cols),
    ("cat", categorical_pipeline, categorical_cols)
])

In [65]:
# Fit only on training data to avoid data leakage
X_train_prep = preprocessor.fit_transform(X_train)
X_test_prep = preprocessor.transform(X_test)

print("Before preprocessing:", X_train.shape)
print("After preprocessing:", X_train_prep.shape)

TypeError: boolean value of NA is ambiguous