In this notebook we will be adding features to Lara.

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score

df = pd.read_csv('../data/adult_clean.csv')
print(f"Loaded: {df.shape}")

Loaded: (45222, 14)


In [3]:
# look at distribution of capital gain once more
print(df['capital-gain'].describe())
print(f"\nZeros: {(df['capital-gain'] == 0).sum()} ({(df['capital-gain'] == 0).mean() * 100:.1f}%)")
print(f"Non-zeros: {(df['capital-gain'] > 0).sum()}")

count    45222.000000
mean      1101.430344
std       7506.430084
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max      99999.000000
Name: capital-gain, dtype: float64

Zeros: 41432 (91.6%)
Non-zeros: 3790


In [4]:
df['capital-gain-bin'] = pd.cut(
    df['capital-gain'],
    bins=[-1, 0, 5000, 99999],
    labels=['none','low','high']
)

print(df['capital-gain-bin'].value_counts())
print(pd.crosstab(df['capital-gain-bin'], df['class'], normalize='index'))

capital-gain-bin
none    41432
high     2338
low      1452
Name: count, dtype: int64
class                <=50K      >50K
capital-gain-bin                    
none              0.786807  0.213193
low               0.811983  0.188017
high              0.100941  0.899059


In [5]:
print(f"Capital-loss zeros: {(df['capital-loss'] == 0).sum()} ({(df['capital-loss'] == 0).mean() * 100:.1f}%)")

df['capital-loss-bin'] = pd.cut(
    df['capital-loss'],
    bins=[-1, 0, 2000, 99999],
    labels=['none','low','high']
)
print(df['capital-loss-bin'].value_counts())
print(pd.crosstab(df['capital-loss-bin'], df['class'], normalize='index'))

Capital-loss zeros: 43082 (95.3%)
capital-loss-bin
none    43082
low      1690
high      450
Name: count, dtype: int64
class                <=50K      >50K
capital-loss-bin                    
none              0.765331  0.234669
low               0.463905  0.536095
high              0.573333  0.426667


In [6]:
df['age-bin'] = pd.cut(
    df['age'],
    bins=[0, 25, 35, 50, 65, 100],
    labels=['young','early-career','mid-career','late-career','retirement']
)
print(df['age-bin'].value_counts().sort_index())
print(pd.crosstab(df['age-bin'], df['class'], normalize='index'))

age-bin
young            8441
early-career    12074
mid-career      16026
late-career      7337
retirement       1344
Name: count, dtype: int64
class            <=50K      >50K
age-bin                         
young         0.980689  0.019311
early-career  0.808680  0.191320
mid-career    0.634594  0.365406
late-career   0.647404  0.352596
retirement    0.782738  0.217262


In [7]:
# prepping data with the engineered feature
df_eng = df.copy()

# drop original columns we binned previously
df_eng = df_eng.drop(columns=['capital-gain', 'capital-loss', 'age'])

X_eng = df_eng.drop(columns=['class'])
y_eng = df_eng['class']

X_train_eng, X_test_eng, y_train_eng, y_test_eng = train_test_split(
    X_eng, 
    y_eng, 
    test_size=0.2, 
    random_state=42, 
    stratify=y_eng
)

# update column lists
cat_cols_eng = X_eng.select_dtypes(include=['object', 'category']).columns.tolist()
num_cols_eng = X_eng.select_dtypes(include=['int64', 'float64']).columns.tolist()

print(f"Categorical: {cat_cols_eng}")
print(f"Numeric: {num_cols_eng}")

Categorical: ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'capital-gain-bin', 'capital-loss-bin', 'age-bin']
Numeric: ['education-num', 'hours-per-week']


In [8]:
preprocessor_eng = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols_eng),
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), cat_cols_eng)
    ]
)

model_eng = Pipeline(steps=[
    ('preprocessor', preprocessor_eng),
    ('classifier', LogisticRegression(max_iter=1000, random_state=42))
])

model_eng.fit(X_train_eng, y_train_eng)
y_pred_eng = model_eng.predict(X_test_eng)

print(f"Engineered Model Accuracy: {accuracy_score(y_test_eng, y_pred_eng):.4f}")
print(classification_report(y_test_eng, y_pred_eng))

Engineered Model Accuracy: 0.8504
              precision    recall  f1-score   support

       <=50K       0.88      0.93      0.90      6803
        >50K       0.74      0.61      0.67      2242

    accuracy                           0.85      9045
   macro avg       0.81      0.77      0.79      9045
weighted avg       0.84      0.85      0.85      9045

