In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import roc_auc_score, confusion_matrix, RocCurveDisplay
import matplotlib.pyplot as plt

In [2]:
pd.set_option('display.max_rows', 500)

In [3]:
df = pd.read_csv('lecture_03_data.csv',
                names = ['age',
                         'workclass',
                         'fnlwgt',
                         'education',
                         'education-num',
                         'marital-status',
                         'occupation',
                         'relationship',
                         'race',
                         'sex',
                         'capital-gain',
                         'capital-loss',
                         'hours-per-week',
                         'native-country',
                         'income'])

In [4]:
num_vars = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
cat_vars = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
target = 'income'

In [5]:
X = df[num_vars + cat_vars]
y = df[target].replace({' <=50K': 0, ' >50K': 1})
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

In [6]:
numeric_tx = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
])
cat_tx = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

In [7]:
preprocessor = ColumnTransformer(transformers=[
    ('numeric', numeric_tx, num_vars),
    ('categorical', cat_tx, cat_vars)
])

In [8]:
classifier_list = [LogisticRegression(random_state=4),
                   RandomForestClassifier(),
                   GradientBoostingClassifier(n_estimators=1000),]

In [9]:
for clf in classifier_list:
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', clf)])
    model_pipeline = pipeline.fit(X_train, y_train)
    preds = model_pipeline.predict_proba(X_test)
    print(roc_auc_score(y_test, preds[:,1]))

0.5688060797833213
0.895695061716149
0.9217372640452988
