In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import roc_auc_score, confusion_matrix, RocCurveDisplay
import matplotlib.pyplot as plt

In [2]:
pd.set_option('display.max_rows', 500)

In [3]:
df = pd.read_csv('../lecture_03/lecture_03_data.csv',
                names = ['age',
                         'workclass',
                         'fnlwgt',
                         'education',
                         'education-num',
                         'marital-status',
                         'occupation',
                         'relationship',
                         'race',
                         'sex',
                         'capital-gain',
                         'capital-loss',
                         'hours-per-week',
                         'native-country',
                         'income'])

In [4]:
num_vars = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
cat_vars = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
target = 'income'

In [5]:
X = df[num_vars + cat_vars]
y = df[target].replace({' <=50K': 0, ' >50K': 1})
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

In [6]:
numeric_tx = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
])
cat_tx = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

In [7]:
preprocessor = ColumnTransformer(transformers=[
    ('numeric', numeric_tx, num_vars),
    ('categorical', cat_tx, cat_vars)
])

In [8]:
param_grid = {'learning_rate': [0.1, 0.01], 'n_estimators': [100, 1000]}

In [10]:
gbm = GradientBoostingClassifier()
clf = GridSearchCV(gbm, param_grid, scoring='roc_auc')

In [11]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', clf)])
model_pipeline = pipeline.fit(X_train, y_train)
preds = model_pipeline.predict_proba(X_test)

In [12]:
print(roc_auc_score(y_test, preds[:,1]))

0.9218368483233558


In [13]:
pd.DataFrame(clf.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,1.749145,0.04112,0.009388,0.000191,0.1,100,"{'learning_rate': 0.1, 'n_estimators': 100}",0.917272,0.921858,0.919333,0.928066,0.927739,0.922854,0.004372,2
1,16.676295,0.410902,0.109577,0.001852,0.1,1000,"{'learning_rate': 0.1, 'n_estimators': 1000}",0.925754,0.92598,0.924852,0.931898,0.931792,0.928055,0.003117,1
2,1.852213,0.043305,0.006351,0.000147,0.01,100,"{'learning_rate': 0.01, 'n_estimators': 100}",0.888223,0.891827,0.885985,0.895226,0.899943,0.892241,0.004974,4
3,17.463866,0.41326,0.108682,0.001109,0.01,1000,"{'learning_rate': 0.01, 'n_estimators': 1000}",0.917096,0.920704,0.919257,0.928117,0.927352,0.922505,0.004428,3
