In [1]:
# packages used
import pandas as pd
from sklearn.model_selection import train_test_split

# import data
adult_census = pd.read_csv('../data/adult-census.csv')

# separate feature & target data
target = adult_census['class']
features = adult_census.drop(columns='class')

# drop the duplicated column `"education-num"` as stated in the data exploration notebook
features = features.drop(columns='education-num')

# split into train & test sets
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=123)

In [2]:
# packages used
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# create selector object based on data type
numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

# get columns of interest
numerical_columns = numerical_columns_selector(features)
categorical_columns = categorical_columns_selector(features)

# preprocessors to handle numeric and categorical features
numerical_preprocessor = StandardScaler()
categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")

# transformer to associate each of these preprocessors with their respective columns
preprocessor = ColumnTransformer([
    ('one-hot-encoder', categorical_preprocessor, categorical_columns),
    ('standard_scaler', numerical_preprocessor, numerical_columns)
])

In [3]:
# packages used
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline

# Pipeline object to chain together modeling processes
model = make_pipeline(preprocessor, KNeighborsClassifier())
model

# fit our model
_ = model.fit(X_train, y_train)

# score on test set
model.score(X_test, y_test)

0.8385062648431741

In [4]:
%%time
from sklearn.model_selection import cross_validate

cv_result = cross_validate(model, X_train, y_train, cv=5)
#cv_result = cross_val_score(model, X_train, y_train, cv=5)
cv_result

Wall time: 1min 28s


{'fit_time': array([0.1311717 , 0.18043804, 0.18053198, 0.12667203, 0.1276691 ]),
 'score_time': array([17.74016333, 18.96764588, 17.04785895, 19.8152709 , 13.95120072]),
 'test_score': array([0.82967108, 0.83401583, 0.83756484, 0.83933934, 0.83387933])}

In [5]:
scores = cv_result["test_score"]
print("The mean cross-validation accuracy is: "
      f"{scores.mean():.3f} +/- {scores.std():.3f}")

The mean cross-validation accuracy is: 0.835 +/- 0.003


In [6]:
# example of supplying more than one metric
metrics = ['accuracy', 'roc_auc']

cross_validate(model, X_train, y_train, cv=5, scoring=metrics)

{'fit_time': array([0.15161324, 0.14263082, 0.12866664, 0.35407567, 0.14262724]),
 'score_time': array([31.18683505, 31.9885776 , 30.87956667, 34.94589925, 36.12701845]),
 'test_accuracy': array([0.82967108, 0.83401583, 0.83756484, 0.83933934, 0.83387933]),
 'test_roc_auc': array([0.84983487, 0.85843635, 0.86509668, 0.85774895, 0.85791983])}

In [7]:
import sklearn
sklearn.__version__

'1.0.2'