In [99]:
# code from https://medium.com/district-data-labs/building-a-classifier-from-census-data-18f996c4d7cf
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, log_loss
import xgboost as xgb

import mlflow
import mlflow.xgboost
import mlflow.sklearn

# prepare train and test data
names = [
    'age',
    'workclass',
    'fnlwgt',
    'education',
    'education-num',
    'marital-status',
    'occupation',
    'relationship',
    'race',
    'sex',
    'capital-gain',
    'capital-loss',
    'hours-per-week',
    'native-country',
    'income',
]
data = pd.read_csv('census_data/adult.data', names=names)

import json
meta = {
    'target_names': list(data.income.unique()),
    'feature_names': list(data.columns),
    'categorical_features': {
        column: list(data[column].unique())
        for column in data.columns
        if data[column].dtype == 'object'
    },
}
with open('census_data/meta.json', 'w') as f:
    json.dump(meta, f, indent=2)

In [100]:
from sklearn.datasets.base import Bunch
from sklearn.preprocessing import LabelEncoder
def load_data():
    # Load the meta data from the file
    with open('census_data/meta.json', 'r') as f:
        meta = json.load(f)
    names = meta['feature_names']
    # Load the training and test data, skipping the bad row in the test data
    train = pd.read_csv('census_data/adult.data', names=names)
    test  = pd.read_csv('census_data/adult.test', names=names, skiprows=1)
    # Remove the target from the categorical features
    meta['categorical_features'].pop('income')
    # Return the bunch with the appropriate data chunked apart
    return Bunch(
        data = train[names[:-1]],
        target = train[names[-1]],
        data_test = test[names[:-1]],
        target_test = test[names[-1]],
        target_names = meta['target_names'],
        feature_names = meta['feature_names'],
        categorical_features = meta['categorical_features'],
        DESCR = "descr",
    )
dataset = load_data()

from sklearn.base import BaseEstimator, TransformerMixin
class EncodeCategorical(BaseEstimator, TransformerMixin):
    """
    Encodes a specified list of columns or all columns if None.
    """
    def __init__(self, columns=None):
        self.columns  = columns
        self.encoders = None
    def fit(self, data, target=None):
        """
        Expects a data frame with named columns to encode.
        """
        # Encode all columns if columns is None
        if self.columns is None:
            self.columns = data.columns
        # Fit a label encoder for each column in the data frame
        self.encoders = {
            column: LabelEncoder().fit(data[column])
            for column in self.columns
        }
        return self
    def transform(self, data):
        """
        Uses the encoders to transform a data frame.
        """
        output = data.copy()
        for column, encoder in self.encoders.items():
            output[column] = encoder.transform(data[column])
        return output
encoder = EncodeCategorical(dataset.categorical_features.keys())
dataset.data = encoder.fit_transform(dataset.data)
dataset.data_test = encoder.fit_transform(dataset.data_test)

In [101]:
from sklearn.impute import SimpleImputer
import numpy as np

class ImputeCategorical(BaseEstimator, TransformerMixin):
    """
    Encodes a specified list of columns or all columns if None.
    """
    def __init__(self, columns=None):
        self.columns = columns
        self.imputer = None
    def fit(self, data, target=None):
        """
        Expects a data frame with named columns to impute.
        """
        # Encode all columns if columns is None
        if self.columns is None:
            self.columns = data.columns
        # Fit an imputer for each column in the data frame
        self.imputer = SimpleImputer(missing_values=0, strategy='most_frequent')
        self.imputer.fit(data[self.columns])
        return self
    def transform(self, data):
        """
        Uses the encoders to transform a data frame.
        """
        output = data.copy()
        output[self.columns] = self.imputer.transform(output[self.columns])
        return output
imputer = ImputeCategorical(['workclass', 'native-country', 'occupation'])
dataset.data = imputer.fit_transform(dataset.data)
dataset.data_test = imputer.fit_transform(dataset.data_test)

In [102]:
X_train = dataset.data
yencode = LabelEncoder().fit(dataset.target)
y_train = yencode.transform(dataset.target)

X_test = dataset.data_test
y_test = yencode.transform([y.rstrip(".") for y in dataset.target_test])
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# enable auto logging
mlflow.xgboost.autolog()

def grid_test_xgboost(colsample_tree, subsample, max_depth, min_child_weight, eta):
    with mlflow.start_run():
        # train model
        params = {
            'objective': 'multi:softprob',
            'num_class': 2,
            'eval_metric': 'mlogloss',
            'max_depth': max_depth,
            'min_child_weight': min_child_weight,
            'eta':eta,
            'subsample': subsample,
            'colsample_bytree': colsample_tree
        }
        model = xgb.train(params, dtrain, evals=[(dtrain, 'train')])

        # evaluate model
        y_proba = model.predict(dtest)
        y_pred = y_proba.argmax(axis=1)
        loss = log_loss(y_test, y_proba)
        acc = accuracy_score(y_test, y_pred)

        # log metrics
        mlflow.log_metrics({'log_loss': loss, 'accuracy': acc})
        return acc
    
from sklearn.svm import SVC
def grid_test_svm(kernel, gamma, C):
    with mlflow.start_run():
        clf = SVC(kernel=kernel, gamma=gamma, C=C).fit(X_train, y_train)
        accuracy = clf.score(X_test, y_test)
        mlflow.log_metrics({'accuracy': accuracy})
        mlflow.log_params({'kernel': kernel, 'gamma': gamma, 'C': C})
        return accuracy

# Grid Search XGBoost and Sklearn

In [None]:
#values from https://blog.cambridgespark.com/hyperparameter-tuning-in-xgboost-4ff9100a3b2f
colsample_tree = [0.2, 0.5, 1.0]
subsample = [0.05, 1.0]
max_depth = [1, 5, 10]
min_child_weight = [6]
eta = [.9, .3, .01, .005]

for i in colsample_tree:
    for j in subsample:
        for k in max_depth:
            for l in min_child_weight:
                for m in eta:
                    grid_test_xgboost(i, j, k, l, m)

kernel = ['rbf']
gamma = [1e-3, 1e-4]
C = [1, 10, 100, 1000]
for i in kernel:
    for j in gamma:
        for k in C:
            grid_test_svm(i, j, k)

In [95]:
print(grid_test(0.05, 0.05, 10, 10, 0.1))

[0]	train-mlogloss:0.65907
[1]	train-mlogloss:0.63675
[2]	train-mlogloss:0.60855
[3]	train-mlogloss:0.59430
[4]	train-mlogloss:0.57516
[5]	train-mlogloss:0.55993
[6]	train-mlogloss:0.54759
[7]	train-mlogloss:0.53285
[8]	train-mlogloss:0.52194
[9]	train-mlogloss:0.51432


  all_arg_names = inspect.getargspec(original)[0]  # pylint: disable=W1505


0.7767950371598796


In [89]:
import os
import pandas as pd

dirs = os.listdir("mlruns/0")
dictionary = []
for i in dirs:
    row = {}
    if "meta" in i or "DS_Store" in i:
        continue
    accuracy = float(open('mlruns/0/' + i + '/metrics/accuracy').read().split(" ")[1])
    row['accuracy'] = accuracy
    row['model'] = 'xgboost'
    params_files = os.listdir('mlruns/0/' + i + '/params')
    model_params = {}
    for j in params_files:
        if 'kernel' in j:
            continue
        model_params[j] = open('mlruns/0/' + i + '/params/' + j).read()
    
    row['model_params'] = model_params
    dictionary.append(row)
    
df = pd.DataFrame(dictionary)
df.to_csv('mlflow_workflows.csv')