In [1]:
# code from https://medium.com/district-data-labs/building-a-classifier-from-census-data-18f996c4d7cf
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, log_loss
import xgboost as xgb

import mlflow
import mlflow.xgboost
import mlflow.sklearn
import pandas as pd

# prepare train and test data
names = [
    'age',
    'workclass',
    'fnlwgt',
    'education',
    'education-num',
    'marital-status',
    'occupation',
    'relationship',
    'race',
    'sex',
    'capital-gain',
    'capital-loss',
    'hours-per-week',
    'native-country',
    'income',
]
data = pd.read_csv('census_data/adult.data', names=names)

import json
meta = {
    'target_names': list(data.income.unique()),
    'feature_names': list(data.columns),
    'categorical_features': {
        column: list(data[column].unique())
        for column in data.columns
        if data[column].dtype == 'object'
    },
}
with open('census_data/meta.json', 'w') as f:
    json.dump(meta, f, indent=2)

In [2]:
from sklearn.datasets.base import Bunch
from sklearn.preprocessing import LabelEncoder
def load_data():
    # Load the meta data from the file
    with open('census_data/meta.json', 'r') as f:
        meta = json.load(f)
    names = meta['feature_names']
    # Load the training and test data, skipping the bad row in the test data
    train = pd.read_csv('census_data/adult.data', names=names)
    test  = pd.read_csv('census_data/adult.test', names=names, skiprows=1)
    # Remove the target from the categorical features
    meta['categorical_features'].pop('income')
    # Return the bunch with the appropriate data chunked apart
    return Bunch(
        data = train[names[:-1]],
        target = train[names[-1]],
        data_test = test[names[:-1]],
        target_test = test[names[-1]],
        target_names = meta['target_names'],
        feature_names = meta['feature_names'],
        categorical_features = meta['categorical_features'],
        DESCR = "descr",
    )
dataset = load_data()

from sklearn.base import BaseEstimator, TransformerMixin
class EncodeCategorical(BaseEstimator, TransformerMixin):
    """
    Encodes a specified list of columns or all columns if None.
    """
    def __init__(self, columns=None):
        self.columns  = columns
        self.encoders = None
    def fit(self, data, target=None):
        """
        Expects a data frame with named columns to encode.
        """
        # Encode all columns if columns is None
        if self.columns is None:
            self.columns = data.columns
        # Fit a label encoder for each column in the data frame
        self.encoders = {
            column: LabelEncoder().fit(data[column])
            for column in self.columns
        }
        return self
    def transform(self, data):
        """
        Uses the encoders to transform a data frame.
        """
        output = data.copy()
        for column, encoder in self.encoders.items():
            output[column] = encoder.transform(data[column])
        return output
encoder = EncodeCategorical(dataset.categorical_features.keys())
dataset.data = encoder.fit_transform(dataset.data)
dataset.data_test = encoder.fit_transform(dataset.data_test)



In [3]:
from sklearn.impute import SimpleImputer
import numpy as np

class ImputeCategorical(BaseEstimator, TransformerMixin):
    """
    Encodes a specified list of columns or all columns if None.
    """
    def __init__(self, columns=None):
        self.columns = columns
        self.imputer = None
    def fit(self, data, target=None):
        """
        Expects a data frame with named columns to impute.
        """
        # Encode all columns if columns is None
        if self.columns is None:
            self.columns = data.columns
        # Fit an imputer for each column in the data frame
        self.imputer = SimpleImputer(missing_values=0, strategy='most_frequent')
        self.imputer.fit(data[self.columns])
        return self
    def transform(self, data):
        """
        Uses the encoders to transform a data frame.
        """
        output = data.copy()
        output[self.columns] = self.imputer.transform(output[self.columns])
        return output
imputer = ImputeCategorical(['workclass', 'native-country', 'occupation'])
dataset.data = imputer.fit_transform(dataset.data)
dataset.data_test = imputer.fit_transform(dataset.data_test)

In [4]:
X_train = dataset.data
yencode = LabelEncoder().fit(dataset.target)
y_train = yencode.transform(dataset.target)

X_test = dataset.data_test
y_test = yencode.transform([y.rstrip(".") for y in dataset.target_test])
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# enable auto logging
mlflow.xgboost.autolog()

def grid_test_xgboost(colsample_tree, subsample, max_depth, min_child_weight, eta):
    with mlflow.start_run():
        # train model
        params = {
            'objective': 'multi:softprob',
            'num_class': 2,
            'eval_metric': 'mlogloss',
            'max_depth': max_depth,
            'min_child_weight': min_child_weight,
            'eta':eta,
            'subsample': subsample,
            'colsample_bytree': colsample_tree
        }
        model = xgb.train(params, dtrain, evals=[(dtrain, 'train')])

        # evaluate model
        y_proba = model.predict(dtest)
        y_pred = y_proba.argmax(axis=1)
        loss = log_loss(y_test, y_proba)
        acc = accuracy_score(y_test, y_pred)

        # log metrics
        mlflow.log_metrics({'log_loss': loss, 'accuracy': acc})
        return acc
    
from sklearn.svm import SVC
def grid_test_svm(kernel, gamma, C):
    with mlflow.start_run():
        clf = SVC(kernel=kernel, gamma=gamma, C=C).fit(X_train, y_train)
        accuracy = clf.score(X_test, y_test)
        mlflow.log_metrics({'accuracy': accuracy})
        mlflow.log_params({'kernel': kernel, 'gamma': gamma, 'C': C})
        return accuracy

# Grid Search XGBoost and Sklearn

In [5]:
#values from https://blog.cambridgespark.com/hyperparameter-tuning-in-xgboost-4ff9100a3b2f
# colsample_tree = [0.2, 0.5, 1.0]
# subsample = [0.05, 1.0]
# max_depth = [1, 5, 10]
# min_child_weight = [1, 6, 10, 15]
# eta = [.9, .3, .01, .005]

colsample_tree = [1.0]
subsample = [1.0]
max_depth = [1, 10]
min_child_weight = [1, 10]
eta = [.9, .3, .01, .005]

for i in colsample_tree:
    for j in subsample:
        for k in max_depth:
            for l in min_child_weight:
                for m in eta:
                    grid_test_xgboost(i, j, k, l, m)

# kernel = ['rbf']
# gamma = [1e-3, 1e-4]
# C = [1, 10, 100, 1000]
# for i in kernel:
#     for j in gamma:
#         for k in C:
#             grid_test_svm(i, j, k)

[0]	train-mlogloss:0.49011
[1]	train-mlogloss:0.43253
[2]	train-mlogloss:0.39726
[3]	train-mlogloss:0.37644
[4]	train-mlogloss:0.36613
[5]	train-mlogloss:0.35632
[6]	train-mlogloss:0.34694
[7]	train-mlogloss:0.33944
[8]	train-mlogloss:0.33460
[9]	train-mlogloss:0.33097


  all_arg_names = inspect.getargspec(original)[0]  # pylint: disable=W1505


[0]	train-mlogloss:0.59448
[1]	train-mlogloss:0.53447
[2]	train-mlogloss:0.49733
[3]	train-mlogloss:0.46914
[4]	train-mlogloss:0.45038
[5]	train-mlogloss:0.43607
[6]	train-mlogloss:0.42425
[7]	train-mlogloss:0.41402
[8]	train-mlogloss:0.40580
[9]	train-mlogloss:0.39957


  all_arg_names = inspect.getargspec(original)[0]  # pylint: disable=W1505


[0]	train-mlogloss:0.68931
[1]	train-mlogloss:0.68554
[2]	train-mlogloss:0.68185
[3]	train-mlogloss:0.67822
[4]	train-mlogloss:0.67467
[5]	train-mlogloss:0.67119
[6]	train-mlogloss:0.66777
[7]	train-mlogloss:0.66443
[8]	train-mlogloss:0.66113
[9]	train-mlogloss:0.65791


  all_arg_names = inspect.getargspec(original)[0]  # pylint: disable=W1505


[0]	train-mlogloss:0.69122
[1]	train-mlogloss:0.68931
[2]	train-mlogloss:0.68743
[3]	train-mlogloss:0.68556
[4]	train-mlogloss:0.68370
[5]	train-mlogloss:0.68187
[6]	train-mlogloss:0.68005
[7]	train-mlogloss:0.67824
[8]	train-mlogloss:0.67648
[9]	train-mlogloss:0.67471


  all_arg_names = inspect.getargspec(original)[0]  # pylint: disable=W1505


[0]	train-mlogloss:0.49011
[1]	train-mlogloss:0.43253
[2]	train-mlogloss:0.39726
[3]	train-mlogloss:0.37644
[4]	train-mlogloss:0.36613
[5]	train-mlogloss:0.35632
[6]	train-mlogloss:0.34694
[7]	train-mlogloss:0.33944
[8]	train-mlogloss:0.33460
[9]	train-mlogloss:0.33097


  all_arg_names = inspect.getargspec(original)[0]  # pylint: disable=W1505


[0]	train-mlogloss:0.59448
[1]	train-mlogloss:0.53447
[2]	train-mlogloss:0.49733
[3]	train-mlogloss:0.46914
[4]	train-mlogloss:0.45038
[5]	train-mlogloss:0.43607
[6]	train-mlogloss:0.42425
[7]	train-mlogloss:0.41402
[8]	train-mlogloss:0.40580
[9]	train-mlogloss:0.39957


  all_arg_names = inspect.getargspec(original)[0]  # pylint: disable=W1505


[0]	train-mlogloss:0.68931
[1]	train-mlogloss:0.68554
[2]	train-mlogloss:0.68185
[3]	train-mlogloss:0.67822
[4]	train-mlogloss:0.67467
[5]	train-mlogloss:0.67119
[6]	train-mlogloss:0.66777
[7]	train-mlogloss:0.66443
[8]	train-mlogloss:0.66113
[9]	train-mlogloss:0.65791


  all_arg_names = inspect.getargspec(original)[0]  # pylint: disable=W1505


[0]	train-mlogloss:0.69122
[1]	train-mlogloss:0.68931
[2]	train-mlogloss:0.68743
[3]	train-mlogloss:0.68556
[4]	train-mlogloss:0.68370
[5]	train-mlogloss:0.68187
[6]	train-mlogloss:0.68005
[7]	train-mlogloss:0.67824
[8]	train-mlogloss:0.67648
[9]	train-mlogloss:0.67471


  all_arg_names = inspect.getargspec(original)[0]  # pylint: disable=W1505


[0]	train-mlogloss:0.35137
[1]	train-mlogloss:0.29012
[2]	train-mlogloss:0.25856
[3]	train-mlogloss:0.24660
[4]	train-mlogloss:0.23894


  all_arg_names = inspect.getargspec(original)[0]  # pylint: disable=W1505


[5]	train-mlogloss:0.23205
[6]	train-mlogloss:0.22535
[7]	train-mlogloss:0.21473
[8]	train-mlogloss:0.21162
[9]	train-mlogloss:0.20655
[0]	train-mlogloss:0.53026
[1]	train-mlogloss:0.44020
[2]	train-mlogloss:0.38422
[3]	train-mlogloss:0.34592


  all_arg_names = inspect.getargspec(original)[0]  # pylint: disable=W1505


[4]	train-mlogloss:0.31856
[5]	train-mlogloss:0.29773
[6]	train-mlogloss:0.28320
[7]	train-mlogloss:0.27275
[8]	train-mlogloss:0.26330
[9]	train-mlogloss:0.25594
[0]	train-mlogloss:0.68681
[1]	train-mlogloss:0.68062
[2]	train-mlogloss:0.67455
[3]	train-mlogloss:0.66859


  all_arg_names = inspect.getargspec(original)[0]  # pylint: disable=W1505


[4]	train-mlogloss:0.66275
[5]	train-mlogloss:0.65702
[6]	train-mlogloss:0.65140
[7]	train-mlogloss:0.64588
[8]	train-mlogloss:0.64046
[9]	train-mlogloss:0.63515
[0]	train-mlogloss:0.68998
[1]	train-mlogloss:0.68683
[2]	train-mlogloss:0.68373
[3]	train-mlogloss:0.68066


  all_arg_names = inspect.getargspec(original)[0]  # pylint: disable=W1505


[4]	train-mlogloss:0.67762
[5]	train-mlogloss:0.67460
[6]	train-mlogloss:0.67161
[7]	train-mlogloss:0.66866
[8]	train-mlogloss:0.66573
[9]	train-mlogloss:0.66283
[0]	train-mlogloss:0.35914
[1]	train-mlogloss:0.30188


  all_arg_names = inspect.getargspec(original)[0]  # pylint: disable=W1505


[2]	train-mlogloss:0.27647
[3]	train-mlogloss:0.26558
[4]	train-mlogloss:0.25858
[5]	train-mlogloss:0.25334
[6]	train-mlogloss:0.24617
[7]	train-mlogloss:0.24391
[8]	train-mlogloss:0.24169
[9]	train-mlogloss:0.23779
[0]	train-mlogloss:0.53375
[1]	train-mlogloss:0.44542
[2]	train-mlogloss:0.39028
[3]	train-mlogloss:0.35321


  all_arg_names = inspect.getargspec(original)[0]  # pylint: disable=W1505


[4]	train-mlogloss:0.32694
[5]	train-mlogloss:0.30873
[6]	train-mlogloss:0.29541
[7]	train-mlogloss:0.28431
[8]	train-mlogloss:0.27661
[9]	train-mlogloss:0.27003
[0]	train-mlogloss:0.68695
[1]	train-mlogloss:0.68087
[2]	train-mlogloss:0.67492
[3]	train-mlogloss:0.66908
[4]	train-mlogloss:0.66336


  all_arg_names = inspect.getargspec(original)[0]  # pylint: disable=W1505


[5]	train-mlogloss:0.65774
[6]	train-mlogloss:0.65223
[7]	train-mlogloss:0.64682
[8]	train-mlogloss:0.64151
[9]	train-mlogloss:0.63631
[0]	train-mlogloss:0.69004
[1]	train-mlogloss:0.68696
[2]	train-mlogloss:0.68392
[3]	train-mlogloss:0.68090


  all_arg_names = inspect.getargspec(original)[0]  # pylint: disable=W1505


[4]	train-mlogloss:0.67792
[5]	train-mlogloss:0.67496
[6]	train-mlogloss:0.67204
[7]	train-mlogloss:0.66914
[8]	train-mlogloss:0.66627
[9]	train-mlogloss:0.66343


In [89]:
import os

dirs = os.listdir("mlruns/0")
dictionary = []
for i in dirs:
    row = {}
    if "meta" in i or "DS_Store" in i:
        continue
        
    accuracy = float(open('mlruns/0/' + i + '/metrics/accuracy').read().split(" ")[1])
    row['accuracy'] = accuracy
    params_files = os.listdir('mlruns/0/' + i + '/params')
    model_params = {}
    for j in params_files:
        if 'kernel' in j:
            continue
        model_params[j] = open('mlruns/0/' + i + '/params/' + j).read()
    
    row['model_params'] = model_params
    if len(params_files) == 3:
        row['model'] = 'SVM'
    else:
        row['model'] = 'xgboost'
    dictionary.append(row)
    
df = pd.DataFrame(dictionary)
df.to_csv('mlflow_workflows.csv')