In [1]:
# code from https://medium.com/district-data-labs/building-a-classifier-from-census-data-18f996c4d7cf
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, log_loss

import pandas as pd

# prepare train and test data
names = [
    'age',
    'workclass',
    'fnlwgt',
    'education',
    'education-num',
    'marital-status',
    'occupation',
    'relationship',
    'race',
    'sex',
    'capital-gain',
    'capital-loss',
    'hours-per-week',
    'native-country',
    'income',
]
data = pd.read_csv('census_data/adult.data', names=names)

import json
meta = {
    'target_names': list(data.income.unique()),
    'feature_names': list(data.columns),
    'categorical_features': {
        column: list(data[column].unique())
        for column in data.columns
        if data[column].dtype == 'object'
    },
}
with open('census_data/meta.json', 'w') as f:
    json.dump(meta, f, indent=2)
    
from sklearn.datasets.base import Bunch
from sklearn.preprocessing import LabelEncoder
def load_data():
    # Load the meta data from the file
    with open('census_data/meta.json', 'r') as f:
        meta = json.load(f)
    names = meta['feature_names']
    # Load the training and test data, skipping the bad row in the test data
    train = pd.read_csv('census_data/adult.data', names=names)
    test  = pd.read_csv('census_data/adult.test', names=names, skiprows=1)
    # Remove the target from the categorical features
    meta['categorical_features'].pop('income')
    # Return the bunch with the appropriate data chunked apart
    return Bunch(
        data = train[names[:-1]],
        target = train[names[-1]],
        data_test = test[names[:-1]],
        target_test = test[names[-1]],
        target_names = meta['target_names'],
        feature_names = meta['feature_names'],
        categorical_features = meta['categorical_features'],
        DESCR = "descr",
    )
dataset = load_data()

from sklearn.base import BaseEstimator, TransformerMixin
class EncodeCategorical(BaseEstimator, TransformerMixin):
    """
    Encodes a specified list of columns or all columns if None.
    """
    def __init__(self, columns=None):
        self.columns  = columns
        self.encoders = None
    def fit(self, data, target=None):
        """
        Expects a data frame with named columns to encode.
        """
        # Encode all columns if columns is None
        if self.columns is None:
            self.columns = data.columns
        # Fit a label encoder for each column in the data frame
        self.encoders = {
            column: LabelEncoder().fit(data[column])
            for column in self.columns
        }
        return self
    def transform(self, data):
        """
        Uses the encoders to transform a data frame.
        """
        output = data.copy()
        for column, encoder in self.encoders.items():
            output[column] = encoder.transform(data[column])
        return output
encoder = EncodeCategorical(dataset.categorical_features.keys())
dataset.data = encoder.fit_transform(dataset.data)
dataset.data_test = encoder.fit_transform(dataset.data_test)

from sklearn.impute import SimpleImputer
import numpy as np

class ImputeCategorical(BaseEstimator, TransformerMixin):
    """
    Encodes a specified list of columns or all columns if None.
    """
    def __init__(self, columns=None):
        self.columns = columns
        self.imputer = None
    def fit(self, data, target=None):
        """
        Expects a data frame with named columns to impute.
        """
        # Encode all columns if columns is None
        if self.columns is None:
            self.columns = data.columns
        # Fit an imputer for each column in the data frame
        self.imputer = SimpleImputer(missing_values=0, strategy='most_frequent')
        self.imputer.fit(data[self.columns])
        return self
    def transform(self, data):
        """
        Uses the encoders to transform a data frame.
        """
        output = data.copy()
        output[self.columns] = self.imputer.transform(output[self.columns])
        return output
imputer = ImputeCategorical(['workclass', 'native-country', 'occupation'])
dataset.data = imputer.fit_transform(dataset.data)
dataset.data_test = imputer.fit_transform(dataset.data_test)

X_train = dataset.data
yencode = LabelEncoder().fit(dataset.target)
y_train = yencode.transform(dataset.target)

X_test = dataset.data_test
y_test = yencode.transform([y.rstrip(".") for y in dataset.target_test])



In [2]:
from davincicode import DaVinciCode
app = DaVinciCode(port=8090, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)

[1;31m---------------------------------------------------------------------------[0m
[1;31mIndexError[0m                                Traceback (most recent call last)
[1;32m~/Documents/GitHub/ml-viz/ml-viz/davincicode/DaVinciCode.py[0m in [0;36mupdate_pc[1;34m(
    clickData='main/MLPClassifier/max_iter=nan/',
    rangeData=[0, 1],
    n=242
)[0m
[0;32m    428[0m                     [1;32melse[0m[1;33m:[0m[1;33m[0m[0m
[0;32m    429[0m                         [0mselected_df[0m [1;33m=[0m [0mselected_df[0m[1;33m[[0m[0mselected_df[0m[1;33m[[0m[1;34m'model'[0m[1;33m][0m [1;33m==[0m [0mi[0m[1;33m][0m[1;33m[0m[0m
[1;32m--> 430[1;33m                 [0msample_vals[0m [1;33m=[0m [0mselected_df[0m[1;33m.[0m[0miloc[0m[1;33m[[0m[1;36m0[0m[1;33m][0m[1;33m[0m[0m
[0m        [0;36msample_vals[0m [1;34m= [1;36mundefined[0m[0m[1;34m
        [0m[0;36mselected_df.iloc[0m [1;34m= <pandas.core.indexing._iLocIndexer object at 0

In [3]:
from sklearn.neural_network import MLPClassifier
app.experiment(library='sklearn', model=MLPClassifier, params = {
    'max_iter': 100,
    'alpha': 0.001
})

In [None]:
app.experiment(library='sklearn', model=MLPClassifier, params = {
    'max_iter': 400,
    'alpha': 0.0001
})

app.experiment(library='sklearn', model=MLPClassifier, params = {
    'max_iter': 400,
    'alpha': 0.00001
})

app.experiment(library='sklearn', model=MLPClassifier, params = {
    'max_iter': 200,
    'alpha': 0.001
})

In [None]:
params = {
        'objective': 'multi:softprob',
        'num_class': 2,
        'eval_metric': 'mlogloss',
        'max_depth': 10,
        'min_child_weight': 10,
        'eta':0.2,
        'subsample': 0.05,
        'colsample_bytree': 0.05
}

app.experiment(library='xgboost', model=None, params = params)

In [None]:
params['max_depth'] = 1
params['min_child_weight'] = 6
params['subsample'] = 1.0
params['colsample_bytree'] = 0.05
params['eta'] = 0.15
app.experiment('xgboost', None, params)

In [None]:
params['max_depth'] = 10
params['min_child_weight'] = 10
params['subsample'] = 1.0
params['colsample_bytree'] = 1.0
app.experiment('xgboost', None, params)

In [None]:
from sklearn.svm import SVC
app.experiment('sklearn', SVC, {
    'gamma': 0.01,
    'C': 1
})