# Prepare Census Data

In [3]:
# code from https://medium.com/district-data-labs/building-a-classifier-from-census-data-18f996c4d7cf
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, log_loss

import pandas as pd

# prepare train and test data
names = [
    'age',
    'workclass',
    'fnlwgt',
    'education',
    'education-num',
    'marital-status',
    'occupation',
    'relationship',
    'race',
    'sex',
    'capital-gain',
    'capital-loss',
    'hours-per-week',
    'native-country',
    'income',
]
data = pd.read_csv('census_data/adult.data', names=names)

import json
meta = {
    'target_names': list(data.income.unique()),
    'feature_names': list(data.columns),
    'categorical_features': {
        column: list(data[column].unique())
        for column in data.columns
        if data[column].dtype == 'object'
    },
}
with open('census_data/meta.json', 'w') as f:
    json.dump(meta, f, indent=2)

In [4]:
from sklearn.datasets.base import Bunch
from sklearn.preprocessing import LabelEncoder
def load_data():
    # Load the meta data from the file
    with open('census_data/meta.json', 'r') as f:
        meta = json.load(f)
    names = meta['feature_names']
    # Load the training and test data, skipping the bad row in the test data
    train = pd.read_csv('census_data/adult.data', names=names)
    test  = pd.read_csv('census_data/adult.test', names=names, skiprows=1)
    # Remove the target from the categorical features
    meta['categorical_features'].pop('income')
    # Return the bunch with the appropriate data chunked apart
    return Bunch(
        data = train[names[:-1]],
        target = train[names[-1]],
        data_test = test[names[:-1]],
        target_test = test[names[-1]],
        target_names = meta['target_names'],
        feature_names = meta['feature_names'],
        categorical_features = meta['categorical_features'],
        DESCR = "descr",
    )
dataset = load_data()

from sklearn.base import BaseEstimator, TransformerMixin
class EncodeCategorical(BaseEstimator, TransformerMixin):
    """
    Encodes a specified list of columns or all columns if None.
    """
    def __init__(self, columns=None):
        self.columns  = columns
        self.encoders = None
    def fit(self, data, target=None):
        """
        Expects a data frame with named columns to encode.
        """
        # Encode all columns if columns is None
        if self.columns is None:
            self.columns = data.columns
        # Fit a label encoder for each column in the data frame
        self.encoders = {
            column: LabelEncoder().fit(data[column])
            for column in self.columns
        }
        return self
    def transform(self, data):
        """
        Uses the encoders to transform a data frame.
        """
        output = data.copy()
        for column, encoder in self.encoders.items():
            output[column] = encoder.transform(data[column])
        return output
encoder = EncodeCategorical(dataset.categorical_features.keys())
dataset.data = encoder.fit_transform(dataset.data)
dataset.data_test = encoder.fit_transform(dataset.data_test)

In [5]:
from sklearn.impute import SimpleImputer
import numpy as np

class ImputeCategorical(BaseEstimator, TransformerMixin):
    """
    Encodes a specified list of columns or all columns if None.
    """
    def __init__(self, columns=None):
        self.columns = columns
        self.imputer = None
    def fit(self, data, target=None):
        """
        Expects a data frame with named columns to impute.
        """
        # Encode all columns if columns is None
        if self.columns is None:
            self.columns = data.columns
        # Fit an imputer for each column in the data frame
        self.imputer = SimpleImputer(missing_values=0, strategy='most_frequent')
        self.imputer.fit(data[self.columns])
        return self
    def transform(self, data):
        """
        Uses the encoders to transform a data frame.
        """
        output = data.copy()
        output[self.columns] = self.imputer.transform(output[self.columns])
        return output
imputer = ImputeCategorical(['workclass', 'native-country', 'occupation'])
dataset.data = imputer.fit_transform(dataset.data)
dataset.data_test = imputer.fit_transform(dataset.data_test)

In [6]:
X_train = dataset.data
yencode = LabelEncoder().fit(dataset.target)
y_train = yencode.transform(dataset.target)

X_test = dataset.data_test
y_test = yencode.transform([y.rstrip(".") for y in dataset.target_test])

# Start Viz

In [1]:
from davincicode import DaVinciCode
app = DaVinciCode(8090)

In [None]:
params = {
        'objective': 'multi:softprob',
        'num_class': 2,
        'eval_metric': 'mlogloss',
        'max_depth': 10,
        'min_child_weight': 10,
        'eta':0.2,
        'subsample': 0.05,
        'colsample_bytree': 0.05
}
app.experiment('xgboost', None, params, X_train, X_test, y_train, y_test)

In [7]:
params['max_depth'] = 1
params['min_child_weight'] = 6
params['subsample'] = 1.0
params['colsample_bytree'] = 0.05
params['eta'] = 0.2
app.experiment('xgboost', None, params, X_train, X_test, y_train, y_test)

[0]	train-mlogloss:0.63553
[1]	train-mlogloss:0.59159
[2]	train-mlogloss:0.55848
[3]	train-mlogloss:0.53685
[4]	train-mlogloss:0.51831
[5]	train-mlogloss:0.50185
[6]	train-mlogloss:0.49213
[7]	train-mlogloss:0.48078
[8]	train-mlogloss:0.47256
[9]	train-mlogloss:0.46526


In [8]:
params['max_depth'] = 10
params['min_child_weight'] = 10
params['subsample'] = 1.0
params['colsample_bytree'] = 1.0
app.experiment('xgboost', None, params, X_train, X_test, y_train, y_test)

[0]	train-mlogloss:0.58085
[1]	train-mlogloss:0.50532
[2]	train-mlogloss:0.45114
[3]	train-mlogloss:0.41146
[4]	train-mlogloss:0.38068
[5]	train-mlogloss:0.35712
[6]	train-mlogloss:0.33790
[7]	train-mlogloss:0.32295
[8]	train-mlogloss:0.31026
[9]	train-mlogloss:0.30062


In [None]:
# for max_depth in [5, 10, 15]:
#     for min_child_weight in [1, 5, 10]:
#         for subsample in [0.3, 0.6, 1.0]:
#             for colsample_bytree in [0.3, 0.6, 1.0]:
#                 for eta in [0.01, 0.2, 0.5]:
#                     params['max_depth'] = max_depth
#                     params['min_child_weight'] = min_child_weight
#                     params['subsample'] = subsample
#                     params['colsample_bytree'] = colsample_bytree
#                     params['eta'] = eta
#                     experiment('xgboost', None, params, X_train, X_test, y_train, y_test)
                

In [9]:
from sklearn.svm import SVC
app.experiment('sklearn', SVC, {
    'gamma': 0.01,
    'C': 1
}, X_train, X_test, y_train, y_test)

NameError: name 'experiment' is not defined

In [7]:
from sklearn.neural_network import MLPClassifier
for max_iter in [200, 300, 500, 700]:
    for alpha in [0.01, 0.001, 0.0001]:
        app.experiment('sklearn', MLPClassifier, {
            'max_iter': max_iter,
            'alpha': alpha
        }, X_train, X_test, y_train, y_test)

In [8]:
from sklearn.neural_network import MLPClassifier
app.experiment('sklearn', MLPClassifier, {
    'max_iter': 300,
    'alpha': 0.001
}, X_train, X_test, y_train, y_test)

In [None]:
app.experiment('sklearn', MLPClassifier, {
    'max_iter': 400,
    'alpha': 0.0001
}, X_train, X_test, y_train, y_test)