In [None]:
from legion.toolchain.metrics import Metric
from legion.toolchain import model

In [None]:
model.init('income', '1.0')

In [None]:
import sys

print(sys.version)

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter(action = "ignore", category = FutureWarning)

%matplotlib inline


## Fetch the data and load it in pandas

In [None]:
from urllib.request import urlretrieve

In [None]:
url = ("https://archive.ics.uci.edu/ml/machine-learning-databases"
       "/adult/adult.data")
local_filename = os.path.basename(url)
if not os.path.exists(local_filename):
    print("Downloading Adult Census datasets from UCI")
    urlretrieve(url, local_filename)

In [None]:
names = ("age, workclass, fnlwgt, education, education-num, "
         "marital-status, occupation, relationship, race, sex, "
         "capital-gain, capital-loss, hours-per-week, "
         "native-country, income").split(', ')    
data = pd.read_csv(local_filename, names=names)

In [None]:
data.head(10)

In [None]:
# Strip spaces
df_obj = data.select_dtypes(['object'])

data[df_obj.columns] = df_obj.apply(lambda x: x.str.strip())


In [None]:
data.head()

In [None]:
data.count()

In [None]:
data.describe()

In [None]:
data.groupby('occupation').size()

In [None]:
data.groupby('native-country').size()

In [None]:
data.hist(column='education-num', bins=15);

In [None]:
data.hist(column='age', bins=15);

In [None]:
data.hist('hours-per-week', bins=15);

In [None]:
data.plot(x='age', y='hours-per-week', kind='scatter',
          alpha=0.02, s=50);

In [None]:
data.groupby('income')['income'].count()

In [None]:
np.mean(data['income'] == '>50K')

In [None]:
data['income'].unique()

In [None]:
data = data.dropna()

In [None]:
data['income'].unique()

In [None]:
target_names = data['income'].unique()
target_names

In [None]:
low_income = data[data['income'] == '<=50K']
high_income = data[data['income'] == '>50K']

bins = np.linspace(10, 90, 20)
plt.hist(low_income['age'].values, bins=bins, alpha=0.5, label='<=50K')
plt.hist(high_income['age'].values, bins=bins, alpha=0.5, label='>50K')
plt.legend(loc='best');

In [None]:
plt.scatter(low_income['age'], low_income['hours-per-week'],
            alpha=0.03, s=50, c='b', label='<=50K');
plt.scatter(high_income['age'], high_income['hours-per-week'],
            alpha=0.03, s=50, c='g', label='>50K');
plt.legend()
plt.xlabel('age'); plt.ylabel('hours-per-week');

## Building predictive models

In [None]:
target = data['income']
features_data = data.drop('income', axis=1)
numeric_features = [c for c in features_data if features_data[c].dtype.kind in ('i', 'f')]

categorical_data = features_data.drop(numeric_features, 1)
categories = {}
for c in list(categorical_data):
    categorical_data[c] = categorical_data[c].apply(lambda x: x.strip())
    idx = pd.factorize(categorical_data[c])[1]
    categories[c] = {v: k for k,v in enumerate(idx)}


In [None]:
def prepare(features_data):
    
    numeric_data = features_data[numeric_features]
    categorical_data = features_data.drop(numeric_features, 1)
    categorical_data_encoded = categorical_data.replace(categories)
    
    features = pd.concat([numeric_data, categorical_data_encoded], axis=1)
    return features.values.astype(np.float32)


In [None]:
X = prepare(features_data)

In [None]:
y = (target.values == '>50K').astype(np.int32)

In [None]:
X.shape

In [None]:
y.shape

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

clf = DecisionTreeClassifier(max_depth=8)

scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc')
print("ROC AUC Decision Tree: {:.4f} +/-{:.4f}".format(
    np.mean(scores), np.std(scores)))

## Model error analysis

In [None]:
from sklearn.model_selection import learning_curve


def plot_learning_curve(estimator, X, y, ylim=(0, 1.1), cv=5,
                        n_jobs=-1, train_sizes=np.linspace(.1, 1.0, 5),
                        scoring=None):
    plt.title("Learning curves for %s" % type(estimator).__name__)
    plt.ylim(*ylim); plt.grid()
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, validation_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes,
        scoring=scoring)
    train_scores_mean = np.mean(train_scores, axis=1)
    validation_scores_mean = np.mean(validation_scores, axis=1)

    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, validation_scores_mean, 'o-', color="g",
             label="Cross-validation score")
    plt.legend(loc="best")
    print("Best validation score: {:.4f}".format(validation_scores_mean[-1]))

In [None]:
clf = DecisionTreeClassifier(max_depth=None)
plot_learning_curve(clf, X_train, y_train, scoring='roc_auc')

In [None]:
clf = DecisionTreeClassifier(max_depth=15)
plot_learning_curve(clf, X_train, y_train, scoring='roc_auc')

In [None]:
clf = DecisionTreeClassifier(max_depth=8)
plot_learning_curve(clf, X_train, y_train, scoring='roc_auc')

In [None]:
clf = DecisionTreeClassifier(max_depth=4)
plot_learning_curve(clf, X_train, y_train, scoring='roc_auc')

In [None]:
from sklearn.model_selection import validation_curve


def plot_validation_curve(estimator, X, y, param_name, param_range,
                          ylim=(0, 1.1), cv=5, n_jobs=-1, scoring=None):
    estimator_name = type(estimator).__name__
    plt.title("Validation curves for %s on %s"
              % (param_name, estimator_name))
    plt.ylim(*ylim); plt.grid()
    plt.xlim(min(param_range), max(param_range))
    plt.xlabel(param_name)
    plt.ylabel("Score")

    train_scores, test_scores = validation_curve(
        estimator, X, y, param_name, param_range,
        cv=cv, n_jobs=n_jobs, scoring=scoring)

    train_scores_mean = np.mean(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    plt.semilogx(param_range, train_scores_mean, 'o-', color="r",
                 label="Training score")
    plt.semilogx(param_range, test_scores_mean, 'o-', color="g",
                 label="Cross-validation score")
    plt.legend(loc="best")
    print("Best test score: {:.4f}".format(test_scores_mean[-1]))
    return test_scores_mean[-1]

In [None]:
clf = DecisionTreeClassifier(max_depth=8)
param_name = 'max_depth'
param_range = [1, 2, 4, 8, 16, 32]

score = plot_validation_curve(clf, X_train, y_train,
                              param_name, param_range, scoring='roc_auc')

model.send_metric(Metric.TEST_ACCURACY, score * 100)

In [None]:
clf = DecisionTreeClassifier(max_depth=8)
clf.fit(X=X_train, y=y_train)

model.export(apply_func=lambda x : {'result': int(clf.predict(x)[0]) },
                    prepare_func=lambda x : prepare(x),
                    input_data_frame=features_data).save()