# Initialisation

Installing standard libraries, which only needs to be done once:
- Scientific computing (numpy, scipy)
- Data analysis (pandas)
- Plotting (matplotlib, seaborn)
- Machine learning (scikit-learn)
- Gradient-boosted trees (xgboost. lightgbm)
- Deep learning (tensorflow, keras, scikeras)
- File persistence (joblib, h5py)

Uncomment for first run!!

In [None]:
'''import sys
!{sys.executable} -m pip install black
!{sys.executable} -m pip install jupyter_black
!{sys.executable} -m pip install numpy
!{sys.executable} -m pip install scipy
!{sys.executable} -m pip install pandas
!{sys.executable} -m pip install matplotlib
!{sys.executable} -m pip install seaborn
!{sys.executable} -m pip install scikit-learn
!{sys.executable} -m pip install xgboost
!{sys.executable} -m pip install catboost
!{sys.executable} -m pip install lightgbm
!{sys.executable} -m pip install tensorflow
!{sys.executable} -m pip install keras
!{sys.executable} -m pip install scikeras
!{sys.executable} -m pip install joblib
!{sys.executable} -m pip install h5py
!{sys.executable} -m pip install dataframe_image
!{sys.executable} -m pip install scikit-learn-intelex'''

In [None]:
import jupyter_black

jupyter_black.load(lab=False)

In [None]:
from IPython.core.display import HTML

HTML("<style> .container{ width:90%; } </style>")

In [None]:
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

️Intel(R) Extension for Scikit-learn (https://github.com/intel/scikit-learn-intelex)

In [None]:
from sklearnex import patch_sklearn

patch_sklearn()

# Data Loading

Loading CVS file onto pandas dataframe, a tabular data structure.

Printing first few rows, the shape of data (rows, column), the data types, and some basic analysis of the numeric features

In [None]:
col_names = [
    "age",
    "workclass",
    "fnlwgt",
    "education",
    "education-num",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "capital-gain",
    "capital-loss",
    "hours-per-week",
    "native-country",
    "income",
]

df = pd.read_csv(
    "data/census_income.csv", names=col_names, skipinitialspace=True, na_values=["?"]
)
df.head(10)

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

Based on the data description available at https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names, 
feature 'fnlwgt' does not add any useful information, so it is best to remove.

Feature "education" is also redundant because "education-num" encodes the same information with numerical values.

In [None]:
df = df.drop("fnlwgt", axis=1)
df = df.drop("education", axis=1)

# Exploratory Data Analysis

Check whether numeric features are correlated. Since they are not, all of them bring new relevant information

In [None]:
plt.figure(figsize=(10, 4))
sns.heatmap(df.corr(), annot=True, cmap="Blues", linecolor="white", linewidths=1)
plt.show()

Plot data to understand it, determine whether any features look strongly correlated with high or low income

In [None]:
plt.figure(figsize=(12, 6))
sns.countplot(
    x=df["income"],
    hue=df["education-num"],
    palette="rainbow",
    edgecolor=[(0, 0, 0), (0, 0, 0)],
)
plt.show()

In [None]:
plt.figure(figsize=(12, 4))
sns.countplot(
    y=df["income"], hue=df["sex"], palette="Set1", edgecolor=[(0, 0, 0), (0, 0, 0)]
)
plt.show()

In [None]:
plt.figure(figsize=(12, 4))
sns.countplot(
    y=df["income"],
    hue=df["marital-status"],
    palette="Set1",
    edgecolor=[(0, 0, 0), (0, 0, 0)],
)
plt.show()

In [None]:
plt.figure(figsize=(12, 4))
sns.countplot(
    y=df["income"],
    hue=df["relationship"],
    palette="Set1",
    edgecolor=[(0, 0, 0), (0, 0, 0)],
)
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.countplot(
    y=df["income"],
    hue=df["occupation"],
    palette="tab20",
    edgecolor=[(0, 0, 0), (0, 0, 0)],
)
plt.show()

In [None]:
plt.figure(figsize=(12, 4))
sns.countplot(
    y=df["income"], hue=df["race"], palette="Set1", edgecolor=[(0, 0, 0), (0, 0, 0)]
)
plt.show()

In [None]:
plt.figure(figsize=(12, 5))
sns.countplot(
    y=df["income"],
    hue=df["workclass"],
    palette="Set1",
    edgecolor=[(0, 0, 0), (0, 0, 0)],
)
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.boxenplot(x="income", y="age", data=df, hue="sex", palette="Set1")
plt.show()

# Dealing with NULLs

In [None]:
df.info()

In [None]:
df[df.isnull().any(axis=1)]

In [None]:
df["native-country"].value_counts(dropna=False).head(10)

In [None]:
df["native-country"].fillna("United-States", inplace=True)
df[df["native-country"].isnull()]

In [None]:
df["workclass"].value_counts(dropna=False)

In [None]:
df["workclass"].fillna("Private", inplace=True)
df[df["workclass"].isnull()]

In [None]:
df["occupation"].value_counts(dropna=False)

In [None]:
df["occupation"].fillna("unknown", inplace=True)
df[df["occupation"].isnull()]

In [None]:
df[df.isnull().any(axis=1)]

In [None]:
df.dropna(inplace=True)
df[pd.isnull(df).any(axis=1)]

# Transforming Categorical Features

In [None]:
tdf = pd.get_dummies(
    df,
    columns=[
        "workclass",
        "marital-status",
        "occupation",
        "relationship",
        "race",
        "sex",
        "native-country",
    ],
    drop_first=True,
)

In [None]:
tdf.info()

# Train-Test Split

Split the dataframe into features (X) and labels(y)

In [None]:
from sklearn.preprocessing import label_binarize

X = tdf.drop("income", axis=1)
y = label_binarize(tdf["income"], classes=["<=50K", ">50K"])

Put aside 20% of features to test, train with remaining 80%

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

prior_0 = len(y_train[y_train == 0]) / len(y_train)
prior_1 = 1 - prior_0

# Feature Selection

Now our problem is we have 80 variables to predict a single one. Many of these variables will have absolutely no impact on the income of our citizens, but will add a lot of useless noise to the regression, and may even make it not converge.

We use a little trick called SelectKBest that will give us the k=30 best features based on a standard statistical score. In this case, we use Pearson's Chi Squared test.

In [None]:
from sklearn.feature_selection import SelectKBest, chi2

feature_select = SelectKBest(chi2, k=30)
feature_select.fit(X_train, y_train)

List the surviving features, along with their scores

In [None]:
uni_features = list(zip(feature_select.scores_, X_train.columns))
sorted(uni_features, reverse=True)[0:30]

# Utility Functions

In [None]:
import warnings
import joblib
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    classification_report,
    auc,
    roc_auc_score,
    roc_curve,
    det_curve,
    precision_recall_curve,
    recall_score,
    average_precision_score,
    precision_score,
)
from sklearn.exceptions import DataConversionWarning

metrics = {}

warnings.filterwarnings(action="ignore", category=DataConversionWarning)
warnings.filterwarnings(action="ignore", category=UserWarning)
joblib.dump(list(X.columns), "models/columns.pkl")

In [None]:
def plot_metrics(classifier, classifier_name, ax):
    prediction = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, prediction)
    f1 = f1_score(y_test, prediction)
    recall = recall_score(y_test, prediction)
    precision = precision_score(y_test, prediction)
    specificity = recall_score(y_test, prediction, pos_label=0)

    print(
        "F1 Score: %0.3f. Accuracy: %0.3f. Precision: %0.3f. Recall: %0.3f. Specificity: %0.3f"
        % (f1, accuracy, precision, recall, specificity)
    )
    print(classification_report(y_test, prediction, target_names=["<=50K", ">50K"]))

    sns.heatmap(
        confusion_matrix(y_test, prediction),
        annot=True,
        fmt="d",
        annot_kws={"size": 14},
        xticklabels=["<=50K", ">50K"],
        yticklabels=["<=50K", ">50K"],
        ax=ax,
    )
    ax.set_xlabel("Predicted label")
    ax.set_ylabel("True label")
    ax.set_title("Confusion matrix")

    metrics[classifier_name] = {
        "f1": f1,
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "specificity": specificity,
    }

In [None]:
def plot_roc_curve(classifier, classifier_name, ax):
    try:
        prob_pos = classifier.predict_proba(X_test)[:, 1]
    except:
        prob_pos = classifier.decision_function(X_test)
        prob_pos = (prob_pos - prob_pos.min()) / (prob_pos.max() - prob_pos.min())
        ax.plot([], [], "white", alpha=0, label="*estimated with decision function")

    fpr, tpr, thresholds = roc_curve(y_test, prob_pos)
    roc_score = roc_auc_score(y_test, classifier.predict(X_test))

    ax.plot(fpr, tpr, label="%s (area = %0.3f)" % (classifier_name, roc_score))
    ax.plot([0, 1], [0, 1], "r--", label="Random guess (area = 0.500)")
    ax.set_xlim([-0.05, 1.05])
    ax.set_ylim([0.0, 1.05])
    ax.set_xlabel("False Positive Rate = 1 - Specificity")
    ax.set_ylabel("True Positive Rate = Recall")
    ax.set_title("Receiver operating characteristic")
    ax.legend(loc="lower right")
    ax.fill(fpr, tpr, "lightblue")
    ax.fill([0, 1, 1], [0, 1, 0], "thistle")

    metrics[classifier_name]["roc_score"] = roc_score

In [None]:
def plot_precision_recall(classifier, classifier_name, ax):
    try:
        prob_pos = classifier.predict_proba(X_test)[:, 1]
    except:
        prob_pos = classifier.decision_function(X_test)
        prob_pos = (prob_pos - prob_pos.min()) / (prob_pos.max() - prob_pos.min())
        ax.plot([], [], "white", alpha=0, label="*estimated with decision function")

    precision, recall, thresholds = precision_recall_curve(y_test, prob_pos)
    avg_prec = average_precision_score(y_test, prob_pos)
    area = auc(recall, precision)

    no_skill = len(y_test[y_test == 1]) / len(y_test)

    ax.plot(
        recall,
        precision,
        label="%s\n(avg_prec = %0.3f, area = %0.3f)"
        % (classifier_name, avg_prec, area),
    )
    # ax.fill(recall, precision, "lightblue")
    ax.fill_between(recall, 0, precision, color="lightblue")
    ax.plot([0, 1], [no_skill, no_skill], "r--", label="Random guess")
    ax.set_xlim([-0.05, 1.05])
    ax.set_ylim([0.0, 1.05])
    ax.set_xlabel("Recall")
    ax.set_ylabel("Precision")
    ax.set_title("Precision-recall curve")
    ax.legend(loc="lower left")

    metrics[classifier_name]["avg_prec"] = avg_prec

In [None]:
def plot_cap_curve(classifier, classifier_name, ax):
    from scipy import integrate

    try:
        prob_pos = classifier.predict_proba(X_test)[:, 1]
    except:
        prob_pos = classifier.decision_function(X_test)
        prob_pos = (prob_pos - prob_pos.min()) / (prob_pos.max() - prob_pos.min())
        ax.plot([], [], "white", alpha=0, label="*estimated with decision function")

    y_values = y_test
    num_pos_obs = np.sum(y_values)
    num_count = len(y_values)
    rate_pos_obs = float(num_pos_obs) / float(num_count)
    ideal = pd.DataFrame({"x": [0, rate_pos_obs, 1], "y": [0, 1, 1]})
    xx = np.arange(num_count) / float(num_count - 1)

    y_cap = np.c_[y_values, prob_pos]
    y_cap_df_s = pd.DataFrame(data=y_cap)
    y_cap_df_s = y_cap_df_s.sort_values([1], ascending=False).reset_index(
        level=y_cap_df_s.index.names, drop=True
    )

    # print(y_cap_df_s.head(20))

    yy = np.cumsum(y_cap_df_s[0]) / float(num_pos_obs)
    yy = np.append(
        [0], yy[0 : num_count - 1]
    )  # add the first curve point (0,0) : for xx=0 we have yy=0

    percent = 0.5
    row_index = int(np.trunc(num_count * percent))

    val_y1 = yy[row_index]
    val_y2 = yy[row_index + 1]
    if val_y1 == val_y2:
        val = val_y1 * 1.0
    else:
        val_x1 = xx[row_index]
        val_x2 = xx[row_index + 1]
        val = val_y1 + ((val_x2 - percent) / (val_x2 - val_x1)) * (val_y2 - val_y1)

    sigma_ideal = (
        1 * xx[num_pos_obs - 1] / 2 + (xx[num_count - 1] - xx[num_pos_obs]) * 1
    )
    sigma_model = integrate.simps(yy, xx)
    sigma_random = integrate.simps(xx, xx)
    ar_value = (sigma_model - sigma_random) / (sigma_ideal - sigma_random)

    ax.plot(ideal["x"], ideal["y"], color="dimgrey", label="Perfect Model")
    ax.fill(ideal["x"], ideal["y"], "lightgrey")
    ax.plot(xx, yy, label="%s (AR = %0.3f)" % (classifier_name, ar_value))
    ax.fill(xx, yy, "lightblue")
    ax.plot(xx, xx, "r--", label="Random guess")
    ax.plot([percent, percent], [0.0, val], "g--", linewidth=1)
    ax.plot(
        [0, percent],
        [val, val],
        "g--",
        linewidth=1,
        label="%0.3f%% of positive obs at %0.1f%%" % (val * 100, percent * 100),
    )
    ax.fill([0, 1, 1], [0, 1, 0], "thistle")
    ax.set_xlim(-0.05, 1.05)
    ax.set_ylim(0, 1.05)
    ax.set_title("Cumulative accuracy profile")
    ax.set_xlabel("% of the data")
    ax.set_ylabel("% of positive obs")
    ax.legend(loc="lower right")

    metrics[classifier_name]["acc_ratio"] = ar_value

In [None]:
from time import perf_counter


def classify_and_plot(classifier, trainining_set_only=True):
    classifier_name = type(classifier).__name__
    if classifier_name == "Pipeline":
        classifier_name = classifier.steps[-1][0]
    fit_start = perf_counter()
    classifier.fit(X_train, y_train)
    fit_end = perf_counter()
    fig, axs = plt.subplots(1, 4, figsize=(28, 5))
    plot_metrics(classifier, classifier_name, axs[0])
    plot_roc_curve(classifier, classifier_name, axs[1])
    plot_precision_recall(classifier, classifier_name, axs[2])
    plot_cap_curve(classifier, classifier_name, axs[3])
    fig.suptitle(classifier_name, fontsize=18)
    plt.savefig(
        "img/" + classifier_name + ".png",
        bbox_inches="tight",
        format="png",
        facecolor="white",
        dpi=150,
    )
    plt.savefig("img/" + classifier_name + ".svg", bbox_inches="tight", format="svg")
    plot_end = perf_counter()
    metrics[classifier_name]["fit_time"] = fit_end - fit_start
    metrics[classifier_name]["plot_time"] = plot_end - fit_end
    metrics[classifier_name]["model"] = classifier
    joblib.dump(classifier, "models/" + classifier_name + ".pkl")
    return classifier

# Classic Data Science Classifiers

## Dummy Classifier

In [None]:
from sklearn.dummy import DummyClassifier

pipeline = Pipeline(
    [
        ("Dummy_Stratified", DummyClassifier(strategy="stratified")),
    ]
)
classify_and_plot(pipeline)

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

pipeline = Pipeline(
    [
        ("select", feature_select),
        ("LogisticRegression_30", LogisticRegression(solver="liblinear")),
    ]
)
classify_and_plot(pipeline)

In [None]:
pipeline = Pipeline(
    [
        ("LogisticRegression_80", LogisticRegression(solver="liblinear")),
    ]
)
classify_and_plot(pipeline)

## K-nearest Neighbours

In [None]:
from sklearn.neighbors import KNeighborsClassifier

pipeline = Pipeline(
    [
        ("select", feature_select),
        ("scaler", StandardScaler()),
        ("KNeighborsClassifier_30", KNeighborsClassifier(n_jobs=-1)),
    ]
)
classify_and_plot(pipeline)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

pipeline = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("KNeighborsClassifier_80", KNeighborsClassifier(n_jobs=-1)),
    ]
)
classify_and_plot(pipeline)

## Gaussian Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

pipeline = Pipeline(
    [
        ("select", feature_select),
        ("GaussianNB_30", GaussianNB(priors=[prior_0, prior_1])),
    ]
)
classify_and_plot(pipeline)

In [None]:
from sklearn.naive_bayes import GaussianNB

pipeline = Pipeline(
    [
        ("GaussianNB_80", GaussianNB(priors=[prior_0, prior_1])),
    ]
)
classify_and_plot(pipeline)

## Linear Discriminant Analysis

In [None]:
from sklearn.discriminant_analysis import (
    LinearDiscriminantAnalysis,
    QuadraticDiscriminantAnalysis,
)

pipeline = Pipeline(
    [
        ("select", feature_select),
        ("scaler", StandardScaler()),
        (
            "LinearDiscriminantAnalysis_30",
            LinearDiscriminantAnalysis(priors=[prior_0, prior_1]),
        ),
    ]
)
# for some arcane reason, the SVD decomposition fails every other time
try:
    classify_and_plot(pipeline)
except:
    classify_and_plot(pipeline)

In [None]:
from sklearn.discriminant_analysis import (
    LinearDiscriminantAnalysis,
    QuadraticDiscriminantAnalysis,
)

pipeline = Pipeline(
    [
        ("scaler", StandardScaler()),
        (
            "LinearDiscriminantAnalysis_80",
            LinearDiscriminantAnalysis(priors=[prior_0, prior_1]),
        ),
    ]
)
# for some arcane reason, the SVD decomposition fails every other time
try:
    classify_and_plot(pipeline)
except:
    classify_and_plot(pipeline)

## Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

pipeline = Pipeline(
    [
        ("select", feature_select),
        ("DecisionTreeClassifier_30", DecisionTreeClassifier()),
    ]
)
classify_and_plot(pipeline)

If machine has Graphviz (https://www.graphviz.org/) installed, uncomment to render the decision tree as a PDF

In [None]:
#!{sys.executable} -m pip install pydotplus

'''from io import StringIO
from sklearn.tree import export_graphviz
import pydotplus

dot_data = StringIO()
feature_names = X_train.columns[feature_select.get_support()].tolist()
export_graphviz(
    pipeline.steps[-1][1],
    out_file=dot_data,
    feature_names=feature_names,
    filled=True,
    rounded=True,
    special_characters=True,
)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_pdf("models/tree.pdf")'''

In [None]:
from sklearn.tree import DecisionTreeClassifier

pipeline = Pipeline(
    [
        ("DecisionTreeClassifier_80", DecisionTreeClassifier()),
    ]
)
classify_and_plot(pipeline)

# Ensemble Classifiers

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

pipeline = Pipeline(
    [
        ("select", feature_select),
        ("RandomForestClassifier_30", RandomForestClassifier(n_estimators=100)),
    ]
)
classify_and_plot(pipeline)

In [None]:
from sklearn.ensemble import RandomForestClassifier

pipeline = Pipeline(
    [
        ("RandomForestClassifier_80", RandomForestClassifier(n_estimators=100)),
    ]
)
classify_and_plot(pipeline)

## Extra Trees

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

pipeline = Pipeline(
    [
        ("select", feature_select),
        (
            "ExtraTreesClassifier_30",
            ExtraTreesClassifier(n_estimators=100),
        ),
    ]
)
classify_and_plot(pipeline)

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

pipeline = Pipeline(
    [
        ("ExtraTreesClassifier_80", ExtraTreesClassifier(n_estimators=100)),
    ]
)
classify_and_plot(pipeline)

## Bagging

In [None]:
from sklearn.ensemble import BaggingClassifier

pipeline = Pipeline(
    [
        ("select", feature_select),
        (
            "BaggingClassifier_30",
            BaggingClassifier(
                n_estimators=100, max_samples=0.5, max_features=0.5, n_jobs=-1
            ),
        ),
    ]
)
classify_and_plot(pipeline)

In [None]:
from sklearn.ensemble import BaggingClassifier

pipeline = Pipeline(
    [
        (
            "BaggingClassifier_80",
            BaggingClassifier(
                n_estimators=100, max_samples=0.5, max_features=0.5, n_jobs=-1
            ),
        ),
    ]
)
classify_and_plot(pipeline)

## Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

pipeline = Pipeline(
    [
        ("select", feature_select),
        ("GradientBoostingClassifier_30", GradientBoostingClassifier(n_estimators=100)),
    ]
)
classify_and_plot(pipeline)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

pipeline = Pipeline(
    [
        ("GradientBoostingClassifier_80", GradientBoostingClassifier(n_estimators=100)),
    ]
)
classify_and_plot(pipeline)

## AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier

pipeline = Pipeline(
    [
        ("select", feature_select),
        ("AdaBoostClassifier_30", AdaBoostClassifier(n_estimators=100)),
    ]
)
classify_and_plot(pipeline)

In [None]:
from sklearn.ensemble import AdaBoostClassifier

pipeline = Pipeline(
    [
        ("AdaBoostClassifier_80", AdaBoostClassifier(n_estimators=100)),
    ]
)
classify_and_plot(pipeline)

## Histogram-Based Gradient Boosting

In [None]:
from sklearn.ensemble import HistGradientBoostingClassifier

pipeline = Pipeline(
    [
        ("select", feature_select),
        ("HistGradientBoostingClassifier_30", HistGradientBoostingClassifier()),
    ]
)
classify_and_plot(pipeline)

In [None]:
from sklearn.ensemble import HistGradientBoostingClassifier

pipeline = Pipeline(
    [
        ("HistGradientBoostingClassifier_80", HistGradientBoostingClassifier()),
    ]
)
classify_and_plot(pipeline)

## XGBoost

In [None]:
from xgboost.sklearn import XGBClassifier

pipeline = Pipeline(
    [
        ("select", feature_select),
        (
            "XGBClassifier_30",
            XGBClassifier(max_depth=100, nthread=-1, eval_metric="logloss"),
        ),
    ]
)
classify_and_plot(pipeline)

In [None]:
from xgboost.sklearn import XGBClassifier

pipeline = Pipeline(
    [
        (
            "XGBClassifier_80",
            XGBClassifier(max_depth=100, nthread=-1, eval_metric="logloss"),
        ),
    ]
)
classify_and_plot(pipeline)

## CatBoost

In [None]:
from catboost import CatBoostClassifier

pipeline = Pipeline(
    [
        ("select", feature_select),
        ("CatBoostClassifier_30", CatBoostClassifier(verbose=0, n_estimators=100)),
    ]
)
classify_and_plot(pipeline)

In [None]:
from catboost import CatBoostClassifier

pipeline = Pipeline(
    [
        ("CatBoostClassifier_80", CatBoostClassifier(verbose=0, n_estimators=100)),
    ]
)
classify_and_plot(pipeline)

## Microsoft LightGBM

In [None]:
from lightgbm.sklearn import LGBMClassifier

pipeline = Pipeline(
    [
        ("select", feature_select),
        ("LightGBM_30", LGBMClassifier()),
    ]
)
classify_and_plot(pipeline)

In [None]:
from lightgbm.sklearn import LGBMClassifier

pipeline = Pipeline(
    [
        ("LightGBM_80", LGBMClassifier()),
    ]
)
classify_and_plot(pipeline)

# Support Vector Machines

## Linear SVM

In [None]:
from sklearn.svm import LinearSVC

pipeline = Pipeline(
    [
        ("select", feature_select),
        ("scaler", StandardScaler()),
        ("LinearSVC_30", LinearSVC(C=0.2)),
    ]
)
classify_and_plot(pipeline)

In [None]:
from sklearn.svm import LinearSVC

pipeline = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("LinearSVC_80", LinearSVC(C=0.2)),
    ]
)
classify_and_plot(pipeline)

## SVC with Polynomial Kernel

In [None]:
from sklearn.svm import SVC

pipeline = Pipeline(
    [
        ("select", feature_select),
        ("scaler", StandardScaler()),
        ("SVC_poly", SVC(C=1.0, kernel="poly", gamma="scale")),
    ]
)
classify_and_plot(pipeline)

## SVC with Radial Basis Function Kernel

In [None]:
from sklearn.svm import SVC

pipeline = Pipeline(
    [
        ("select", feature_select),
        ("scaler", StandardScaler()),
        ("SVC_rbf", SVC(C=1.0, kernel="rbf", gamma="scale")),
    ]
)
classify_and_plot(pipeline)

# Neural Networks

## Multi-Layer Perceptron

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, LeakyReLU
from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from keras.optimizers import Adam, Adadelta, SGD
from keras.metrics import Precision, Recall, Accuracy
from scikeras.wrappers import KerasClassifier

# import keras_metrics

In [None]:
from tensorflow.config import set_soft_device_placement

from tensorflow.config.threading import (
    set_inter_op_parallelism_threads,
    set_intra_op_parallelism_threads,
)

set_inter_op_parallelism_threads(0)
set_intra_op_parallelism_threads(0)
set_soft_device_placement(True)

Fully connected layers:
- 80 inputs
- Hidden layer with 40 neurons, leaky ReLU as activation, and dropout rate of 10%
- Hidden layer with 20 neurons, leaky ReLU as activation, and dropout rate of 10%
- 1 single output with sigmoid activation

In [None]:
def create_model():
    model = Sequential()
    model.add(
        Dense(
            40,
            kernel_initializer="glorot_normal",
            activation="relu",
            input_dim=X_train.shape[1],
        )
    )
    # model.add(LeakyReLU(alpha=0.001))
    model.add(Dropout(rate=0.1))
    model.add(Dense(20, kernel_initializer="glorot_normal", activation="relu"))
    # model.add(LeakyReLU(alpha=0.001))
    model.add(Dropout(rate=0.1))
    model.add(Dense(1, kernel_initializer="glorot_normal", activation="sigmoid"))
    model.compile(
        loss="binary_crossentropy",
        optimizer=SGD(nesterov=True),
        metrics=[
            "accuracy",
            "Precision",
            "Recall",
        ],
    )
    return model

Features must be scaled by subtracting the mean and scaling to unit variance.

Absolute maximum of 250 training epochs, with 3 callbacks:
1. Monitor loss on validation test set (val_loss), and stop training once it stops improving
2. Persist to disk the model state that yielded the best val_loss.
3. Tensorboard callback

In [None]:
import datetime
import os

logs_dir = os.path.normpath(os.path.join(os.getcwd(), "logs/fit/"))
this_log_dir = os.path.normpath(
    os.path.join(logs_dir, datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
)
tensorboard = TensorBoard(log_dir=this_log_dir, histogram_freq=1)

checkpoint = ModelCheckpoint(
    "models/best_model.hdf5",
    monitor="val_loss",
    mode="min",
    verbose=0,
    save_best_only=True,
)
early_stop = EarlyStopping(
    monitor="val_loss", patience=50, verbose=1, mode="min", restore_best_weights=True
)


pipeline = Pipeline(
    [
        ("scaler", StandardScaler()),
        (
            "KerasClassifier",
            KerasClassifier(
                build_fn=create_model,
                epochs=250,
                batch_size=50,
                verbose=2,
                validation_split=0.10,
                callbacks=[
                    checkpoint,
                    early_stop,
                    # tensorboard, #Removed to avoid an error persisting with joblib
                ],
            ),
        ),
    ]
)

In [None]:
classify_and_plot(pipeline)

# Overall Classifier Ratings

In [None]:
metrics_df = pd.DataFrame.from_dict(metrics, orient="index")
styled_metrics_df = (
    metrics_df.drop(labels="model", axis=1)
    .sort_values(["f1", "accuracy", "precision", "recall"], ascending=False)
    .style.background_gradient(cmap="coolwarm")
    .format("{:.4f}")
    .set_properties(**{"min-width": "6em"})
)
styled_metrics_df

In [None]:
import dataframe_image as dfi

dfi.export(styled_metrics_df, "img/census_metrics_df.png", max_rows=-1)