In [None]:
!pip install requests black nb_black
%load_ext nb_black

In [None]:
import os
from pathlib import Path

from requests import get
import pandas as pd
import numpy as np
import torch

np.random.seed(0)

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from pytorch_tabnet.tab_model import TabNetClassifier

# Utilities

In [None]:
def download(url, out, force=False, verify=True):
    out.parent.mkdir(parents=True, exist_ok=True)
    if force:
        print(f"Removing file at {str(out)}")
        out.unlink()

    if out.exists():
        print("File already exists.")
        return
    print(f"Downloading {url} at {str(out)} ...")
    # open in binary mode
    with out.open(mode="wb") as file:
        # get request
        response = get(url, verify=verify)
        for chunk in response.iter_content(100000):
            # write to file
            file.write(chunk)

In [None]:
UNKNOWN_VALUE = ["Unkn0wnV@lue"]


class SafeLabelEncoder(LabelEncoder):
    """
    Safe label encoder, encoding every unknown value as Unkn0wnV@lue.
    """

    def fit(self, y):
        """
        Fit the label encoder, by casting the numpy array as a string, then adding the code for unknown.
        
        Parameters
        ----------
        y : numpy array
            the values to fit
        
        Returns
        -------
        SafeLabelEncoder
            itself, fitted
        """
        return super().fit(np.concatenate((y.astype("str"), UNKNOWN_VALUE)))

    def fit_transform(self, y):
        """
        Fit the encoder, then transform the input data and returns it.
        
        Parameters
        ----------
        y : numpy array
            the values to fit
        
        Returns
        -------
        numpy array
            the encoded data
        """
        self.fit(y)
        return super().transform(y)

    def transform(self, y):
        """
        Transform the input data and returns it.
        
        Parameters
        ----------
        y : numpy array
            the values to fit
        
        Returns
        -------
        numpy array
            the encoded data
        """
        return super().transform(
            np.where(
                np.isin(y.astype("str"), self.classes_), y.astype("str"), UNKNOWN_VALUE
            )
        )



# Download census-income dataset

In [None]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
url_test = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test"

dataset_name = "census-income"
out = Path(os.getcwd() + "/data/" + dataset_name + ".csv")
out_test = Path(os.getcwd() + "/data/" + dataset_name + "_test.csv")

download(url, out, force=True)
download(url_test, out_test, force=True)

# Load data and split

In [None]:
cols = [
    "age",
    "workclass",
    "fnlwgt",
    "education",
    "education-num",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "capital-gain",
    "capital-loss",
    "hours-per-week",
    "native-country",
    "target",
]

In [None]:
train = pd.read_csv(out, names=cols)
test = pd.read_csv(out_test, names=cols, skiprows=2)
target = "target"

train[target] = train[target].str.strip()
# Test has . in label, let's clean it
test[target] = test[target].str.strip().str.strip(".")

In [None]:
if "Set" not in train.columns:
    train["Set"] = np.random.choice(
        ["train", "valid"], p=[0.8, 0.2], size=(train.shape[0],)
    )

train_indices = train[train.Set == "train"].index
valid_indices = train[train.Set == "valid"].index

In [None]:
used_columns = list(set(train.columns.tolist()) - set([target]) - set(["Set"]))
used_columns

# Simple preprocessing

Label encode categorical features and fill empty cells.

In [None]:
nunique = train[used_columns].nunique()
types = train[used_columns].dtypes

cat_cols = train[used_columns].columns[(nunique < 200) | (types == "object")]
other_cols = train[used_columns].columns[~train[used_columns].columns.isin(cat_cols)]
print(cat_cols)
print(other_cols)

In [None]:
nunique["education"]

In [None]:
# Fillna
train[cat_cols] = train[cat_cols].astype("str")
train[other_cols] = train[other_cols].fillna(train[other_cols].mean())

test[cat_cols] = test[cat_cols].astype("str")
test[other_cols] = test[other_cols].fillna(train[other_cols].mean())

In [None]:
train.isnull().sum().sum()

In [None]:
enc = {}
for col in cat_cols:
    label_enc = SafeLabelEncoder()
    enc[col] = label_enc
    train[col] = label_enc.fit_transform(train[col])
    test[col] = label_enc.transform(test[col])
enc[target] = SafeLabelEncoder()
train[target] = enc[target].fit_transform(train[target])
test[target] = enc[target].transform(test[target])

enc

# Define categorical features for categorical embeddings

In [None]:
unused_feat = ["Set"]

cat_idxs = [i for i, f in enumerate(used_columns) if f in cat_cols]
cat_dims = [len(enc[f].classes_) for f in used_columns if f in cat_cols]
print(cat_idxs)
print(cat_dims)

# Training one model

In [None]:
X = train[used_columns].values
y = train[target].values

X_test = test[used_columns].values
y_test = test[target].values

# Random Grid search

In [None]:
num_workers = os.cpu_count() if torch.cuda.is_available() else 0

In [None]:
class TabNetTuner(TabNetClassifier):
    def fit(self, X, y, *args, **kwargs):
        # Dirty trick => would be better to add n_d in grid, or fix it in __init__ of tuner
        self.n_d = self.n_a
        X_train, X_valid, y_train, y_valid = train_test_split(
            X, y, test_size=0.2, random_state=0, shuffle=True, stratify=y
        )
        return super().fit(
            X_train,
            y_train,
            patience=20,
            X_valid=X_valid,
            y_valid=y_valid,
            num_workers=num_workers,
            max_epochs=1000,
            batch_size=1024,
            virtual_batch_size=128,
        )

In [None]:
clf = TabNetTuner(cat_idxs=cat_idxs, cat_dims=cat_dims)

In [None]:
# Let's generate embedding size based on cat dims
cat_emb_dim_list = []
for max_dim in [1, 5, 10, 20, 50]:
    cat_emb_dim_list.append([min(nb // 2, max_dim) for nb in cat_dims])
cat_emb_dim_list

In [None]:
grid = {
    "n_a": [3, 5, 8, 13, 21],
    # "n_d": [8], #
    "cat_emb_dim": cat_emb_dim_list,
    "n_independent": [0, 1, 2, 5],
    "n_shared": [0, 1, 2],
    "n_steps": [1, 3, 5, 8],
    "clip_value": [1],
    "gamma": [0.5, 1.3, 3],
    "momentum": [0.1, 0.05, 0.02, 0.005],
    "lambda_sparse": [0.1, 0.01, 0.001],
    "lr": [0.1, 0.02, 0.001],
    "verbose": [1],
    # optimizer_fn
}

In [None]:
search = RandomizedSearchCV(
    clf,
    grid,
    n_iter=90,
    scoring="roc_auc",
    n_jobs=1,
    iid=False,
    refit=False,
    cv=[(train_indices, valid_indices)],
    verbose=1,
    pre_dispatch=0,
    random_state=0,
    return_train_score=False,
)

In [None]:
X = train[used_columns].values
y = train[target].values

In [None]:
search.fit(X, y)
search.best_params_

In [None]:
search.best_params_

After searching for best params, we need to retrain a model, we chose a make that on 5 folds.
{'verbose': 0,
 'n_steps': 1,
 'n_shared': 2,
 'n_independent': 0,
 'n_a': 21,
 'momentum': 0.02,
 'lr': 0.1,
 'lambda_sparse': 0.01,
 'gamma': 0.5,
 'clip_value': 1,
 'cat_emb_dim': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
 {'verbose': 0,
 'n_steps': 5,
 'n_shared': 2,
 'n_independent': 1,
 'n_a': 5,
 'momentum': 0.02,
 'lr': 0.1,
 'lambda_sparse': 0.001,
 'gamma': 0.5,
 'clip_value': 1,
 'cat_emb_dim': [5, 3, 5, 1, 5, 5, 3, 5, 4, 5, 5, 5, 5]}

In [None]:
from sklearn.model_selection import StratifiedKFold

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

In [None]:
models = []
for train_index, valid_index in skf.split(X, y):
    clf = TabNetClassifier(cat_idxs=cat_idxs, cat_dims=cat_dims, **search.best_params_)
    clf.fit(
        X[train_index],
        y[train_index],
        patience=20,
        X_valid=X[valid_index],
        y_valid=y[valid_index],
    )
    models.append(clf)

In [None]:
preds = np.zeros(shape=y_test.shape)
for model in models:
    preds += clf.predict_proba(X_test)[:, 1]
preds = preds / len(models)

In [None]:
test_auc = roc_auc_score(y_score=preds, y_true=y_test)

print(f"FINAL TEST SCORE FOR {dataset_name} : {test_auc}")