## Cross-validation

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
os.chdir("..")

In [3]:
from tqdm import tqdm
import pandas as pd
import numpy as np
import torch.nn as nn

from sklearn.decomposition import PCA
from sklearn.manifold import Isomap
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression

from utils.data_preparation import concat_with_mi
from utils.cv import generate_all_combinations

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

from joblib import Parallel, delayed

In [4]:
from utils.cv import preprocess, compute_score

In [5]:
df_raw = pd.read_pickle("./input/submission/df_wide_raw_vals.df")
df_diff = pd.read_pickle("./input/submission/df_wide_diff.df")
df_diff_sign = pd.read_pickle("./input/submission/df_wide_diff_sign.df")

In [6]:
target = pd.read_csv("./input/submission/target.csv", index_col="loan_id")

In [7]:
for df_ in [df_raw, df_diff, df_diff_sign]:
    lids = df_.columns[df_.columns.str.contains("loan_id")]
    df_.drop(columns=lids, inplace=True)

Join to remove the removed loans in X from y

In [8]:
def create_prediction_layout(df, target, y_label: str = "default_at_9"):
    Xy = df.join(target.groupby(level=0).head(1))
    Xy.loc[:, "default_at_9"] = Xy.loc[:, "default_at_9"].astype(int)
    y = Xy[y_label]
    X = Xy.drop(y_label, axis=1)
    return X, y

In [9]:
X_raw, y = create_prediction_layout(df_raw, target)
X_diff, _ =  create_prediction_layout(df_diff, target)
X_diff_sign, _ =  create_prediction_layout(df_diff_sign, target)

Convert to proper dtypes

In [10]:
X_raw = X_raw.convert_dtypes()
X_diff = X_diff.convert_dtypes()
X_diff_sign = X_diff_sign.convert_dtypes()

In [11]:
data_params = {
    'X': [X_raw, X_diff, df_diff_sign],
    'add_mi': [True, False]
}

In [12]:
data_combinations = generate_all_combinations(data_params)

In [13]:
clf = LogisticRegression(max_iter=1000)

A pipeline could have been used instead of manually writing out the different steps. However, this is more difficult adapting to the different testing that is going to be done. Since we want to use the autoencoder output as a pass to a model, rather than validating the autoencoder itself

In [52]:
pca_params = {
    'n_components': [15, 20, 25, 30, 35, 40]
}
isomap_params = {
    'n_components': [10, 20, 30],
    'n_neighbors': [3, 5, 10]
}
ae_params = {
    'module__layers': [
        [200, 100, 50],
        [100, 50],
    ],
    'lr': [0.8, 0.6],
    'module__act_fn': [nn.LeakyReLU(), nn.ReLU()],
    'batch_size': [512],
    'max_epochs': [100],
    'sparse': [False, True]
}

In [53]:
pca_combinations = generate_all_combinations(pca_params)
isomap_combinations = generate_all_combinations(isomap_params)
ae_combinations = generate_all_combinations(ae_params)

In [49]:
from utils.ae import AE, AutoEncoder

In [17]:
def compute_score_sklearn(y, reducer, data_comb):
    X = data_comb['X']
    add_mi = data_comb['add_mi']
    if add_mi: X = concat_with_mi(X)
    return compute_score(reducer = reducer, X = X, y = y)

def compute_score_ae(y, ae_comb, data_comb):
    X = data_comb['X']
    add_mi = data_comb['add_mi']
    if add_mi: X = concat_with_mi(X)
    _, D = X.shape
    sparse = ae_comb['sparse']
    ae_comb_ = ae_comb.copy()
    ae_comb_.pop("sparse")
    ae = AE(
        AutoEncoder,
        module__D = D,
        train_split=None,
        verbose=0,
        **ae_comb_
    )
    return compute_score(reducer = ae, X = X, y = y, sparse=sparse)

#### PCA CV loop

In [152]:
pca_scores_grid = np.zeros((len(data_combinations), len(pca_combinations)))

In [44]:
for i, pca_comb in enumerate(pca_combinations):
    pca = PCA(**pca_comb)
    scores = Parallel(n_jobs=2)(delayed(compute_score_sklearn)(reducer = pca, data_comb=data_comb, y = y) for data_comb in data_combinations) 
    pca_scores_grid[:, i] = scores

In [45]:
col = np.argmax(np.max(pca_scores_grid, axis=0))
row = np.argmax(np.max(pca_scores_grid, axis=1))

In [46]:
pca_best_params = pca_combinations[col]
pca_best_data_params = data_combinations[row]

In [48]:
np.save("./input/submission/pca_best_params", pca_best_params)
np.save("./input/submission/pca_best_data_params", pca_best_data_params)

In [50]:
np.save("./input/submission/pca_scores_grid", pca_scores_grid)

#### Isomap CV loop

In [51]:
isomap_scores_grid = np.zeros((len(data_combinations), len(isomap_combinations)))

In [52]:
for i, isomap_comb in enumerate(isomap_combinations):
    isomap = Isomap(**isomap_comb)
    scores = Parallel(n_jobs=2)(delayed(compute_score_sklearn)(reducer = isomap, data_comb=data_comb, y = y) for data_comb in data_combinations) 
    isomap_scores_grid[:, i] = scores

In [53]:
col = np.argmax(np.max(isomap_scores_grid, axis=0))
row = np.argmax(np.max(isomap_scores_grid, axis=1))

In [54]:
isomap_best_params = isomap_combinations[col]
isomap_best_data_params = data_combinations[row]

In [55]:
np.save("./input/submission/isomap_best_params", isomap_best_params)
np.save("./input/submission/isomap_best_data_params", isomap_best_data_params)

In [56]:
np.save("./input/submission/isomap_scores_grid", isomap_scores_grid)

#### Autoencoder CV loop

In [54]:
ae_scores_grid = np.zeros((len(data_combinations), len(ae_combinations)))

In [55]:
for i, ae_comb in enumerate(ae_combinations):
    scores = [compute_score_ae(y = y, ae_comb=ae_comb, data_comb = data_comb) for data_comb in data_combinations]
    print(scores)
    ae_scores_grid[:, i] = scores

5it [00:52, 10.45s/it]
5it [00:43,  8.72s/it]
5it [00:47,  9.48s/it]
5it [00:39,  7.90s/it]
5it [00:45,  9.15s/it]
5it [00:41,  8.31s/it]
0it [00:00, ?it/s]

[0.37637142680000074, 0.28552158997114396, 0.36007092225832144, 0.2961940502145265, 0.380312999524282, 0.3736680484504626]


5it [00:49,  9.88s/it]
5it [00:41,  8.29s/it]
5it [00:46,  9.29s/it]
5it [00:41,  8.23s/it]
5it [00:48,  9.74s/it]
5it [00:43,  8.67s/it]
0it [00:00, ?it/s]

[0.2338629601469464, 0.13641927826732703, 0.23145448201383262, 0.2787287683597611, 0.39509352200732484, 0.36749967143795115]


5it [00:44,  8.97s/it]
5it [00:42,  8.46s/it]
5it [00:51, 10.21s/it]
5it [00:40,  8.06s/it]
5it [00:48,  9.70s/it]
5it [00:37,  7.40s/it]
0it [00:00, ?it/s]

[0.36096624905055197, 0.2363497759710119, 0.34515454803706863, 0.28266462143195054, 0.35859105540398345, 0.38206586644667306]


5it [00:45,  9.10s/it]
5it [00:42,  8.56s/it]
5it [00:45,  9.17s/it]
5it [00:39,  7.96s/it]
5it [00:43,  8.74s/it]
5it [00:35,  7.01s/it]
0it [00:00, ?it/s]

[0.4830415174860419, 0.4830415174860419, 0.4830415174860419, 0.4830415174860419, 0.4830415174860419, 0.4830415174860419]


5it [00:42,  8.42s/it]
5it [00:37,  7.40s/it]
5it [00:48,  9.66s/it]
5it [00:43,  8.67s/it]
5it [00:48,  9.61s/it]
5it [00:42,  8.51s/it]
0it [00:00, ?it/s]

[0.3470695372288187, 0.2525281033329447, 0.37136957639626306, 0.26447876071165655, 0.36885804906571806, 0.37338680697975285]


5it [00:52, 10.42s/it]
5it [00:44,  8.84s/it]
5it [00:46,  9.23s/it]
5it [00:42,  8.43s/it]
5it [00:48,  9.62s/it]
5it [00:41,  8.37s/it]
0it [00:00, ?it/s]

[0.3632144789097292, 0.17887059948664458, 0.29000126479989374, 0.1814783111849886, 0.28932364271354066, 0.2819718641998066]


5it [00:52, 10.49s/it]
5it [00:45,  9.13s/it]
5it [00:46,  9.32s/it]
5it [00:42,  8.46s/it]
5it [00:42,  8.59s/it]
5it [00:40,  8.13s/it]
0it [00:00, ?it/s]

[0.35646443278719503, 0.24193357067698348, 0.3451722323788661, 0.28252138083950645, 0.36791397199756865, 0.36298110987024684]


5it [00:51, 10.35s/it]
5it [00:43,  8.64s/it]
5it [00:48,  9.73s/it]
5it [00:42,  8.46s/it]
5it [00:47,  9.42s/it]
5it [00:45,  9.17s/it]
0it [00:00, ?it/s]

[0.4830415174860419, 0.4830415174860419, 0.4830415174860419, 0.4830415174860419, 0.4830415174860419, 0.4830415174860419]


5it [00:46,  9.31s/it]
5it [00:37,  7.49s/it]
5it [00:37,  7.60s/it]
5it [00:34,  6.84s/it]
5it [00:38,  7.66s/it]
5it [00:31,  6.40s/it]
0it [00:00, ?it/s]

[0.35986479266661736, 0.30931541311797683, 0.36989399638201814, 0.3297057530190015, 0.3756388292883618, 0.3884215202519007]


5it [00:44,  8.89s/it]
5it [00:40,  8.04s/it]
5it [00:41,  8.40s/it]
5it [00:41,  8.27s/it]
5it [00:41,  8.23s/it]
5it [00:39,  7.93s/it]
0it [00:00, ?it/s]

[0.2504566580095281, 0.17022889000190822, 0.2289966813696151, 0.12942551462895396, 0.3996830631975355, 0.3433881029090175]


5it [00:44,  8.85s/it]
5it [00:37,  7.43s/it]
5it [00:39,  7.84s/it]
5it [00:36,  7.38s/it]
5it [00:39,  7.98s/it]
5it [00:34,  6.97s/it]
0it [00:00, ?it/s]

[0.3625675756638035, 0.3262375794454523, 0.37635770024261467, 0.33317165747976873, 0.37874974802469286, 0.387278038924847]


5it [00:42,  8.48s/it]
5it [00:36,  7.26s/it]
5it [00:35,  7.05s/it]
5it [00:33,  6.67s/it]
5it [00:33,  6.69s/it]
5it [00:29,  5.94s/it]
0it [00:00, ?it/s]

[0.4830415174860419, 0.4830415174860419, 0.4830415174860419, 0.4830415174860419, 0.4830415174860419, 0.4830415174860419]


5it [00:34,  6.99s/it]
5it [00:32,  6.47s/it]
5it [00:39,  7.83s/it]
5it [00:35,  7.07s/it]
5it [00:42,  8.45s/it]
5it [00:36,  7.21s/it]
0it [00:00, ?it/s]

[0.3802945918759602, 0.32352312702848984, 0.3761277398711043, 0.32884094701456634, 0.3676264300767052, 0.37756940962256846]


5it [00:42,  8.47s/it]
5it [00:36,  7.39s/it]
5it [00:39,  7.85s/it]
5it [00:35,  7.02s/it]
5it [00:37,  7.57s/it]
5it [00:35,  7.04s/it]
0it [00:00, ?it/s]

[0.18938869008199305, 0.26395591925489814, 0.24947667626401068, 0.12477481037323042, 0.2850159889198541, 0.2689671844768894]


5it [00:41,  8.33s/it]
5it [00:36,  7.29s/it]
5it [00:39,  7.99s/it]
5it [00:36,  7.21s/it]
5it [00:41,  8.22s/it]
5it [00:35,  7.11s/it]
0it [00:00, ?it/s]

[0.3520070598193744, 0.30357305643632937, 0.3725932706231395, 0.3374885246904053, 0.3732831925398317, 0.3755571820340862]


5it [00:42,  8.50s/it]
5it [00:36,  7.33s/it]
5it [00:41,  8.36s/it]
5it [00:37,  7.40s/it]
5it [00:38,  7.79s/it]
5it [00:35,  7.08s/it]

[0.4830415174860419, 0.4830415174860419, 0.4830415174860419, 0.4830415174860419, 0.4830415174860419, 0.4830415174860419]





Get every other element of the combinations (containing sparse)

In [89]:
col = np.argmax(np.max(ae_scores_grid, axis=0))
row = np.argmax(np.max(ae_scores_grid, axis=1))

In [90]:
ae_best_params = ae_combinations[col]
ae_best_data_params = data_combinations[row]

In [91]:
ae_scores_grid

array([[0.37637143, 0.23386296, 0.36096625, 0.48304152, 0.34706954,
        0.36321448, 0.35646443, 0.48304152, 0.35986479, 0.25045666,
        0.36256758, 0.48304152, 0.38029459, 0.18938869, 0.35200706,
        0.48304152],
       [0.28552159, 0.13641928, 0.23634978, 0.48304152, 0.2525281 ,
        0.1788706 , 0.24193357, 0.48304152, 0.30931541, 0.17022889,
        0.32623758, 0.48304152, 0.32352313, 0.26395592, 0.30357306,
        0.48304152],
       [0.36007092, 0.23145448, 0.34515455, 0.48304152, 0.37136958,
        0.29000126, 0.34517223, 0.48304152, 0.369894  , 0.22899668,
        0.3763577 , 0.48304152, 0.37612774, 0.24947668, 0.37259327,
        0.48304152],
       [0.29619405, 0.27872877, 0.28266462, 0.48304152, 0.26447876,
        0.18147831, 0.28252138, 0.48304152, 0.32970575, 0.12942551,
        0.33317166, 0.48304152, 0.32884095, 0.12477481, 0.33748852,
        0.48304152],
       [0.380313  , 0.39509352, 0.35859106, 0.48304152, 0.36885805,
        0.28932364, 0.36791397, 

Notice how every row that correponds to the sparse case return the same high value. This is investigated further in main in the actual testing

In [102]:
not_sparse = list(range(len(ae_combinations)))[::2]

In [103]:
ae_scores_wo_sparse = ae_scores_grid[:, not_sparse]

In [104]:
col_ = np.argmax(np.max(ae_scores_wo_sparse, axis=0))
row_ = np.argmax(np.max(ae_scores_wo_sparse, axis=1))

In [105]:
ae_combinations_ = np.array(ae_combinations)[not_sparse]

In [106]:
ae_best_params_ = ae_combinations_[col_]
ae_best_data_params_ = data_combinations[row_]

In [61]:
np.save("./input/submission/ae_best_params_512", ae_best_params)
np.save("./input/submission/ae_best_data_params_512", ae_best_data_params)

In [108]:
np.save("./input/submission/ae_best_params_512_notsparse", ae_best_params_)
np.save("./input/submission/ae_best_data_params_512_notsparse", ae_best_data_params_)