# **Section:** Model Selection

In [18]:
import os
import itertools as it
import warnings
import time
import pickle

import numpy as np

import pandas as pd

%matplotlib inline
from matplotlib import pyplot as plt
from matplotlib import cm
import seaborn as sns

import joblib

import pathlib

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.exceptions import DataConversionWarning

from sklearn import metrics

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

import tqdm

from libs.container import Container
from libs.nearest import nearest
from libs.experiment import WithAnotherExperiment, roc, metrics
from libs.precstar import  prec_star

warnings.simplefilter("ignore", category=DataConversionWarning)

In [2]:
PATH = pathlib.Path(os.path.abspath(os.path.dirname("")))

DATA_PATH = PATH / "_data" / "s5k_scaled.pkl.bz2"

COLUMNS_NO_FEATURES = ['id', 'tile', 'cnt', 'ra_k', 'dec_k', 'vs_type', 'vs_catalog', 'cls'] 

In [3]:
sample = pd.read_pickle(DATA_PATH)

# the features
X_columns = [c for c in sample.columns if c not in COLUMNS_NO_FEATURES]
y_column = "cls"

sample[X_columns] =  sample[X_columns].astype(np.float32)

data = Container({k: v for k, v in sample.groupby("tile") if k in ["b234", "b360", "b278", "b261"]})

del sample

In [4]:
def score_func(y, y_prob, **kwargs):
    prec, rec, thr = metrics.precision_recall_curve(
            y, y_prob, sample_weight=None)
    idx = nearest(array=rec, value=.9)
    return prec[idx]


def grid_search(data, estimator, param_grid):
    print(f"Running {type(estimator)}")
    clf = GridSearchCV(
        estimator, 
        param_grid, 
        cv=5, scoring=metrics.make_scorer(score_func, needs_proba=True), n_jobs=-1)

    X, y = data[X_columns].values, data.cls.values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)

    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()

    return clf

## SVM-Linear 

```
[{'kernel': ['linear'], 'C': [1, 10, 100, 1000]}] -> {'C': 1000, 'kernel': 'linear'}
[{'kernel': ['linear'], 'C': [500, 1000, 5000]}]  -> {'C': 5000, 'kernel': 'linear'}
[{'kernel': ['linear'], 'C': [2500, 5000, 7500]}] -> {'C': 5000, 'kernel': 'linear'}
[{'kernel': ['linear'], 'C': [4000, 4500, 5000]}] -> {'C': 4000, 'kernel': 'linear'}
[{'kernel': ['linear'], 'C': [3000, 3500, 4000]}] -> {'C': 4000, 'kernel': 'linear'}
[{'kernel': ['linear'], 'C': [3900, 4000, 4100]}] -> {'C': 3900, 'kernel': 'linear'}
[{'kernel': ['linear'], 'C': [3700, 3800, 3900]}] -> {'C': 3700, 'kernel': 'linear'}
[{'kernel': ['linear'], 'C': [3700, 3600, 3650]}] -> {'C': 3700, 'kernel': 'linear'}
```

In [5]:
%%time
# start = time.time()
# svc_linear1 = grid_search(
#     data=data.b278, 
#     estimator=SVC(probability=True),
#     param_grid=[{'kernel': ['linear'], 'C': [3000, 3500, 4000], "probability": True}])
# end = time.time() - start

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 8.82 µs


## SVM-RBF

In [6]:
gamma_range = np.logspace(-9, 3, 13)
gamma_range

array([1.e-09, 1.e-08, 1.e-07, 1.e-06, 1.e-05, 1.e-04, 1.e-03, 1.e-02,
       1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03])

In [7]:
%%time
start = time.time()
svc_rbf = grid_search(
    data=data.b278, 
    estimator=SVC(),
    param_grid=[{'kernel': ['rbf'], 'C': [3000, 3500, 4000], "gamma": gamma_range, "probability": [True]}])
end = time.time() - start

Running <class 'sklearn.svm._classes.SVC'>
Best parameters set found on development set:

{'C': 3500, 'gamma': 0.001, 'kernel': 'rbf', 'probability': True}

Grid scores on development set:

0.578 (+/-0.091) for {'C': 3000, 'gamma': 1e-09, 'kernel': 'rbf', 'probability': True}
0.588 (+/-0.092) for {'C': 3000, 'gamma': 1e-08, 'kernel': 'rbf', 'probability': True}
0.616 (+/-0.112) for {'C': 3000, 'gamma': 1e-07, 'kernel': 'rbf', 'probability': True}
0.657 (+/-0.104) for {'C': 3000, 'gamma': 1e-06, 'kernel': 'rbf', 'probability': True}
0.657 (+/-0.068) for {'C': 3000, 'gamma': 1e-05, 'kernel': 'rbf', 'probability': True}
0.667 (+/-0.153) for {'C': 3000, 'gamma': 0.0001, 'kernel': 'rbf', 'probability': True}
0.714 (+/-0.146) for {'C': 3000, 'gamma': 0.001, 'kernel': 'rbf', 'probability': True}
0.681 (+/-0.175) for {'C': 3000, 'gamma': 0.01, 'kernel': 'rbf', 'probability': True}
0.682 (+/-0.144) for {'C': 3000, 'gamma': 0.1, 'kernel': 'rbf', 'probability': True}
0.070 (+/-0.001) for {'C': 30

## KNN

In [8]:
k_range = np.arange(1, 30)
k_range

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29])

In [9]:
%%time
start = time.time()

knn = grid_search(
    data=data.b278, 
    estimator=KNeighborsClassifier(),
    param_grid=[{
        "weights": ['uniform', 'distance'], 
        "algorithm": ['auto', 'ball_tree', 'kd_tree', 'brute'],
        "p": [1, 2, 3],
        "n_neighbors": k_range}])

end = time.time() - start

Running <class 'sklearn.neighbors._classification.KNeighborsClassifier'>
Best parameters set found on development set:

{'algorithm': 'auto', 'n_neighbors': 29, 'p': 1, 'weights': 'distance'}

Grid scores on development set:

0.703 (+/-0.628) for {'algorithm': 'auto', 'n_neighbors': 1, 'p': 1, 'weights': 'uniform'}
0.703 (+/-0.628) for {'algorithm': 'auto', 'n_neighbors': 1, 'p': 1, 'weights': 'distance'}
0.520 (+/-0.724) for {'algorithm': 'auto', 'n_neighbors': 1, 'p': 2, 'weights': 'uniform'}
0.520 (+/-0.724) for {'algorithm': 'auto', 'n_neighbors': 1, 'p': 2, 'weights': 'distance'}
0.649 (+/-0.573) for {'algorithm': 'auto', 'n_neighbors': 1, 'p': 3, 'weights': 'uniform'}
0.649 (+/-0.573) for {'algorithm': 'auto', 'n_neighbors': 1, 'p': 3, 'weights': 'distance'}
0.631 (+/-0.554) for {'algorithm': 'auto', 'n_neighbors': 2, 'p': 1, 'weights': 'uniform'}
0.634 (+/-0.557) for {'algorithm': 'auto', 'n_neighbors': 2, 'p': 1, 'weights': 'distance'}
0.740 (+/-0.074) for {'algorithm': 'auto',

## Random Forest

In [10]:
%%time
start = time.time()

rf = grid_search(
    data=data.b278, 
    estimator=RandomForestClassifier(),
    param_grid=[{
        'max_features': ['auto', 'sqrt', "log2", None, 0.2, 0.5], 
        "min_samples_split": [2, 5, 10],
        "n_estimators": [500], 
        "criterion": ["entropy"], 
        "n_jobs": [10]}]
)

end = time.time() - start

Running <class 'sklearn.ensemble._forest.RandomForestClassifier'>
Best parameters set found on development set:

{'criterion': 'entropy', 'max_features': 0.2, 'min_samples_split': 5, 'n_estimators': 500, 'n_jobs': 10}

Grid scores on development set:

0.822 (+/-0.175) for {'criterion': 'entropy', 'max_features': 'auto', 'min_samples_split': 2, 'n_estimators': 500, 'n_jobs': 10}
0.825 (+/-0.165) for {'criterion': 'entropy', 'max_features': 'auto', 'min_samples_split': 5, 'n_estimators': 500, 'n_jobs': 10}
0.824 (+/-0.157) for {'criterion': 'entropy', 'max_features': 'auto', 'min_samples_split': 10, 'n_estimators': 500, 'n_jobs': 10}
0.822 (+/-0.173) for {'criterion': 'entropy', 'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 500, 'n_jobs': 10}
0.828 (+/-0.147) for {'criterion': 'entropy', 'max_features': 'sqrt', 'min_samples_split': 5, 'n_estimators': 500, 'n_jobs': 10}
0.830 (+/-0.199) for {'criterion': 'entropy', 'max_features': 'sqrt', 'min_samples_split': 10, 'n_esti

## Run the classifiers with the selected parameters

In [11]:
CLFS_CLASSES = {
    "RF": RandomForestClassifier,
    "KNN": KNeighborsClassifier,
    "SVM-Linear": SVC,
    "SVM-RBF": SVC,
}

CLFS_PARAMS = {
    "RF":{
        'criterion': 'entropy',
         'max_features': 'log2',
         'min_samples_split': 5,
         'n_estimators': 500},
    
    "KNN": {
        'algorithm': 'auto', 
        'n_neighbors': 29, 
        'p': 1, 
        'weights': 
        'distance'},
    
    "SVM-Linear": {"probability": True, 'C': 3700, 'kernel': 'linear'},
    
    "SVM-RBF": {"probability": True, 'C': 3500, 'gamma': 0.001, 'kernel': 'rbf'}
}

In [12]:
def make_clf(tile_name, clf_name, df, X_columns):    
    X_train = df[X_columns].values
    y_train = df.cls.values
    
    clf_class = CLFS_CLASSES[clf_name]
    clf_params = CLFS_PARAMS[clf_name]
    
    clf = clf_class(**clf_params)
    clf.fit(X_train, y_train)
    return tile_name, clf


def get_clfs(clf_name, data, X_columns):
    with joblib.Parallel(n_jobs=-1) as jobs:
        clfs = jobs(
            joblib.delayed(make_clf)(tile_name, clf_name, df, X_columns)
            for tile_name, df in sorted(tqdm.tqdm(data.items())))
    return Container(clfs)


def get_combs(clf_name, data, X_columns):
    combs = []
    clfs = get_clfs(clf_name, data, X_columns)
    for train_name, clf in clfs.items():
        for test_name in clfs.keys():
            if train_name != test_name:
                test_sample = data[test_name]
                comb = Container({
                    "idx": len(combs), 
                    "train_name": train_name, "clf": clf,  
                    "test_name": test_name, "test_sample": test_sample, "X_columns": X_columns,
                    "clf_name": clf_name, "y_column": y_column})
                combs.append(comb)
    return combs

def execute_clf(idx, train_name, clf_name, clf, test_name, test_sample, X_columns, y_column):
    
    X_test = test_sample[X_columns].values
    y_test = test_sample[y_column].values
    
    predictions = clf.predict(X_test)
    probabilities = clf.predict_proba(X_test)
    
    fpr, tpr, thresholds = metrics.roc_curve(
        y_test, 1.-probabilities[:,0], pos_label=1)

    prec_rec_curve = metrics.precision_recall_curve(
        y_test, 1.- probabilities[:,0], pos_label=1)

    roc_auc = metrics.auc(fpr, tpr)
    
    result = Container({
        "idx": idx,
        "clf_name": clf_name,
        "train_name": train_name,
        "test_name": test_name,
        'fpr': fpr,
        'tpr': tpr,
        'thresh': thresholds,
        'roc_auc': roc_auc,
        'prec_rec_curve': prec_rec_curve,
        'real_cls': y_test,
        'predictions': predictions,
        'probabilities': probabilities,
        'confusion_matrix': metrics.confusion_matrix(y_test, predictions)})    
    return result

def train_and_run(clf_name, data, X_columns):
    combs = get_combs(clf_name, data, X_columns)
    print("Combinaciones: {}".format(len(combs)))    
    with joblib.Parallel(n_jobs=-1) as jobs:
        results = jobs(
            joblib.delayed(execute_clf)(**comb) for comb in tqdm.tqdm(combs))
    return results

In [13]:
%%time
rf_test = train_and_run("RF", data, X_columns)

100%|██████████| 4/4 [00:00<00:00, 10958.34it/s]
100%|██████████| 12/12 [00:00<00:00, 13733.06it/s]

Combinaciones: 12





CPU times: user 3.73 s, sys: 97.4 ms, total: 3.82 s
Wall time: 29.6 s


In [14]:
%%time
knn_test = train_and_run("KNN", data, X_columns)

100%|██████████| 4/4 [00:00<00:00, 10137.29it/s]
100%|██████████| 12/12 [00:00<00:00, 13206.94it/s]

Combinaciones: 12





CPU times: user 223 ms, sys: 47.3 ms, total: 270 ms
Wall time: 22.6 s


In [16]:
%%time
svml_test = train_and_run("SVM-Linear", data, X_columns)

100%|██████████| 4/4 [00:00<00:00, 7925.00it/s]
100%|██████████| 12/12 [00:00<00:00, 11848.32it/s]

Combinaciones: 12





CPU times: user 743 ms, sys: 466 ms, total: 1.21 s
Wall time: 4h 23min 28s


In [17]:
%%time
svmr_test = train_and_run("SVM-RBF", data, X_columns)

100%|██████████| 4/4 [00:00<00:00, 6487.71it/s]
100%|██████████| 12/12 [00:00<00:00, 11325.75it/s]

Combinaciones: 12





CPU times: user 195 ms, sys: 34.5 ms, total: 229 ms
Wall time: 16.1 s


In [23]:
joblib.dump({
    "rf_test": rf_test,
    "knn_test": knn_test,
    "svml_test": svml_test,
    "svmr_test": svmr_test,}, "_cache/model_select.pkl.bz2", compress=3)

['_cache/model_select.pkl.bz2']