# **Section:** Model Selection

In [1]:
import os
import itertools as it
import warnings
import time
import pickle

import numpy as np

import pandas as pd

%matplotlib inline
from matplotlib import pyplot as plt
from matplotlib import cm
import seaborn as sns

import joblib

import pathlib

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.exceptions import DataConversionWarning

from sklearn import metrics

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

import tqdm

from libs.container import Container
from libs.nearest import nearest
from libs.experiment import WithAnotherExperiment, roc, metrics
from libs.precstar import  prec_star

warnings.simplefilter("ignore", category=DataConversionWarning)



In [2]:
PATH = pathlib.Path(os.path.abspath(os.path.dirname("")))

DATA_PATH = PATH / "_data" / "s5k_scaled.pkl.bz2"

COLUMNS_NO_FEATURES = ['id', 'tile', 'cnt', 'ra_k', 'dec_k', 'vs_type', 'vs_catalog', 'cls'] 

In [3]:
sample = pd.read_pickle(DATA_PATH)

# the features
X_columns = [c for c in sample.columns if c not in COLUMNS_NO_FEATURES]
y_column = "cls"

sample[X_columns] =  sample[X_columns].astype(np.float32)

data = Container({k: v for k, v in sample.groupby("tile") if k in ["b234", "b360", "b278", "b261"]})

del sample

In [4]:
best_params = joblib.load("_cache/best_params.pkl.bz2")

if "n_jobs" in best_params["rf"]:
    del best_params["rf"]["n_jobs"]
best_params

{'rf': {'criterion': 'entropy',
  'max_features': 'auto',
  'min_samples_split': 10,
  'n_estimators': 500},
 'knn': {'algorithm': 'auto', 'n_neighbors': 2, 'p': 1, 'weights': 'distance'},
 'svml': {'C': 4, 'kernel': 'linear', 'probability': True},
 'svmr': {'C': 34, 'gamma': 0.01, 'kernel': 'rbf', 'probability': True}}

In [5]:
CLFS_CLASSES = {
    "RF": RandomForestClassifier,
    "KNN": KNeighborsClassifier,
    "SVM-Linear": SVC,
    "SVM-RBF": SVC,
}

CLFS_PARAMS = {
    "RF": best_params["rf"],
    "KNN": best_params["knn"],
    "SVM-Linear": best_params["svml"],
    "SVM-RBF": best_params["svmr"]
}

In [6]:
def make_clf(tile_name, clf_name, df, X_columns):    
    X_train = df[X_columns].values
    y_train = df.cls.values
    
    clf_class = CLFS_CLASSES[clf_name]
    clf_params = CLFS_PARAMS[clf_name]
    
    clf = clf_class(**clf_params)
    clf.fit(X_train, y_train)
    return tile_name, clf


def get_clfs(clf_name, data, X_columns):
    with joblib.Parallel(n_jobs=-1) as jobs:
        clfs = jobs(
            joblib.delayed(make_clf)(tile_name, clf_name, df, X_columns)
            for tile_name, df in sorted(tqdm.tqdm(data.items())))
    return Container(clfs)


def get_combs(clf_name, data, X_columns):
    combs = []
    clfs = get_clfs(clf_name, data, X_columns)
    for train_name, clf in clfs.items():
        for test_name in clfs.keys():
            if train_name != test_name:
                test_sample = data[test_name]
                comb = Container({
                    "idx": len(combs), 
                    "train_name": train_name, "clf": clf,  
                    "test_name": test_name, "test_sample": test_sample, "X_columns": X_columns,
                    "clf_name": clf_name, "y_column": y_column})
                combs.append(comb)
    return combs

def execute_clf(idx, train_name, clf_name, clf, test_name, test_sample, X_columns, y_column):
    
    X_test = test_sample[X_columns].values
    y_test = test_sample[y_column].values
    
    predictions = clf.predict(X_test)
    probabilities = clf.predict_proba(X_test)
    
    fpr, tpr, thresholds = metrics.roc_curve(
        y_test, 1.-probabilities[:,0], pos_label=1)

    prec_rec_curve = metrics.precision_recall_curve(
        y_test, 1.- probabilities[:,0], pos_label=1)

    roc_auc = metrics.auc(fpr, tpr)
    
    result = Container({
        "idx": idx,
        "clf_name": clf_name,
        "train_name": train_name,
        "test_name": test_name,
        'fpr': fpr,
        'tpr': tpr,
        'thresh': thresholds,
        'roc_auc': roc_auc,
        'prec_rec_curve': prec_rec_curve,
        'real_cls': y_test,
        'predictions': predictions,
        'probabilities': probabilities,
        'confusion_matrix': metrics.confusion_matrix(y_test, predictions)})    
    return result

def train_and_run(clf_name, data, X_columns):
    combs = get_combs(clf_name, data, X_columns)
    print("Combinaciones: {}".format(len(combs)))    
    with joblib.Parallel(n_jobs=-1) as jobs:
        results = jobs(
            joblib.delayed(execute_clf)(**comb) for comb in tqdm.tqdm(combs))
    return results

In [7]:
%%time
rf_test = train_and_run("RF", data, X_columns)

100%|██████████| 4/4 [00:00<00:00, 7817.90it/s]
100%|██████████| 12/12 [00:00<00:00, 13255.64it/s]

Combinaciones: 12





CPU times: user 3.79 s, sys: 227 ms, total: 4.02 s
Wall time: 35 s


In [8]:
%%time
knn_test = train_and_run("KNN", data, X_columns)

100%|██████████| 4/4 [00:00<00:00, 16727.03it/s]
100%|██████████| 12/12 [00:00<00:00, 14012.15it/s]

Combinaciones: 12





CPU times: user 201 ms, sys: 54 ms, total: 255 ms
Wall time: 17.1 s


In [9]:
import gc; gc.collect()

478

In [10]:
%%time
svml_test = train_and_run("SVM-Linear", data, X_columns)

100%|██████████| 4/4 [00:00<00:00, 10754.63it/s]
100%|██████████| 12/12 [00:00<00:00, 11423.43it/s]

Combinaciones: 12





CPU times: user 195 ms, sys: 23.3 ms, total: 219 ms
Wall time: 22.8 s


In [11]:
%%time
svmr_test = train_and_run("SVM-RBF", data, X_columns)

100%|██████████| 4/4 [00:00<00:00, 22982.49it/s]
100%|██████████| 12/12 [00:00<00:00, 14169.95it/s]

Combinaciones: 12





CPU times: user 194 ms, sys: 15.6 ms, total: 209 ms
Wall time: 6.64 s


In [12]:
joblib.dump({
    "rf_test": rf_test,
    "knn_test": knn_test,
    "svml_test": svml_test,
    "svmr_test": svmr_test,}, "_cache/model_select.pkl.bz2", compress=3)

['_cache/model_select.pkl.bz2']

In [13]:
import datetime
datetime.datetime.now()

datetime.datetime(2020, 1, 18, 20, 37, 21, 368036)