# **Section:** Model Selection

In [1]:
import os
import itertools as it
import warnings
import time
import pickle

import numpy as np

import pandas as pd

%matplotlib inline
from matplotlib import pyplot as plt
from matplotlib import cm
import seaborn as sns

import joblib

import pathlib

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.exceptions import DataConversionWarning

from sklearn import metrics

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

import tqdm

from libs.container import Container
from libs.nearest import nearest
from libs.experiment import WithAnotherExperiment, roc, metrics
from libs.precstar import  prec_star

warnings.simplefilter("ignore", category=DataConversionWarning)



In [2]:
PATH = pathlib.Path(os.path.abspath(os.path.dirname("")))

DATA_PATH = PATH / "_data" / "s20k_scaled.pkl.bz2"
DATA_PATH = PATH / "bkp" / "s20k_scaled.pkl.bz2"

COLUMNS_NO_FEATURES = ['id', 'tile', 'cnt', 'ra_k', 'dec_k', 'vs_type', 'vs_catalog', 'cls'] 

In [3]:
sample = pd.read_pickle(DATA_PATH)

# the features
X_columns = [c for c in sample.columns if c not in COLUMNS_NO_FEATURES]
y_column = "cls"

sample[X_columns] =  sample[X_columns].astype(np.float32)

data = Container({k: v for k, v in sample.groupby("tile") if k in ["b234", "b360", "b278", "b261"]})

del sample

In [5]:
len(X_columns)

62

In [4]:
def score_func(y, y_prob, **kwargs):
    prec, rec, thr = metrics.precision_recall_curve(
            y, y_prob, sample_weight=Nonbkp/
    idx = nearest(array=rec, value=.9)
    return prec[idx]


def grid_search(data, estimator, param_grid):
    print(f"Running {type(estimator)}")
    clf = GridSearchCV(
        estimator, 
        param_grid, 
        cv=5, scoring=metrics.make_scorer(score_func, needs_proba=True), n_jobs=-1)

    X, y = data[X_columns].values, data.cls.values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)

    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()

    return clf

## SVM-RBF

In [5]:
gamma_range = np.array([1.e-4, 3.e-4, 1.e+3, 3.e-3])

In [6]:
%%time
svc_rbf = grid_search(
    data=data.b278, 
    estimator=SVC(),
    param_grid=[{
        'kernel': ['rbf'], 
        'C': [1, 10, 30, 50, 100], 
        "gamma": gamma_range, 
        "probability": [True]}])

Running <class 'sklearn.svm._classes.SVC'>
Best parameters set found on development set:

{'C': 10, 'gamma': 0.003, 'kernel': 'rbf', 'probability': True}

Grid scores on development set:

0.245 (+/-0.143) for {'C': 1, 'gamma': 0.0001, 'kernel': 'rbf', 'probability': True}
0.251 (+/-0.151) for {'C': 1, 'gamma': 0.0003, 'kernel': 'rbf', 'probability': True}
0.022 (+/-0.000) for {'C': 1, 'gamma': 1000.0, 'kernel': 'rbf', 'probability': True}
0.343 (+/-0.161) for {'C': 1, 'gamma': 0.003, 'kernel': 'rbf', 'probability': True}
0.258 (+/-0.161) for {'C': 10, 'gamma': 0.0001, 'kernel': 'rbf', 'probability': True}
0.298 (+/-0.220) for {'C': 10, 'gamma': 0.0003, 'kernel': 'rbf', 'probability': True}
0.022 (+/-0.000) for {'C': 10, 'gamma': 1000.0, 'kernel': 'rbf', 'probability': True}
0.388 (+/-0.144) for {'C': 10, 'gamma': 0.003, 'kernel': 'rbf', 'probability': True}
0.283 (+/-0.218) for {'C': 30, 'gamma': 0.0001, 'kernel': 'rbf', 'probability': True}
0.306 (+/-0.177) for {'C': 30, 'gamma': 0.00

## SVM-Linear 

In [8]:
%%time

Cs = [20, 30, 40,  50, 60, 100]

svc_linear = grid_search(
    data=data.b278, 
    estimator=SVC(probability=True),
    param_grid=[{'kernel': ['linear'], 'C': Cs, "probability": [True]}])


Running <class 'sklearn.svm._classes.SVC'>
Best parameters set found on development set:

{'C': 50, 'kernel': 'linear', 'probability': True}

Grid scores on development set:

0.247 (+/-0.199) for {'C': 20, 'kernel': 'linear', 'probability': True}
0.243 (+/-0.205) for {'C': 30, 'kernel': 'linear', 'probability': True}
0.247 (+/-0.198) for {'C': 40, 'kernel': 'linear', 'probability': True}
0.251 (+/-0.203) for {'C': 50, 'kernel': 'linear', 'probability': True}
0.251 (+/-0.203) for {'C': 60, 'kernel': 'linear', 'probability': True}
0.246 (+/-0.195) for {'C': 100, 'kernel': 'linear', 'probability': True}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4005
           1       0.90      0.77      0.83        83

    accuracy                           0.99      4088
   macro avg       0.95      0.88    

## KNN

In [20]:
k_range = [4, 5, 6]
k_range

[3, 5, 7]

In [22]:
%%time
knn = grid_search(
    data=data.b278, 
    estimator=KNeighborsClassifier(),
    param_grid=[{
        "weights": ['uniform', 'distance'], 
        "algorithm": ['auto'],
        "p": [1, 2, 3],
        "n_neighbors": k_range}])

Running <class 'sklearn.neighbors._classification.KNeighborsClassifier'>
Best parameters set found on development set:

{'algorithm': 'auto', 'n_neighbors': 5, 'p': 1, 'weights': 'uniform'}

Grid scores on development set:

0.493 (+/-0.473) for {'algorithm': 'auto', 'n_neighbors': 3, 'p': 1, 'weights': 'uniform'}
0.493 (+/-0.473) for {'algorithm': 'auto', 'n_neighbors': 3, 'p': 1, 'weights': 'distance'}
0.335 (+/-0.515) for {'algorithm': 'auto', 'n_neighbors': 3, 'p': 2, 'weights': 'uniform'}
0.335 (+/-0.515) for {'algorithm': 'auto', 'n_neighbors': 3, 'p': 2, 'weights': 'distance'}
0.327 (+/-0.499) for {'algorithm': 'auto', 'n_neighbors': 3, 'p': 3, 'weights': 'uniform'}
0.327 (+/-0.499) for {'algorithm': 'auto', 'n_neighbors': 3, 'p': 3, 'weights': 'distance'}
0.510 (+/-0.070) for {'algorithm': 'auto', 'n_neighbors': 5, 'p': 1, 'weights': 'uniform'}
0.510 (+/-0.070) for {'algorithm': 'auto', 'n_neighbors': 5, 'p': 1, 'weights': 'distance'}
0.438 (+/-0.042) for {'algorithm': 'auto', '

## Random Forest

In [23]:
%%time

rf = grid_search(
    data=data.b278, 
    estimator=RandomForestClassifier(),
    param_grid=[{
        'max_features': ['auto', 'sqrt', "log2", None, 0.2, 0.5], 
        "min_samples_split": [2, 5, 10],
        "n_estimators": [500], 
        "criterion": ["entropy"], 
        "n_jobs": [-1]}])

del rf.best_params_["n_jobs"]

Running <class 'sklearn.ensemble._forest.RandomForestClassifier'>
Best parameters set found on development set:

{'criterion': 'entropy', 'max_features': 'log2', 'min_samples_split': 2, 'n_estimators': 500, 'n_jobs': -1}

Grid scores on development set:

0.631 (+/-0.143) for {'criterion': 'entropy', 'max_features': 'auto', 'min_samples_split': 2, 'n_estimators': 500, 'n_jobs': -1}
0.644 (+/-0.201) for {'criterion': 'entropy', 'max_features': 'auto', 'min_samples_split': 5, 'n_estimators': 500, 'n_jobs': -1}
0.608 (+/-0.234) for {'criterion': 'entropy', 'max_features': 'auto', 'min_samples_split': 10, 'n_estimators': 500, 'n_jobs': -1}
0.629 (+/-0.194) for {'criterion': 'entropy', 'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 500, 'n_jobs': -1}
0.615 (+/-0.261) for {'criterion': 'entropy', 'max_features': 'sqrt', 'min_samples_split': 5, 'n_estimators': 500, 'n_jobs': -1}
0.600 (+/-0.203) for {'criterion': 'entropy', 'max_features': 'sqrt', 'min_samples_split': 10, 'n_e

In [24]:
rf.best_params_

{'criterion': 'entropy',
 'max_features': 'log2',
 'min_samples_split': 2,
 'n_estimators': 500}

In [25]:
best_params = {
    "rf": rf.best_params_,
    "knn": knn.best_params_,
    "svml": svc_linear.best_params_,
    "svmr": svc_rbf.best_params_
}

joblib.dump(best_params, "_cache/best_params.pkl.bz2", compress=3)

['_cache/best_params.pkl.bz2']

In [26]:
best_params

{'rf': {'criterion': 'entropy',
  'max_features': 'log2',
  'min_samples_split': 2,
  'n_estimators': 500},
 'knn': {'algorithm': 'auto', 'n_neighbors': 5, 'p': 1, 'weights': 'uniform'},
 'svml': {'C': 50, 'kernel': 'linear', 'probability': True},
 'svmr': {'C': 10, 'gamma': 0.003, 'kernel': 'rbf', 'probability': True}}

In [27]:
import datetime
datetime.datetime.now()

datetime.datetime(2020, 1, 23, 9, 31, 24, 990982)