In [1]:
import numpy as np

import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
# sns.set()

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.preprocessing import StandardScaler

import joblib

from libs.container import Container
import dataset

In [2]:
data = dataset.load_scaled()

df = pd.concat([data.b278, data.b261])

cls = {name: idx for idx, name in enumerate(df.tile.unique())}
df["cls"] = df.tile.apply(cls.get)

print(cls)

del data

Reading '/mnt/is0/jbcabral/denoise/dataset/full_scaled.pkl.bz2'
{'b278': 0, 'b261': 1}


In [3]:
def grid_search(data, estimator, score, param_grid):
    print(f"Running {type(estimator)}")
    
    clf = GridSearchCV(
        estimator, 
        param_grid, 
        cv=5, n_jobs=-2,
        scoring='%s_macro' % score)

    X, y = data[dataset.FEATURES].values, data.cls.values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)

    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()

    return clf

In [4]:
%%time
svc_rbf = grid_search(
    data=df, 
    estimator=SVC(),
    score="precision",
    param_grid=[{
        'kernel': ['rbf'], 
        'C': [1, 10, 30, 50, 100], 
        "gamma": np.array([1.e-4, 3.e-4, 1.e+3, 3.e-3]), 
        "probability": [True]}])

Running <class 'sklearn.svm._classes.SVC'>
Best parameters set found on development set:

{'C': 50, 'gamma': 0.003, 'kernel': 'rbf', 'probability': True}

Grid scores on development set:

0.667 (+/-0.043) for {'C': 1, 'gamma': 0.0001, 'kernel': 'rbf', 'probability': True}
0.700 (+/-0.031) for {'C': 1, 'gamma': 0.0003, 'kernel': 'rbf', 'probability': True}
0.253 (+/-0.001) for {'C': 1, 'gamma': 1000.0, 'kernel': 'rbf', 'probability': True}
0.752 (+/-0.012) for {'C': 1, 'gamma': 0.003, 'kernel': 'rbf', 'probability': True}
0.724 (+/-0.022) for {'C': 10, 'gamma': 0.0001, 'kernel': 'rbf', 'probability': True}
0.741 (+/-0.016) for {'C': 10, 'gamma': 0.0003, 'kernel': 'rbf', 'probability': True}
0.253 (+/-0.001) for {'C': 10, 'gamma': 1000.0, 'kernel': 'rbf', 'probability': True}
0.775 (+/-0.021) for {'C': 10, 'gamma': 0.003, 'kernel': 'rbf', 'probability': True}
0.733 (+/-0.015) for {'C': 30, 'gamma': 0.0001, 'kernel': 'rbf', 'probability': True}
0.745 (+/-0.013) for {'C': 30, 'gamma': 0.00

In [5]:
%%time
rf = grid_search(
    data=df, 
    estimator=RandomForestClassifier(),
    score="precision",
    param_grid=[{
     'max_features': ['auto', 'sqrt', "log2", None, 0.2, 0.5], 
     "min_samples_split": [2, 5, 10],
     "n_estimators": [500], 
     "criterion": ["entropy"], 
     "n_jobs": [10]}])

Running <class 'sklearn.ensemble._forest.RandomForestClassifier'>
Best parameters set found on development set:

{'criterion': 'entropy', 'max_features': 'sqrt', 'min_samples_split': 5, 'n_estimators': 500, 'n_jobs': 10}

Grid scores on development set:

0.776 (+/-0.029) for {'criterion': 'entropy', 'max_features': 'auto', 'min_samples_split': 2, 'n_estimators': 500, 'n_jobs': 10}
0.773 (+/-0.030) for {'criterion': 'entropy', 'max_features': 'auto', 'min_samples_split': 5, 'n_estimators': 500, 'n_jobs': 10}
0.780 (+/-0.025) for {'criterion': 'entropy', 'max_features': 'auto', 'min_samples_split': 10, 'n_estimators': 500, 'n_jobs': 10}
0.776 (+/-0.030) for {'criterion': 'entropy', 'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 500, 'n_jobs': 10}
0.783 (+/-0.026) for {'criterion': 'entropy', 'max_features': 'sqrt', 'min_samples_split': 5, 'n_estimators': 500, 'n_jobs': 10}
0.775 (+/-0.033) for {'criterion': 'entropy', 'max_features': 'sqrt', 'min_samples_split': 10, 'n_e

In [12]:
%%time
knn = grid_search(
    data=df, 
    estimator=KNeighborsClassifier(),
    score="precision",
    param_grid=[{
        "weights": ['uniform', 'distance'], 
        "algorithm": ['auto'],
        "p": [1, 2, 3],
        "n_neighbors": [5,  10, 40, 45, 50, 55, 56, 57, 58, 59, 60, 100]}])

Running <class 'sklearn.neighbors._classification.KNeighborsClassifier'>
Best parameters set found on development set:

{'algorithm': 'auto', 'n_neighbors': 55, 'p': 1, 'weights': 'distance'}

Grid scores on development set:

0.669 (+/-0.024) for {'algorithm': 'auto', 'n_neighbors': 5, 'p': 1, 'weights': 'uniform'}
0.669 (+/-0.024) for {'algorithm': 'auto', 'n_neighbors': 5, 'p': 1, 'weights': 'distance'}
0.612 (+/-0.024) for {'algorithm': 'auto', 'n_neighbors': 5, 'p': 2, 'weights': 'uniform'}
0.612 (+/-0.023) for {'algorithm': 'auto', 'n_neighbors': 5, 'p': 2, 'weights': 'distance'}
0.608 (+/-0.029) for {'algorithm': 'auto', 'n_neighbors': 5, 'p': 3, 'weights': 'uniform'}
0.609 (+/-0.028) for {'algorithm': 'auto', 'n_neighbors': 5, 'p': 3, 'weights': 'distance'}
0.672 (+/-0.026) for {'algorithm': 'auto', 'n_neighbors': 10, 'p': 1, 'weights': 'uniform'}
0.676 (+/-0.020) for {'algorithm': 'auto', 'n_neighbors': 10, 'p': 1, 'weights': 'distance'}
0.638 (+/-0.034) for {'algorithm': 'auto

In [20]:
%%time
svc_linear = grid_search(
    data=df, 
    estimator=SVC(),
    score="precision",
    param_grid=[{
        'kernel': ['linear'], 
        'C': [1, 10, 100, 104, 105, 106], 
        "probability": [True]}])

Running <class 'sklearn.svm._classes.SVC'>
Best parameters set found on development set:

{'C': 100, 'kernel': 'linear', 'probability': True}

Grid scores on development set:

0.808 (+/-0.044) for {'C': 1, 'kernel': 'linear', 'probability': True}
0.846 (+/-0.020) for {'C': 10, 'kernel': 'linear', 'probability': True}
0.854 (+/-0.025) for {'C': 100, 'kernel': 'linear', 'probability': True}
0.852 (+/-0.022) for {'C': 104, 'kernel': 'linear', 'probability': True}
0.852 (+/-0.023) for {'C': 105, 'kernel': 'linear', 'probability': True}
0.852 (+/-0.023) for {'C': 106, 'kernel': 'linear', 'probability': True}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

              precision    recall  f1-score   support

           0       0.91      0.83      0.86       422
           1       0.82      0.90      0.86       378

    accuracy                           0.86       800
   macro avg       0.87      0.87 

In [23]:
joblib.dump(
    value = {
        "svc_linear": svc_linear, "rf": rf,
        "svc_rbf": svc_rbf, "knn": knn},
    filename = "results/hp_selection.pkl.bz2",
    compress=3)

['results/hp_selection.pkl.bz2']

In [22]:
import datetime
datetime.datetime.now()

datetime.datetime(2020, 3, 18, 20, 50, 44, 707801)