In [1]:
import numpy as np

import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
# sns.set()

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.preprocessing import StandardScaler

import joblib

from libs.container import Container
import dataset

In [2]:
cpu = joblib.cpu_count()

In [5]:
data = dataset.load_scaled()

Reading '/home/juan/proyectos/denoise/src/dataset/full_scaled.pkl.bz2'


In [6]:
df = pd.concat([data.b278, data.b261])

cls = {name: idx for idx, name in enumerate(df.tile.unique())}
df["cls"] = df.tile.apply(cls.get)

print(cls)

{'b278': 0, 'b261': 1}


In [12]:
def grid_search(data, estimator, score, param_grid):
    print(f"Running {type(estimator)}")
    
    clf = GridSearchCV(
        estimator, 
        param_grid, 
        cv=5, n_jobs=-2,
        scoring='%s_macro' % score)

    X, y = data[dataset.FEATURES].values, data.cls.values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)

    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()

    return clf

In [None]:
%%time
svc_rbf = grid_search(
    data=df, 
    estimator=SVC(),
    score="precision",
    param_grid=[{
        'kernel': ['rbf'], 
        'C': [1, 10, 30, 50, 100], 
        "gamma": np.array([1.e-4, 3.e-4, 1.e+3, 3.e-3]), 
        "probability": [True]}])

In [None]:
%%time
rf = grid_search(
    data=df, 
    estimator=RandomForestClassifier(),
    score="precision",
    param_grid=[{
     'max_features': ['auto', 'sqrt', "log2", None, 0.2, 0.5], 
     "min_samples_split": [2, 5, 10],
     "n_estimators": [500], 
     "criterion": ["entropy"], 
     "n_jobs": [10]}])

In [18]:
%%time
knn = grid_search(
    data=df, 
    estimator=KNeighborsClassifier(),
    param_grid=[{
        "weights": ['uniform', 'distance'], 
        "algorithm": ['auto'],
        "p": [4, 5, 6],
        "n_neighbors": k_range}])

True

In [19]:
%%time
svc_linear = grid_search(
    data=df, 
    estimator=SVC(),
    score="precision",
    param_grid=[{
        'kernel': ['linear'], 
        'C': [20, 30, 40,  50, 60, 100], 
        "probability": [True]}])

True

In [19]:
joblib.dump(
    value = {
        "svc_linear": svc_linear, "rf": rf,
        "svc_rbf": svc_rbf, "knn": knn},
    filename = "results/hp_selection.pkl.bz2",
    compress=3)

In [22]:
import datetime
datetime.datetime.now()

datetime.datetime(2020, 3, 18, 14, 13, 43, 689599)