In [15]:
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import recall_score, precision_score, make_scorer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import scipy
import numpy as np

cancer_data = load_breast_cancer()
X=cancer_data.data
y=cancer_data.target

## Решение 1 задачи
Максимизация $ recall $ при заданной вероятности, что текст помечен важным

In [16]:
alpha = 0.5

In [17]:
def custom_score_1 (y, probas, alpha):
    probas_sorted=np.sort(probas[:,1])
    i_treshold = round((1-alpha)*len(y))
    threshold = probas_sorted[i_treshold]
    y_preds = (probas[:,1] >= threshold).astype(int)
    recall = recall_score(y, y_preds)
    return recall

my_score_1 = make_scorer(custom_score_1, needs_proba = True, alpha=alpha)

In [26]:
param_grid = {
    #'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000],
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty' : ['l1', 'l2'],
    'fit_intercept': ['True','False'],
    'max_iter': [100, 200, 500, 1000],
    #'tol': [1e-2, 1e-4, 1e-6]
}

In [27]:
%%time

clf_grid_my_score_1 = GridSearchCV(LogisticRegression(), param_grid, scoring = my_score_1)
clf_grid_my_score_1.fit(X,y)

print(clf_grid_my_score_1.best_params_)
print(clf_grid_my_score_1.best_score_)

{'C': 1, 'fit_intercept': 'True', 'max_iter': 100, 'penalty': 'l1'}
0.795528053049
Wall time: 35.4 s


In [28]:
%%time

param_distributions = {
    'C': scipy.stats.uniform(0, 10),
    'penalty' : ['l1', 'l2'],
    'fit_intercept': ['True','False'],
    'max_iter': scipy.stats.randint(50, 200),
    #'tol': [1e-2, 1e-4, 1e-6]
}

clf_rand_my_score_1 = RandomizedSearchCV(LogisticRegression(), param_distributions, n_iter=100, scoring = my_score_1)
clf_rand_my_score_1.fit(X,y)

print(clf_rand_my_score_1.best_params_)
print(clf_rand_my_score_1.best_score_)

{'C': 8.3935330119323002, 'fit_intercept': 'True', 'max_iter': 125, 'penalty': 'l2'}
0.795528053049
Wall time: 52.7 s


In [29]:
%%time

clf_grid_accuracy_1 = GridSearchCV(LogisticRegression(), param_grid, scoring = 'accuracy')
clf_grid_accuracy_1.fit(X,y)

print(clf_grid_accuracy_1.best_params_)
print(clf_grid_accuracy_1.best_score_)

{'C': 100, 'fit_intercept': 'True', 'max_iter': 100, 'penalty': 'l1'}
0.96309314587
Wall time: 36 s


In [30]:
%%time

param_distributions = {
    'C': scipy.stats.uniform(50, 500),
    'penalty' : ['l1', 'l2'],
    'fit_intercept': ['True','False'],
    'max_iter': scipy.stats.randint(50, 200),
    #'tol': [1e-2, 1e-4, 1e-6]
}

clf_rand_accuracy_1 = RandomizedSearchCV(LogisticRegression(), param_distributions, n_iter=100, scoring = 'accuracy')
clf_rand_accuracy_1.fit(X,y)

print(clf_rand_accuracy_1.best_params_)
print(clf_rand_accuracy_1.best_score_)

{'C': 93.858194683080711, 'fit_intercept': 'True', 'max_iter': 66, 'penalty': 'l1'}
0.96309314587
Wall time: 1min 35s


## Решение 2 задачи
Максимизация $ precision $ при заданном $ recall $

In [31]:
given_recall = 0.99

In [32]:
def custom_score_2 (y, probas, given_recall):
    probas_sorted=np.sort(probas[:,1])
    n_treshold = round( sum(y)*(1-given_recall) ).astype(int)
    threshold = probas_sorted[n_treshold]
    y_preds = (probas[:,1] >= threshold).astype(int)
    precision = precision_score(y, y_preds)
    return precision

my_score_2 = make_scorer(custom_score_2, needs_proba = True, given_recall=given_recall)

In [33]:
%%time

clf_grid_my_score_2 = GridSearchCV(LogisticRegression(), param_grid, scoring = my_score_2)
clf_grid_my_score_2.fit(X,y)

print(clf_grid_my_score_2.best_params_)
print(clf_grid_my_score_2.best_score_)

{'C': 0.001, 'fit_intercept': 'True', 'max_iter': 100, 'penalty': 'l1'}
0.630742070268
Wall time: 36.9 s


In [34]:
%%time

param_distributions = {
    'C': scipy.stats.uniform(0, 0.0005),
    'penalty' : ['l1', 'l2'],
    'fit_intercept': ['True','False'],
    'max_iter': scipy.stats.randint(50, 150),
    #'tol': [1e-2, 1e-4, 1e-6]
}

clf_rand_my_score_2 = RandomizedSearchCV(LogisticRegression(), param_distributions, n_iter=100, scoring = my_score_2)
clf_rand_my_score_2.fit(X,y)

print(clf_rand_my_score_2.best_params_)
print(clf_rand_my_score_2.best_score_)

{'C': 0.00019257069845936232, 'fit_intercept': 'False', 'max_iter': 104, 'penalty': 'l2'}
0.630742070268
Wall time: 1.57 s
