# Imports

In [103]:
from collections import Counter
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import RandomOverSampler, SMOTE
from sklearn.model_selection import cross_val_score, GridSearchCV
import pandas as pd

  return f(*args, **kwds)


# Load data

In [76]:
ds = load_breast_cancer()
X, y = ds.data, ds.target

# Metadata

In [77]:
n_samples = len(y)
print(f'Number of samples: {n_samples}')

Number of samples: 569


In [78]:
n_features = X.shape[1]
print(f'Number of features: {n_features}')

Number of features: 30


In [79]:
count = Counter(y)
print(f'IR: {count[1] / count[0]}')

IR: 1.6839622641509433


In [80]:
print(f'R: {n_samples / n_features}')

R: 18.966666666666665


# Modelling

### No oversampling

In [81]:
lr = LogisticRegression(solver='lbfgs', max_iter=5000)

In [82]:
score = cross_val_score(lr, X, y, scoring='f1', cv=5)

In [83]:
score.mean()

0.9640490336020795

### Oversampling

In [84]:
smote = SMOTE()
lr = LogisticRegression(solver='lbfgs', max_iter=5000)

In [85]:
lr_smote = make_pipeline(smote, lr)

In [86]:
score_over = cross_val_score(lr_smote, X, y, scoring='f1', cv=5)

In [87]:
score_over.mean()

0.959696031260024

### Grid search

In [99]:
smote = SMOTE()
lr = LogisticRegression(solver='lbfgs', max_iter=5000)

In [100]:
lr_smote = make_pipeline(smote, lr)

In [101]:
param_grid = {'smote__k_neighbors': [3, 4, 5], 'logisticregression__C': [5e4, 5e3, 5e2, 1]}

In [102]:
gscv = GridSearchCV(lr_smote, param_grid, cv=5, n_jobs=-1, scoring='f1')
gscv.fit(X, y)



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('smote', SMOTE(k_neighbors=5, kind='deprecated', m_neighbors='deprecated', n_jobs=1,
   out_step='deprecated', random_state=None, ratio=None,
   sampling_strategy='auto', svm_estimator='deprecated')), ('logisticregression', LogisticRegression(C=1.0, class...nalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'smote__k_neighbors': [3, 4, 5], 'logisticregression__C': [50000.0, 5000.0, 500.0, 1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='f1', verbose=0)

In [104]:
pd.DataFrame(gscv.cv_results_)



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_logisticregression__C,param_smote__k_neighbors,params,split0_test_score,split1_test_score,split2_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,2.948507,0.017999,0.00123,6.5e-05,50000,3,"{'logisticregression__C': 50000.0, 'smote__k_n...",0.958333,0.965517,0.979021,...,0.971932,0.009667,1,0.993007,0.987698,0.986063,0.991243,0.985965,0.988795,0.002843
1,2.942253,0.030465,0.001157,0.000103,50000,4,"{'logisticregression__C': 50000.0, 'smote__k_n...",0.957746,0.972222,0.972222,...,0.968981,0.005666,6,0.993007,0.984127,0.987826,0.989474,0.986014,0.98809,0.003039
2,2.826185,0.245662,0.001134,0.000107,50000,5,"{'logisticregression__C': 50000.0, 'smote__k_n...",0.958333,0.97931,0.958904,...,0.970725,0.010881,3,0.993007,0.987698,0.984293,0.991274,0.982456,0.987746,0.004001
3,2.911513,0.039451,0.001052,2.7e-05,5000,3,"{'logisticregression__C': 5000.0, 'smote__k_ne...",0.965035,0.972222,0.958904,...,0.969197,0.006653,5,0.991243,0.989437,0.986063,0.991243,0.987741,0.989145,0.002018
4,2.874463,0.016269,0.001055,8.1e-05,5000,4,"{'logisticregression__C': 5000.0, 'smote__k_ne...",0.958333,0.972222,0.958904,...,0.969292,0.010095,4,0.991243,0.992982,0.984348,0.991213,0.986014,0.98916,0.003353
5,2.879495,0.004609,0.001153,5.9e-05,5000,5,"{'logisticregression__C': 5000.0, 'smote__k_ne...",0.957746,0.965517,0.958904,...,0.967818,0.009126,7,0.987698,0.987698,0.984293,0.991274,0.986014,0.987395,0.002313
6,2.839231,0.080975,0.001086,6.5e-05,500,3,"{'logisticregression__C': 500.0, 'smote__k_nei...",0.958333,0.958333,0.965517,...,0.967798,0.009129,8,0.985915,0.985915,0.989547,0.989474,0.987741,0.987719,0.001608
7,2.801568,0.16892,0.001124,6e-05,500,4,"{'logisticregression__C': 500.0, 'smote__k_nei...",0.951049,0.965517,0.972222,...,0.967721,0.009378,9,0.987698,0.989437,0.978947,0.991274,0.987741,0.987019,0.004245
8,2.898466,0.035155,0.001075,3.5e-05,500,5,"{'logisticregression__C': 500.0, 'smote__k_nei...",0.965035,0.965517,0.979021,...,0.971898,0.006093,2,0.987698,0.985915,0.982456,0.98951,0.984293,0.985975,0.002477
9,1.489449,0.220143,0.001102,2.8e-05,1,3,"{'logisticregression__C': 1, 'smote__k_neighbo...",0.95302,0.958904,0.985714,...,0.959539,0.014022,11,0.970018,0.96831,0.966841,0.977233,0.970123,0.970505,0.003575
