In [1]:
import streamlit as st
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
import pickle
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
# Tuning
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV


In [4]:
data_one_hot = pd.read_csv('../data/data_one_hot.csv')

In [5]:
data_one_hot.tail()

Unnamed: 0.1,Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,diagnosis_M
564,564,21.56,22.39,142.0,1479.0,0.111,0.1159,0.2439,0.1389,0.1726,...,26.4,166.1,2027.0,0.141,0.2113,0.4107,0.2216,0.206,0.07115,1.0
565,565,20.13,28.25,131.2,1261.0,0.0978,0.1034,0.144,0.09791,0.1752,...,38.25,155.0,1731.0,0.1166,0.1922,0.3215,0.1628,0.2572,0.06637,1.0
566,566,16.6,28.08,108.3,858.1,0.08455,0.1023,0.09251,0.05302,0.159,...,34.12,126.7,1124.0,0.1139,0.3094,0.3403,0.1418,0.2218,0.0782,1.0
567,567,20.6,29.33,140.1,1265.0,0.1178,0.277,0.3514,0.152,0.2397,...,39.42,184.6,1821.0,0.165,0.8681,0.9387,0.265,0.4087,0.124,1.0
568,568,7.76,24.54,47.92,181.0,0.05263,0.04362,0.0,0.0,0.1587,...,30.37,59.16,268.6,0.08996,0.06444,0.0,0.0,0.2871,0.07039,0.0


In [6]:
y = data_one_hot["diagnosis_M"]
X = data_one_hot[["concave points_worst","radius_worst","texture_worst","perimeter_se"]] 

In [7]:
data = data_one_hot[["concave points_worst","radius_worst","texture_worst","perimeter_se", "diagnosis_M"]]

In [8]:
data.tail(15)

Unnamed: 0,concave points_worst,radius_worst,texture_worst,perimeter_se,diagnosis_M
554,0.06493,13.89,35.74,1.502,0.0
555,0.09127,10.84,34.91,1.437,0.0
556,0.02232,10.65,22.88,1.648,0.0
557,0.0,10.49,34.24,3.618,0.0
558,0.1105,15.48,27.27,2.224,0.0
559,0.09653,12.48,37.16,1.936,0.0
560,0.1048,15.3,33.17,2.888,0.0
561,0.0,11.92,38.3,2.041,0.0
562,0.2356,17.52,42.79,2.362,1.0
563,0.2542,24.29,29.41,8.758,1.0


In [64]:
#make_pipeline(RobustScaler(), LogisticRegression())
clf_pipeline = Pipeline(steps=[('robustscaler', RobustScaler()),
                ('logicticRegression', LogisticRegression())])


In [65]:
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=.3, random_state=0, stratify=y) 

In [66]:
clf_pipeline_train = clf_pipeline.fit(X_train, y_train)

In [67]:
# Cross val pipeline
cross_val_score(clf_pipeline, X_train, y_train, cv=41, scoring='recall').mean()

0.9410569105691058

In [68]:
clf_pipeline_train.get_params()

{'memory': None,
 'steps': [('robustscaler', RobustScaler()),
  ('logicticRegression', LogisticRegression())],
 'verbose': False,
 'robustscaler': RobustScaler(),
 'logicticRegression': LogisticRegression(),
 'robustscaler__copy': True,
 'robustscaler__quantile_range': (25.0, 75.0),
 'robustscaler__unit_variance': False,
 'robustscaler__with_centering': True,
 'robustscaler__with_scaling': True,
 'logicticRegression__C': 1.0,
 'logicticRegression__class_weight': None,
 'logicticRegression__dual': False,
 'logicticRegression__fit_intercept': True,
 'logicticRegression__intercept_scaling': 1,
 'logicticRegression__l1_ratio': None,
 'logicticRegression__max_iter': 100,
 'logicticRegression__multi_class': 'auto',
 'logicticRegression__n_jobs': None,
 'logicticRegression__penalty': 'l2',
 'logicticRegression__random_state': None,
 'logicticRegression__solver': 'lbfgs',
 'logicticRegression__tol': 0.0001,
 'logicticRegression__verbose': 0,
 'logicticRegression__warm_start': False}

In [None]:
# Export as pickle file
with open("pipeline_01.pkl", "wb") as file:
    pickle.dump(clf_pipeline_train, file)

# Load pipeline from pickle file
clf_pipeline_train = pickle.load(open("pipeline_01.pkl","rb"))

clf_pipeline_train.score(X_test, y_test)

In [69]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

#make_pipeline(RobustScaler(), LogisticRegression())
clf_pipeline = Pipeline(steps=[('robustscaler', RobustScaler()),
                ('lr', LogisticRegression())])

param_grid = {
    'lr__solver': ['newton-cg', 'lbfgs'],
    'lr__penalty': ['none', 'elasticnet', 'l1', 'l2'],
    'lr__max_iter': [30, 100, 300, 500],
    'lr__C': [0.001, 0.01, 0.1, 1, 10, 100]
}

grid_search = GridSearchCV(clf_pipeline, param_grid, cv=5, scoring='recall')
grid_search.fit(X_train, y_train)

grid_search.best_params_











480 fits failed out of a total of 960.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
120 fits failed with the following error:
Traceback (most recent call last):
  File "/home/laura/.pyenv/versions/3.8.12/envs/simplon/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/laura/.pyenv/versions/3.8.12/envs/simplon/lib/python3.8/site-packages/sklearn/pipeline.py", line 382, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/home/laura/.pyenv/versions/3.8.12/envs/simplon/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1091, in fit
    solver = _check_solver(self.solver, self.penalty, self

{'lr__C': 10,
 'lr__max_iter': 30,
 'lr__penalty': 'l2',
 'lr__solver': 'newton-cg'}

In [70]:
# Export as pickle file
with open("pipeline_vf.pkl", "wb") as file:
    pickle.dump(grid_search, file)

# Load pipeline from pickle file
grid_search = pickle.load(open("pipeline_vf.pkl","rb"))

grid_search.score(X_test, y_test)

0.9375

In [71]:
X_test

Unnamed: 0,concave points_worst,radius_worst,texture_worst,perimeter_se
510,0.10560,12.45,17.60,1.345
520,0.07262,10.57,17.84,2.388
311,0.05813,16.46,21.75,1.954
25,0.25500,22.25,21.40,7.276
530,0.10100,13.50,27.98,3.149
...,...,...,...,...
32,0.18470,20.88,32.09,3.999
377,0.05781,14.69,35.63,1.400
535,0.21480,24.30,25.48,4.706
236,0.25930,31.01,34.51,7.247


In [72]:
y_test

510    0.0
520    0.0
311    0.0
25     1.0
530    0.0
      ... 
32     1.0
377    0.0
535    1.0
236    1.0
354    0.0
Name: diagnosis_M, Length: 171, dtype: float64

In [73]:
grid_search.predict(X_test)

array([0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0.,
       1., 0., 0., 1., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0.,
       0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 1.,
       0., 1., 0., 1., 1., 1., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
       0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
       0., 0., 1., 0., 1., 0., 1., 1., 1., 0., 0., 1., 1., 1., 1., 0., 1.,
       0., 0., 1., 1., 1., 0., 1., 1., 1., 0., 1., 0., 1., 0., 0., 0., 1.,
       1., 1., 1., 0., 1., 0., 1., 0., 1., 0., 0., 1., 1., 0., 1., 1., 0.,
       0., 0., 1., 1., 1., 1., 0., 0., 1., 0., 0., 0., 1., 1., 0., 1., 1.,
       0.])

In [75]:
X_test.describe()

Unnamed: 0,concave points_worst,radius_worst,texture_worst,perimeter_se
count,171.0,171.0,171.0,171.0
mean,0.117702,16.352883,25.454678,2.814436
std,0.062371,4.968872,6.034097,1.585898
min,0.0,8.678,12.87,0.8439
25%,0.06806,12.835,21.09,1.593
50%,0.1105,15.05,25.4,2.344
75%,0.16195,19.005,28.725,3.497
max,0.2867,32.49,47.16,8.83


In [91]:
X_test.shape

(171, 4)

In [95]:
np.array([[0.12,12.47,17.63,1.32]]).shape

(1, 4)

In [96]:
df_test = pd.DataFrame(data=np.array([[0.12,12.47,17.63,1.32]]),columns=["concave points_worst","radius_worst","texture_worst","perimeter_se"])
grid_search.predict(df_test)

array([0.])