[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mbarbetti/LNHunter/blob/master/binary_classification.ipynb)

# Binary classification

## Data loading and preparation

In [1]:
import numpy as np
import pandas as pd

import os
import pickle

data_dir  = "./data"
data_file = "db_mediastinalbulky_v2-reduced.pkl" 
file_path = os.path.join ( data_dir, data_file )

with open (file_path, "rb") as file:
  data = pickle.load (file)

data . info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 119 entries, 0 to 118
Data columns (total 19 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   ID                               119 non-null    float64
 1   lymphoma_type                    119 non-null    float64
 2   age                              111 non-null    float64
 3   SUVmin (SUV)                     111 non-null    float64
 4   SUVmean (SUV)                    111 non-null    float64
 5   MTV (# vx)                       111 non-null    float64
 6   SMTV (mL/Kg) TOT                 111 non-null    float64
 7   CONVENTIONAL_SUVbwmin            115 non-null    float64
 8   CONVENTIONAL_SUVbwKurtosis       115 non-null    float64
 9   DISCRETIZED_HISTO_Kurtosis       110 non-null    float64
 10  SHAPE_Sphericity[onlyFor3DROI])  113 non-null    float64
 11  GLCM_Correlation                 113 non-null    float64
 12  GLRLM_SRE             

### Aggressive data cleaning 

In [2]:
data . dropna ( inplace = True )
data . info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 101 entries, 0 to 118
Data columns (total 19 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   ID                               101 non-null    float64
 1   lymphoma_type                    101 non-null    float64
 2   age                              101 non-null    float64
 3   SUVmin (SUV)                     101 non-null    float64
 4   SUVmean (SUV)                    101 non-null    float64
 5   MTV (# vx)                       101 non-null    float64
 6   SMTV (mL/Kg) TOT                 101 non-null    float64
 7   CONVENTIONAL_SUVbwmin            101 non-null    float64
 8   CONVENTIONAL_SUVbwKurtosis       101 non-null    float64
 9   DISCRETIZED_HISTO_Kurtosis       101 non-null    float64
 10  SHAPE_Sphericity[onlyFor3DROI])  101 non-null    float64
 11  GLCM_Correlation                 101 non-null    float64
 12  GLRLM_SRE             

### Splitting into train-set and test-set

In [3]:
cols = list ( data.columns )
X_cols = cols[2:]
y_cols = "lymphoma_type"

binary_class = (data[y_cols] != 2)   # exclude GZ class

X = data[binary_class][X_cols] . to_numpy()
y = data[binary_class][y_cols] . to_numpy() . flatten()

ratio = len(y[y == 3]) / len(y[y == 1])
print ( "Label ratio on data-set : {:.1f}%" . format (100 * ratio) )

Label ratio on data-set : 41.5%


In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split ( X, y, test_size = 0.2, random_state = 42 )

ratio_train = len(y_train[y_train == 3]) / len(y_train[y_train == 1])
print ( "Label ratio on train-set : {:.1f}%" . format (100 * ratio_train) )

ratio_test = len(y_test[y_test == 3]) / len(y_test[y_test == 1])
print ( "Label ratio on test-set  : {:.1f}%" . format (100 * ratio_test) )

Label ratio on train-set : 46.0%
Label ratio on test-set  : 26.7%


In [5]:
from sklearn.model_selection import StratifiedShuffleSplit

sss = StratifiedShuffleSplit ( n_splits = 1, test_size = 0.2, random_state = 42 )
for idx_train, idx_test in sss . split ( X, y ):
  X_train , y_train = X[idx_train] , y[idx_train]
  X_test  , y_test  = X[idx_test]  , y[idx_test]

ratio_train = len(y_train[y_train == 3]) / len(y_train[y_train == 1])
print ( "Label ratio on train-set : {:.1f}%" . format (100 * ratio_train) )

ratio_test = len(y_test[y_test == 3]) / len(y_test[y_test == 1])
print ( "Label ratio on test-set  : {:.1f}%" . format (100 * ratio_test) )

Label ratio on train-set : 40.4%
Label ratio on test-set  : 46.2%


## Simple binary classification

In [25]:
%matplotlib inline 
import matplotlib.pyplot as plt

from sklearn.model_selection import cross_val_score

### Display trained model score
def display_score ( model_name: str, scores: list ):
  str_len = len ( model_name )
  print ( "+--" + "-" * str_len + "--+")
  print ( "|  {}  |" . format (model_name) )
  print ( "+--" + "-" * str_len + "--+")
  print ( "| Scores : {}" . format ( scores ) )
  print ( "| Mean   : {:.1f}%" . format ( 100 * np.mean(scores) ) )
  print ( "| Std    : {:.1f}%" . format ( 100 * np.std (scores) ) )
  print ( "+--- - -")

### Gradient Boosted Decision Trees

In [26]:
from sklearn.ensemble import GradientBoostingClassifier

clf_gbdt = GradientBoostingClassifier (
                                        loss = "deviance" ,
                                        learning_rate = 0.3 ,
                                        n_estimators  = 100 ,
                                        criterion = "friedman_mse" ,
                                        max_leaf_nodes = 10 ,
                                        max_depth = 5       ,
                                      )

scores = cross_val_score ( clf_gbdt, X_train, y_train, scoring = "accuracy", cv = 4 )

display_score ( model_name = "GBDT Classifier", scores = scores )

+-------------------+
|  GBDT Classifier  |
+-------------------+
| Scores : [0.73684211 0.72222222 0.83333333 0.83333333]
| Mean   : 78.1%
| Std    : 5.2%
+--- - -


#### Automatic hyperparameters tuning

In [27]:
import optuna
optuna.logging.set_verbosity ( optuna.logging.ERROR )   # silence Optuna during trials study

### Return a set of trials studied by Optuna
def optuna_study (
                   model_name  : str ,
                   storage_dir : str ,
                   objective   : float ,
                   n_trials    : int = 10 ,
                   direction : str = "minimize"  , 
                   load_if_exists : bool = False ,
                 ):
  storage_path = "{}/{}.db" . format (storage_dir, model_name)
  storage_name = "sqlite:///{}" . format (storage_path)  

  if load_if_exists:
    pass
  elif not ( load_if_exists ) and os.path.isfile ( storage_path ):
    os.remove ( storage_path )

  study = optuna.create_study ( 
                                study_name = model_name ,
                                storage  = storage_name ,
                                load_if_exists = load_if_exists ,
                                direction = direction
                              )

  study . optimize ( objective, n_trials = n_trials )

  return study


In [28]:
def obj_gbdt (trial):
  loss = trial . suggest_categorical ( "loss", ["deviance", "exponential"] )
  lr   = trial . suggest_float ( "lr", 0.05, 0.95, log = True )
  n_estims  = trial . suggest_int ( "n_estims", 100, 500, log = True )
  criterion = trial . suggest_categorical ( "criterion", ["friedman_mse", "mse"] ) 
  max_leaf  = trial . suggest_int ( "max_leaf" , 2, 20 )
  max_depth = trial . suggest_int ( "max_depth", 1, 10 )

  model = GradientBoostingClassifier ( 
                                       loss = loss ,
                                       learning_rate =  lr , 
                                       n_estimators  =  n_estims , 
                                       criterion  =  criterion   ,
                                       max_leaf_nodes = max_leaf ,
                                       max_depth = max_depth ,
                                     )
  
  scores = cross_val_score ( model, X_train, y_train, scoring = "accuracy", cv = 4 )
  
  return np.mean ( scores )

In [29]:
study_gbdt = optuna_study (
                            model_name  = "clf_gbdt"  ,
                            storage_dir = "./storage" ,
                            objective = obj_gbdt ,
                            n_trials  = 50 ,
                            direction = "maximize" ,
                            load_if_exists = False ,
                            )

In [30]:
df = study_gbdt . trials_dataframe ( attrs = ("params", "value") )
df . sort_values ( by = "value", ascending = False ) . head()

Unnamed: 0,params_criterion,params_loss,params_lr,params_max_depth,params_max_leaf,params_n_estims,value
25,friedman_mse,exponential,0.152128,2,15,200,0.834795
38,friedman_mse,exponential,0.199145,3,14,160,0.834795
45,friedman_mse,deviance,0.326869,3,14,186,0.834795
43,friedman_mse,exponential,0.300303,3,12,172,0.834795
10,friedman_mse,exponential,0.195587,1,2,194,0.834795


In [31]:
best_params = study_gbdt . best_params

clf_gbdt = GradientBoostingClassifier (
                                        loss = best_params["loss"] ,
                                        learning_rate = best_params["lr"] ,
                                        n_estimators  = best_params["n_estims"]  ,
                                        criterion  =  best_params["criterion"]   ,
                                        max_leaf_nodes = best_params["max_leaf"] ,
                                        max_depth = best_params["max_depth"] ,
                                      )

scores = cross_val_score ( clf_gbdt, X_train, y_train, scoring = "accuracy", cv = 4 )

display_score ( model_name = "GBDT Classifier", scores = scores )

+-------------------+
|  GBDT Classifier  |
+-------------------+
| Scores : [0.89473684 0.66666667 0.88888889 0.88888889]
| Mean   : 83.5%
| Std    : 9.7%
+--- - -


In [32]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix

from sklearn.metrics import plot_confusion_matrix

#y_train_pred = cross_val_predict(model, X_train, y_train, cv=3)

#print ( confusion_matrix(y_train, y_train_pred) )

#plot_confusion_matrix (model, X_train, y_train)
#plt.show()

### Support Vector Machines

In [33]:
from sklearn.svm import SVC

clf_svm = SVC ( C = 1.0, kernel = "rbf" )

scores = cross_val_score ( clf_svm, X_train, y_train, scoring = "accuracy", cv = 4 )

display_score ( model_name = "SVM Classifier", scores = scores )

+------------------+
|  SVM Classifier  |
+------------------+
| Scores : [0.68421053 0.72222222 0.72222222 0.72222222]
| Mean   : 71.3%
| Std    : 1.6%
+--- - -


#### Automatic hyperparameters tuning

In [34]:
def obj_svm (trial):
  C = trial . suggest_float ( "C", 1e-2, 1e2, log = True )
  kernel = trial . suggest_categorical ( "kernel", ["rbf", "sigmoid"] )
  gamma  = trial . suggest_categorical ( "gamma", ["scale", "auto"] )
  coef0  = trial . suggest_float ( "coef0", 0, 1 )

  model = SVC ( C = C, kernel = kernel, gamma = gamma, coef0 = coef0 )
  
  scores = cross_val_score ( model, X_train, y_train, scoring = "accuracy", cv = 4 )
  
  return np.mean ( scores )

In [35]:
study_svm = optuna_study (
                           model_name  = "clf_svm"  ,
                           storage_dir = "./storage" ,
                           objective = obj_svm ,
                           n_trials  = 50 ,
                           direction = "maximize" ,
                           load_if_exists = False ,
                         )

In [36]:
df = study_svm . trials_dataframe ( attrs = ("params", "value") )
df . sort_values ( by = "value", ascending = False ) . head()

Unnamed: 0,params_C,params_coef0,params_gamma,params_kernel,value
0,0.149424,0.533525,auto,sigmoid,0.712719
1,0.018802,0.6512,auto,rbf,0.712719
27,71.427136,0.097037,auto,sigmoid,0.712719
28,2.579852,0.17374,auto,sigmoid,0.712719
29,6.932823,0.595131,auto,sigmoid,0.712719


In [37]:
best_params = study_svm . best_params

clf_svm = SVC ( C = best_params["C"], kernel = best_params["kernel"], gamma = best_params["gamma"], coef0 = best_params["coef0"] )

scores = cross_val_score ( clf_svm, X_train, y_train, scoring = "accuracy", cv = 4 )

display_score ( model_name = "SVM Classifier", scores = scores )

+------------------+
|  SVM Classifier  |
+------------------+
| Scores : [0.68421053 0.72222222 0.72222222 0.72222222]
| Mean   : 71.3%
| Std    : 1.6%
+--- - -


### Model evaluation

In [19]:
from sklearn.metrics import accuracy_score

#final_predictions = final_model . predict ( X_test )
#final_scores = accuracy_score ( y_test, final_predictions )

#display_score ( model_name = "Best Classifier", scores = final_scores )

### Model export

In [20]:
model_dir = "./models"
#exp_model_file = "model_test.pkl"
#exp_model_path = os.path.join ( model_dir, exp_model_file )

#pickle . dump ( model, open ( exp_model_path, "wb" ) )