### Import modules and data

In [None]:
#Written by Jae Hyon Park, MD, PhD, Jongjin Yoon, MD, PhD.
#Department of Radiology, Yonsei University College of Medicine

#Radiomics model to predict hepatic metastasis or abscess in periampullary cancer patients 
# For related questions, please contact the following e-mail: yelv@yuhs.ac (corresponding author: Yong Eun Chung)

#Contents:
#code: python code for performing the analysis, code may be modified for intended use. 
#pyradiomics folder: This is an open-source python package for the extraction of Radiomics features from medical imaging. With this package we aim to establish a reference standard for Radiomic Analysis, and provide a tested and maintained open-source platform for easy and reproducible Radiomic Feature extraction (https://pyradiomics.readthedocs.io/en/latest/). A used version of the radiomics tool used in this study, pyradiomics, is included.
#Slicer folder: 3D Slicer is a free, open source and multi-platform software package widely used for medical, biomedical, and related imaging research (https://www.slicer.org/).


# Import common modules

import os
import pandas as pd
import numpy as np
from PIL import Image
import csv
import shutil
import sklearn
import pingouin as pg
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import f
import tqdm 
import random
import cv2
from ReliefF import ReliefF

In [None]:
# csv file loading
path1 = "C:/Research/Liver_abscess_metastasis_radiomics/Raw_data/gangnam_shinchon/concat/"
path2 = "C:/Research/Liver_abscess_metastasis_radiomics/Raw_data/gangnam_shinchon/concat/ICC/"

csv3 = pd.read_csv(os.path.join(path1, filename3), sep=",") 
csv4 = pd.read_csv(os.path.join(path2, filename4), sep=",")

In [None]:
data_df_train = pd.DataFrame(csv3)
labels_df_train = data_df_train["target"]
ID_df_train = data_df_train["Patient_ID"]

data_df_test = pd.DataFrame(csv4)
labels_df_test = data_df_test["target"]
ID_df_test = data_df_test["Patient_ID"]

csv3 = csv3.drop(columns=['target', 'Patient_ID'], axis=0)
csv4 = csv4.drop(columns=['target', 'Patient_ID'], axis=0)

In [None]:
"""
# ICC > 0.75 인 feature만 선별
good_csv7 = csv7['ICC'] >= 0.75
good_csv8 = csv8['ICC'] >= 0.75

csv7_filter_df = csv7[good_csv7]
csv8_filter_df = csv8[good_csv8]

print(csv7_filter_df)
print(csv8_filter_df)

common_good_features_df_1 = pd.merge(csv7_filter_df, csv8_filter_df, on="feature")

print(common_good_features_df_1)

common_good_features_list_df_1 = common_good_features_df_1[['feature']]
common_good_features_list_1 = common_good_features_list_df_1.values.tolist()
common_good_features_list_1.append(['Patient_ID', 'target'])
common_good_features_list_1 = sum(common_good_features_list_1, []) # 차원 축소
"""

In [7]:
"""
# ICC > 0.75 인 feature만 선별
good_csv9 = csv9['ICC'] >= 0.75
good_csv10 = csv10['ICC'] >= 0.75

csv9_filter_df = csv9[good_csv9]
csv10_filter_df = csv10[good_csv10]

print(csv9_filter_df)
print(csv10_filter_df)

common_good_features_df_2 = pd.merge(csv9_filter_df, csv10_filter_df, on="feature")

print(common_good_features_df_2)

common_good_features_list_df_2 = common_good_features_df_2[['feature']]
common_good_features_list_2 = common_good_features_list_df_2.values.tolist()
common_good_features_list_2.append(['Patient_ID', 'target'])
common_good_features_list_2 = sum(common_good_features_list_2, []) # 차원 축소
"""

     Unnamed: 0                                 feature  Type  \
0             1         diagnostics_Image-original_Mean  ICC2   
3             1          original_shape_LeastAxisLength  ICC2   
5             1  original_shape_Maximum2DDiameterColumn  ICC2   
6             1     original_shape_Maximum2DDiameterRow  ICC2   
9             1               original_shape_MeshVolume  ICC2   
..          ...                                     ...   ...   
841           1        wavelet-LLL_glszm_ZonePercentage  ICC2   
842           1          wavelet-LLL_glszm_ZoneVariance  ICC2   
843           1              wavelet-LLL_ngtdm_Busyness  ICC2   
844           1            wavelet-LLL_ngtdm_Coarseness  ICC2   
845           1            wavelet-LLL_ngtdm_Complexity  ICC2   

              Description    ICC             F  df1  df2  pval        CI95%  
0    Single random raters  1.000 -7.570308e+15   22   22   1.0      [1. 1.]  
3    Single random raters  0.961  6.441900e+01   22   22   0.0 

In [8]:
"""
# ICC > 0.75 인 feature만 선별
good_csv11 = csv11['ICC'] >= 0.75
good_csv12 = csv12['ICC'] >= 0.75

csv11_filter_df = csv11[good_csv11]
csv12_filter_df = csv12[good_csv12]

print(csv11_filter_df)
print(csv12_filter_df)

common_good_features_df_3 = pd.merge(csv11_filter_df, csv12_filter_df, on="feature")

print(common_good_features_df_3)

common_good_features_list_df_3 = common_good_features_df_3[['feature']]
common_good_features_list_3 = common_good_features_list_df_3.values.tolist()
common_good_features_list_3.append(['Patient_ID', 'target'])
common_good_features_list_3 = sum(common_good_features_list_3, []) # 차원 축소
"""

     Unnamed: 0                                 feature  Type  \
0             1         diagnostics_Image-original_Mean  ICC2   
3             1          original_shape_LeastAxisLength  ICC2   
5             1  original_shape_Maximum2DDiameterColumn  ICC2   
6             1     original_shape_Maximum2DDiameterRow  ICC2   
9             1               original_shape_MeshVolume  ICC2   
..          ...                                     ...   ...   
842           1          wavelet-LLL_glszm_ZoneVariance  ICC2   
843           1              wavelet-LLL_ngtdm_Busyness  ICC2   
844           1            wavelet-LLL_ngtdm_Coarseness  ICC2   
845           1            wavelet-LLL_ngtdm_Complexity  ICC2   
847           1              wavelet-LLL_ngtdm_Strength  ICC2   

              Description    ICC             F  df1  df2  pval        CI95%  
0    Single random raters  1.000  5.095080e+15   22   22   0.0      [1. 1.]  
3    Single random raters  0.881  1.609900e+01   22   22   0.0 

In [11]:
# 2개 이상 phase를 concat 
#data_df_train = pd.DataFrame()
#data_df_test = pd.DataFrame()
#data_df_train = pd.concat([csv1_ICC_filtered, csv3_ICC_filtered, csv5_ICC_filtered], axis=1) # training set ; gangnam AP + pre + T2
#data_df_test = pd.concat([csv2_ICC_filtered, csv4_ICC_filtered, csv6_ICC_filtered], axis=1) # test set ; shinchon AP + pre + T2
#data_df_train = pd.concat([csv3_ICC_filtered, csv5_ICC_filtered], axis=1) # training set ; gangnam pre + T2
#data_df_test = pd.concat([csv4_ICC_filtered, csv6_ICC_filtered], axis=1) # test set ; shinchon pre + T2

In [None]:
#data_df_test

### Sample augmentation using SMOTE (Synthetic Minority Oversampling TEchnique)

In [12]:
# Sample augmentator using SMOTE

import imblearn
from imblearn.over_sampling import SMOTE

def smote_amplifier(input_data, input_label):
    X = input_data
    y = input_label

    print("*** Pre-augmentation ***")
    print(y.value_counts())
    print('\n')

    # SMOTE 
    smote = SMOTE(sampling_strategy='minority', random_state=42)
    X_sm, y_sm = smote.fit_resample(X, y)

    print("*** Post-augmentation ***")
    print(y_sm.value_counts())
    print('\n')

    return X_sm, y_sm


In [13]:
# Sample augmentation

X_sm_train, y_sm_train = smote_amplifier(data_df_train, labels_df_train)
#X_sm_test, y_sm_test = smote_amplifier(data_df_test, labels_df_test)


*** Pre-augmentation ***
1    68
0    43
Name: target, dtype: int64


*** Post-augmentation ***
0    68
1    68
Name: target, dtype: int64




### Feature selector

In [14]:
# CST (removing constant feature) 에 의한 feature selection
from sklearn.feature_selection import VarianceThreshold


def LASSO(input_data, labels):
    X = input_data
    y = labels

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

    pipeline = Pipeline([
                     ('scaler',StandardScaler()),
                     ('model',Lasso())
    ])
    search = GridSearchCV(pipeline,
                        {'model__alpha':np.arange(0.0,0.01,0.001)}, # alpha range를 탐색하기 위한 범위는 그때그때 임의지정 해 주자
                        cv = 10, scoring="neg_mean_squared_error",verbose=3
                        ) # LASSO를 돌리기 위한 최적의 alpha값을 찾기 위함
    search.fit(X_train,y_train)

    print("Best alpha : ", search.best_params_) # best alpha 값
    coefficients = search.best_estimator_.named_steps['model'].coef_
    importance = np.abs(coefficients)

    features = list(X_sm_train.columns)
    features_LASSO = np.array(features)[importance == 0].tolist()

    print("*** Total radiomics feature 갯수 ***")
    print(len(X.columns))
    print("*** LASSO filter를 통과한 feature의 갯수 ***")
    print(len(X.columns)-len(features_LASSO))

    data_df_LASSO = X

    for feature in features_LASSO:
        data_df_LASSO = data_df_LASSO.drop(columns=str(feature), axis=1)

    return data_df_LASSO


In [15]:
X_sm_LASSO = LASSO(X_sm_train, y_sm_train) 

*** Total radiomics feature 갯수 ***
1374
*** constant filter를 통과한 feature의 갯수 ***
1374


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape),k=1).astype(np.bool)) # upper triangular 만 선택


*** Total radiomics feature 갯수 ***
1374
*** highly correlation filter를 통과한 feature의 갯수 ***
660
*** Total radiomics feature 갯수 ***
1374
*** Relief filter를 통과한 feature의 갯수 ***
64
Fitting 10 folds for each of 10 candidates, totalling 100 fits
[CV 1/10] END ................model__alpha=0.0;, score=-0.108 total time=   0.0s
[CV 2/10] END ................model__alpha=0.0;, score=-0.373 total time=   0.0s


  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  model = cd_fast.enet_coordinate_descent(
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  model = cd_fast.enet_coordinate_descent(
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  model = cd_fast.enet_coordinate_descent(


[CV 3/10] END ................model__alpha=0.0;, score=-0.196 total time=   0.0s
[CV 4/10] END ................model__alpha=0.0;, score=-0.339 total time=   0.0s
[CV 5/10] END ................model__alpha=0.0;, score=-0.115 total time=   0.0s
[CV 6/10] END ................model__alpha=0.0;, score=-0.331 total time=   0.0s
[CV 7/10] END ................model__alpha=0.0;, score=-0.175 total time=   0.0s


  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  model = cd_fast.enet_coordinate_descent(
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  model = cd_fast.enet_coordinate_descent(
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  model = cd_fast.enet_coordinate_descent(
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  model = cd_fast.enet_coordinate_descent(
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  model = cd_fast.enet_coordinate_descent(


[CV 8/10] END ................model__alpha=0.0;, score=-0.201 total time=   0.0s
[CV 9/10] END ................model__alpha=0.0;, score=-0.202 total time=   0.0s
[CV 10/10] END ...............model__alpha=0.0;, score=-0.087 total time=   0.0s


  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  model = cd_fast.enet_coordinate_descent(
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV 1/10] END ..............model__alpha=0.001;, score=-0.201 total time=   0.0s
[CV 2/10] END ..............model__alpha=0.001;, score=-0.469 total time=   0.0s


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV 3/10] END ..............model__alpha=0.001;, score=-0.249 total time=   0.0s
[CV 4/10] END ..............model__alpha=0.001;, score=-0.239 total time=   0.0s


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV 5/10] END ..............model__alpha=0.001;, score=-0.265 total time=   0.0s
[CV 6/10] END ..............model__alpha=0.001;, score=-0.367 total time=   0.0s


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV 7/10] END ..............model__alpha=0.001;, score=-0.107 total time=   0.0s
[CV 8/10] END ..............model__alpha=0.001;, score=-0.215 total time=   0.0s


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV 9/10] END ..............model__alpha=0.001;, score=-0.092 total time=   0.0s
[CV 10/10] END .............model__alpha=0.001;, score=-0.186 total time=   0.0s


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV 1/10] END ..............model__alpha=0.002;, score=-0.192 total time=   0.0s
[CV 2/10] END ..............model__alpha=0.002;, score=-0.419 total time=   0.0s


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV 3/10] END ..............model__alpha=0.002;, score=-0.258 total time=   0.0s
[CV 4/10] END ..............model__alpha=0.002;, score=-0.270 total time=   0.0s


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV 5/10] END ..............model__alpha=0.002;, score=-0.237 total time=   0.0s
[CV 6/10] END ..............model__alpha=0.002;, score=-0.412 total time=   0.0s


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV 7/10] END ..............model__alpha=0.002;, score=-0.124 total time=   0.0s
[CV 8/10] END ..............model__alpha=0.002;, score=-0.233 total time=   0.0s


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV 9/10] END ..............model__alpha=0.002;, score=-0.095 total time=   0.0s
[CV 10/10] END .............model__alpha=0.002;, score=-0.178 total time=   0.0s
[CV 1/10] END ..............model__alpha=0.003;, score=-0.180 total time=   0.0s
[CV 2/10] END ..............model__alpha=0.003;, score=-0.387 total time=   0.0s


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV 3/10] END ..............model__alpha=0.003;, score=-0.238 total time=   0.0s
[CV 4/10] END ..............model__alpha=0.003;, score=-0.276 total time=   0.0s


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV 5/10] END ..............model__alpha=0.003;, score=-0.186 total time=   0.0s
[CV 6/10] END ..............model__alpha=0.003;, score=-0.395 total time=   0.0s


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV 7/10] END ..............model__alpha=0.003;, score=-0.126 total time=   0.0s
[CV 8/10] END ..............model__alpha=0.003;, score=-0.241 total time=   0.0s


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV 9/10] END ..............model__alpha=0.003;, score=-0.092 total time=   0.0s
[CV 10/10] END .............model__alpha=0.003;, score=-0.172 total time=   0.0s
[CV 1/10] END ..............model__alpha=0.004;, score=-0.170 total time=   0.0s
[CV 2/10] END ..............model__alpha=0.004;, score=-0.375 total time=   0.0s


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV 3/10] END ..............model__alpha=0.004;, score=-0.225 total time=   0.0s
[CV 4/10] END ..............model__alpha=0.004;, score=-0.283 total time=   0.0s


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV 5/10] END ..............model__alpha=0.004;, score=-0.182 total time=   0.0s
[CV 6/10] END ..............model__alpha=0.004;, score=-0.377 total time=   0.0s


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV 7/10] END ..............model__alpha=0.004;, score=-0.141 total time=   0.0s
[CV 8/10] END ..............model__alpha=0.004;, score=-0.244 total time=   0.0s


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV 9/10] END ..............model__alpha=0.004;, score=-0.079 total time=   0.0s
[CV 10/10] END .............model__alpha=0.004;, score=-0.160 total time=   0.0s


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV 1/10] END ..............model__alpha=0.005;, score=-0.171 total time=   0.0s
[CV 2/10] END ..............model__alpha=0.005;, score=-0.357 total time=   0.0s


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV 3/10] END ..............model__alpha=0.005;, score=-0.213 total time=   0.0s
[CV 4/10] END ..............model__alpha=0.005;, score=-0.272 total time=   0.0s
[CV 5/10] END ..............model__alpha=0.005;, score=-0.185 total time=   0.0s
[CV 6/10] END ..............model__alpha=0.005;, score=-0.351 total time=   0.0s
[CV 7/10] END ..............model__alpha=0.005;, score=-0.140 total time=   0.0s


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV 8/10] END ..............model__alpha=0.005;, score=-0.246 total time=   0.0s
[CV 9/10] END ..............model__alpha=0.005;, score=-0.063 total time=   0.0s
[CV 10/10] END .............model__alpha=0.005;, score=-0.156 total time=   0.0s
[CV 1/10] END ..............model__alpha=0.006;, score=-0.172 total time=   0.0s
[CV 2/10] END ..............model__alpha=0.006;, score=-0.336 total time=   0.0s
[CV 3/10] END ..............model__alpha=0.006;, score=-0.209 total time=   0.0s
[CV 4/10] END ..............model__alpha=0.006;, score=-0.281 total time=   0.0s
[CV 5/10] END ..............model__alpha=0.006;, score=-0.140 total time=   0.0s
[CV 6/10] END ..............model__alpha=0.006;, score=-0.339 total time=   0.0s
[CV 7/10] END ..............model__alpha=0.006;, score=-0.141 total time=   0.0s
[CV 8/10] END ..............model__alpha=0.006;, score=-0.242 total time=   0.0s
[CV 9/10] END ..............model__alpha=0.006;, score=-0.050 total time=   0.0s


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV 10/10] END .............model__alpha=0.006;, score=-0.151 total time=   0.0s
[CV 1/10] END ..............model__alpha=0.007;, score=-0.176 total time=   0.0s
[CV 2/10] END ..............model__alpha=0.007;, score=-0.309 total time=   0.0s
[CV 3/10] END ..............model__alpha=0.007;, score=-0.208 total time=   0.0s
[CV 4/10] END ..............model__alpha=0.007;, score=-0.267 total time=   0.0s
[CV 5/10] END ..............model__alpha=0.007;, score=-0.131 total time=   0.0s
[CV 6/10] END ..............model__alpha=0.007;, score=-0.335 total time=   0.0s
[CV 7/10] END ..............model__alpha=0.007;, score=-0.143 total time=   0.0s
[CV 8/10] END ..............model__alpha=0.007;, score=-0.237 total time=   0.0s
[CV 9/10] END ..............model__alpha=0.007;, score=-0.050 total time=   0.0s
[CV 10/10] END .............model__alpha=0.007;, score=-0.155 total time=   0.0s
[CV 1/10] END ..............model__alpha=0.008;, score=-0.179 total time=   0.0s
[CV 2/10] END ..............

In [16]:
filtered_set_list = [ X_sm_LASSO]
filtered_set_list_name = ['X_sm_LASSO']


for i in range(len(filtered_set_list_name)):

    #globals()['X_train_{}'.format(filtered_set_list_name[i])], globals()['X_test_{}'.format(filtered_set_list_name[i])], globals()['y_train_{}'.format(filtered_set_list_name[i])], globals()['y_test_{}'.format(filtered_set_list_name[i])] = \
                #train_test_split(filtered_set_list[i], y_sm_, test_size = 0.5, shuffle=True, random_state=42)

    globals()['X_train_{}'.format(filtered_set_list_name[i])] = filtered_set_list[i]
    globals()['y_train_{}'.format(filtered_set_list_name[i])] = y_sm_train
    #globals()['X_test_{}'.format(filtered_set_list_name[i])] = X_sm_test.loc[:, X_sm_test.columns.isin(column_list)]
    #globals()['y_test_{}'.format(filtered_set_list_name[i])] = y_sm_test
    column_list = filtered_set_list[i].columns.tolist()
    globals()['X_test_{}'.format(filtered_set_list_name[i])] = data_df_test.loc[:, data_df_test.columns.isin(column_list)]
    globals()['y_test_{}'.format(filtered_set_list_name[i])] = labels_df_test



### Conventional Machine Learning Classifier

In [17]:
# Confusion matrix, ROC, AUROC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn import metrics

def validation(y_train, y_train_pred, y_test_pred_proba_01):
    y_train = y_train
    y_train_pred = y_train_pred
    y_test_pred_proba_01 = y_test_pred_proba_01
    print("\n")
    print("*** Confusion Matrix ***")
    print(confusion_matrix(y_train, y_train_pred))
    print("\n")
    print("*** ROC ***")
    fpr, tpr, thresholds = roc_curve(y_train, y_test_pred_proba_01, )

    def plot_roc_curve(fpr, tpr, label=None):
        plt.plot(fpr, tpr, linewidth=2, label=label)
        plt.plot([0, 1], [0, 1], 'k--') # 대각 점선
        plt.axis([0, 1, 0, 1])                                    # Not shown in the book
        plt.xlabel('False Positive Rate (Fall-Out)', fontsize=16) # Not shown
        plt.ylabel('True Positive Rate (Recall)', fontsize=16)    # Not shown
        plt.grid(True)                                            # Not shown

    plt.figure(figsize=(8, 6))                                    # Not shown
    plot_roc_curve(fpr, tpr)
    plt.show()
    print("AUROC : ", metrics.auc(fpr, tpr))

    return metrics.auc(fpr, tpr)

In [18]:
# Adaptive boosting(Adab)
from graphviz import Source
from sklearn.tree import export_graphviz

def SVM_clf(X_train, y_train, X_test, y_test, poly_feature_degree, LinearSVC_C):
    X_train = X_train
    y_train = y_train
    X_test =  X_test
    y_test = y_test
    poly_feature_degree = poly_feature_degree
    LinearSVC_C = LinearSVC_C

    polynomial_svm_clf = Pipeline([
            ("poly_features", PolynomialFeatures(degree=poly_feature_degree)), # feature 갯수 많아지면 절 대 3 이상 돌리지 말것
            ("scaler", StandardScaler()),
            ("svm_clf", SVC(kernel='linear',probability=True, random_state=42))
        ])
    polynomial_svm_clf.fit(X_train, y_train)


    print("*** Polynomial SVM classification validation ***")
    y_test_pred = polynomial_svm_clf.predict(X_test)
    y_test_pred_proba = polynomial_svm_clf.predict_proba(X_test)
    y_test_pred_proba_01 = y_test_pred_proba[:, 1]
    AUROC = validation(y_test, y_test_pred, y_test_pred_proba_01)
    return AUROC


In [None]:
# Multiple filter x Multiple classifier
"""
filtered_splitted_X_train = [X_train_X_sm, X_train_X_sm_CST, X_train_X_sm_Corr, X_train_X_sm_Relief, X_train_X_sm_LASSO, X_train_X_sm_PCA, X_train_X_sm_FFS]
filtered_splitted_y_train = [y_train_X_sm, y_train_X_sm_CST, y_train_X_sm_Corr, y_train_X_sm_Relief, y_train_X_sm_LASSO, y_train_X_sm_PCA, y_train_X_sm_FFS]
filtered_splitted_X_test = [X_test_X_sm, X_test_X_sm_CST, X_test_X_sm_Corr, X_test_X_sm_Relief, X_test_X_sm_LASSO, X_test_X_sm_PCA, X_test_X_sm_FFS]
filtered_splitted_y_test = [y_test_X_sm, y_test_X_sm_CST, y_test_X_sm_Corr, y_test_X_sm_Relief, y_test_X_sm_LASSO, y_test_X_sm_PCA, y_test_X_sm_FFS]
"""
filtered_set_list = [X_sm_LASSO]

filtered_splitted_X_train = [X_train_X_sm_LASSO]
filtered_splitted_y_train = [y_train_X_sm_LASSO]
filtered_splitted_X_test = [X_test_X_sm_LASSO]
filtered_splitted_y_test = [y_test_X_sm_LASSO]

filter_list = ['X_sm_LASSO']

for i in range(len(filter_list)):
    print("****************************************")
    print("     Sample set : ", filter_list[i])
    print("****************************************")
    print("\n"
    
    globals()['AUROC_SVM_{}'.format(filter_list[i])] = SVM_clf(filtered_splitted_X_train[i], filtered_splitted_y_train[i], filtered_splitted_X_test[i], filtered_splitted_y_test[i], 1, 10) # [X_train, y_train, X_test, y_test, polynomial feature degree(feature 갯수 많을때 3 이상 하지 말 것), avoid misclassifying each training example]
    

In [None]:
# Save results

filter_list = ['X_sm_LASSO']
classifier_list = ['SVM']

result_df = pd.DataFrame(columns = filter_list, index = classifier_list)

for i in range(len(filter_list)):
    for j in range(len(classifier_list)):
        result_df.at[str(classifier_list[j]), str(filter_list[i])] = globals()['AUROC_{}_{}'.format(classifier_list[j], filter_list[i])]


In [None]:
result_df

In [None]:
save_path = "C:/Research/Liver_abscess_metastasis_radiomics/Result/20220923_Classification/test-augmentation_training-no-augmentation/"
save_path_file = os.path.join(save_path, "20220929_gangnam-train_shinchon-test_AP-pre-T2.csv")
result_df.to_csv(save_path_file)

### SVM 과 RF 에서 test set 의 검정능력, predictied probability, performance matrix, ROC 추출

## Corr-SVM

# training set tuning

In [19]:
# Confusion matrix, ROC, AUROC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn import metrics

def validation(y_train, y_train_pred, y_test_pred_proba_01):
    y_train = y_train
    y_train_pred = y_train_pred
    y_test_pred_proba_01 = y_test_pred_proba_01
    print("\n")
    print("*** Confusion Matrix ***")
    print(confusion_matrix(y_train, y_train_pred))
    print("\n")
    print("*** ROC ***")
    fpr, tpr, thresholds = roc_curve(y_train, y_test_pred_proba_01, )

    """
    def plot_roc_curve(fpr, tpr, label=None):
        plt.plot(fpr, tpr, linewidth=2, label=label)
        plt.plot([0, 1], [0, 1], 'k--') # 대각 점선
        plt.axis([0, 1, 0, 1])                                    # Not shown in the book
        plt.xlabel('False Positive Rate (Fall-Out)', fontsize=16) # Not shown
        plt.ylabel('True Positive Rate (Recall)', fontsize=16)    # Not shown
        plt.grid(True)                                            # Not shown

    plt.figure(figsize=(8, 6))                                    # Not shown
    plot_roc_curve(fpr, tpr)
    plt.show()
    """
    print("AUROC : ", metrics.auc(fpr, tpr))


    return metrics.auc(fpr, tpr)

In [57]:
# Polynomial SVM Classification / test set

from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures

from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn import metrics

# Data preparation
X_train = X_train_X_sm_Corr
y_train = y_train_X_sm_Corr
X_test =  X_test_X_sm_Corr
y_test = y_test_X_sm_Corr

# Binary classifier
polynomial_svm_clf = Pipeline([
        ("scaler", StandardScaler()),
        #("svm_clf", SVC(kernel='linear',probability=True, random_state=42))
        ("svm_clf", SVC(gamma = 'auto', kernel='poly', degree = 1, coef0 = 0, C = 1, probability = True, random_state=42))
    ])
polynomial_svm_clf.fit(X_train, y_train)


# Test set prediction
y_test_pred = polynomial_svm_clf.predict(X_test)
y_test_pred_proba = polynomial_svm_clf.predict_proba(X_test)
y_test_pred_proba_01 = y_test_pred_proba[:, 1]

fpr_roc, tpr_roc, thresholds = roc_curve(y_test, y_test_pred_proba_01, )
AUROC = metrics.auc(fpr_roc, tpr_roc)

# Performance 계산
CM = confusion_matrix(y_test, y_test_pred)
TN, FP, FN, TP = confusion_matrix(list(y_test), list(y_test_pred), labels = [0, 1]).ravel()
TPR = TP / (TP + FN) # Sensitivity, hit rate, recall, or true positive rate
TNR = TN / (TN + FP) # Specificity or true negative rate
PPV = TP / (TP + FP) # Precision or positive predictive value
NPV = TN / (TN + FN) # Negative predictive value
FPR = FP / (FP + TN) # Fall out or false positive rate
FNR = FN / (TP + FN) # False negative rate
FDR = FP / (TP + FP) # False discovery rate
ACC = (TP + TN) / (TP + FP + FN + TN) # Overall accuracy

# Performance save
column_list = ['AUROC', 'TN', 'FP', 'FN', 'TP', 'TPR', 'TNR', 'PPV', 'NPV', 'FPR', 'FNR', 'FDR', 'ACC']
performance_metrics_df = pd.DataFrame(columns = column_list)
data_to_insert_df = pd.DataFrame({
    'TN' : [TN], 
    'FP' : [FP],
    'FN' : [FN],
    'TP' : [TP],
    'AUROC' : [AUROC],
    'TPR' : [TPR],
    'TNR' : [TNR], 
    'PPV' : [PPV], 
    'NPV' : [NPV], 
    'FPR' : [FPR], 
    'FNR' : [FNR], 
    'FDR' : [FDR], 
    'ACC' : [ACC]
})
performance_metrics_df = pd.concat([performance_metrics_df, data_to_insert_df], axis = 0)

save_path = "C:/Research/Liver_abscess_metastasis_radiomics/Result/20220930_Classification_metrics/"
save_path_file = os.path.join(save_path, "20220930_SVM_test-set_performance_metrics_pre_T2.csv")
performance_metrics_df.to_csv(save_path_file)

# save data for ROC curve
fpr_df = pd.DataFrame(fpr)
tpr_df = pd.DataFrame(tpr)
fpr_tpr_df = pd.concat([fpr_df, tpr_df], axis = 1)
fpr_tpr_df.columns = ['fpr', 'tpr']

save_path_file_fpr = os.path.join(save_path, "20220930_SVM_test-set_fpr_tpr_pre_T2.csv")
fpr_tpr_df.to_csv(save_path_file_fpr)

# save data for prediction
y_test_pred_df = pd.DataFrame(y_test_pred)
y_test_prob_df = pd.DataFrame(y_test_pred_proba_01)
y_GT_pred_prob_df = pd.concat([ID_df_test, y_test, y_test_pred_df, y_test_prob_df ], axis = 1)
y_GT_pred_prob_df.columns = ['patient_ID', 'ground_truth', 'predicted_class', ' predicted_probability']

save_path_file_GT_pred_prob = os.path.join(save_path, "20220930_SVM_test-set_GT_pred_prob_pre_T2.csv")
y_GT_pred_prob_df.to_csv(save_path_file_GT_pred_prob)

In [56]:
# Polynomial SVM Classification / training set

from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures

from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn import metrics

# Data preparation
X_train = X_train_X_sm_Corr
y_train = y_train_X_sm_Corr
X_test =  X_test_X_sm_Corr
y_test = y_test_X_sm_Corr

# Binary classifier
polynomial_svm_clf = Pipeline([
        ("scaler", StandardScaler()),
        #("svm_clf", SVC(kernel='linear',probability=True, random_state=42))
        ("svm_clf", SVC(gamma = 'auto', kernel='poly', degree = 1, coef0 = 0, C = 1, probability = True, random_state=42))
    ])
polynomial_svm_clf.fit(X_train, y_train)


# Test set prediction
y_test_pred = polynomial_svm_clf.predict(X_train)
y_test_pred_proba = polynomial_svm_clf.predict_proba(X_train)
y_test_pred_proba_01 = y_test_pred_proba[:, 1]

fpr_roc, tpr_roc, thresholds = roc_curve(y_train, y_test_pred_proba_01, )
AUROC = metrics.auc(fpr_roc, tpr_roc)

# Performance 계산
CM = confusion_matrix(y_train, y_test_pred)
TN, FP, FN, TP = confusion_matrix(list(y_train), list(y_test_pred), labels = [0, 1]).ravel()
TPR = TP / (TP + FN) # Sensitivity, hit rate, recall, or true positive rate
TNR = TN / (TN + FP) # Specificity or true negative rate
PPV = TP / (TP + FP) # Precision or positive predictive value
NPV = TN / (TN + FN) # Negative predictive value
FPR = FP / (FP + TN) # Fall out or false positive rate
FNR = FN / (TP + FN) # False negative rate
FDR = FP / (TP + FP) # False discovery rate
ACC = (TP + TN) / (TP + FP + FN + TN) # Overall accuracy

# Performance save
column_list = ['AUROC', 'TN', 'FP', 'FN', 'TP', 'TPR', 'TNR', 'PPV', 'NPV', 'FPR', 'FNR', 'FDR', 'ACC']
performance_metrics_df = pd.DataFrame(columns = column_list)
data_to_insert_df = pd.DataFrame({
    'TN' : [TN], 
    'FP' : [FP],
    'FN' : [FN],
    'TP' : [TP],
    'AUROC' : [AUROC],
    'TPR' : [TPR],
    'TNR' : [TNR], 
    'PPV' : [PPV], 
    'NPV' : [NPV], 
    'FPR' : [FPR], 
    'FNR' : [FNR], 
    'FDR' : [FDR], 
    'ACC' : [ACC]
})
performance_metrics_df = pd.concat([performance_metrics_df, data_to_insert_df], axis = 0)

save_path = "C:/Research/Liver_abscess_metastasis_radiomics/Result/20220930_Classification_metrics/"
save_path_file = os.path.join(save_path, "20220930_SVM_train-set_performance_metrics_pre_T2.csv")
performance_metrics_df.to_csv(save_path_file)

# save data for ROC curve
fpr_df = pd.DataFrame(fpr)
tpr_df = pd.DataFrame(tpr)
fpr_tpr_df = pd.concat([fpr_df, tpr_df], axis = 1)
fpr_tpr_df.columns = ['fpr', 'tpr']

save_path_file_fpr = os.path.join(save_path, "20220930_SVM_train-set_fpr_tpr_pre_T2.csv")
fpr_tpr_df.to_csv(save_path_file_fpr)

# save data for prediction
"""
y_test_pred_df = pd.DataFrame(y_test_pred)
y_test_prob_df = pd.DataFrame(y_test_pred_proba_01)
y_GT_pred_prob_df = pd.concat([ID_df_train, y_train, y_test_pred_df, y_test_prob_df ], axis = 1)
y_GT_pred_prob_df.columns = ['patient_ID', 'ground_truth', 'predicted_class', ' predicted_probability']

save_path_file_GT_pred_prob = os.path.join(save_path, "20220930_SVM_train-set_GT_pred_prob_pre_T2.csv")
y_GT_pred_prob_df.to_csv(save_path_file_GT_pred_prob)
"""

'\ny_test_pred_df = pd.DataFrame(y_test_pred)\ny_test_prob_df = pd.DataFrame(y_test_pred_proba_01)\ny_GT_pred_prob_df = pd.concat([ID_df_train, y_train, y_test_pred_df, y_test_prob_df ], axis = 1)\ny_GT_pred_prob_df.columns = [\'patient_ID\', \'ground_truth\', \'predicted_class\', \' predicted_probability\']\n\nsave_path_file_GT_pred_prob = os.path.join(save_path, "20220930_SVM_train-set_GT_pred_prob_pre_T2.csv")\ny_GT_pred_prob_df.to_csv(save_path_file_GT_pred_prob)\n'

In [None]:
# Polynomial SVM Classification / GridSearch

from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures

X_train = X_train_X_sm_Corr
y_train = y_train_X_sm_Corr
X_test =  X_test_X_sm_Corr
y_test = y_test_X_sm_Corr

column_list = ['coef0', 'C', 'poly', 'AUROC']
AUROC_df = pd.DataFrame(columns = column_list)

for idx_coef0 in range(10):
    #print("coef0 :", idx_coef0)
    for idx_C in range(1,10):
        #print("C :", idx_C)
        for idx_poly in range(1,3+1):


            polynomial_svm_clf = Pipeline([
                    ("poly_features", PolynomialFeatures(degree=poly_feature_degree)), # feature 갯수 많아지면 절 대 3 이상 돌리지 말것
                    ("scaler", StandardScaler()),
                    #("svm_clf", SVC(kernel='linear',probability=True, random_state=42))
                    ("svm_clf", SVC(gamma = 'auto', kernel='poly', degree = idx_poly, coef0 = idx_coef0, C = idx_C, probability = True, random_state=42))
                ])
            polynomial_svm_clf.fit(X_train, y_train)


            #print("*** Polynomial SVM classification validation ***")
            y_test_pred = polynomial_svm_clf.predict(X_test)
            y_test_pred_proba = polynomial_svm_clf.predict_proba(X_test)
            y_test_pred_proba_01 = y_test_pred_proba[:, 1]
            
            #print(confusion_matrix(y_test, y_test_pred))
            fpr, tpr, thresholds = roc_curve(y_test, y_test_pred_proba_01, )
            AUROC = metrics.auc(fpr, tpr)

            data_to_insert_df = pd.DataFrame({'coef0' : [idx_coef0], 'C' : [idx_C], 'poly' : [idx_poly], 'AUROC' : [AUROC]})
            AUROC_df = pd.concat([AUROC_df, data_to_insert_df], axis = 0)
        
AUROC_df    

In [None]:
save_path = "C:/Research/Liver_abscess_metastasis_radiomics/Result/20220923_Classification/"
save_path_file = os.path.join(save_path, "20220929_Gridsearch_SVM.csv")
AUROC_df.to_csv(save_path_file)

In [None]:
# Polynomial SVM Classification

from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures

X_train = X_train_X_sm_Corr
y_train = y_train_X_sm_Corr
X_test =  X_test_X_sm_Corr
y_test = y_test_X_sm_Corr
poly_feature_degree = 1
LinearSVC_C = 10

polynomial_svm_clf = Pipeline([
        ("poly_features", PolynomialFeatures(degree=poly_feature_degree)), # feature 갯수 많아지면 절 대 3 이상 돌리지 말것
        ("scaler", StandardScaler()),
        #("svm_clf", SVC(kernel='linear',probability=True, random_state=42))
        ("svm_clf", SVC(kernel='rbf', gamma = 0.0001, C = 100000, probability = True, random_state=42))
    ])
polynomial_svm_clf.fit(X_train, y_train)


print("*** Polynomial SVM classification validation ***")
y_test_pred = polynomial_svm_clf.predict(X_test)
y_test_pred_proba = polynomial_svm_clf.predict_proba(X_test)
y_test_pred_proba_01 = y_test_pred_proba[:, 1]
validation(y_test, y_test_pred, y_test_pred_proba_01)

In [None]:
# Polynomial SVM Classification

from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.base import clone

X_train = X_train_X_sm_Corr.to_numpy()
y_train = y_train_X_sm_Corr.to_numpy()
X_test =  X_test_X_sm_Corr.to_numpy()
y_test = y_test_X_sm_Corr.to_numpy()
poly_feature_degree = 1
LinearSVC_C = 10
coef0 = 1
C = 8
n_splits = 5

polynomial_svm_clf = Pipeline([
        ("poly_features", PolynomialFeatures(degree=poly_feature_degree)), # feature 갯수 많아지면 절 대 3 이상 돌리지 말것
        ("scaler", StandardScaler()),
        #("svm_clf", SVC(kernel='linear',probability=True, random_state=42))
        ("svm_clf", SVC(kernel='poly', degree = poly_feature_degree, coef0 = coef0, C = C, probability = True, random_state=42))
    ])

skfolds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
y_test_pred_proba_avg = np.zeros(123,)

for train_index, test_index in skfolds.split(X_test, y_test):
    clone_clf = clone(polynomial_svm_clf)
    X_train_folds = X_test[train_index]
    y_train_folds = y_test[train_index]
    X_test_fold = X_test[test_index]
    y_test_fold = y_test[test_index]

    clone_clf.fit(X_train_folds, y_train_folds)

    y_test_pred_proba = clone_clf.predict_proba(X_test)
    y_test_pred_proba_01 = y_test_pred_proba[:, 1]
    y_test_pred_proba_avg = y_test_pred_proba_avg + y_test_pred_proba_01

    #print(y_test_pred_proba_01)

print("*** Polynomial SVM classification validation ***")
y_test_pred_proba_avg = y_test_pred_proba_avg/n_splits
y_test_pred = np.around(y_test_pred_proba_avg)
validation(y_test, y_test_pred, y_test_pred_proba_avg)




#scores = cross_val_score(polynomial_svm_clf, X_train, y_train, cv=cv)
#y_test_pred = polynomial_svm_clf.predict(X_test)
#y_test_pred = cross_val_predict(polynomial_svm_clf, X_train, y_train, cv=5)
#y_test_score = cross_val_score(polynomial_svm_clf, X_train, y_train, cv=5)
#y_test_valid = cross_validate(polynomial_svm_clf, X_train, y_train, cv=5)
#y_test_pred_proba = polynomial_svm_clf.predict_proba(X_test)
#y_test_pred_proba_01 = y_test_pred_proba[:, 1]
#validation(y_test, y_test_pred, y_test_pred_proba_01)


In [None]:
# Polynomial SVM Classification ; t

from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.base import clone

X_train = X_train_X_sm_Corr.to_numpy()
y_train = y_train_X_sm_Corr.to_numpy()
X_test =  X_test_X_sm_Corr.to_numpy()
y_test = y_test_X_sm_Corr.to_numpy()
poly_feature_degree = 1
coef0 = 1
C = 8
n_splits = 5

polynomial_svm_clf = Pipeline([
        ("poly_features", PolynomialFeatures(degree=poly_feature_degree)), # feature 갯수 많아지면 절 대 3 이상 돌리지 말것
        ("scaler", StandardScaler()),
        ("svm_clf", SVC(kernel='poly', degree = poly_feature_degree, coef0 = coef0, C = C, probability = True, random_state=42))
    ])

skfolds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
y_test_pred_proba_avg = np.zeros(123,) # test set의 sample 숫자에 맞춰줘야 함. y_test.shape 찍어보면 알수있음.

for train_index, test_index in skfolds.split(X_train, y_train):
    clone_clf = clone(polynomial_svm_clf)
    X_train_folds = X_train[train_index]
    y_train_folds = y_train[train_index]
    X_test_fold = X_train[test_index]
    y_test_fold = y_train[test_index]

    clone_clf.fit(X_train_folds, y_train_folds)

    y_test_pred_proba = clone_clf.predict_proba(X_test)
    y_test_pred_proba_01 = y_test_pred_proba[:, 1]
    y_test_pred_proba_avg = y_test_pred_proba_avg + y_test_pred_proba_01


print("*** Polynomial SVM classification validation ***")
y_test_pred_proba_avg = y_test_pred_proba_avg/n_splits
y_test_pred = np.around(y_test_pred_proba_avg)
validation(y_test, y_test_pred, y_test_pred_proba_avg)



In [None]:
# Polynomial SVM Classification

from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import GridSearchCV

X_train = X_train_X_sm_Corr
y_train = y_train_X_sm_Corr
X_test =  X_test_X_sm_Corr
y_test = y_test_X_sm_Corr
#poly_feature_degree = 1
LinearSVC_C = 10

param_grid = [
    {'degree' : [1, 2]}
]

polynomial_svm_clf = Pipeline([
        ("poly_features", PolynomialFeatures(degree=poly_feature_degree)), # feature 갯수 많아지면 절 대 3 이상 돌리지 말것
        ("scaler", StandardScaler()),
        #("svm_clf", SVC(kernel='linear',probability=True, random_state=42))
        ("svm_clf", SVC(kernel='poly', coef0 = 1, C = 8, probability = True, random_state=42))
    ])
grid_search = GridSearchCV(polynomial_svm_clf, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(X_test, y_test)

#polynomial_svm_clf.fit(X_train, y_train)


#print("*** Polynomial SVM classification validation ***")
#y_test_pred = polynomial_svm_clf.predict(X_test)
#y_test_pred_proba = polynomial_svm_clf.predict_proba(X_test)
#y_test_pred_proba_01 = y_test_pred_proba[:, 1]
#validation(y_test, y_test_pred, y_test_pred_proba_01)

In [None]:
# Polynomial SVM Classification

from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import GridSearchCV

# Dataset 준비
X_train = X_train_X_sm_Corr
y_train = y_train_X_sm_Corr
X_test =  X_test_X_sm_Corr
y_test = y_test_X_sm_Corr

scaler = StandardScaler()

scaler.fit(X_train)
X_train = scaler.transform(X_train)
scaler.fit(X_test)
X_test =  scaler.transform(X_test)

# Grid 설정
param_grid = [
    {'coef0' : [0, 1, 2, 3], 'C' : [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}
]

# Training
svm_clf = SVC(kernel = 'poly', probability = True, random_state=42)
cv = 5
for idx in range(2, cv+1):
    print("cv :", idx)
    grid_search = GridSearchCV(svm_clf, param_grid, cv=idx, scoring='neg_mean_squared_error', return_train_score=True)
    grid_search.fit(X_train, y_train)
    print(grid_search.best_params_)
    
    best_model = grid_search.best_estimator_
    #best_model.fit(X_test, y_test)
    y_test_pred = best_model.predict(X_test)
    y_test_pred_proba = best_model.predict_proba(X_test)
    y_test_pred_proba_01 = y_test_pred_proba[:, 1]
    validation(y_test, y_test_pred, y_test_pred_proba_01)
    
    #cvres = grid_search.cv_results_
    #for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    #    print(np.sqrt(-mean_score), params)


In [None]:
svm_clf = SVC(kernel = 'poly', C = 7, coef0 = 1, probability = True, random_state=42)
svm_clf.fit(X_train, y_train)

y_test_pred = svm_clf.predict(X_test)
y_test_pred_proba = svm_clf.predict_proba(X_test)
y_test_pred_proba_01 = y_test_pred_proba[:, 1]
validation(y_test, y_test_pred, y_test_pred_proba_01)

In [None]:
X_test.shape

In [None]:
#####################################################################################################################################3

In [None]:
"""
X_sm
X_sm_CST
X_sm_Corr
X_sm_Relief
X_sm_LASSO
X_sm_PCA
X_sm_FFS
"""

"""
X_sm_filtered = X_sm

Adab(X_sm_filtered, y_sm, 5,4) # [input_data, labels, n_estimator(tree갯수), cv]
decision_tree(X_sm_filtered, y_sm, 10,4) # [input_data, labels, max_depth, cv]
random_forest(X_sm_filtered, y_sm, 1000, 10, 4) # [input_data, labels, n_estimators, max_leaf_nodes, cv]
KNN(X_sm_filtered, y_sm, 4) # [input_data, labels, cv]
SGD_clf(X_sm_filtered, y_sm, 1000, 1e-3, 4) # [input_data, labels, max_iter, tolerance, cv]
SVM_clf(X_sm_filtered, y_sm, 2, 10, 4) # [input_data, labels, polynomial feature degree(feature 갯수 많을때 3 이상 하지 말 것), avoid misclassifying each training example, cv]
LDA(X_sm_filtered, y_sm, 4) # [input_data, labels, cv]
QDA(X_sm_filtered, y_sm, 4) # [input_data, labels, cv]
GNB_clf(X_sm_filtered, y_sm, 4) # [input_data, labels, cv]
"""