In [1]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTENC
from sklearn.metrics import classification_report

data = pd.read_csv('../Data/MedIntel_BC_Vault42/JUNO_Protected_BC_Clinical.tsv', sep="\t")

data.drop(['Study ID', 'Sample ID'], axis=1, inplace=True)
data.dropna(subset=['Tumor Stage'], inplace=True)
data.dropna(subset=['Relapse Free Status'], inplace=True)

data['ER status measured by IHC'].replace({'Positve':'Positive'}, inplace=True)
data['Tumor Stage'].replace({0.0:0, 1.0:1, 2.0:2, 3.0:3, 4.0:4}, inplace=True)
data['Relapse Free Status'].replace({'0:Not Recurred':0, '1:Recurred':1}, inplace=True)
Cellularity_1 = ['Low', 'Moderate', 'High', np.nan] 
Chemotherapy_1 = ['NO', 'YES', np.nan] 
ER_status_measured_by_IHC_1 = ['Negative', 'Positive', np.nan] 
ER_Status_1 = ['Negative', 'Positive', np.nan] 
Neoplasm_Histologic_Grade_1 = [1, 2, 3, np.nan] 
HER2_Status_1 = ['Negative', 'Positive', np.nan] 
Hormone_Therapy_1 = ['NO', 'YES', np.nan] 
Inferred_Menopausal_State_1 = ['Pre', 'Post', np.nan] 
Integrative_Cluster_1 = ['1', '2', '3', '4ER-', '4ER+', '5', '6', '7', '8', '9', '10', np.nan] 
Primary_Tumor_Laterality_1 = ['Left', 'Right', np.nan] 
Overall_Survival_Status_1 = [0, 1, np.nan] 
PR_Status = ['Negative', 'Positive', np.nan] 
Radio_Therapy_1 = ['NO', 'YES', np.nan] 

numerical_feature = ['Age at Diagnosis',
                     'Lymph nodes examined positive', 
                     'Mutation Count', 
                     'Nottingham prognostic index', 
                     'Overall Survival (Months)',
                     'Relapse Free Status (Months)',
                     'Number of Samples Per Patient', 
                     'TMB (nonsynonymous)', 
                     'Tumor Size']
ordinal_feature = ['Cellularity', 
                   'Chemotherapy', 
                   'ER status measured by IHC', 
                   'ER Status', 
                   'Neoplasm Histologic Grade', 
                   'HER2 Status', 
                   'Hormone Therapy', 
                   'Inferred Menopausal State', 
                   'Integrative Cluster', 
                   'Primary Tumor Laterality', 
                   'Overall Survival Status', 
                   'PR Status', 
                   'Radio Therapy']

nominal_feature = ['Cancer Type', 
                   'Type of Breast Surgery', 
                   'Cancer Type Detailed', 
                   'Pam50 + Claudin-low subtype', 
                   'Cohort', 
                   'HER2 status measured by SNP6', 
                   'Tumor Other Histologic Subtype', 
                   'Oncotree Code', 
                   'Sample Type', 
                   'Sex', 
                   '3-Gene classifier subtype', 
                   "Patient's Vital Status"]
x = data.drop(['Patient ID', 'Tumor Stage', 'Relapse Free Status'], axis=1)
y_stage = data['Tumor Stage']
y_relapse = data['Relapse Free Status']
x_train, x_test, y_stage_train, y_stage_test, y_relapse_train, y_relapse_test = train_test_split(
    x, y_stage, y_relapse, test_size=0.2, random_state=42, stratify=y_stage
)
categorical_indices = [x.columns.get_loc(col) for col in nominal_feature if col in x.columns]
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder(categories=[
            Cellularity_1,
            Chemotherapy_1,
            ER_status_measured_by_IHC_1,
            ER_Status_1,
            Neoplasm_Histologic_Grade_1,
            HER2_Status_1,
            Hormone_Therapy_1,
            Inferred_Menopausal_State_1,
            Integrative_Cluster_1,
            Primary_Tumor_Laterality_1,
            Overall_Survival_Status_1,
            PR_Status,
            Radio_Therapy_1]))
])

nominal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(transformers=[
    ('numerical_features', numerical_transformer, numerical_feature),
    ('ordinal_feature', ordinal_transformer, [
            'Cellularity',
            'Chemotherapy',
            'ER status measured by IHC',
            'ER Status',
            'Neoplasm Histologic Grade',
            'HER2 Status',
            'Hormone Therapy',
            'Inferred Menopausal State',
            'Integrative Cluster',
            'Primary Tumor Laterality',
            'Overall Survival Status',
            'PR Status',
            'Radio Therapy'
            ]
    ),
    ('nominal_feature', nominal_transformer, nominal_feature),    
])
# First, create a function to get categorical indices after transformation
def get_feature_names_and_categorical_indices(preprocessor, x):
    # Get feature names for all transformers
    feature_names = []
    categorical_indices = []
    current_idx = 0
    
    # Handle numerical features
    numerical_features = preprocessor.named_transformers_['numerical_features'].get_feature_names_out(numerical_feature)
    feature_names.extend(numerical_features)
    current_idx += len(numerical_features)
    
    # Handle ordinal features
    ordinal_features = preprocessor.named_transformers_['ordinal_feature'].get_feature_names_out([
        'Cellularity', 'Chemotherapy', 'ER status measured by IHC', 'ER Status',
        'Neoplasm Histologic Grade', 'HER2 Status', 'Hormone Therapy',
        'Inferred Menopausal State', 'Integrative Cluster', 'Primary Tumor Laterality',
        'Overall Survival Status', 'PR Status', 'Radio Therapy'
    ])
    feature_names.extend(ordinal_features)
    current_idx += len(ordinal_features)
    
    # Handle nominal features
    nominal_features = preprocessor.named_transformers_['nominal_feature'].get_feature_names_out(nominal_feature)
    # Add indices for all nominal features
    categorical_indices.extend(range(current_idx, current_idx + len(nominal_features)))
    feature_names.extend(nominal_features)
    
    return feature_names, categorical_indices

# Modified pipeline
def create_pipeline(x_train, categorical_indices):
    return Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('smote_nc', SMOTENC(
            categorical_features=categorical_indices,
            sampling_strategy='auto',
            random_state=42)
        ),
        ('feature_selector', SelectKBest(score_func=f_classif, k=15)),
        ('classifier', SVC())
    ])

# Fit the preprocessor first to get the transformed feature names
preprocessor.fit(x_train)
feature_names, categorical_indices = get_feature_names_and_categorical_indices(preprocessor, x_train)

# Create the pipeline with correct categorical indices
clf_stage = create_pipeline(x_train, categorical_indices)

# Rest of your code remains the same
param_stage = {
    'classifier__C': [0.1, 1, 10, 100],
    'classifier__gamma': ['scale', 'auto', 0.1, 1, 10],
    'classifier__kernel': ['rbf', 'linear', 'poly'],
    'classifier__class_weight': [None, 'balanced'],
}

model_stage = RandomizedSearchCV(
    estimator=clf_stage,
    param_distributions=param_stage,
    n_iter=100,
    scoring='f1_weighted',
    cv=6,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

# Fit the model
model_stage.fit(x_train, y_stage_train)

# Print results
print("Best score for Tumor Stage prediction:", model_stage.best_score_)
print("Best params for Tumor Stage prediction:", model_stage.best_params_)

y_stage_pred = model_stage.predict(x_test)
print(classification_report(y_stage_test, y_stage_pred))

  from pandas.core import (
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['ER status measured by IHC'].replace({'Positve':'Positive'}, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Tumor Stage'].replace({0.0:0, 1.0:1, 2.0:2, 3.0:3, 4.0:4}, inplace=True)
The behavior will change in pandas 3.0. This 

ValueError: Found unknown categories ['1:DECEASED', '0:LIVING'] in column 10 during fit

In [None]:
# Modified pipeline
def create_pipeline_relapse(x_train, categorical_indices):
    return Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('feature_selector', SelectKBest(score_func=f_classif, k=33)),
        ('classifier', SVC())
    ])

# Fit the preprocessor first to get the transformed feature names
preprocessor.fit(x_train)
feature_names, categorical_indices = get_feature_names_and_categorical_indices(preprocessor, x_train)

# Create the pipeline with correct categorical indices
clf_relapse = create_pipeline_relapse(x_train, categorical_indices)

param_relapse = {
    'classifier__C': [0.1, 1, 10, 100],
    'classifier__gamma': ['scale', 'auto', 0.1, 1, 10],
    'classifier__kernel': ['rbf', 'linear', 'poly'],
    'classifier__class_weight': [None, 'balanced'],
    # 'feature_selector__percentile':[75,80,85,90]
}

model_relapse = RandomizedSearchCV(
    estimator=clf_relapse,
    param_distributions=param_relapse,
    n_iter=100,
    scoring='f1',
    cv=6,
    verbose=1,
    n_jobs=1,
    random_state=42
)

model_relapse.fit(x_train, y_relapse_train)
print("Best score for Relapse Free Status prediction:", model_relapse.best_score_)
print("Best params for Relapse Free Status prediction:", model_relapse.best_params_)

y_relapse_pred = model_relapse.predict(x_test)
print(classification_report(y_relapse_test, y_relapse_pred))

In [83]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTENC, ADASYN
from imblearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from missforest import MissForest
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import chi2, SelectPercentile, f_classif, SelectKBest
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.metrics import classification_report
from sklearn.inspection import permutation_importance
from lazypredict.Supervised import LazyClassifier

In [84]:
data = pd.read_csv('../Data/MedIntel_BC_Vault42/JUNO_Protected_BC_Clinical.tsv', sep="\t")

### Drop

In [86]:
data.drop(['Study ID', 'Sample ID'], axis=1, inplace=True)
data.dropna(subset=['Tumor Stage'], inplace=True)
data.dropna(subset=['Relapse Free Status'], inplace=True)

### Unique values

In [95]:
# imblanced data
data['Tumor Stage'].value_counts()

Tumor Stage
2.00    976
1.00    624
3.00    143
0.00     13
4.00     11
Name: count, dtype: int64

In [96]:
data['Relapse Free Status'].value_counts()

Relapse Free Status
0:Not Recurred    1039
1:Recurred         728
Name: count, dtype: int64

### Replace

In [102]:
data['ER status measured by IHC'].replace({'Positve':'Positive'}, inplace=True)

In [103]:
data['Tumor Stage'].replace({0.0:0, 1.0:1, 2.0:2, 3.0:3, 4.0:4}, inplace=True)

In [104]:
# Relapse Free Status
# 0:Not Recurred
# 1:Recurred

data['Relapse Free Status'].replace({'0:Not Recurred':0, '1:Recurred':1}, inplace=True)

In [105]:
# Overall Survival Status
# 1:DECEASED    822
# 0:LIVING      643

data['Overall Survival Status'].replace({'1:DECEASED':1, '0:LIVING':0}, inplace=True)

### Features type

In [107]:
Cellularity_1 = ['Low', 'Moderate', 'High', np.nan]
Chemotherapy_1 = ['NO', 'YES', np.nan]
ER_status_measured_by_IHC_1 = ['Negative', 'Positive', np.nan]
ER_Status_1 = ['Negative', 'Positive', np.nan]
Neoplasm_Histologic_Grade_1 = [1, 2, 3, np.nan]
HER2_Status_1 = ['Negative', 'Positive', np.nan]
Hormone_Therapy_1 = ['NO', 'YES', np.nan]
Inferred_Menopausal_State_1 = ['Pre', 'Post', np.nan]
Integrative_Cluster_1 = ['1', '2', '3', '4ER-', '4ER+', '5', '6', '7', '8', '9', '10', np.nan]
Primary_Tumor_Laterality_1 = ['Left', 'Right', np.nan]
Overall_Survival_Status_1 = [0, 1, np.nan]
PR_Status = ['Negative', 'Positive', np.nan]
Radio_Therapy_1 = ['NO', 'YES', np.nan]
Relapse_Free_Status_1 = [0, 1]
Tumor_Stage_1 = [0, 1, 2, 3, 4]

In [108]:
numerical_feature = ['Age at Diagnosis',
                     'Lymph nodes examined positive', 
                     'Mutation Count', 
                     'Nottingham prognostic index', 
                     'Overall Survival (Months)',
                     'Relapse Free Status (Months)',
                     'Number of Samples Per Patient', 
                     'TMB (nonsynonymous)', 
                     'Tumor Size']

ordinal_feature = ['Cellularity', 
                   'Chemotherapy', 
                   'ER status measured by IHC', 
                   'ER Status', 
                   'Neoplasm Histologic Grade', 
                   'HER2 Status', 
                   'Hormone Therapy', 
                   'Inferred Menopausal State', 
                   'Integrative Cluster', 
                   'Primary Tumor Laterality', 
                   'Overall Survival Status', 
                   'PR Status', 
                   'Radio Therapy']

nominal_feature = ['Cancer Type', 
                   'Type of Breast Surgery', 
                   'Cancer Type Detailed', 
                   'Pam50 + Claudin-low subtype', 
                   'Cohort', 
                   'HER2 status measured by SNP6', 
                   'Tumor Other Histologic Subtype', 
                   'Oncotree Code', 
                   'Sample Type', 
                   'Sex', 
                   '3-Gene classifier subtype', 
                   "Patient's Vital Status"]


### Split by column

In [110]:
x = data.drop(['Patient ID', 'Tumor Stage', 'Relapse Free Status'], axis=1)
y_stage = data['Tumor Stage']
y_relapse = data['Relapse Free Status']

### Split by row

In [112]:
x_train, x_test, y_stage_train, y_stage_test, y_relapse_train, y_relapse_test = train_test_split(
    x, y_stage, y_relapse, test_size=0.2, random_state=42, stratify=y_stage
)

### Check data

### MissForest

In [115]:
# mf = MissForest()

# x_train = mf.fit_transform(
#     x=x_train,
#     categorical=numerical_feature + nominal_feature + ordinal_feature
# )

# x_test = mf.transform(x_test)

### ADASYN

In [119]:
# adasyn = ADASYN(
#     sampling_strategy = 'auto',
#     random_state = 42,
#     n_neighbors = 5
# )

# x_train, y_stage_train = adasyn.fit_resample(x_train, y_stage_train)

In [None]:
# Retrieve indices of nominal features
ordinal_indices = [x.columns.get_loc(col) for col in ordinal_feature if col in x.columns]

### Preprocessing

In [None]:
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder(categories=[
            Cellularity_1,
            Chemotherapy_1,
            ER_status_measured_by_IHC_1,
            ER_Status_1,
            Neoplasm_Histologic_Grade_1,
            HER2_Status_1,
            Hormone_Therapy_1,
            Inferred_Menopausal_State_1,
            Integrative_Cluster_1,
            Primary_Tumor_Laterality_1,
            Overall_Survival_Status_1,
            PR_Status,
            Radio_Therapy_1]))
])

nominal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
])


NameError: name 'PR_Status_1' is not defined

In [None]:
preprocessor = ColumnTransformer(transformers=[
    ('numerical_features', numerical_transformer, numerical_feature),
    ('ordinal_feature', ordinal_transformer, [
            'Cellularity',
            'Chemotherapy',
            'ER status measured by IHC',
            'ER Status',
            'Neoplasm Histologic Grade',
            'HER2 Status',
            'Hormone Therapy',
            'Inferred Menopausal State',
            'Integrative Cluster',
            'Primary Tumor Laterality',
            'Overall Survival Status',
            'PR Status',
            'Radio Therapy'
            ]
    ),
    ('nominal_feature', nominal_transformer, nominal_feature),    
])

### RandomizedSearchCV for Stage

In [None]:
# Stage
clf_stage = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote_nc', SMOTENC(
        categorical_features=categorical_indices,
        sampling_strategy='auto',
        random_state=42)
),
    ('feature_selector', SelectKBest(score_func=f_classif, k=15)),
    ('classifier', SVC())
])

param_stage = {
    'classifier__C': [0.1, 1, 10, 100],
    'classifier__gamma': ['scale', 'auto', 0.1, 1, 10],
    'classifier__kernel': ['rbf', 'linear', 'poly'],
    'classifier__class_weight': [None, 'balanced'],
}

# model_stage = GridSearchCV(
#     estimator=clf_stage,
#     param_grid=param_stage,
#     scoring='f1_weighted',
#     cv=6,
#     verbose=1,
#     n_jobs=1
# )

model_stage = RandomizedSearchCV(
    estimator=clf_stage,
    param_distributions=param_stage,
    n_iter=100,
    scoring='f1_weighted',
    cv=6,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

model_stage.fit(x_train, y_stage_train)
print("Best score for Tumor Stage prediction:", model_stage.best_score_)
print("Best params for Tumor Stage prediction:", model_stage.best_params_)

y_stage_pred = model_stage.predict(x_test)
print(classification_report(y_stage_test, y_stage_pred))

Fitting 6 folds for each of 100 candidates, totalling 600 fits


  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = ms

ValueError: could not broadcast input array from shape (9,429) into shape (10,429)

In [None]:
# Fill missing values by MissForest, SVM
'''
Best score for Tumor Stage prediction: 0.8505164966768697
Best params for Tumor Stage prediction: {'classifier__kernel': 'rbf', 'classifier__gamma': 'auto', 'classifier__class_weight': None, 'classifier__C': 10}
              precision    recall  f1-score   support

         0.0       1.00      0.67      0.80         3
         1.0       0.91      0.89      0.90       125
         2.0       0.87      0.93      0.90       195
         3.0       0.80      0.55      0.65        29
         4.0       0.00      0.00      0.00         2

    accuracy                           0.88       354
   macro avg       0.72      0.61      0.65       354
weighted avg       0.87      0.88      0.87       354
'''

"\nBest score for Tumor Stage prediction: 0.8505164966768697\nBest params for Tumor Stage prediction: {'classifier__kernel': 'rbf', 'classifier__gamma': 'auto', 'classifier__class_weight': None, 'classifier__C': 10}\n              precision    recall  f1-score   support\n\n         0.0       1.00      0.67      0.80         3\n         1.0       0.91      0.89      0.90       125\n         2.0       0.87      0.93      0.90       195\n         3.0       0.80      0.55      0.65        29\n         4.0       0.00      0.00      0.00         2\n\n    accuracy                           0.88       354\n   macro avg       0.72      0.61      0.65       354\nweighted avg       0.87      0.88      0.87       354\n"

In [None]:
# Fill mising values by median, mode (SVM)
'''
Best score for Tumor Stage prediction: 0.8497575825306799
Best params for Tumor Stage prediction: {'classifier__kernel': 'rbf', 'classifier__gamma': 'auto', 'classifier__class_weight': None, 'classifier__C': 10}
              precision    recall  f1-score   support

         0.0       1.00      0.67      0.80         3
         1.0       0.90      0.90      0.90       125
         2.0       0.87      0.93      0.90       195
         3.0       0.84      0.55      0.67        29
         4.0       0.00      0.00      0.00         2

    accuracy                           0.88       354
   macro avg       0.72      0.61      0.65       354
weighted avg       0.88      0.88      0.88       354
'''

"\nBest score for Tumor Stage prediction: 0.8497575825306799\nBest params for Tumor Stage prediction: {'classifier__kernel': 'rbf', 'classifier__gamma': 'auto', 'classifier__class_weight': None, 'classifier__C': 10}\n              precision    recall  f1-score   support\n\n         0.0       1.00      0.67      0.80         3\n         1.0       0.90      0.90      0.90       125\n         2.0       0.87      0.93      0.90       195\n         3.0       0.84      0.55      0.67        29\n         4.0       0.00      0.00      0.00         2\n\n    accuracy                           0.88       354\n   macro avg       0.72      0.61      0.65       354\nweighted avg       0.88      0.88      0.88       354\n"

### RandomizedSearchCV for Relapse

In [None]:
# Relapse
clf_relapse = Pipeline(steps=[
    ('preprocessor', preprocessor),
#     ('adasyn', ADASYN(
#     sampling_strategy = 'auto',
#     random_state = 42,
#     n_neighbors = 5
# )),
    # ('feature_selector', SelectPercentile()),
    ('feature_selector', SelectKBest(score_func=f_classif, k=33)),
    ('classifier', SVC())
])

param_relapse = {
    'classifier__C': [0.1, 1, 10, 100],
    'classifier__gamma': ['scale', 'auto', 0.1, 1, 10],
    'classifier__kernel': ['rbf', 'linear', 'poly'],
    'classifier__class_weight': [None, 'balanced'],
    # 'feature_selector__percentile':[75,80,85,90]
}

model_relapse = RandomizedSearchCV(
    estimator=clf_relapse,
    param_distributions=param_relapse,
    n_iter=100,
    scoring='f1',
    cv=6,
    verbose=1,
    n_jobs=1,
    random_state=42
)

model_relapse.fit(x_train, y_relapse_train)
print("Best score for Relapse Free Status prediction:", model_relapse.best_score_)
print("Best params for Relapse Free Status prediction:", model_relapse.best_params_)

y_relapse_pred = model_relapse.predict(x_test)
print(classification_report(y_relapse_test, y_relapse_pred))

Fitting 6 folds for each of 100 candidates, totalling 600 fits
Best score for Relapse Free Status prediction: 0.9169176522458726
Best params for Relapse Free Status prediction: {'classifier__kernel': 'rbf', 'classifier__gamma': 'auto', 'classifier__class_weight': None, 'classifier__C': 10}
              precision    recall  f1-score   support

           0       0.95      0.93      0.94       213
           1       0.90      0.92      0.91       141

    accuracy                           0.93       354
   macro avg       0.93      0.93      0.93       354
weighted avg       0.93      0.93      0.93       354



In [None]:
# Fill missing values by MissForest (SVM)
'''
Fitting 6 folds for each of 100 candidates, totalling 600 fits
Best score for Relapse Free Status prediction: 0.9158707252858679
Best params for Relapse Free Status prediction: {'classifier__kernel': 'rbf', 'classifier__gamma': 'scale', 'classifier__class_weight': 'balanced', 'classifier__C': 10}
              precision    recall  f1-score   support

           0       0.94      0.92      0.93       213
           1       0.88      0.91      0.90       141

    accuracy                           0.92       354
   macro avg       0.91      0.91      0.91       354
weighted avg       0.92      0.92      0.92       354
'''

"\nFitting 6 folds for each of 100 candidates, totalling 600 fits\nBest score for Relapse Free Status prediction: 0.9158707252858679\nBest params for Relapse Free Status prediction: {'classifier__kernel': 'rbf', 'classifier__gamma': 'scale', 'classifier__class_weight': 'balanced', 'classifier__C': 10}\n              precision    recall  f1-score   support\n\n           0       0.94      0.92      0.93       213\n           1       0.88      0.91      0.90       141\n\n    accuracy                           0.92       354\n   macro avg       0.91      0.91      0.91       354\nweighted avg       0.92      0.92      0.92       354\n"

In [None]:
# Fill mising values by median, mode (SVM)
'''
Fitting 6 folds for each of 100 candidates, totalling 600 fits
Best score for Relapse Free Status prediction: 0.9271202868407643
Best params for Relapse Free Status prediction: {'classifier__kernel': 'rbf', 'classifier__gamma': 'scale', 'classifier__class_weight': 'balanced', 'classifier__C': 10}
              precision    recall  f1-score   support

           0       0.94      0.95      0.94       213
           1       0.92      0.91      0.91       141

    accuracy                           0.93       354
   macro avg       0.93      0.93      0.93       354
weighted avg       0.93      0.93      0.93       354
'''

"\nFitting 6 folds for each of 100 candidates, totalling 600 fits\nBest score for Relapse Free Status prediction: 0.9271202868407643\nBest params for Relapse Free Status prediction: {'classifier__kernel': 'rbf', 'classifier__gamma': 'scale', 'classifier__class_weight': 'balanced', 'classifier__C': 10}\n              precision    recall  f1-score   support\n\n           0       0.94      0.95      0.94       213\n           1       0.92      0.91      0.91       141\n\n    accuracy                           0.93       354\n   macro avg       0.93      0.93      0.93       354\nweighted avg       0.93      0.93      0.93       354\n"

### Feature importance

In [None]:
# Stage
result_stage = permutation_importance(model_stage.best_estimator_, x_test, y_stage_test, n_repeats=10, random_state=42)
sorted_idx_stage = result_stage.importances_mean.argsort()[::-1]

print("Feature importance for Tumor Stage:")
for i in sorted_idx_stage:
    print(f"{x.columns[i]}: {result_stage.importances_mean[i]:.3f}")

Feature importance for Tumor Stage:
Tumor Size: 0.188
Lymph nodes examined positive: 0.154
Nottingham prognostic index: 0.099
Patient's Vital Status: 0.047
Overall Survival Status: 0.046
Cohort: 0.042
Hormone Therapy: 0.028
ER Status: 0.014
Oncotree Code: 0.007
Cancer Type Detailed: 0.007
Pam50 + Claudin-low subtype: 0.002
Cellularity: 0.000
Chemotherapy: 0.000
ER status measured by IHC: 0.000
Tumor Other Histologic Subtype: 0.000
Neoplasm Histologic Grade: 0.000
Cancer Type: 0.000
HER2 status measured by SNP6: 0.000
Type of Breast Surgery: 0.000
HER2 Status: 0.000
Integrative Cluster: 0.000
Inferred Menopausal State: 0.000
Primary Tumor Laterality: 0.000
Mutation Count: 0.000
Overall Survival (Months): 0.000
PR Status: 0.000
Relapse Free Status (Months): 0.000
Number of Samples Per Patient: 0.000
Sample Type: 0.000
Sex: 0.000
3-Gene classifier subtype: 0.000
TMB (nonsynonymous): 0.000
Age at Diagnosis: 0.000
Radio Therapy: -0.010


In [None]:
'''
Feature importance for Tumor Stage:
Tumor Size: 0.212
Nottingham prognostic index: 0.170
Lymph nodes examined positive: 0.126
Cohort: 0.060
Neoplasm Histologic Grade: 0.055
Chemotherapy: 0.026
Hormone Therapy: 0.020
Overall Survival (Months): 0.010
Patient's Vital Status: 0.004
Type of Breast Surgery: 0.003
HER2 Status: 0.000
HER2 status measured by SNP6: 0.000
ER Status: 0.000
ER status measured by IHC: 0.000
Pam50 + Claudin-low subtype: 0.000
Cellularity: 0.000
Cancer Type: 0.000
Tumor Other Histologic Subtype: 0.000
Integrative Cluster: 0.000
Inferred Menopausal State: 0.000
Primary Tumor Laterality: 0.000
Mutation Count: 0.000
Overall Survival Status: 0.000
PR Status: 0.000
...
TMB (nonsynonymous): 0.000
Age at Diagnosis: 0.000
Oncotree Code: -0.000
Cancer Type Detailed: -0.000
'''

"\nFeature importance for Tumor Stage:\nTumor Size: 0.212\nNottingham prognostic index: 0.170\nLymph nodes examined positive: 0.126\nCohort: 0.060\nNeoplasm Histologic Grade: 0.055\nChemotherapy: 0.026\nHormone Therapy: 0.020\nOverall Survival (Months): 0.010\nPatient's Vital Status: 0.004\nType of Breast Surgery: 0.003\nHER2 Status: 0.000\nHER2 status measured by SNP6: 0.000\nER Status: 0.000\nER status measured by IHC: 0.000\nPam50 + Claudin-low subtype: 0.000\nCellularity: 0.000\nCancer Type: 0.000\nTumor Other Histologic Subtype: 0.000\nIntegrative Cluster: 0.000\nInferred Menopausal State: 0.000\nPrimary Tumor Laterality: 0.000\nMutation Count: 0.000\nOverall Survival Status: 0.000\nPR Status: 0.000\n...\nTMB (nonsynonymous): 0.000\nAge at Diagnosis: 0.000\nOncotree Code: -0.000\nCancer Type Detailed: -0.000\n"

In [None]:
# Relapse
result_relapse = permutation_importance(model_relapse.best_estimator_, x_test, y_relapse_test, n_repeats=10, random_state=42)
sorted_idx_relapse = result_relapse.importances_mean.argsort()[::-1]

print("Feature importance for Relapse Free Status:")
for i in sorted_idx_relapse:
    print(f"{x.columns[i]}: {result_relapse.importances_mean[i]:.3f}")

Feature importance for Relapse Free Status:
Relapse Free Status (Months): 0.340
Patient's Vital Status: 0.224
Overall Survival (Months): 0.131
Overall Survival Status: 0.021
Cohort: 0.013
Nottingham prognostic index: 0.006
Pam50 + Claudin-low subtype: 0.003
Inferred Menopausal State: 0.001
Radio Therapy: 0.000
HER2 Status: 0.000
Lymph nodes examined positive: 0.000
Chemotherapy: 0.000
Cellularity: 0.000
Tumor Other Histologic Subtype: 0.000
Cancer Type: 0.000
Type of Breast Surgery: 0.000
ER status measured by IHC: 0.000
ER Status: 0.000
HER2 status measured by SNP6: 0.000
Integrative Cluster: 0.000
Hormone Therapy: 0.000
Primary Tumor Laterality: 0.000
Mutation Count: 0.000
PR Status: 0.000
Number of Samples Per Patient: 0.000
Sample Type: 0.000
Sex: 0.000
3-Gene classifier subtype: 0.000
TMB (nonsynonymous): 0.000
Oncotree Code: -0.001
Cancer Type Detailed: -0.001
Tumor Size: -0.001
Neoplasm Histologic Grade: -0.003
Age at Diagnosis: -0.005


In [None]:
'''
Feature importance for Relapse Free Status:
Patient's Vital Status: 0.299
Overall Survival Status: 0.040
Nottingham prognostic index: 0.027
Age at Diagnosis: 0.015
Tumor Size: 0.008
3-Gene classifier subtype: 0.007
Lymph nodes examined positive: 0.007
Neoplasm Histologic Grade: 0.005
Cohort: 0.004
Overall Survival (Months): 0.003
ER status measured by IHC: 0.002
HER2 Status: 0.001
PR Status: 0.001
Chemotherapy: 0.001
Inferred Menopausal State: 0.001
HER2 status measured by SNP6: 0.001
Tumor Other Histologic Subtype: 0.000
Cellularity: 0.000
Cancer Type Detailed: 0.000
ER Status: 0.000
Cancer Type: 0.000
Sex: 0.000
Hormone Therapy: 0.000
Sample Type: 0.000
...
Number of Samples Per Patient: 0.000
Integrative Cluster: 0.000
Type of Breast Surgery: -0.000
Pam50 + Claudin-low subtype: -0.001
'''

"\nFeature importance for Relapse Free Status:\nPatient's Vital Status: 0.299\nOverall Survival Status: 0.040\nNottingham prognostic index: 0.027\nAge at Diagnosis: 0.015\nTumor Size: 0.008\n3-Gene classifier subtype: 0.007\nLymph nodes examined positive: 0.007\nNeoplasm Histologic Grade: 0.005\nCohort: 0.004\nOverall Survival (Months): 0.003\nER status measured by IHC: 0.002\nHER2 Status: 0.001\nPR Status: 0.001\nChemotherapy: 0.001\nInferred Menopausal State: 0.001\nHER2 status measured by SNP6: 0.001\nTumor Other Histologic Subtype: 0.000\nCellularity: 0.000\nCancer Type Detailed: 0.000\nER Status: 0.000\nCancer Type: 0.000\nSex: 0.000\nHormone Therapy: 0.000\nSample Type: 0.000\n...\nNumber of Samples Per Patient: 0.000\nIntegrative Cluster: 0.000\nType of Breast Surgery: -0.000\nPam50 + Claudin-low subtype: -0.001\n"

### Lazy Prediction

In [None]:
clf_stage = LazyClassifier(verbose=2, ignore_warnings=True, custom_metric=None)
models_stage, predictions_stage = clf_stage.fit(x_train, x_test, y_stage_train, y_stage_test)
models_stage

  7%|▋         | 2/29 [00:00<00:04,  5.78it/s]

{'Model': 'AdaBoostClassifier', 'Accuracy': 0.4632768361581921, 'Balanced Accuracy': 0.35767285587975245, 'ROC AUC': None, 'F1 Score': 0.45237102898953413, 'Time taken': 0.22502827644348145}
{'Model': 'BaggingClassifier', 'Accuracy': 0.8389830508474576, 'Balanced Accuracy': 0.4176339522546419, 'ROC AUC': None, 'F1 Score': 0.8287520290196472, 'Time taken': 0.13560795783996582}
{'Model': 'BernoulliNB', 'Accuracy': 0.652542372881356, 'Balanced Accuracy': 0.5881740053050397, 'ROC AUC': None, 'F1 Score': 0.6611408588122717, 'Time taken': 0.033622026443481445}


 14%|█▍        | 4/29 [00:02<00:14,  1.70it/s]

{'Model': 'CalibratedClassifierCV', 'Accuracy': 0.7401129943502824, 'Balanced Accuracy': 0.339746065428824, 'ROC AUC': None, 'F1 Score': 0.7136395110610888, 'Time taken': 1.6335129737854004}
{'Model': 'DecisionTreeClassifier', 'Accuracy': 0.827683615819209, 'Balanced Accuracy': 0.41767922192749773, 'ROC AUC': None, 'F1 Score': 0.8219972238811024, 'Time taken': 0.036805152893066406}
{'Model': 'DummyClassifier', 'Accuracy': 0.5508474576271186, 'Balanced Accuracy': 0.2, 'ROC AUC': None, 'F1 Score': 0.3913124015930351, 'Time taken': 0.019508838653564453}
{'Model': 'ExtraTreeClassifier', 'Accuracy': 0.5903954802259888, 'Balanced Accuracy': 0.3015313881520778, 'ROC AUC': None, 'F1 Score': 0.5966586771671519, 'Time taken': 0.02172112464904785}


 41%|████▏     | 12/29 [00:02<00:02,  6.02it/s]

{'Model': 'ExtraTreesClassifier', 'Accuracy': 0.731638418079096, 'Balanced Accuracy': 0.35268611847922193, 'ROC AUC': None, 'F1 Score': 0.7117327613092239, 'Time taken': 0.25832605361938477}
{'Model': 'GaussianNB', 'Accuracy': 0.1016949152542373, 'Balanced Accuracy': 0.36235048629531386, 'ROC AUC': None, 'F1 Score': 0.05448169135178577, 'Time taken': 0.028068065643310547}
{'Model': 'KNeighborsClassifier', 'Accuracy': 0.6779661016949152, 'Balanced Accuracy': 0.3935434129089302, 'ROC AUC': None, 'F1 Score': 0.663132143783563, 'Time taken': 0.034979820251464844}
{'Model': 'LabelPropagation', 'Accuracy': 0.655367231638418, 'Balanced Accuracy': 0.3391603890362511, 'ROC AUC': None, 'F1 Score': 0.6445995716341552, 'Time taken': 0.11128425598144531}


 48%|████▊     | 14/29 [00:02<00:02,  6.02it/s]

{'Model': 'LabelSpreading', 'Accuracy': 0.655367231638418, 'Balanced Accuracy': 0.3391603890362511, 'ROC AUC': None, 'F1 Score': 0.6445995716341552, 'Time taken': 0.17744088172912598}
{'Model': 'LinearDiscriminantAnalysis', 'Accuracy': 0.7203389830508474, 'Balanced Accuracy': 0.44460053050397874, 'ROC AUC': None, 'F1 Score': 0.7171785653141585, 'Time taken': 0.151580810546875}


 52%|█████▏    | 15/29 [00:03<00:02,  4.74it/s]

{'Model': 'LinearSVC', 'Accuracy': 0.731638418079096, 'Balanced Accuracy': 0.40932979664014146, 'ROC AUC': None, 'F1 Score': 0.7128310449829357, 'Time taken': 0.4477219581604004}


 66%|██████▌   | 19/29 [00:04<00:01,  5.52it/s]

{'Model': 'LogisticRegression', 'Accuracy': 0.7853107344632768, 'Balanced Accuracy': 0.45861856763925735, 'ROC AUC': None, 'F1 Score': 0.7742781809404361, 'Time taken': 0.5811638832092285}
{'Model': 'NearestCentroid', 'Accuracy': 0.5790960451977402, 'Balanced Accuracy': 0.6476328912466843, 'ROC AUC': None, 'F1 Score': 0.598129380610283, 'Time taken': 0.06233525276184082}
{'Model': 'PassiveAggressiveClassifier', 'Accuracy': 0.6666666666666666, 'Balanced Accuracy': 0.4321329796640141, 'ROC AUC': None, 'F1 Score': 0.6683951815699066, 'Time taken': 0.0812380313873291}


 72%|███████▏  | 21/29 [00:04<00:01,  6.22it/s]

{'Model': 'Perceptron', 'Accuracy': 0.6440677966101694, 'Balanced Accuracy': 0.3323529619805482, 'ROC AUC': None, 'F1 Score': 0.5976085332280233, 'Time taken': 0.06608700752258301}
{'Model': 'QuadraticDiscriminantAnalysis', 'Accuracy': 0.12429378531073447, 'Balanced Accuracy': 0.21997948717948718, 'ROC AUC': None, 'F1 Score': 0.09241144304819163, 'Time taken': 0.15408778190612793}


 76%|███████▌  | 22/29 [00:04<00:01,  5.00it/s]

{'Model': 'RandomForestClassifier', 'Accuracy': 0.844632768361582, 'Balanced Accuracy': 0.3969032714412025, 'ROC AUC': None, 'F1 Score': 0.8189822055354484, 'Time taken': 0.37737488746643066}
{'Model': 'RidgeClassifier', 'Accuracy': 0.6864406779661016, 'Balanced Accuracy': 0.3097931034482758, 'ROC AUC': None, 'F1 Score': 0.6586647855822313, 'Time taken': 0.06453895568847656}


 86%|████████▌ | 25/29 [00:05<00:00,  6.15it/s]

{'Model': 'RidgeClassifierCV', 'Accuracy': 0.6949152542372882, 'Balanced Accuracy': 0.3134443854995579, 'ROC AUC': None, 'F1 Score': 0.6666861566617848, 'Time taken': 0.14592409133911133}
{'Model': 'SGDClassifier', 'Accuracy': 0.7146892655367232, 'Balanced Accuracy': 0.3750974358974359, 'ROC AUC': None, 'F1 Score': 0.6807290573912255, 'Time taken': 0.13666296005249023}


 90%|████████▉ | 26/29 [00:05<00:00,  5.16it/s]

{'Model': 'SVC', 'Accuracy': 0.7824858757062146, 'Balanced Accuracy': 0.4371083996463307, 'ROC AUC': None, 'F1 Score': 0.7619430812314794, 'Time taken': 0.30628204345703125}


 97%|█████████▋| 28/29 [00:06<00:00,  3.39it/s]

{'Model': 'XGBClassifier', 'Accuracy': 0.8587570621468926, 'Balanced Accuracy': 0.5186603006189212, 'ROC AUC': None, 'F1 Score': 0.8534193011259618, 'Time taken': 0.9180681705474854}


100%|██████████| 29/29 [00:09<00:00,  3.02it/s]

{'Model': 'LGBMClassifier', 'Accuracy': 0.8700564971751412, 'Balanced Accuracy': 0.5186150309460654, 'ROC AUC': None, 'F1 Score': 0.8612237917319644, 'Time taken': 3.330993890762329}





Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
NearestCentroid,0.58,0.65,,0.6,0.06
BernoulliNB,0.65,0.59,,0.66,0.03
XGBClassifier,0.86,0.52,,0.85,0.92
LGBMClassifier,0.87,0.52,,0.86,3.33
LogisticRegression,0.79,0.46,,0.77,0.58
LinearDiscriminantAnalysis,0.72,0.44,,0.72,0.15
SVC,0.78,0.44,,0.76,0.31
PassiveAggressiveClassifier,0.67,0.43,,0.67,0.08
DecisionTreeClassifier,0.83,0.42,,0.82,0.04
BaggingClassifier,0.84,0.42,,0.83,0.14


In [None]:
clf_relapse = LazyClassifier(verbose=2, ignore_warnings=True, custom_metric=None)
models_relapse, predictions_relapse = clf_relapse.fit(x_train, x_test, y_relapse_train, y_relapse_test)
models_relapse

  0%|          | 0/29 [00:00<?, ?it/s]

  7%|▋         | 2/29 [00:05<01:05,  2.44s/it]

{'Model': 'AdaBoostClassifier', 'Accuracy': 0.9124293785310734, 'Balanced Accuracy': 0.9056537808410748, 'ROC AUC': 0.9056537808410748, 'F1 Score': 0.9121469802023984, 'Time taken': 5.63745379447937}
{'Model': 'BaggingClassifier', 'Accuracy': 0.9322033898305084, 'Balanced Accuracy': 0.9256817500749175, 'ROC AUC': 0.9256817500749176, 'F1 Score': 0.9319372799475968, 'Time taken': 0.19000005722045898}
{'Model': 'BernoulliNB', 'Accuracy': 0.8305084745762712, 'Balanced Accuracy': 0.8064129457596644, 'ROC AUC': 0.8064129457596644, 'F1 Score': 0.8263930329100111, 'Time taken': 0.029970884323120117}


 21%|██        | 6/29 [00:06<00:14,  1.63it/s]

{'Model': 'CalibratedClassifierCV', 'Accuracy': 0.8983050847457628, 'Balanced Accuracy': 0.8927180101887924, 'ROC AUC': 0.8927180101887924, 'F1 Score': 0.8981794935620375, 'Time taken': 0.6338169574737549}
{'Model': 'DecisionTreeClassifier', 'Accuracy': 0.8954802259887006, 'Balanced Accuracy': 0.8939666366996304, 'ROC AUC': 0.8939666366996303, 'F1 Score': 0.8957700132406706, 'Time taken': 0.07264208793640137}
{'Model': 'DummyClassifier', 'Accuracy': 0.6016949152542372, 'Balanced Accuracy': 0.5, 'ROC AUC': 0.5, 'F1 Score': 0.45206707918572325, 'Time taken': 0.050186872482299805}
{'Model': 'ExtraTreeClassifier', 'Accuracy': 0.7768361581920904, 'Balanced Accuracy': 0.7558185995405055, 'ROC AUC': 0.7558185995405053, 'F1 Score': 0.7735392787757972, 'Time taken': 0.03584909439086914}


 41%|████▏     | 12/29 [00:07<00:04,  4.08it/s]

{'Model': 'ExtraTreesClassifier', 'Accuracy': 0.8898305084745762, 'Balanced Accuracy': 0.862900809109979, 'ROC AUC': 0.862900809109979, 'F1 Score': 0.8859136084056203, 'Time taken': 0.34836411476135254}
{'Model': 'GaussianNB', 'Accuracy': 0.4152542372881356, 'Balanced Accuracy': 0.5128858255918489, 'ROC AUC': 0.512885825591849, 'F1 Score': 0.2671213968267367, 'Time taken': 0.028701305389404297}
{'Model': 'KNeighborsClassifier', 'Accuracy': 0.864406779661017, 'Balanced Accuracy': 0.8525621816002398, 'ROC AUC': 0.8525621816002398, 'F1 Score': 0.8634692253514685, 'Time taken': 0.03623485565185547}
{'Model': 'LabelPropagation', 'Accuracy': 0.8135593220338984, 'Balanced Accuracy': 0.7947258016182199, 'ROC AUC': 0.79472580161822, 'F1 Score': 0.8109825927622538, 'Time taken': 0.11011099815368652}
{'Model': 'LabelSpreading', 'Accuracy': 0.8135593220338984, 'Balanced Accuracy': 0.7947258016182199, 'ROC AUC': 0.79472580161822, 'F1 Score': 0.8109825927622538, 'Time taken': 0.1361219882965088}


 52%|█████▏    | 15/29 [00:08<00:04,  3.39it/s]

{'Model': 'LinearDiscriminantAnalysis', 'Accuracy': 0.8926553672316384, 'Balanced Accuracy': 0.8844271301568275, 'ROC AUC': 0.8844271301568276, 'F1 Score': 0.892234026583695, 'Time taken': 0.8874900341033936}
{'Model': 'LinearSVC', 'Accuracy': 0.9011299435028248, 'Balanced Accuracy': 0.8962641094795725, 'ROC AUC': 0.8962641094795725, 'F1 Score': 0.9010697910352717, 'Time taken': 0.16100311279296875}


 55%|█████▌    | 16/29 [00:09<00:05,  2.38it/s]

{'Model': 'LogisticRegression', 'Accuracy': 0.8983050847457628, 'Balanced Accuracy': 0.8927180101887924, 'ROC AUC': 0.8927180101887924, 'F1 Score': 0.8981794935620375, 'Time taken': 0.9350478649139404}
{'Model': 'NearestCentroid', 'Accuracy': 0.807909604519774, 'Balanced Accuracy': 0.8032164618919189, 'ROC AUC': 0.8032164618919189, 'F1 Score': 0.8085384718859295, 'Time taken': 0.07129025459289551}


 62%|██████▏   | 18/29 [00:09<00:03,  2.96it/s]

{'Model': 'NuSVC', 'Accuracy': 0.9124293785310734, 'Balanced Accuracy': 0.8984616921386475, 'ROC AUC': 0.8984616921386475, 'F1 Score': 0.9113007584852241, 'Time taken': 0.30626988410949707}
{'Model': 'PassiveAggressiveClassifier', 'Accuracy': 0.8757062146892656, 'Balanced Accuracy': 0.8571571271601239, 'ROC AUC': 0.8571571271601239, 'F1 Score': 0.8734994878003353, 'Time taken': 0.07038712501525879}


 69%|██████▉   | 20/29 [00:09<00:02,  3.86it/s]

{'Model': 'Perceptron', 'Accuracy': 0.9067796610169492, 'Balanced Accuracy': 0.9021576266107283, 'ROC AUC': 0.9021576266107283, 'F1 Score': 0.9067229458332563, 'Time taken': 0.13995909690856934}


 72%|███████▏  | 21/29 [00:10<00:02,  3.53it/s]

{'Model': 'QuadraticDiscriminantAnalysis', 'Accuracy': 0.4915254237288136, 'Balanced Accuracy': 0.5750674258315852, 'ROC AUC': 0.5750674258315852, 'F1 Score': 0.41024054474132193, 'Time taken': 0.38138604164123535}


 76%|███████▌  | 22/29 [00:10<00:02,  3.31it/s]

{'Model': 'RandomForestClassifier', 'Accuracy': 0.9124293785310734, 'Balanced Accuracy': 0.8948656477874337, 'ROC AUC': 0.8948656477874338, 'F1 Score': 0.9107840077980881, 'Time taken': 0.3698239326477051}
{'Model': 'RidgeClassifier', 'Accuracy': 0.8926553672316384, 'Balanced Accuracy': 0.8844271301568275, 'ROC AUC': 0.8844271301568276, 'F1 Score': 0.892234026583695, 'Time taken': 0.07842087745666504}


 83%|████████▎ | 24/29 [00:11<00:01,  3.67it/s]

{'Model': 'RidgeClassifierCV', 'Accuracy': 0.8898305084745762, 'Balanced Accuracy': 0.8820797123164519, 'ROC AUC': 0.8820797123164519, 'F1 Score': 0.8894752331578558, 'Time taken': 0.3719167709350586}
{'Model': 'SGDClassifier', 'Accuracy': 0.8728813559322034, 'Balanced Accuracy': 0.882379382679053, 'ROC AUC': 0.882379382679053, 'F1 Score': 0.8740946452397136, 'Time taken': 0.08368301391601562}


 90%|████████▉ | 26/29 [00:11<00:00,  4.08it/s]

{'Model': 'SVC', 'Accuracy': 0.9124293785310734, 'Balanced Accuracy': 0.9044550993906703, 'ROC AUC': 0.9044550993906704, 'F1 Score': 0.9120226890188139, 'Time taken': 0.3033111095428467}


 97%|█████████▋| 28/29 [00:11<00:00,  4.88it/s]

{'Model': 'XGBClassifier', 'Accuracy': 0.9322033898305084, 'Balanced Accuracy': 0.9316751573269404, 'ROC AUC': 0.9316751573269403, 'F1 Score': 0.9323561810508553, 'Time taken': 0.25110292434692383}


100%|██████████| 29/29 [00:12<00:00,  2.35it/s]

{'Model': 'LGBMClassifier', 'Accuracy': 0.9293785310734464, 'Balanced Accuracy': 0.9257316951353511, 'ROC AUC': 0.9257316951353511, 'F1 Score': 0.9293355650251941, 'Time taken': 0.5667948722839355}





Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
XGBClassifier,0.93,0.93,0.93,0.93,0.25
LGBMClassifier,0.93,0.93,0.93,0.93,0.57
BaggingClassifier,0.93,0.93,0.93,0.93,0.19
AdaBoostClassifier,0.91,0.91,0.91,0.91,5.64
SVC,0.91,0.9,0.9,0.91,0.3
Perceptron,0.91,0.9,0.9,0.91,0.14
NuSVC,0.91,0.9,0.9,0.91,0.31
LinearSVC,0.9,0.9,0.9,0.9,0.16
RandomForestClassifier,0.91,0.89,0.89,0.91,0.37
DecisionTreeClassifier,0.9,0.89,0.89,0.9,0.07
