In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from datetime import datetime

In [2]:
# Threshold to take number of genes (biomarkers) per subtype
THRESHOLD = 10
COHORT = 'TCGA_BRCA'
assert COHORT in ['TCGA_BRCA','TCGA_LUNG','TCGA_CRC','METABRIC']
RANDOM_STATE = 42

In [3]:
LIST_OMICS = ['GE', 'CNA']
# Single omic or Multi-omics| to run experiments
LIST_EXP_OMICS = ['GE_CNA','GE', 'CNA']

ROOT_DATA_FOLDER = f'../input/deepssc-omics-pretrained-model-and-biomarkers/DeepSSC_data/{COHORT}/'
DATA_FOLDER = {'train': ROOT_DATA_FOLDER + f'data/train/',
              'test': ROOT_DATA_FOLDER + f'data/test/'}
BIOMARKER_FILE = ROOT_DATA_FOLDER + f'biomarkers/matrix_biomarkers.csv'
if COHORT == 'METABRIC':
    HGNC_SYMBOL_CHECK_FILE = ROOT_DATA_FOLDER + f'biomarkers/hgnc_symbol_check.csv'
    # for matching gene name between two cohort
LIST_TYPE_DATA = ['train', 'test']

In [4]:
# Read data
dict_df_label = {}
dict_df_data = {}
# Read data as df and create numpy array data for labeled data
for type_data in LIST_TYPE_DATA:
    dict_df_label[type_data] = pd.read_csv(DATA_FOLDER[type_data] + f'df_label_{type_data}.csv', index_col='sampleID')
    
    dict_df_omics = {}
    dict_narray_omics = {}
    for omic in LIST_OMICS:
        dict_df_omics[omic] = pd.read_csv(DATA_FOLDER[type_data] + f'df_{type_data}_{omic}_labeled.csv', index_col='sampleID')
    dict_df_data[type_data] = dict_df_omics

LABEL_MAPPING_NAME = dict_df_label['train']['disease_subtypes'].astype('category').cat.categories # sorted by alphabetical order
# Convert categorical label to numerical label
for type_data in LIST_TYPE_DATA:
    dict_df_label[type_data].loc[:,'disease_subtypes'] = dict_df_label[type_data]['disease_subtypes'].astype('category').cat.codes

#---------------------------------------------------------------------------------------
# Keep only biomarker genes found from TCGA data
print('-'*100)
print('KEEP ONLY BIOMARKER GENES FOUND FROM TCGA DATA')
score_genes = pd.read_csv(BIOMARKER_FILE)

score_genes = score_genes.iloc[:THRESHOLD]
top_genes = list(set(score_genes.to_numpy(copy=True).reshape(-1)))
top_genes = [gene.upper() for gene in top_genes]
print(f'Top {THRESHOLD} from TCGA have {len(top_genes)} unique genes')

if COHORT == 'METABRIC':
    # Check missing genes in METABRIC
    origin_unique_gene = {}
    for omic in LIST_OMICS:
        origin_unique_gene[omic] = list(dict_df_data['train'][omic].columns.str.upper().str.split(r'\|').str[0].unique())

    print('\nBefore matching alias gene names ')
    missing_genes = {}
    for omic in LIST_OMICS: 
        print(f'\tmissing {omic}:')
        missing_genes[omic] = list(set(top_genes) - set(origin_unique_gene[omic]))
        print('\t\t Number of missing',len(missing_genes[omic]))

    print('\tmissing both GE and CNA:')
    missing_genes_both_GE_CNA = list(set(missing_genes['GE']).intersection(set(missing_genes['CNA'])))
    print('\t\t Number of missing ',len(missing_genes_both_GE_CNA))
    print('\t\t',missing_genes_both_GE_CNA)


    print('\nAfter matching alias gene names ')
    alias_gene_names = pd.read_csv(HGNC_SYMBOL_CHECK_FILE,index_col='Input')
    alias_gene_names.index = alias_gene_names.index.str.upper()
    alias_gene_names['Approved symbol'] = alias_gene_names['Approved symbol'].str.upper()

    alias_gene_names = alias_gene_names.loc[alias_gene_names.index.isin(missing_genes_both_GE_CNA)]
    alias_gene_names = alias_gene_names.loc[alias_gene_names['Approved symbol'].notna(),['Approved symbol']]

    # Match alias gene name TCGA on METABRIC
    top_genes_fixed = []
    list_input_alias_gene_names =  alias_gene_names.index.tolist()
    for gene_name in top_genes:
        if gene_name in list_input_alias_gene_names:
            # change gene_name to its' alias gene names that can be used in METABRIC
            alias_names = alias_gene_names.loc[gene_name]['Approved symbol']
            if isinstance(alias_names, str):
                top_genes_fixed.append(alias_names)
            else:
                # an genes can have more than 1 suitable alias gene names
                top_genes_fixed.extend(alias_names.tolist())
        else:
            # gene name is existed in METABRIC or cannot find any suitable alias gene names
            # can be used in METABRIC
            top_genes_fixed.append(gene_name)
    print(f'Total top_genes_fixed {len(top_genes_fixed)}')

    missing_genes = {}
    for omic in LIST_OMICS: 
        print(f'\tmissing {omic}:')
        missing_genes[omic] = list(set(top_genes_fixed) - set(origin_unique_gene[omic]))
        print('\t\t Number of missing',len(missing_genes[omic]))

    print('\tmissing both GE and CNA:')
    missing_genes_both_GE_CNA = list(set(missing_genes['GE']).intersection(set(missing_genes['CNA'])))
    print('\t\t Number of missing ',len(missing_genes_both_GE_CNA))
    print('\t\t',missing_genes_both_GE_CNA)

    top_genes = top_genes_fixed

GENE = {}
print("\nNum features/genes:")
for omic in LIST_OMICS:
    GENE[omic] = dict_df_data['train'][omic].columns[
        dict_df_data['train'][omic].columns.str.upper().str.split(r'\|').str[0].isin(top_genes)
    ].to_numpy(copy=True).tolist()

    print(f'\twith {omic} TOP {THRESHOLD}:', len(GENE[omic]))
    
    for type_data in LIST_TYPE_DATA:
        dict_df_data[type_data][omic] = dict_df_data[type_data][omic][GENE[omic]].copy(deep=True)

if COHORT == 'METABRIC':
    # Fill missing values with mode value
    for type_data in LIST_TYPE_DATA:
        print(type_data)
        for omic in LIST_OMICS:
            print(omic)
            print('\tBefore imputing:')
            iloc_null = np.argwhere(np.isnan(dict_df_data[type_data][omic].to_numpy(copy=True)))
            iloc_gene_col_null = np.unique(iloc_null[:,1])
            print(f'\t\tNumber of null: {len(iloc_null)}')
            print(f'\t\tNumber of genes have null: {len(iloc_gene_col_null)}')
            
            if len(iloc_null) > 0:
                print('\tImputing:')
                for index in iloc_gene_col_null:
                    dict_df_data[type_data][omic].iloc[:,index].fillna(
                        dict_df_data[type_data][omic].iloc[:,index].mode()[0], inplace=True
                    )
                print('\tAfter imputing:')
                iloc_null = np.argwhere(np.isnan(dict_df_data[type_data][omic].to_numpy(copy=True)))
                print(f'\t\tNumber of null: {len(iloc_null)}')

----------------------------------------------------------------------------------------------------
KEEP ONLY BIOMARKER GENES FOUND FROM TCGA DATA
Top 10 from TCGA have 46 unique genes

Num features/genes:
	with GE TOP 10: 46
	with CNA TOP 10: 15


In [5]:
dict_X = {}
dict_y = {}
for type_data in LIST_TYPE_DATA:
    dict_X[type_data] = {}
    dict_y[type_data] = {}

for type_omic in LIST_EXP_OMICS:
    if '_' in type_omic:
        print(f'Creating data for multi-omics experiment: {type_omic}')
        list_omics = type_omic.split('_')
        for type_data in LIST_TYPE_DATA:
            tuple_data_omics = tuple([dict_df_data[type_data][single_omic] for single_omic in list_omics])
            dict_X[type_data][type_omic] = np.concatenate(tuple_data_omics, axis=1)
    else:
        print(f'Creating data for single omic experiment: {type_omic}')
        for type_data in LIST_TYPE_DATA:
            dict_X[type_data][type_omic] = dict_df_data[type_data][type_omic].to_numpy(copy=True)
    
    for type_data in LIST_TYPE_DATA:
        dict_y[type_data][type_omic] = dict_df_label[type_data]['disease_subtypes'].to_numpy(copy=True)

Creating data for multi-omics experiment: GE_CNA
Creating data for single omic experiment: GE
Creating data for single omic experiment: CNA


In [6]:
from IPython.display import Markdown, display
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', None)
def printmd(string, color=None):
    colorstr = "<span style='color:{}'>{}</span>".format(color, string)
    display(Markdown(colorstr))

In [7]:
def tuning_and_eval(gridcvs, X_train, y_train, X_test, y_test,
                    scoring, refit, is_binary_problem,
                    result_on_dataset, rank_hparams_info):
    assert 'test' in result_on_dataset
    assert isinstance(rank_hparams_info, bool)

    start=datetime.now()
    X = {}
    y = {}
    X['train'] = X_train
    X['test'] = X_test
    y['train'] = y_train
    y['test'] = y_test
    
    for name, gs_est in sorted(gridcvs.items()):
        printmd(f'{name} classifier:\n', color="blue")
        gs_est.fit(X['train'],y['train'])
        print(f'%s | best {refit} score  %.2f%% +/- %.2f' % 
          (name, gs_est.best_score_ * 100, gs_est.cv_results_[f'std_test_{refit}'][gs_est.best_index_] * 100))

        print('best parmams:', gs_est.best_params_)
        if rank_hparams_info:
            print('\n')
            select_result_cols = []
            for metric in scoring:
                select_result_cols.extend(['rank_test_'+metric,'mean_test_'+ metric, 'std_test_'+metric])
            select_result_cols.extend(['params'])
            
            dataframe_results = pd.DataFrame(gs_est.cv_results_).loc[:,select_result_cols].sort_values(by=f'mean_test_{refit}',ascending=False)
            display(dataframe_results[:10])

        for type_data in result_on_dataset:
            print('\n')
            print(f'Result on {type_data} dataset with best hyperparameters:')
            y_predict = gs_est.predict(X[type_data])
            
            acc = accuracy_score(y_true=y[type_data], y_pred=y_predict)
            print(f'{type_data}_acc: {acc * 100:.2f}')
            if is_binary_problem:
                f1 = f1_score(y_true=y[type_data], y_pred=y_predict,average='binary')
                y_score = gs_est.predict_proba(X[type_data])[:, 1]
                roc_auc = roc_auc_score(y_true=y[type_data], y_score=y_score)
                print(f'{type_data}_f1: {f1 * 100:.2f}')
                print(f'{type_data}_roc_auc: {roc_auc * 100:.2f}')
            else:
                f1_macro = f1_score(y_true=y[type_data], y_pred=y_predict,average='macro')
                f1_weighted = f1_score(y_true=y[type_data], y_pred=y_predict,average='weighted')
                print(f'{type_data}_f1_macro: {f1_macro * 100:.2f}')
                print(f'{type_data}_f1_weighted: {f1_weighted * 100:.2f}')
                
            display(pd.DataFrame(classification_report(y_true=y[type_data], 
                                                       y_pred=y_predict,
                                                       digits=4, output_dict=True)))
            display(pd.crosstab(y[type_data],y_predict,margins=True))
            print('-'*100)
    print(f'Total Time: {datetime.now()-start}')

In [8]:
def validate_biomarker(dict_X_train, dict_y_train, dict_X_test, dict_y_test,
                       omics=['GE_CNA', 'GE','CNA'], random_state = RANDOM_STATE,
                       result_on_dataset = ['train','test'], rank_hparams_info = True,
                       is_binary_problem=False):
    assert 'test' in result_on_dataset
    assert isinstance(rank_hparams_info, bool)
    
    scoring = None
    refit= None
    if is_binary_problem:
        scoring = ['f1','accuracy','roc_auc']
        refit = 'f1'
    else: 
        scoring = ['f1_macro','f1_weighted', 'accuracy']
        refit = 'f1_macro'
        
    # Initializing classifiers
    clf1 = LogisticRegression(random_state=random_state)

    # Binary case, probability = True to cal ROC_AUC, slowdown k-fold....
    clf2 = SVC(random_state=random_state, probability=is_binary_problem)

    clf3 = RandomForestClassifier(random_state=random_state)

    # Setting up the parameter grids
    param_grid1 = [{
                    'penalty': ['l2'],
                    'multi_class':["multinomial"],
                    'solver':["newton-cg"],
                    'class_weight': ["balanced"],
                    'C': np.power(10., np.arange(-4, 3)),
                    }]

    param_grid2 = [{
                    'kernel': ['rbf'],
                    'class_weight': ["balanced"],
                    'C': np.power(10., np.arange(-4, 3)),
                    'gamma': list(np.power(10., np.arange(-4, 0))) + ['scale']
                    }]

    param_grid3 = [{'n_estimators': [50, 100, 150],
                    'max_features': ["sqrt", "log2"],
                    'max_depth' : list(range(1, 10)) + [None],
                    'criterion' :["gini", "entropy"],
                    'class_weight': ["balanced", "balanced_subsample"]}]

    # Setting up multiple GridSearchCV objects, 1 for each algorithm
    gridcvs = {}
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=random_state)

    train_options = zip((param_grid1
                         ,param_grid2\
                         ,param_grid3
                        ),
                        (clf1,clf2\
                         ,clf3
                        ),
                        ('1_Softmax', '2_SVM'\
                         ,'3_RandomForest'
                        ))

    for pgrid, est, name in train_options:
        gcv = GridSearchCV(estimator=est,
                           param_grid=pgrid,
                           scoring=scoring,
                           n_jobs=-1,
                           cv=cv,
                           verbose=1,
                           refit=refit)
        gridcvs[name] = gcv
    
    for omic in omics:
        print('-'*100)
        printmd(f'Validate on {omic} data:\n', color="red")

        X_train = dict_X_train[omic]
        y_train = dict_y_train[omic]
        print('Train dist: ', np.unique(y_train, return_counts=True ))

        X_test = dict_X_test[omic]
        y_test = dict_y_test[omic]
        print('Test dist', np.unique(y_test, return_counts=True ),'\n')
        
        # run tuning and eval
        tuning_and_eval(gridcvs, X_train, y_train, X_test, y_test,\
                        scoring, refit,is_binary_problem,
                        result_on_dataset, rank_hparams_info)

In [9]:
validate_biomarker(dict_X['train'], dict_y['train'], dict_X['test'], dict_y['test'],
                   omics=LIST_EXP_OMICS, random_state=RANDOM_STATE,
                   result_on_dataset= ['test'], rank_hparams_info =False,
                   is_binary_problem = (len(LABEL_MAPPING_NAME)==2))

----------------------------------------------------------------------------------------------------


<span style='color:red'>Validate on GE_CNA data:
</span>

Train dist:  (array([0, 1, 2, 3, 4], dtype=int8), array([106,  55, 336, 142,  17]))
Test dist (array([0, 1, 2, 3, 4], dtype=int8), array([28, 12, 73, 45,  5])) 



<span style='color:blue'>1_Softmax classifier:
</span>

Fitting 50 folds for each of 7 candidates, totalling 350 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done 320 tasks      | elapsed:   42.6s
[Parallel(n_jobs=-1)]: Done 350 out of 350 | elapsed:   58.1s finished


1_Softmax | best f1_macro score  80.87% +/- 5.33
best parmams: {'C': 0.1, 'class_weight': 'balanced', 'multi_class': 'multinomial', 'penalty': 'l2', 'solver': 'newton-cg'}


Result on test dataset with best hyperparameters:
test_acc: 88.96
test_f1_macro: 86.62
test_f1_weighted: 88.96


Unnamed: 0,0,1,2,3,4,accuracy,macro avg,weighted avg
precision,1.0,0.666667,0.905405,0.894737,0.8,0.889571,0.853362,0.8979
recall,1.0,1.0,0.917808,0.755556,0.8,0.889571,0.894673,0.889571
f1-score,1.0,0.8,0.911565,0.819277,0.8,0.889571,0.866168,0.889642
support,28.0,12.0,73.0,45.0,5.0,0.889571,163.0,163.0


col_0,0,1,2,3,4,All
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,28,0,0,0,0,28
1,0,12,0,0,0,12
2,0,2,67,3,1,73
3,0,4,7,34,0,45
4,0,0,0,1,4,5
All,28,18,74,38,5,163


----------------------------------------------------------------------------------------------------


<span style='color:blue'>2_SVM classifier:
</span>

Fitting 50 folds for each of 35 candidates, totalling 1750 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:   10.2s
[Parallel(n_jobs=-1)]: Done 876 tasks      | elapsed:   23.4s


2_SVM | best f1_macro score  80.62% +/- 4.77
best parmams: {'C': 100.0, 'class_weight': 'balanced', 'gamma': 0.0001, 'kernel': 'rbf'}


Result on test dataset with best hyperparameters:
test_acc: 91.41
test_f1_macro: 89.73
test_f1_weighted: 91.63


[Parallel(n_jobs=-1)]: Done 1750 out of 1750 | elapsed:   37.2s finished


Unnamed: 0,0,1,2,3,4,accuracy,macro avg,weighted avg
precision,1.0,0.666667,0.971014,0.863636,1.0,0.91411,0.900264,0.924832
recall,1.0,1.0,0.917808,0.844444,0.8,0.91411,0.912451,0.91411
f1-score,1.0,0.8,0.943662,0.853933,0.888889,0.91411,0.897297,0.916311
support,28.0,12.0,73.0,45.0,5.0,0.91411,163.0,163.0


col_0,0,1,2,3,4,All
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,28,0,0,0,0,28
1,0,12,0,0,0,12
2,0,1,67,5,0,73
3,0,5,2,38,0,45
4,0,0,0,1,4,5
All,28,18,69,44,4,163


----------------------------------------------------------------------------------------------------


<span style='color:blue'>3_RandomForest classifier:
</span>

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 50 folds for each of 240 candidates, totalling 12000 fits


[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:   26.9s
[Parallel(n_jobs=-1)]: Done 738 tasks      | elapsed:   58.1s
[Parallel(n_jobs=-1)]: Done 1088 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1538 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 2088 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 2738 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done 3488 tasks      | elapsed:  5.7min
[Parallel(n_jobs=-1)]: Done 4338 tasks      | elapsed:  7.8min
[Parallel(n_jobs=-1)]: Done 5288 tasks      | elapsed: 10.7min
[Parallel(n_jobs=-1)]: Done 6338 tasks      | elapsed: 13.7min
[Parallel(n_jobs=-1)]: Done 7488 tasks      | elapsed: 15.8min
[Parallel(n_jobs=-1)]: Done 8738 tasks      | elapsed: 18.4min
[Parallel(n_jobs=-1)]: Done 10088 tasks      | elapsed: 21.7min
[Parallel(n_jobs=-1)]: Done 11538 tasks      | elapsed: 26.9min
[Parallel(n_jobs=-1)]: Done 12000 out of 12000 | elapsed

3_RandomForest | best f1_macro score  77.03% +/- 5.48
best parmams: {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 3, 'max_features': 'log2', 'n_estimators': 150}


Result on test dataset with best hyperparameters:
test_acc: 87.73
test_f1_macro: 84.04
test_f1_weighted: 87.73


Unnamed: 0,0,1,2,3,4,accuracy,macro avg,weighted avg
precision,0.875,0.714286,0.953846,0.816327,1.0,0.877301,0.871892,0.886116
recall,1.0,0.833333,0.849315,0.888889,0.6,0.877301,0.834307,0.877301
f1-score,0.933333,0.769231,0.898551,0.851064,0.75,0.877301,0.840436,0.877339
support,28.0,12.0,73.0,45.0,5.0,0.877301,163.0,163.0


col_0,0,1,2,3,4,All
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,28,0,0,0,0,28
1,2,10,0,0,0,12
2,1,2,62,8,0,73
3,0,2,3,40,0,45
4,1,0,0,1,3,5
All,32,14,65,49,3,163


----------------------------------------------------------------------------------------------------
Total Time: 0:30:16.762230
----------------------------------------------------------------------------------------------------


<span style='color:red'>Validate on GE data:
</span>

Train dist:  (array([0, 1, 2, 3, 4], dtype=int8), array([106,  55, 336, 142,  17]))
Test dist (array([0, 1, 2, 3, 4], dtype=int8), array([28, 12, 73, 45,  5])) 



<span style='color:blue'>1_Softmax classifier:
</span>

Fitting 50 folds for each of 7 candidates, totalling 350 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 128 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done 350 out of 350 | elapsed:  1.3min finished


1_Softmax | best f1_macro score  80.29% +/- 5.10
best parmams: {'C': 0.1, 'class_weight': 'balanced', 'multi_class': 'multinomial', 'penalty': 'l2', 'solver': 'newton-cg'}


Result on test dataset with best hyperparameters:
test_acc: 91.41
test_f1_macro: 90.05
test_f1_weighted: 91.43


Unnamed: 0,0,1,2,3,4,accuracy,macro avg,weighted avg
precision,1.0,0.705882,0.932432,0.9,1.0,0.91411,0.907663,0.920479
recall,1.0,1.0,0.945205,0.8,0.8,0.91411,0.909041,0.91411
f1-score,1.0,0.827586,0.938776,0.847059,0.888889,0.91411,0.900462,0.914256
support,28.0,12.0,73.0,45.0,5.0,0.91411,163.0,163.0


col_0,0,1,2,3,4,All
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,28,0,0,0,0,28
1,0,12,0,0,0,12
2,0,1,69,3,0,73
3,0,4,5,36,0,45
4,0,0,0,1,4,5
All,28,17,74,40,4,163


----------------------------------------------------------------------------------------------------


<span style='color:blue'>2_SVM classifier:
</span>

Fitting 50 folds for each of 35 candidates, totalling 1750 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:    9.6s
[Parallel(n_jobs=-1)]: Done 876 tasks      | elapsed:   21.6s


2_SVM | best f1_macro score  78.61% +/- 4.57
best parmams: {'C': 10.0, 'class_weight': 'balanced', 'gamma': 0.001, 'kernel': 'rbf'}


Result on test dataset with best hyperparameters:
test_acc: 91.41
test_f1_macro: 90.66
test_f1_weighted: 91.51


[Parallel(n_jobs=-1)]: Done 1750 out of 1750 | elapsed:   34.3s finished


Unnamed: 0,0,1,2,3,4,accuracy,macro avg,weighted avg
precision,1.0,0.75,0.956522,0.847826,1.0,0.91411,0.91087,0.920112
recall,1.0,1.0,0.90411,0.866667,0.8,0.91411,0.914155,0.91411
f1-score,1.0,0.857143,0.929577,0.857143,0.888889,0.91411,0.90655,0.915097
support,28.0,12.0,73.0,45.0,5.0,0.91411,163.0,163.0


col_0,0,1,2,3,4,All
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,28,0,0,0,0,28
1,0,12,0,0,0,12
2,0,1,66,6,0,73
3,0,3,3,39,0,45
4,0,0,0,1,4,5
All,28,16,69,46,4,163


----------------------------------------------------------------------------------------------------


<span style='color:blue'>3_RandomForest classifier:
</span>

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 50 folds for each of 240 candidates, totalling 12000 fits


[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Done 256 tasks      | elapsed:   19.5s
[Parallel(n_jobs=-1)]: Done 506 tasks      | elapsed:   41.6s
[Parallel(n_jobs=-1)]: Done 856 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 1306 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 1856 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 2506 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done 3256 tasks      | elapsed:  5.7min
[Parallel(n_jobs=-1)]: Done 4106 tasks      | elapsed:  7.7min
[Parallel(n_jobs=-1)]: Done 5056 tasks      | elapsed: 10.8min
[Parallel(n_jobs=-1)]: Done 6106 tasks      | elapsed: 14.4min
[Parallel(n_jobs=-1)]: Done 7256 tasks      | elapsed: 16.4min
[Parallel(n_jobs=-1)]: Done 8506 tasks      | elapsed: 18.9min
[Parallel(n_jobs=-1)]: Done 9856 tasks      | elapsed: 22.0min
[Parallel(n_jobs=-1)]: Done 11306 tasks      | elapsed: 26.9min
[Parallel(n_jobs=-1)]: Done 12000 out of 12000 | elapsed: 

3_RandomForest | best f1_macro score  78.58% +/- 4.86
best parmams: {'class_weight': 'balanced_subsample', 'criterion': 'entropy', 'max_depth': 3, 'max_features': 'sqrt', 'n_estimators': 150}


Result on test dataset with best hyperparameters:
test_acc: 86.50
test_f1_macro: 84.12
test_f1_weighted: 86.57


Unnamed: 0,0,1,2,3,4,accuracy,macro avg,weighted avg
precision,0.903226,0.75,0.952381,0.769231,0.8,0.865031,0.834968,0.873801
recall,1.0,0.75,0.821918,0.888889,0.8,0.865031,0.852161,0.865031
f1-score,0.949153,0.75,0.882353,0.824742,0.8,0.865031,0.84125,0.865653
support,28.0,12.0,73.0,45.0,5.0,0.865031,163.0,163.0


col_0,0,1,2,3,4,All
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,28,0,0,0,0,28
1,2,9,0,1,0,12
2,1,1,60,10,1,73
3,0,2,3,40,0,45
4,0,0,0,1,4,5
All,31,12,63,52,5,163


----------------------------------------------------------------------------------------------------
Total Time: 0:31:35.520637
----------------------------------------------------------------------------------------------------


<span style='color:red'>Validate on CNA data:
</span>

Train dist:  (array([0, 1, 2, 3, 4], dtype=int8), array([106,  55, 336, 142,  17]))
Test dist (array([0, 1, 2, 3, 4], dtype=int8), array([28, 12, 73, 45,  5])) 



<span style='color:blue'>1_Softmax classifier:
</span>

Fitting 50 folds for each of 7 candidates, totalling 350 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 200 tasks      | elapsed:    2.3s


1_Softmax | best f1_macro score  41.21% +/- 3.95
best parmams: {'C': 10.0, 'class_weight': 'balanced', 'multi_class': 'multinomial', 'penalty': 'l2', 'solver': 'newton-cg'}


Result on test dataset with best hyperparameters:
test_acc: 44.17
test_f1_macro: 40.12
test_f1_weighted: 46.51


[Parallel(n_jobs=-1)]: Done 350 out of 350 | elapsed:    5.1s finished


Unnamed: 0,0,1,2,3,4,accuracy,macro avg,weighted avg
precision,0.529412,0.318182,0.673469,0.458333,0.088235,0.441718,0.413526,0.545222
recall,0.642857,0.583333,0.452055,0.244444,0.6,0.441718,0.504538,0.441718
f1-score,0.580645,0.411765,0.540984,0.318841,0.153846,0.441718,0.401216,0.46508
support,28.0,12.0,73.0,45.0,5.0,0.441718,163.0,163.0


col_0,0,1,2,3,4,All
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,18,0,2,2,6,28
1,1,7,1,1,2,12
2,8,5,33,10,17,73
3,7,10,11,11,6,45
4,0,0,2,0,3,5
All,34,22,49,24,34,163


----------------------------------------------------------------------------------------------------


<span style='color:blue'>2_SVM classifier:
</span>

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 50 folds for each of 35 candidates, totalling 1750 fits


[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:    6.6s
[Parallel(n_jobs=-1)]: Done 876 tasks      | elapsed:   15.0s


2_SVM | best f1_macro score  45.02% +/- 4.33
best parmams: {'C': 1.0, 'class_weight': 'balanced', 'gamma': 0.1, 'kernel': 'rbf'}


Result on test dataset with best hyperparameters:
test_acc: 50.92
test_f1_macro: 45.22
test_f1_weighted: 52.86


[Parallel(n_jobs=-1)]: Done 1750 out of 1750 | elapsed:   27.7s finished


Unnamed: 0,0,1,2,3,4,accuracy,macro avg,weighted avg
precision,0.583333,0.368421,0.72,0.472222,0.090909,0.509202,0.446977,0.582938
recall,0.75,0.583333,0.493151,0.377778,0.4,0.509202,0.520852,0.509202
f1-score,0.65625,0.451613,0.585366,0.419753,0.148148,0.509202,0.452226,0.528563
support,28.0,12.0,73.0,45.0,5.0,0.509202,163.0,163.0


col_0,0,1,2,3,4,All
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,21,0,2,4,1,28
1,2,7,0,3,0,12
2,6,4,36,12,15,73
3,7,8,9,17,4,45
4,0,0,3,0,2,5
All,36,19,50,36,22,163


----------------------------------------------------------------------------------------------------


<span style='color:blue'>3_RandomForest classifier:
</span>

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 50 folds for each of 240 candidates, totalling 12000 fits


[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:   25.3s
[Parallel(n_jobs=-1)]: Done 876 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 1576 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 2476 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 3208 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 3858 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done 4608 tasks      | elapsed:  5.8min
[Parallel(n_jobs=-1)]: Done 5458 tasks      | elapsed:  7.0min
[Parallel(n_jobs=-1)]: Done 6408 tasks      | elapsed:  8.5min
[Parallel(n_jobs=-1)]: Done 7458 tasks      | elapsed: 10.1min
[Parallel(n_jobs=-1)]: Done 8608 tasks      | elapsed: 12.0min
[Parallel(n_jobs=-1)]: Done 9858 tasks      | elapsed: 14.0min
[Parallel(n_jobs=-1)]: Done 11208 tasks      | elapsed: 16.2min
[Parallel(n_jobs=-1)]: Done 12000 out of 12000 | elapsed: 17.7min finished


3_RandomForest | best f1_macro score  47.66% +/- 4.99
best parmams: {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 9, 'max_features': 'sqrt', 'n_estimators': 150}


Result on test dataset with best hyperparameters:
test_acc: 54.60
test_f1_macro: 46.05
test_f1_weighted: 54.25


Unnamed: 0,0,1,2,3,4,accuracy,macro avg,weighted avg
precision,0.689655,0.4,0.607595,0.545455,0.076923,0.546012,0.463926,0.572975
recall,0.714286,0.666667,0.657534,0.266667,0.2,0.546012,0.501031,0.546012
f1-score,0.701754,0.5,0.631579,0.358209,0.111111,0.546012,0.460531,0.542511
support,28.0,12.0,73.0,45.0,5.0,0.546012,163.0,163.0


col_0,0,1,2,3,4,All
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,20,0,6,2,0,28
1,1,8,1,2,0,12
2,4,4,48,6,11,73
3,4,8,20,12,1,45
4,0,0,4,0,1,5
All,29,20,79,22,13,163


----------------------------------------------------------------------------------------------------
Total Time: 0:18:16.001334


In [10]:
print(pd.__version__)
print(sklearn.__version__)
print(np.__version__)

1.3.2
0.23.2
1.19.5
