In [1]:
# Libraries import

import os, gc
import numpy as np
#import pickle
import joblib
import pandas as pd
import sys

from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import SVC
from sklearn.metrics import balanced_accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score

np.set_printoptions(suppress=True, precision = 3)

# Notes 

## Model architecture 
We train a random forest classifier, using a 5-fold cross-validation. The validation folds are designed using over-sampling so that in each fold, the validation set contains at least one tumor from each class, but possibly more so that each class has at least 15% of its data in validation. 

We select the most important features from the random forest model for experimental validation. We also use them to train a support vector classifier model, to boost the quality of the model. 

In [2]:
"""
DATA DESCRIPTION
Indicate the file names for the pre-processed data and for the tumor/non-tumor map. 
The pre-processed data is a list of dictionaries, one for each sample, each dictionary has the 2D x 1738 data
plus its methylation label, sample name, patient ID, , t-SNE and PCA decompositions (datastructure described below).
The tumor/non-tumor map is a list of numpy arrays, one for each sample, of the same 2D shape as the 
corresponding sample, coming in the same order as the samples in the pre-processed file. 
Each entry in the 2D array is 0/1, with 1 indicating that the corresponding spot in the pre-processed data 
was labeled as tumor. 
The map is to be used as filter: only those spot labeled as tumor are to be analysed further. 
We also use the map to extract the most discriminative features between tumor and non-tumor. 
"""

PREPROC_DATA_PATH = "../data/preprocessed.npy"
TUMOR_MAP_PATH = "../data/tumor_maps.npy"
SEPARATION_CLASSIFIER_PATH = "./TumorSeparationModels/" 

# Make the directory if it doesn't exist
if not os.path.exists(SEPARATION_CLASSIFIER_PATH[:-1]):
    os.mkdir(SEPARATION_CLASSIFIER_PATH[:-1])

SAMPLE_DICT_DATA_KEY = "data"
SAMPLE_DICT_NAME_KEY = "name"
SAMPLE_DICT_CLASS_ID_KEY = "class_id"
SAMPLE_DICT_PATIENT_ID_KEY = "patient_id"
SAMPLE_DICT_TSNE_KEY = "tsne"
SAMPLE_DICT_PCA_KEY = "pca"

# Define the samples to be excluded from the analysis (use their full name)
ignore = ["HF-1887_via-t_2_1.h5_3"]

In [3]:
"""
CROSS-VALIDATION DESIGN: 5-fold
All samples (except 1887) included. Assigned to the CV folds in order of their names, 
trying to have about the same amount of validation data in each fold.
Date: February 2023. 
"""

CV5folds=[ 
    [#Cv fold 1
        #LGm1 
        "HF-448_V5B_1.h5_3",
        #LGm2
        "HF-305_v4b_1_1.h5_6", "HF-615_V5BB_1.h5_9",  
        #LGm3
        "HF-2104_#5_1.h5_0", "HF-2104_#9_1.h5_1", "HF-2104_V1T_1.h5_2",
        #LGm4
        "HF-442_V4BB_1.h5_12", "HF-1002_V1AT_1.h5_0", "HF-1002_V2AT_1.h5_1", 
        #LGm5
        "HF-682_V3AT_1.h5_9", "HF-682_V3BB_1.h5_10", "HF-894_9_1.h5_11", "HF-894_V1BB_1.h5_12", 
        #LGm6
        "HF-592_V3T_1.h5_4", 
    ],
    [#Cv fold 2
        #LGm1 
        "HF-868_1_2.h5_4",
        #LGm2
        "HF-901_V2T_2.h5_10", "HF-960_VIAT_2.h5_11", 
        #LGm3
        "HF-2614_V1B_1.h5_3", 
        #LGm4
        "HF-1825_V2B_1.h5_2", "HF-2102_V2BB_1.h5_3", "HF-2102_V3AM_1.h5_4", "HF-2102_V3AM_2.h5_5", 
        #LGm5
        "HF-988_V1-T_1.h5_13", "HF-988_V1B_1.h5_14", "HF-1043_V1AM_1.h5_0", 
        #LGm6
        "HF-2106_V3AM_1.h5_0", 
    ], 
    [#Cv fold 3
        #LGm1 
        "HF-1293_13_1.h5_0",
        #LGm2
        "HF-1010_V1T_1.h5_0", "HF-1016_IAT_2.h5_1", "HF-1334_V58-B_2_1.h5_2", 
        #LGm3
        "HF-2849_VIT2_1.h5_4", "HF-2849_VIT2_1.h5_5", "HF-2849_VIT2_2.h5_6", "HF-2849_VIT_2_new2021.h5_7",
        #LGm4
        "HF-2454_V1AT_1.h5_6", "HF-2548_V1T_1.h5_7", 
        #LGm5
        "HF-1086_#1_1.h5_1", "HF-2355_V2AM_1.h5_2", 
        #LGm6
        "HF-2493_V1T_1.h5_1", "HF-2493_V1T_2.h5_2", 
    ], 
    [#Cv fold 4
        #LGm1 
        "HF-1295_V3AM_2.h5_1",
        #LGm2
        "HF-2070_V1T_1.h5_4", "HF-2776_V2B_2.h5_5",        
        #LGm3 
        "HF-2852_VIT_2_2.h5_8", 
        #LGm4
        "HF-2715_VIL_1.h5_8", "HF-2802_V3T_1.h5_9", 
        #LGm5
        "HF-2485_V1B_1.h5_3", "HF-2600_V1B_1.h5_4", "HF-2608_V1T_1.h5_5", 
        #LGm6
        "HF-2544_V1B_1.h5_3", 
    ],
    [#Cv fold 5
        #LGm1 
        "HF-2534_V2B_1.h5_2",
        #LGm2
        "HF-3271_VIB_2.h5_7", "HF-3337_V3T_1.h5_8", 
        #LGm3 same as fold 1
        "HF-2104_#5_1.h5_0", "HF-2104_#9_1.h5_1", "HF-2104_V1T_1.h5_2",
        #LGm4
        "HF-2876_V1T_1.h5_10", "HF-2898_V1T_1.h5_11",
        #LGm5
        "HF-2619_V1T_1.h5_6", "HF-2619_V4T_1.h5_7", "HF-2666_V2B_1.h5_8",
        #LGm6 same as fold 1
        "HF-592_V3T_1.h5_4", 
    ],
]

In [4]:
def reset_seed(SEED = 0):
    os.environ['PYTHONHASHSEED']=str(SEED)
    np.random.seed(SEED)
    
reset_seed(2022)

In [5]:
def CrossValidation(data, labels, CV_folds=CV5folds, verbose=False):
    """ 
    This is the cross-validation training. 
    In each CV fold, the model is first trained, then the data reduced to its top 20 features
    and the model is re-trained with an SVC. 
    This is in the hope of reducing the noise and getting a better fit.
    We save all RF and SVC models, as well as their metrics. 
    """
    
    results = []  
    RFmetrics=[]
    SVCmetrics=[]
    index = 0
    
    # Loop through the CV folds
    for valid_fold in CV_folds:
        
        index = index+1
        if verbose: 
            print("Validation fold", index)
        CV_fold_name=str(index)

        # Get the train/validation data for this CV fold
        train_X=np.empty( (0,data[0][SAMPLE_DICT_DATA_KEY].shape[1]) )
        train_y=np.empty( 0 )
        valid_names=[]
        valid_X=np.empty( (0,data[0][SAMPLE_DICT_DATA_KEY].shape[1]) )
        valid_y=np.empty( 0 )
        for (sample,label) in zip(data,labels): 
            if sample[SAMPLE_DICT_NAME_KEY] in valid_fold:
                valid_names.append(sample[SAMPLE_DICT_NAME_KEY])
                valid_X = np.append(valid_X, sample[SAMPLE_DICT_DATA_KEY], axis=0)
                valid_y = np.append(valid_y, label, axis=0)
            else: 
                train_X = np.append(train_X, sample[SAMPLE_DICT_DATA_KEY], axis=0)
                train_y = np.append(train_y, label, axis=0)
        if len(valid_names) != len(valid_fold):
            print("Error in the design of CV fold (data not found):", valid_fold)
            sys.exit(-1)     
        
        # Create list of indices
        shuffle = np.arange(len(train_X))
        
        # Numpy shuffle method performs shuffle in place
        np.random.shuffle(shuffle)
        
        # Shuffle the training data
        train_X = np.squeeze(train_X[shuffle])
        train_y = np.squeeze(train_y[shuffle])
        
        # Create RF model and train to extract features
        RFclf = RandomForestClassifier(n_estimators = 300,
                            bootstrap = True,
                            max_depth = 10,
                            random_state=0,
                            criterion = "entropy",
                            class_weight = "balanced_subsample",
                            warm_start = False,
                            n_jobs = -1,
                            min_samples_leaf = 50,
                            min_samples_split = 50,
                            max_features = "sqrt")
                            

        RFclf.fit(train_X, train_y)
        
        # Get accuracy on training and validation data
        acc_train = RFclf.score(train_X, train_y)
        acc_val = RFclf.score(valid_X, valid_y)
        if verbose: 
            print("\t RF model. Accuracy on train and valid: ", acc_train, acc_val)
        
        # Get prediction on the validation data
        pred = RFclf.predict(valid_X)
        
        # Gather RF metrics
        RFacc = balanced_accuracy_score(valid_y, pred)
        RFroc_auc = roc_auc_score(valid_y, pred, average =None)
        RFprec = precision_score(valid_y, pred, average =None)
        RFrec = recall_score(valid_y, pred, average =None)
        RFf1 = f1_score(valid_y, pred, average =None)
        RFconf = confusion_matrix(valid_y, pred, normalize='true')
        RFmetrics.append([RFacc, RFroc_auc, RFprec, RFrec, RFf1, RFconf])
        
        if verbose: 
            print("\t RF accuracy: ", RFacc)
            print("\t RF AUROC:    ", RFroc_auc)
            print("\t RF precision:", RFprec)
            print("\t RF recall:   ", RFrec)
            print("\t RF F1:       ", RFf1)
            print("\t RF confusion matrix:\n", RFconf)
        
        # Save the model
        joblib.dump(RFclf, SEPARATION_CLASSIFIER_PATH+"RF_" + CV_fold_name + ".RFmod")
        
    return RFmetrics

In [6]:
# Read the pre-processed data and the tumor map of each sample
samples_0 = np.load(PREPROC_DATA_PATH, allow_pickle=True)
tumor_map_0 = np.load(TUMOR_MAP_PATH, allow_pickle=True)

samples = []
tumor_map = []

# Flatten the data (no spatial info needed here) and skip the samples on the "ignore" list
for (sample, label) in zip(samples_0, tumor_map_0):
    if sample[SAMPLE_DICT_NAME_KEY] in ignore:
        print("Sample ignored:", sample[SAMPLE_DICT_NAME_KEY])
        continue
        
    sample[SAMPLE_DICT_DATA_KEY] = sample[SAMPLE_DICT_DATA_KEY].reshape(-1, sample[SAMPLE_DICT_DATA_KEY].shape[2])
    label.resize(np.product(label.shape))    
    samples.append(sample)
    tumor_map.append(label)
    
del samples_0
del tumor_map_0
gc.collect()

Sample ignored: HF-1887_via-t_2_1.h5_3


11

In [7]:
reset_seed(2022)
RFmetrics = CrossValidation(data=samples, labels=tumor_map, CV_folds=CV5folds, verbose=True)

Validation fold 1
	 RF model. Accuracy on train and valid:  0.9923969421126085 0.9985002286236854
	 RF accuracy:  0.9927493924833478
	 RF AUROC:     0.9927493924833478
	 RF precision: [0.997 0.999]
	 RF recall:    [0.986 1.   ]
	 RF F1:        [0.991 0.999]
	 RF confusion matrix:
 [[0.986 0.014]
 [0.    1.   ]]
Validation fold 2
	 RF model. Accuracy on train and valid:  0.9918656407425298 0.9812899735895886
	 RF accuracy:  0.9841547271140232
	 RF AUROC:     0.9841547271140233
	 RF precision: [0.848 0.999]
	 RF recall:    [0.988 0.981]
	 RF F1:        [0.912 0.99 ]
	 RF confusion matrix:
 [[0.988 0.012]
 [0.019 0.981]]
Validation fold 3
	 RF model. Accuracy on train and valid:  0.9992323831943094 0.9587131232838927
	 RF accuracy:  0.9286171136808127
	 RF AUROC:     0.9286171136808127
	 RF precision: [0.994 0.947]
	 RF recall:    [0.859 0.998]
	 RF F1:        [0.922 0.972]
	 RF confusion matrix:
 [[0.859 0.141]
 [0.002 0.998]]
Validation fold 4
	 RF model. Accuracy on train and valid:  0

In [8]:
del samples
del tumor_map
gc.collect()

48

In [9]:
results = np.mean(np.asanyarray(RFmetrics, dtype=object), axis=0)
print("\t Mean RF accuracy: ", results[0])
print("\t Mean RF ROCAUC:   ", results[1])
print("\t Mean RF precision:", results[2])
print("\t Mean RF recall:   ", results[3])
print("\t Mean RF F1:       ", results[4])
print("\t Mean RF confusion matrix:\n", results[5])

	 Mean RF accuracy:  0.9795591572000705
	 Mean RF ROCAUC:    0.9795591572000705
	 Mean RF precision: [0.957 0.989]
	 Mean RF recall:    [0.964 0.995]
	 Mean RF F1:        [0.958 0.992]
	 Mean RF confusion matrix:
 [[0.964 0.036]
 [0.005 0.995]]
