In [1]:
import pandas as pd
import numpy as np
from collections import Counter

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score,cohen_kappa_score, matthews_corrcoef, precision_score, recall_score
from sklearn.model_selection import StratifiedKFold
from sklearn.decomposition import PCA

pd.options.mode.chained_assignment = None 

In [2]:
def train_test_data_split(raw_data, train_mapping, test_mapping):
    train_data = pd.DataFrame(columns=raw_data.columns)
    test_data = pd.DataFrame(columns=raw_data.columns)
    
    for case_id in raw_data.index:
        case_general_id = case_id[0:20]
        if case_general_id in train_mapping.values[:,1]:
            train_data.loc[case_id]= raw_data.loc[case_id, :]
        elif case_general_id in test_mapping.values[:,1]:
            test_data.loc[case_id]= raw_data.loc[case_id, :]
            
    return train_data, test_data

In [3]:
def split_modalities(input_data):
    
    flair= pd.DataFrame(columns=input_data.columns)
    t1c = pd.DataFrame(columns=input_data.columns)
    t1= pd.DataFrame(columns=input_data.columns)
    t2 = pd.DataFrame(columns=input_data.columns)
    
    for case_id in input_data.index:
        if "FLAIR" in case_id:
            flair.loc[case_id]= input_data.loc[case_id, :]
        elif "T1C" in case_id:
            t1c.loc[case_id]= input_data.loc[case_id, :]
        elif "T1" in case_id:
            t1.loc[case_id]= input_data.loc[case_id, :]
        elif "T2" in case_id:
            t2.loc[case_id]= input_data.loc[case_id, :]
            
            
    labels = (flair.loc[:, 'Grade']).to_numpy()
    
    flair.drop('Grade',axis='columns', inplace=True)
    t1c.drop('Grade',axis='columns', inplace=True)
    t1.drop('Grade',axis='columns', inplace=True)
    t2.drop('Grade',axis='columns', inplace=True)
    
    return flair, t1c, t1, t2, labels

In [4]:
def prepare_scaler(dataset):
    dataset_values = dataset.loc[:, dataset.columns != "Grade"]
    standard_scaler = StandardScaler().fit(dataset_values)

    return standard_scaler

In [5]:
def apply_pca(dataset, scaler):
    dataset_values = dataset.loc[:, dataset.columns != "Grade"]
    dataset_labels = dataset.loc[:, "Grade"]
    dataset_index = dataset.index
    
    dataset_values[dataset_values.columns] = scaler.transform(dataset_values[dataset_values.columns])
    
    pca = PCA(.95)
    
    pca.fit(dataset_values)
    
    dataset_values = pca.transform(dataset_values)
    
    dataset_pca = pd.DataFrame(dataset_values, index=dataset_index)
    dataset_pca["Grade"] = dataset_labels
    
    return dataset_pca, pca

In [6]:
def equalize_mapping(mapping):
    is_HGG =  mapping['Grade']=='HGG'
    is_LGG =  mapping['Grade']=='LGG'
    hggs = mapping[is_HGG]
    lggs = mapping[is_LGG]
    hggs = hggs.sample(n=len(lggs))
    equalized_mapping = pd.concat([hggs, lggs])
    return equalized_mapping

In [7]:
def equalize_dataset(mapping, dataset):
    mapping_ids = mapping["ID"].tolist()
    dataset.index = dataset.index.astype('str')
    equalized_train_dataset = pd.DataFrame()
    for id in mapping_ids:   
        case_df = dataset[dataset.index.str.contains(id)]
        equalized_train_dataset = pd.concat([equalized_train_dataset, case_df])
    return equalized_train_dataset

In [8]:
def preprocess_test_dataset(dataset, scaler, pca):
    dataset_values = dataset.loc[:, dataset.columns != "Grade"]
    dataset_labels = dataset.loc[:, "Grade"]
    dataset_index = dataset.index

    dataset_values[dataset_values.columns] = scaler.transform(dataset_values[dataset_values.columns])
    
    dataset_values = pca.transform(dataset_values)
    
    preprocessed_dataset = pd.DataFrame(dataset_values, index=dataset_index)
    preprocessed_dataset["Grade"] = dataset_labels
    
    return preprocessed_dataset

In [9]:
def check_majority_proba(proba_arr):
    
    hgg_proba = 0
    lgg_proba = 0
    
    for modality_proba in proba_arr:
        hgg_proba += modality_proba[0]
        lgg_proba += modality_proba[1]
        
    if  hgg_proba >= lgg_proba:
        return "HGG"
    else:
        return "LGG"

In [10]:
def prepare_metrics(overall_acc, overall_kappa, overall_mcc, overall_prc, overall_rec): 
    general_accuracy = pd.DataFrame({'Accuracy': overall_acc})
    general_kappa = pd.DataFrame({'Kappa': overall_kappa})
    general_mcc = pd.DataFrame({'MCC': overall_mcc})
    general_prc = pd.DataFrame({'MCC': overall_prc})
    general_rec = pd.DataFrame({'MCC': overall_rec})
        
    accuracy_mean = float(general_accuracy.mean())
    accuracy_median = float(general_accuracy.median())
    accuracy_Q1 = general_accuracy.Accuracy.quantile([0.25]).to_numpy()[0]
    accuracy_Q3 = general_accuracy.Accuracy.quantile([0.75]).to_numpy()[0]
    accuracy_IQR = accuracy_Q3 - accuracy_Q1

    kappa_mean = float(general_kappa.mean())
    kappa_median = float(general_kappa.median())
    kappa_Q1 = general_kappa.Kappa.quantile([0.25]).to_numpy()[0]
    kappa_Q3 = general_kappa.Kappa.quantile([0.75]).to_numpy()[0]
    kappa_IQR = kappa_Q3 - kappa_Q1

    mcc_mean = float(general_mcc.mean())
    mcc_median = float(general_mcc.median())
    mcc_Q1 = general_mcc.MCC.quantile([0.25]).to_numpy()[0]
    mcc_Q3 = general_mcc.MCC.quantile([0.75]).to_numpy()[0]
    mcc_IQR = mcc_Q3 - mcc_Q1
    
    prc_mean = float(general_prc.mean())
    prc_median = float(general_prc.median())
    prc_Q1 = general_prc.MCC.quantile([0.25]).to_numpy()[0]
    prc_Q3 = general_prc.MCC.quantile([0.75]).to_numpy()[0]
    prc_IQR = prc_Q3 - prc_Q1
    
    rec_mean = float(general_rec.mean())
    rec_median = float(general_rec.median())
    rec_Q1 = general_rec.MCC.quantile([0.25]).to_numpy()[0]
    rec_Q3 = general_rec.MCC.quantile([0.75]).to_numpy()[0]
    rec_IQR = rec_Q3 - rec_Q1

    accuracy_list = [accuracy_mean, accuracy_median, accuracy_Q1, accuracy_Q3, accuracy_IQR]
    kappa_list = [kappa_mean, kappa_median, kappa_Q1, kappa_Q3, kappa_IQR]
    mcc_list = [mcc_mean, mcc_median, mcc_Q1, mcc_Q3, mcc_IQR]
    prc_list = [prc_mean, prc_median, prc_Q1, prc_Q3, prc_IQR]
    rec_list = [rec_mean, rec_median, rec_Q1, rec_Q3, rec_IQR]


    rows_labels = ["Mean", "Median", "Q1", "Q3", "IQR"]
    column_labels = ["Accuracy", "Kappa", "MCC", "Precision", "Recall"]
    metrics = pd.DataFrame(index=rows_labels, columns=column_labels)

    metrics["Accuracy"] = accuracy_list
    metrics["Kappa"] = kappa_list
    metrics["MCC"] = mcc_list
    metrics["Precision"] = prc_list
    metrics["Recall"] = rec_list

    rounded_metrics = metrics.round(3)
    rounded_metrics.to_csv("metrics.csv")
    return rounded_metrics

In [None]:
dataset = pd.read_csv("output/features.csv", index_col=0)
name_mapping = pd.read_csv("mapping/name_mapping.csv")

skf = StratifiedKFold(n_splits=5)

X = name_mapping.values[:,1]
y = name_mapping.values[:,0]
fold_iterator = 1

overall_acc = []
overall_kappa = []
overall_mcc = []
overall_prc = []
overall_rec = []

for train_index, test_index in skf.split(X, y):
    print("\n")
    print("FOLD NUMBER: ", fold_iterator)
    print("\n")
    fold_iterator += 1
    
    scaler = None
    pca = None
     
    X_train, X_test = X[train_index], X[test_index] 
    y_train, y_test = y[train_index], y[test_index]
    
    # print("Train labels distribution:", Counter(y_train))
    # print("Test labels distribution:",Counter(y_test))
    # print("\n")

    train_mapping = pd.DataFrame({'Grade': y_train, 'ID': X_train})    
    test_mapping = pd.DataFrame({'Grade': y_test, 'ID': X_test})
    equalized_train_mapping = equalize_mapping(train_mapping)

    (train_dataset, test_dataset) = train_test_data_split(dataset, train_mapping, test_mapping)
    
    scaler = prepare_scaler(train_dataset)
    
    pca_train_dataset, pca = apply_pca(train_dataset, scaler)
    
    equalized_train_dataset = equalize_dataset(equalized_train_mapping, pca_train_dataset)
    
    (flair_data, t1c_data, t1_data, t2_data, labels) = split_modalities(equalized_train_dataset)
    
    flair_classifier = RandomForestClassifier()
    t1c_classifier = RandomForestClassifier()
    t1_classifier = RandomForestClassifier()
    t2_classifier = RandomForestClassifier()
    
    flair_classifier.fit(flair_data.to_numpy(), labels)
    t1c_classifier.fit(t1c_data.to_numpy(), labels)
    t1_classifier.fit(t1_data.to_numpy(), labels)
    t2_classifier.fit(t2_data.to_numpy(), labels)
    
    preprocessed_test_dataset = preprocess_test_dataset(test_dataset, scaler, pca)
    
    (test_flair_data, test_t1c_data, test_t1_data, test_t2_data, test_labels) = split_modalities(preprocessed_test_dataset)
    
    number_of_test_cases = len(test_mapping)
    
    test_results = []

    for i in range(number_of_test_cases):
        probability_array = []

        flair_result = flair_classifier.predict_proba(test_flair_data.iloc[i].to_numpy().reshape(1,-1))[0]
        probability_array.append(flair_result)
            
        t1c_result = t1c_classifier.predict_proba(test_t1c_data.iloc[i].to_numpy().reshape(1,-1))[0]
        probability_array.append(t1c_result)
            
        t1_result = t1_classifier.predict_proba(test_t1_data.iloc[i].to_numpy().reshape(1,-1))[0]
        probability_array.append(t1_result)
            
        t2_result = t2_classifier.predict_proba(test_t2_data.iloc[i].to_numpy().reshape(1,-1))[0]
        probability_array.append(t2_result)
                
        final_prediction = check_majority_proba(probability_array)

        test_results.append(final_prediction)
        
    test_results = np.array(test_results)
    
    print(confusion_matrix(test_labels, test_results))
    print(classification_report(test_labels, test_results))
    
    acc = accuracy_score(test_labels, test_results)
    kappa = cohen_kappa_score(test_labels, test_results)
    mcc = matthews_corrcoef(test_labels, test_results)
    # prc = precision_score(test_labels, test_results)
    # rec = recall_score(test_labels, test_results)
    
    print("Accuracy: {:.5f}".format(acc))
    print("Cohen's Kappa: {:.5f}".format(kappa))
    print("MCC: {:.5f}".format(mcc))
    # print("Precision: {:.5f}".format(prc))
    # print("Recall: {:.5f}".format(rec))

    
    overall_acc.append(acc)
    overall_kappa.append(kappa)
    overall_mcc.append(mcc)
    # overall_prc.append(prc)
    # overall_rec.append(rec)

metrics = prepare_metrics(overall_acc, overall_kappa, overall_mcc, overall_prc, overall_rec)



FOLD NUMBER:  1


[[58  1]
 [ 5 10]]
              precision    recall  f1-score   support

         HGG       0.92      0.98      0.95        59
         LGG       0.91      0.67      0.77        15

    accuracy                           0.92        74
   macro avg       0.91      0.82      0.86        74
weighted avg       0.92      0.92      0.91        74

Accuracy: 0.91892
Cohen's Kappa: 0.72146
MCC: 0.73423


FOLD NUMBER:  2


[[58  1]
 [ 7  8]]
              precision    recall  f1-score   support

         HGG       0.89      0.98      0.94        59
         LGG       0.89      0.53      0.67        15

    accuracy                           0.89        74
   macro avg       0.89      0.76      0.80        74
weighted avg       0.89      0.89      0.88        74

Accuracy: 0.89189
Cohen's Kappa: 0.60691
MCC: 0.63514


FOLD NUMBER:  3




In [12]:
metrics

Unnamed: 0,Accuracy,Kappa,MCC,Precision,Recall
Mean,0.686,0.368,0.427,,
Median,0.662,0.344,0.432,,
Q1,0.507,0.189,0.323,,
Q3,0.865,0.536,0.545,,
IQR,0.358,0.347,0.222,,
