In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score,cohen_kappa_score, matthews_corrcoef
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
import statistics
from sklearn.decomposition import PCA

pd.options.mode.chained_assignment = None 

In [2]:
def train_test_data_split(raw_data, train_mapping, test_mapping):
    train_data = pd.DataFrame(columns=raw_data.columns)
    test_data = pd.DataFrame(columns=raw_data.columns)
    
    for case_id in raw_data.index:
        case_general_id = case_id[0:20]
        if case_general_id in train_mapping.values[:,1]:
            train_data.loc[case_id]= raw_data.loc[case_id, :]
        elif case_general_id in test_mapping.values[:,1]:
            test_data.loc[case_id]= raw_data.loc[case_id, :]
            
    return train_data, test_data

In [3]:
def equalize_mapping(mapping):
    is_HGG =  mapping['Grade']=='HGG'
    is_LGG =  mapping['Grade']=='LGG'
    hggs = mapping[is_HGG]
    lggs = mapping[is_LGG]
    hggs = hggs.sample(n=len(lggs))
    equalized_mapping = pd.concat([hggs, lggs])
    return equalized_mapping

In [4]:
def equalize_dataset(mapping, dataset):
    mapping_ids = mapping["ID"].tolist()
    dataset.index = dataset.index.astype('str')
    equalized_train_dataset = pd.DataFrame()
    for id in mapping_ids:   
        case_df = dataset[dataset.index.str.contains(id)]
        equalized_train_dataset = pd.concat([equalized_train_dataset, case_df])
    return equalized_train_dataset

In [5]:
def split_modalities(input_data):
    
    flair= pd.DataFrame(columns=input_data.columns)
    t1c = pd.DataFrame(columns=input_data.columns)
    t1= pd.DataFrame(columns=input_data.columns)
    t2 = pd.DataFrame(columns=input_data.columns)
    
    for case_id in input_data.index:
        if "FLAIR" in case_id:
            flair.loc[case_id]= input_data.loc[case_id, :]
        elif "T1C" in case_id:
            t1c.loc[case_id]= input_data.loc[case_id, :]
        elif "T1" in case_id:
            t1.loc[case_id]= input_data.loc[case_id, :]
        elif "T2" in case_id:
            t2.loc[case_id]= input_data.loc[case_id, :]
            
            
    labels = (flair.loc[:, 'Grade']).to_numpy()
    
    flair.drop('Grade',axis='columns', inplace=True)
    t1c.drop('Grade',axis='columns', inplace=True)
    t1.drop('Grade',axis='columns', inplace=True)
    t2.drop('Grade',axis='columns', inplace=True)
    
    return flair, t1c, t1, t2, labels

In [6]:
def select_features(dataset, number_of_features):
    X = dataset.loc[:, dataset.columns != "Grade"]
    y = dataset.loc[:, "Grade"]
    
    standard_scaler = StandardScaler().fit(X)
    X[X.columns] = standard_scaler.fit_transform(X[X.columns])
    
    select_k_best = SelectKBest(f_classif).fit(X, Y)
    
    scored_features = pd.DataFrame({'Feature':list(X.columns), 'Score':select_k_best.scores_})
    scored_features.sort_values(by='Score', ascending=False, inplace=True)
    scored_features.to_csv("output/scored_features.csv")
    
    best_features = scored_features.nlargest(number_of_features,'Score')
    best_features = best_features.loc[:, "Feature"]
    # print("Selected features: ")
    # print(best_features)
    best_features.to_csv("output/best_features.csv")

    selected_features = X.loc[dataset.index, best_features]
    selected_features["Grade"] = Y
    
    return selected_features

In [11]:
def check_majority_proba(proba_arr):
    
    hgg_proba = 0
    lgg_proba = 0
    
    for modality_proba in proba_arr:
        hgg_proba += modality_proba[0]
        lgg_proba += modality_proba[1]
        
    if  hgg_proba >= lgg_proba:
        return "HGG"
    else:
        return "LGG"

In [7]:
def prepare_scaler(dataset):
    dataset_values = dataset.loc[:, dataset.columns != "Grade"]
    standard_scaler = StandardScaler().fit(dataset_values)
    return standard_scaler

In [8]:
def apply_pca(dataset, scaler):
    dataset_values = dataset.loc[:, dataset.columns != "Grade"]
    dataset_labels = dataset.loc[:, "Grade"]
    dataset_index = dataset.index

    dataset_values[dataset_values.columns] = scaler.transform(dataset_values[dataset_values.columns])
    
    pca = PCA(.95)
    
    pca.fit(dataset_values)
    
    dataset_values = pca.transform(dataset_values)
    
    dataset_pca = pd.DataFrame(dataset_values, index=dataset_index)
    dataset_pca["Grade"] = dataset_labels
    
    return dataset_pca, pca

In [9]:
def preprocess_dataset(dataset, scaler, pca):
    dataset_values = dataset.loc[:, dataset.columns != "Grade"]
    dataset_labels = dataset.loc[:, "Grade"]
    dataset_index = dataset.index

    dataset_values[dataset_values.columns] = scaler.transform(dataset_values[dataset_values.columns])
    
    dataset_values = pca.transform(dataset_values)
    
    preprocessed_dataset = pd.DataFrame(dataset_values, index=dataset_index)
    preprocessed_dataset["Grade"] = dataset_labels
    
    return preprocessed_dataset

In [13]:
dataset = pd.read_csv("output/features.csv", index_col=0)
name_mapping = pd.read_csv("mapping/name_mapping.csv")

skf = StratifiedKFold(n_splits=5)

X = name_mapping.values[:,1]
y = name_mapping.values[:,0]

for train_index, test_index in skf.split(X, y):
     
    X_train, X_test = X[train_index], X[test_index] 
    y_train, y_test = y[train_index], y[test_index]
        
    print("Train labels distribution:", Counter(y_train))
    print("Test labels distribution:",Counter(y_test))
    print("\n")

    train_mapping = pd.DataFrame({'Grade': y_train, 'ID': X_train})    
    test_mapping = pd.DataFrame({'Grade': y_test, 'ID': X_test})
    equalized_train_mapping = equalize_mapping(train_mapping)
        
    (train_dataset, test_dataset) = train_test_data_split(dataset, train_mapping, test_mapping)
    
    scaler = prepare_scaler(train_dataset)
    
    pca_train_dataset, pca = apply_pca(train_dataset, scaler)
    
    equalized_train_dataset = equalize_dataset(equalized_train_mapping, pca_train_dataset)
    
    (flair_data, t1c_data, t1_data, t2_data, labels) = split_modalities(equalized_train_dataset)
    
    flair_classifier = RandomForestClassifier()
    t1c_classifier = RandomForestClassifier()
    t1_classifier = RandomForestClassifier()
    t2_classifier = RandomForestClassifier()
    
    flair_classifier.fit(flair_data.to_numpy(), labels)
    t1c_classifier.fit(t1c_data.to_numpy(), labels)
    t1_classifier.fit(t1_data.to_numpy(), labels)
    t2_classifier.fit(t2_data.to_numpy(), labels)
    
    preprocessed_test_dataset = preprocess_dataset(test_dataset, scaler, pca)
    
    (test_flair_data, test_t1c_data, test_t1_data, test_t2_data, test_labels) = split_modalities(preprocessed_test_dataset)
    
    number_of_test_cases = len(test_mapping)
    
    test_results = []

    for i in range(number_of_test_cases):
        probability_array = []

        flair_result = flair_classifier.predict_proba(test_flair_data.iloc[i].to_numpy().reshape(1,-1))[0]
        probability_array.append(flair_result)
            
        t1c_result = t1c_classifier.predict_proba(test_t1c_data.iloc[i].to_numpy().reshape(1,-1))[0]
        probability_array.append(t1c_result)
            
        t1_result = t1_classifier.predict_proba(test_t1_data.iloc[i].to_numpy().reshape(1,-1))[0]
        probability_array.append(t1_result)
            
        t2_result = t2_classifier.predict_proba(test_t2_data.iloc[i].to_numpy().reshape(1,-1))[0]
        probability_array.append(t2_result)
                
        final_prediction = check_majority_proba(probability_array)

        test_results.append(final_prediction)
        
    test_results = np.array(test_results)
    
    print(confusion_matrix(test_labels, test_results))
    print(classification_report(test_labels, test_results))
    
    acc = accuracy_score(test_labels, test_results)
    kappa = cohen_kappa_score(test_labels, test_results)
    mcc = matthews_corrcoef(test_labels, test_results)
    
    print("Accuracy: {:.5f}".format(acc))
    print("Cohen's Kappa: {:.5f}".format(kappa))
    print("MCC: {:.5f}".format(mcc))

    

Train labels distribution: Counter({'HGG': 233, 'LGG': 61})
Test labels distribution: Counter({'HGG': 59, 'LGG': 15})


[[57  2]
 [ 8  7]]
              precision    recall  f1-score   support

         HGG       0.88      0.97      0.92        59
         LGG       0.78      0.47      0.58        15

    accuracy                           0.86        74
   macro avg       0.83      0.72      0.75        74
weighted avg       0.86      0.86      0.85        74

Accuracy: 0.86486
Cohen's Kappa: 0.50863
MCC: 0.53229
Train labels distribution: Counter({'HGG': 233, 'LGG': 61})
Test labels distribution: Counter({'HGG': 59, 'LGG': 15})


[[58  1]
 [ 8  7]]
              precision    recall  f1-score   support

         HGG       0.88      0.98      0.93        59
         LGG       0.88      0.47      0.61        15

    accuracy                           0.88        74
   macro avg       0.88      0.72      0.77        74
weighted avg       0.88      0.88      0.86        74

Accuracy: 0.87