In [1]:
import os
import pandas as pd
import numpy as np
from collections import Counter
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score,cohen_kappa_score
from sklearn.ensemble import RandomForestClassifier

In [2]:
def train_test_mapping_split(name_mapping, random_state):
    labels = name_mapping.values[:,0]
    samples = name_mapping.values[:,1]

    (train_samples, test_samples, train_labels, test_labels)= sklearn.model_selection.train_test_split(samples, labels, random_state=random_state, stratify=labels)
    
    train_mapping = pd.DataFrame({'Grade': train_labels, 'ID': train_samples})    
    test_mapping = pd.DataFrame({'Grade': test_labels, 'ID': test_samples})

    print("Labels distribution:", Counter(labels))
    print("Train labels distribution:", Counter(train_labels))
    print("Test labels distribution:",Counter(test_labels))
    
    return train_mapping, test_mapping

In [3]:
def train_test_data_split(raw_data, train_mapping, test_mapping):
    train_data = pd.DataFrame(columns=raw_data.columns)
    test_data = pd.DataFrame(columns=raw_data.columns)
    
    for case_id in raw_data.index:
        case_general_id = case_id[0:20]
        if case_general_id in train_mapping.values[:,1]:
            train_data.loc[case_id]= raw_data.loc[case_id, :]
        elif case_general_id in test_mapping.values[:,1]:
            test_data.loc[case_id]= raw_data.loc[case_id, :]
            
    return train_data, test_data

In [4]:
def split_modalities(input_data):
    
    flair= pd.DataFrame(columns=input_data.columns)
    t1c = pd.DataFrame(columns=input_data.columns)
    t1= pd.DataFrame(columns=input_data.columns)
    t2 = pd.DataFrame(columns=input_data.columns)
    
    for case_id in input_data.index:
        if "FLAIR" in case_id:
            flair.loc[case_id]= input_data.loc[case_id, :]
        elif "T1C" in case_id:
            t1c.loc[case_id]= input_data.loc[case_id, :]
        elif "T1" in case_id:
            t1.loc[case_id]= input_data.loc[case_id, :]
        elif "T2" in case_id:
            t2.loc[case_id]= input_data.loc[case_id, :]
            
            
    labels = (flair.loc[:, 'Grade']).to_numpy()
    
    flair.drop('Grade',axis='columns', inplace=True)
    t1c.drop('Grade',axis='columns', inplace=True)
    t1.drop('Grade',axis='columns', inplace=True)
    t2.drop('Grade',axis='columns', inplace=True)
    
    return [flair, t1c, t1, t2, labels]

In [5]:
def initialize_classifiers():
    flair_cls = RandomForestClassifier()
    t1c_cls = RandomForestClassifier()
    t1_cls = RandomForestClassifier()
    t2_cls = RandomForestClassifier()
    
    return flair_cls, t1c_cls, t1_cls, t2_cls

In [6]:
def train_classifiers(train_data_list, classifiers_list):
    flair_classifier.fit(train_data_list[0].to_numpy(), train_data_list[4])
    t1c_classifier.fit(train_data_list[1].to_numpy(), train_data_list[4])
    t1_classifier.fit(train_data_list[2].to_numpy(), train_data_list[4])
    t2_classifier.fit(train_data_list[3].to_numpy(), train_data_list[4])

In [7]:
def checkMajorityElement(voting_arr):
    lgg_occurences = voting_arr.count("LGG")
    hgg_occurences = voting_arr.count("HGG")
    if lgg_occurences >= hgg_occurences:
        return "LGG"
    else:
        return "HGG"

In [8]:
def predict(test_data_list, classifiers_list):
    results = []
    for i in range(len(test_data_list[4])):
        voting_arr = []

        flair_result = classifiers_list[0].predict(test_data_list[0].iloc[i].to_numpy().reshape(1,-1))[0]
        voting_arr.append(flair_result)

        t1c_result = classifiers_list[1].predict(test_data_list[1].iloc[i].to_numpy().reshape(1,-1))[0]
        voting_arr.append(t1c_result)

        t1_result = classifiers_list[2].predict(test_data_list[2].iloc[i].to_numpy().reshape(1,-1))[0]
        voting_arr.append(t1_result)

        t2_result = classifiers_list[3].predict(test_data_list[3].iloc[i].to_numpy().reshape(1,-1))[0]
        voting_arr.append(t2_result)

        final_prediction = checkMajorityElement(voting_arr)

        results.append(final_prediction)
        
    results = np.array(results)
    return results

In [9]:
raw_data = pd.read_csv("output/selected_features.csv", index_col=0)
name_mapping = pd.read_csv("data/name_mapping.csv")

In [10]:
(train_mapping, test_mapping) = train_test_mapping_split(name_mapping, 35)

Labels distribution: Counter({'HGG': 292, 'LGG': 76})
Train labels distribution: Counter({'HGG': 219, 'LGG': 57})
Test labels distribution: Counter({'HGG': 73, 'LGG': 19})


In [11]:
(train_data, test_data) = train_test_data_split(raw_data, train_mapping, test_mapping)

In [12]:
train_data_list = split_modalities(train_data)
test_data_list = split_modalities(test_data)

In [13]:
(flair_classifier, t1c_classifier, t1_classifier, t2_classifier) = initialize_classifiers()
classifiers_list = [flair_classifier, t1c_classifier, t1_classifier, t2_classifier]

In [14]:
train_classifiers(train_data_list, classifiers_list)

In [15]:
results = predict(test_data_list, classifiers_list)

In [16]:
print(confusion_matrix(test_data_list[4], results))
print(classification_report(test_data_list[4], results))

accuracy = accuracy_score(test_data_list[4], results) 
print("Accuracy: {:.5f}".format(accuracy))
print("Cohen's Kappa: {:.5f}".format(cohen_kappa_score(test_data_list[4], results)))

[[66  7]
 [ 7 12]]
              precision    recall  f1-score   support

         HGG       0.90      0.90      0.90        73
         LGG       0.63      0.63      0.63        19

    accuracy                           0.85        92
   macro avg       0.77      0.77      0.77        92
weighted avg       0.85      0.85      0.85        92

Accuracy: 0.84783
Cohen's Kappa: 0.53569
