In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score,cohen_kappa_score
pd.options.mode.chained_assignment = None 

In [2]:
dataset = pd.read_csv("output/features.csv", index_col=0)
name_mapping = pd.read_csv("mapping/name_mapping.csv")

In [3]:
def train_test_mapping_split(name_mapping):
    labels = name_mapping.values[:,0]
    samples = name_mapping.values[:,1]

    (train_samples, test_samples, train_labels, test_labels)= sklearn.model_selection.train_test_split(samples, labels, test_size=0.2, stratify=labels)
    
    train_mapping = pd.DataFrame({'Grade': train_labels, 'ID': train_samples})    
    test_mapping = pd.DataFrame({'Grade': test_labels, 'ID': test_samples})

    print("Labels distribution:", Counter(labels))
    print("Train labels distribution:", Counter(train_labels))
    print("Test labels distribution:",Counter(test_labels))
    
    return train_mapping, test_mapping

In [4]:
def train_test_data_split(raw_data, train_mapping, test_mapping):
    train_data = pd.DataFrame(columns=raw_data.columns)
    test_data = pd.DataFrame(columns=raw_data.columns)
    
    for case_id in raw_data.index:
        case_general_id = case_id[0:20]
        if case_general_id in train_mapping.values[:,1]:
            train_data.loc[case_id]= raw_data.loc[case_id, :]
        elif case_general_id in test_mapping.values[:,1]:
            test_data.loc[case_id]= raw_data.loc[case_id, :]
            
    print("Splitting finished")
            
    return train_data, test_data

In [5]:
def select_features(dataset):
    X = dataset.loc[:, dataset.columns != "Grade"]
    Y = dataset.loc[:, "Grade"]
    
    standard_scaler = StandardScaler().fit(X)
    X[X.columns] = standard_scaler.fit_transform(X[X.columns])
    
    select_k_best = SelectKBest(f_classif).fit(X, Y)
    
    scored_features = pd.DataFrame({'Feature':list(X.columns), 'Score':select_k_best.scores_})
    scored_features.sort_values(by='Score', ascending=False, inplace=True)
    
    best_features = scored_features.nlargest(10,'Score')
    best_features = best_features.loc[:, "Feature"]
    print(best_features)
    best_features.to_csv("output/selected_features.csv")

    selected_features = X.loc[dataset.index, best_features]
    selected_features["Grade"] = Y
    
    return selected_features

In [6]:
def split_modalities(input_data):
    
    flair= pd.DataFrame(columns=input_data.columns)
    t1c = pd.DataFrame(columns=input_data.columns)
    t1= pd.DataFrame(columns=input_data.columns)
    t2 = pd.DataFrame(columns=input_data.columns)
    
    for case_id in input_data.index:
        if "FLAIR" in case_id:
            flair.loc[case_id]= input_data.loc[case_id, :]
        elif "T1C" in case_id:
            t1c.loc[case_id]= input_data.loc[case_id, :]
        elif "T1" in case_id:
            t1.loc[case_id]= input_data.loc[case_id, :]
        elif "T2" in case_id:
            t2.loc[case_id]= input_data.loc[case_id, :]
            
            
    labels = (flair.loc[:, 'Grade']).to_numpy()
    
    flair.drop('Grade',axis='columns', inplace=True)
    t1c.drop('Grade',axis='columns', inplace=True)
    t1.drop('Grade',axis='columns', inplace=True)
    t2.drop('Grade',axis='columns', inplace=True)
    
    print("Modalities splitted")
    
    return flair, t1c, t1, t2, labels

In [7]:
def preprocess_test_dataset(dataset):
    selected_features = pd.read_csv("output/best_features.csv", index_col=0)
    features_list = selected_features["Feature"].to_list()
    
    X = dataset.loc[:, dataset.columns != "Grade"]
    Y = dataset.loc[:, "Grade"]
    
    reduced_X = X.loc[:, features_list]
    
    standard_scaler = StandardScaler().fit(X)
    reduced_X[reduced_X.columns] = standard_scaler.fit_transform(reduced_X[reduced_X.columns])
    
    reduced_dataset = reduced_X
    reduced_dataset["Grade"] = Y
    
    return reduced_dataset

In [8]:
def checkMajority(voting_arr):
    lgg_occurences = voting_arr.count("LGG")
    hgg_occurences = voting_arr.count("HGG")
    if  hgg_occurences >= lgg_occurences:
        return "HGG"
    else:
        return "LGG"

In [9]:
(train_mapping, test_mapping) = train_test_mapping_split(name_mapping)

Labels distribution: Counter({'HGG': 292, 'LGG': 76})
Train labels distribution: Counter({'HGG': 233, 'LGG': 61})
Test labels distribution: Counter({'HGG': 59, 'LGG': 15})


In [10]:
(train_dataset, test_dataset) = train_test_data_split(dataset, train_mapping, test_mapping)

Splitting finished


In [11]:
reduced_train_dataset = select_features(train_dataset)

8                         original_shape_MeshVolume
13                       original_shape_VoxelVolume
79            original_glrlm_RunLengthNonUniformity
10                        original_shape_Sphericity
57            original_gldm_DependenceNonUniformity
2                    original_shape_LeastAxisLength
70            original_glrlm_GrayLevelNonUniformity
9                    original_shape_MinorAxisLength
91    original_glszm_LargeAreaHighGrayLevelEmphasis
60             original_gldm_GrayLevelNonUniformity
Name: Feature, dtype: object


In [12]:
(flair_data, t1c_data, t1_data, t2_data, labels) = split_modalities(reduced_train_dataset)

Modalities splitted


In [13]:
flair_classifier = RandomForestClassifier()
t1c_classifier = RandomForestClassifier()
t1_classifier = RandomForestClassifier()
t2_classifier = RandomForestClassifier()

In [14]:
flair_classifier.fit(flair_data.to_numpy(), labels)
t1c_classifier.fit(t1c_data.to_numpy(), labels)
t1_classifier.fit(t1_data.to_numpy(), labels)
t2_classifier.fit(t2_data.to_numpy(), labels)

RandomForestClassifier()

In [15]:
preprocessed_test_dataset = preprocess_test_dataset(test_dataset)

In [16]:
(test_flair_data, test_t1c_data, test_t1_data, test_t2_data, test_labels) = split_modalities(preprocessed_test_dataset)

Modalities splitted


In [17]:
number_of_test_cases = len(test_mapping)

In [18]:
test_results = []

for i in range(number_of_test_cases):
        voting_arr = []

        flair_result = flair_classifier.predict(test_flair_data.iloc[i].to_numpy().reshape(1,-1))[0]
        voting_arr.append(flair_result)

        t1c_result = t1c_classifier.predict(test_t1c_data.iloc[i].to_numpy().reshape(1,-1))[0]
        voting_arr.append(t1c_result)

        t1_result = t1_classifier.predict(test_t1_data.iloc[i].to_numpy().reshape(1,-1))[0]
        voting_arr.append(t1_result)

        t2_result = t2_classifier.predict(test_t2_data.iloc[i].to_numpy().reshape(1,-1))[0]
        voting_arr.append(t2_result)

        final_prediction = checkMajority(voting_arr)

        test_results.append(final_prediction)
        
test_results = np.array(test_results)


In [19]:
print(confusion_matrix(test_labels, test_results))
print(classification_report(test_labels, test_results))

accuracy = accuracy_score(test_labels, test_results)
print("Accuracy: {:.5f}".format(accuracy))
print("Cohen's Kappa: {:.5f}".format(cohen_kappa_score(test_labels, test_results)))

[[55  4]
 [ 9  6]]
              precision    recall  f1-score   support

         HGG       0.86      0.93      0.89        59
         LGG       0.60      0.40      0.48        15

    accuracy                           0.82        74
   macro avg       0.73      0.67      0.69        74
weighted avg       0.81      0.82      0.81        74

Accuracy: 0.82432
Cohen's Kappa: 0.37935
