In [None]:
import pandas as pd
import numpy as np
from collections import Counter
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score,cohen_kappa_score, matthews_corrcoef
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
import statistics

pd.options.mode.chained_assignment = None 

In [None]:
def train_test_data_split(raw_data, train_mapping, test_mapping):
    train_data = pd.DataFrame(columns=raw_data.columns)
    test_data = pd.DataFrame(columns=raw_data.columns)
    
    for case_id in raw_data.index:
        case_general_id = case_id[0:20]
        if case_general_id in train_mapping.values[:,1]:
            train_data.loc[case_id]= raw_data.loc[case_id, :]
        elif case_general_id in test_mapping.values[:,1]:
            test_data.loc[case_id]= raw_data.loc[case_id, :]
            
    return train_data, test_data

In [None]:
def equalize_mapping(mapping):
    is_HGG =  mapping['Grade']=='HGG'
    is_LGG =  mapping['Grade']=='LGG'
    hggs = mapping[is_HGG]
    lggs = mapping[is_LGG]
    hggs = hggs.sample(n=len(lggs))
    equalized_mapping = pd.concat([hggs, lggs])
    return equalized_mapping

In [None]:
def equalize_dataset(mapping, dataset):
    mapping_ids = mapping["ID"].tolist()
    dataset.index = dataset.index.astype('str')
    equalized_train_dataset = pd.DataFrame()
    for id in mapping_ids:   
        case_df = dataset[dataset.index.str.contains(id)]
        equalized_train_dataset = pd.concat([equalized_train_dataset, case_df])
    return equalized_train_dataset

In [None]:
def split_modalities(input_data):
    
    flair= pd.DataFrame(columns=input_data.columns)
    t1c = pd.DataFrame(columns=input_data.columns)
    t1= pd.DataFrame(columns=input_data.columns)
    t2 = pd.DataFrame(columns=input_data.columns)
    
    for case_id in input_data.index:
        if "FLAIR" in case_id:
            flair.loc[case_id]= input_data.loc[case_id, :]
        elif "T1C" in case_id:
            t1c.loc[case_id]= input_data.loc[case_id, :]
        elif "T1" in case_id:
            t1.loc[case_id]= input_data.loc[case_id, :]
        elif "T2" in case_id:
            t2.loc[case_id]= input_data.loc[case_id, :]
            
            
    labels = (flair.loc[:, 'Grade']).to_numpy()
    
    flair.drop('Grade',axis='columns', inplace=True)
    t1c.drop('Grade',axis='columns', inplace=True)
    t1.drop('Grade',axis='columns', inplace=True)
    t2.drop('Grade',axis='columns', inplace=True)
    
    return flair, t1c, t1, t2, labels

In [None]:
def select_features(dataset, number_of_features):
    X = dataset.loc[:, dataset.columns != "Grade"]
    Y = dataset.loc[:, "Grade"]
    
    standard_scaler = StandardScaler().fit(X)
    X[X.columns] = standard_scaler.fit_transform(X[X.columns])
    
    select_k_best = SelectKBest(f_classif).fit(X, Y)
    
    scored_features = pd.DataFrame({'Feature':list(X.columns), 'Score':select_k_best.scores_})
    scored_features.sort_values(by='Score', ascending=False, inplace=True)
    scored_features.to_csv("output/scored_features.csv")
    
    best_features = scored_features.nlargest(number_of_features,'Score')
    best_features = best_features.loc[:, "Feature"]
    # print("Selected features: ")
    # print(best_features)
    best_features.to_csv("output/best_features.csv")

    selected_features = X.loc[dataset.index, best_features]
    selected_features["Grade"] = Y
    
    return selected_features

In [None]:
dataset = pd.read_csv("output/features.csv", index_col=0)
name_mapping = pd.read_csv("mapping/name_mapping.csv")

skf = StratifiedKFold(n_splits=2)

X = name_mapping.values[:,1]
y = name_mapping.values[:,0]

number_of_features = 10

for train_index, test_index in skf.split(X, y):
     
    X_train, X_test = X[train_index], X[test_index] 
    y_train, y_test = y[train_index], y[test_index]
        
    print("Train labels distribution:", Counter(y_train))
    print("Test labels distribution:",Counter(y_test))
    print("\n")

    train_mapping = pd.DataFrame({'Grade': y_train, 'ID': X_train})    
    test_mapping = pd.DataFrame({'Grade': y_test, 'ID': X_test})
    equalized_train_mapping = equalize_mapping(train_mapping)
        
    (train_dataset, test_dataset) = train_test_data_split(dataset, train_mapping, test_mapping)
    
    reduced_train_dataset = select_features(train_dataset, number_of_features)