In [None]:
import pandas as pd
import numpy as np
from collections import Counter
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score,cohen_kappa_score, matthews_corrcoef
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
import statistics

pd.options.mode.chained_assignment = None 

In [None]:
def train_test_data_split(raw_data, train_mapping, test_mapping):
    train_data = pd.DataFrame(columns=raw_data.columns)
    test_data = pd.DataFrame(columns=raw_data.columns)
    
    for case_id in raw_data.index:
        case_general_id = case_id[0:20]
        if case_general_id in train_mapping.values[:,1]:
            train_data.loc[case_id]= raw_data.loc[case_id, :]
        elif case_general_id in test_mapping.values[:,1]:
            test_data.loc[case_id]= raw_data.loc[case_id, :]
            
    return train_data, test_data

In [None]:
def select_features(dataset, number_of_features):
    X = dataset.loc[:, dataset.columns != "Grade"]
    Y = dataset.loc[:, "Grade"]
    
    standard_scaler = StandardScaler().fit(X)
    X[X.columns] = standard_scaler.fit_transform(X[X.columns])
    
    select_k_best = SelectKBest(f_classif).fit(X, Y)
    
    scored_features = pd.DataFrame({'Feature':list(X.columns), 'Score':select_k_best.scores_})
    scored_features.sort_values(by='Score', ascending=False, inplace=True)
    scored_features.to_csv("output/scored_features.csv")
    
    best_features = scored_features.nlargest(number_of_features,'Score')
    best_features = best_features.loc[:, "Feature"]
    # print("Selected features: ")
    # print(best_features)
    best_features.to_csv("output/best_features.csv")

    selected_features = X.loc[dataset.index, best_features]
    selected_features["Grade"] = Y
    
    return selected_features

In [134]:
def equalize_dataset(dataset):
    is_HGG =  dataset['Grade']=='HGG'
    is_LGG =  dataset['Grade']=='LGG'
    hggs = dataset[is_HGG]
    lggs = dataset[is_LGG]
    hggs = hggs.sample(n=len(lggs))
    equalized_mapping = pd.concat([hggs, lggs])
    return equalized_mapping

In [137]:
dataset = pd.read_csv("output/features.csv", index_col=0)
name_mapping = pd.read_csv("mapping/name_mapping.csv")

skf = StratifiedKFold(n_splits=5)

X = name_mapping.values[:,1]
y = name_mapping.values[:,0]

number_of_features = 10

etd = None

for train_index, test_index in skf.split(X, y):
     
    X_train, X_test = X[train_index], X[test_index] 
    y_train, y_test = y[train_index], y[test_index]
        
    print("Train labels distribution:", Counter(y_train))
    print("Test labels distribution:",Counter(y_test))
    print("\n")

    train_mapping = pd.DataFrame({'Grade': y_train, 'ID': X_train})    
    test_mapping = pd.DataFrame({'Grade': y_test, 'ID': X_test})
        
    (train_dataset, test_dataset) = train_test_data_split(dataset, train_mapping, test_mapping)
    
    reduced_train_dataset = select_features(train_dataset, number_of_features)
    
    equalized_train_dataset = equalize_dataset(reduced_train_dataset)
    
    print("Size of equalized dataset: ", (len(equalized_train_dataset)/4))

Train labels distribution: Counter({'HGG': 233, 'LGG': 61})
Test labels distribution: Counter({'HGG': 59, 'LGG': 15})


Size of equalized dataset:  122.0
Train labels distribution: Counter({'HGG': 233, 'LGG': 61})
Test labels distribution: Counter({'HGG': 59, 'LGG': 15})




KeyboardInterrupt: 

In [138]:
rds

Unnamed: 0,original_glcm_Idn,original_gldm_LargeDependenceHighGrayLevelEmphasis,original_glcm_InverseVariance,original_glrlm_RunEntropy,original_gldm_SmallDependenceLowGrayLevelEmphasis,original_glcm_SumAverage,original_glcm_JointAverage,original_glcm_Idmn,original_glszm_LargeAreaHighGrayLevelEmphasis,original_shape_LeastAxisLength,Grade
BraTS20_Training_001_FLAIR,0.677054,0.210019,0.524749,-0.001946,-0.915236,-0.061814,-0.061814,0.825877,1.365458,1.419114,HGG
BraTS20_Training_001_T1,1.221874,0.378783,0.486353,0.356023,-1.069613,-0.025435,-0.025435,1.180603,2.311673,1.419114,HGG
BraTS20_Training_001_T1C,0.292287,-0.093218,0.421201,-0.132266,-0.842367,0.031282,0.031282,0.635376,0.126898,1.419114,HGG
BraTS20_Training_001_T2,0.821166,-0.118250,-0.319973,0.012178,-0.795128,-0.295421,-0.295421,0.677323,1.977581,1.419114,HGG
BraTS20_Training_002_FLAIR,-1.000550,-0.279260,0.305009,0.333196,0.376283,-0.149899,-0.149899,-0.708334,-0.358986,0.059756,HGG
...,...,...,...,...,...,...,...,...,...,...,...
BraTS20_Training_296_T2,1.103441,1.114477,0.515131,0.347981,-1.122091,0.341558,0.341558,1.187376,2.335453,1.824657,LGG
BraTS20_Training_297_FLAIR,0.548815,0.486037,0.484225,-0.242740,-0.956670,0.234500,0.234500,0.716939,-0.183283,0.110791,LGG
BraTS20_Training_297_T1,0.617123,0.931837,-0.414453,1.151393,-0.987561,0.766157,0.766157,0.848403,-0.327857,0.110791,LGG
BraTS20_Training_297_T1C,0.096448,0.472833,-1.664356,1.457248,-0.965526,1.152712,1.152712,0.552979,-0.396218,0.110791,LGG
