# Group 3/4 Combined

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline 
import scipy
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn import metrics
from collections import Counter

from sksurv.linear_model import CoxPHSurvivalAnalysis
from sksurv.metrics import concordance_index_censored
from sksurv.util import Surv

import pymrmre

In [2]:
df = pd.read_excel("/Users/samwwong/Desktop/Michael Zhang/mb_t1_t2_3way.xlsx")

In [3]:
df_group34 = df[df['molecular'] == 'group3']
df_group34 = df_group34[df_group34['alive'].notnull() | df_group34['os'].notnull()]
df_group34['alive'] = df_group34['alive'].replace({'Alive ': 0, "alive": 0, "Alive": 0, 'Deceased': 1, "deceased": 1})
df_group34['alive'] = df_group34['alive'].fillna(0)
df_group34['t2_original_shape_VoxelVolume'] = df_group34['t2_original_shape_VoxelVolume'] / 1.0
df_group34 = df_group34[df_group34['os'].apply(lambda x: type(x) == int or type(x) == float)]
df_group34 = df_group34[df_group34['os'] > 1].reset_index(drop=True)

In [4]:
y_cols = ['alive', 'os']
x_cols = df_group34.columns[9:]    #should we include sex and age as well?

feature_cols = x_cols
y_name = ['alive', 'os']
num_features = 5

Cs = []
feats = []
folds = []

In [5]:
#outer loop of nested CV
fold_size = len(df_group34) // 5
fold1 = df_group34[:fold_size]
fold2 = df_group34[fold_size:2*fold_size]
fold3 = df_group34[2*fold_size:3*fold_size]
fold4 = df_group34[3*fold_size:4*fold_size]
fold5 = df_group34[4*fold_size:]

In [6]:
for i in range(5):
    all_folds = [fold1, fold2, fold3, fold4, fold5]
    test = all_folds.pop(i)
    train = pd.concat(all_folds)
    test.reset_index(inplace=True, drop=True)
    train.reset_index(inplace=True, drop=True)
    X_train = train[x_cols]
    y_train = train[y_cols]
    X_test = test[x_cols]
    y_test = test[y_cols]
    
    #K-fold
    kf_inner = KFold(n_splits = 5)
    best_features = []
    inner_auc = []
    
    for train_index_inner, test_index_inner in kf_inner.split(X_train):
        X_train_inner = X_train.iloc[train_index_inner][feature_cols]
        y_train_inner = y_train.iloc[train_index_inner][y_name]
        X_val = X_train.iloc[test_index_inner][feature_cols]
        y_val = y_train.iloc[test_index_inner][y_name]
        
        surv = Surv.from_arrays(y_train_inner[["alive"]].values.flatten(), y_train_inner[["os"]].values.flatten())
        
        solutions = pymrmre.mrmr.mrmr_ensemble(features=X_train_inner, targets=pd.DataFrame(surv).astype('double'),\
                                               solution_length=num_features, solution_count=5)
        
        flat_solution = [item for sublist in solutions.iloc[0] for item in sublist]
        flat_solution = list(filter(("time").__ne__, flat_solution))
        
        best_features.append(flat_solution)
    
    chosen_features = []
    flat_features = [item for sublist in best_features for item in sublist]
    c = Counter(flat_features)
    mc = c.most_common(num_features)
    for features in mc:
        chosen_features.append(features[0])
    
    y = Surv.from_arrays(y_train[['alive']].values.flatten(), y_train[['os']].values.flatten())
    
    estimator = CoxPHSurvivalAnalysis().fit(X_train[chosen_features].values, y)
    pred = estimator.predict(X_test[chosen_features].values)
    
    event_indicators = []
    event_time = []
    for j in range(len(y_test.values)):
        event_indicators.append(bool(y_test.values[j][0]))
        event_time.append(y_test.values[j][1])
    
    C = (concordance_index_censored(event_indicators, event_time, pred))
    
    Cs.append(C[0])
    feats.append(chosen_features)
    folds.append(i+1)

  causality_dropped = np.where(np.array(self._causality_list.loc[target_index]) > causality_threshold)
  mi_dropped = np.where(-.5 * np.log(1 - np.square(self._mi_matrix[:, target_index])) < mi_threshold)
  causality_dropped = np.where(np.array(self._causality_list.loc[target_index]) > causality_threshold)
  mi_dropped = np.where(-.5 * np.log(1 - np.square(self._mi_matrix[:, target_index])) < mi_threshold)
  causality_dropped = np.where(np.array(self._causality_list.loc[target_index]) > causality_threshold)
  mi_dropped = np.where(-.5 * np.log(1 - np.square(self._mi_matrix[:, target_index])) < mi_threshold)
  causality_dropped = np.where(np.array(self._causality_list.loc[target_index]) > causality_threshold)
  mi_dropped = np.where(-.5 * np.log(1 - np.square(self._mi_matrix[:, target_index])) < mi_threshold)
  causality_dropped = np.where(np.array(self._causality_list.loc[target_index]) > causality_threshold)
  mi_dropped = np.where(-.5 * np.log(1 - np.square(self._mi_matrix[:, target_

In [7]:
d = {'Concordance':Cs,'Features':feats, "Fold":folds}
result_df = pd.DataFrame(d)

In [8]:
result_df

Unnamed: 0,Concordance,Features,Fold
0,0.736842,"[t1_wavelet-HHH_firstorder_Median, t1_wavelet-...",1
1,0.925,[t1_wavelet-HHH_glszm_SizeZoneNonUniformityNor...,2
2,0.552083,[t2_wavelet-HHH_glszm_SmallAreaLowGrayLevelEmp...,3
3,0.521739,[t1_wavelet-HHH_glszm_SmallAreaLowGrayLevelEmp...,4
4,0.432203,"[t1_wavelet-HLH_firstorder_Mean, t2_wavelet-HH...",5
