In [1]:
import sys
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '6' 
from os.path import abspath
import numpy as np
import pandas as pd
from utils.generate_network import generate_network
from utils.prepare_data import prepare_data
from utils.popphy_io import get_config, save_params, load_params
from utils.popphy_io import get_stat, get_stat_dict
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from models.PopPhy import PopPhyCNN
from models.CNN1D import CNN1D
from models.MLPNN import MLPNN
from models.RF import RF
from models.SVM import SVM
from models.LASSO import LASSO
import warnings
from datetime import datetime
import json
import warnings
warnings.filterwarnings("ignore")



In [2]:
config = get_config()
filt_thresh = config.get('Evaluation', 'FilterThresh')
dataset = config.get('Evaluation', 'DataSet')  # dataset is T2D or Cirrhosis
num_runs = int(config.get('Evaluation', 'NumberRuns'))
num_test = int(config.get('Evaluation', 'NumberTestSplits'))
path = "../data/" + dataset 

print(dataset)

T2D


In [3]:
results_dir = "../results/notebook_results/" + dataset

try:
    os.makedirs(results_dir)
except OSError:
    print ("Creation of the directory %s failed" % results_dir)
else:
    print ("Successfully created the directory %s" % results_dir)

Creation of the directory ../results/notebook_results/T2D failed


In [4]:
my_maps, raw_x, tree_x, raw_features, tree_features, labels, label_set, g, feature_df = prepare_data(path, config)

num_class = len(np.unique(labels))
if num_class == 2:
    metric = "AUC"
else:
    metric = "MCC"


seed = np.random.randint(100)
np.random.seed(seed)
np.random.shuffle(my_maps)
np.random.seed(seed)
np.random.shuffle(raw_x)
np.random.seed(seed)
np.random.shuffle(tree_x)
np.random.seed(seed)
np.random.shuffle(labels)

n_values = np.max(labels) + 1
labels_oh = np.eye(n_values)[labels]
 
tree_row = my_maps.shape[1]
tree_col = my_maps.shape[2]

print("There are %d classes...%s" % (num_class, ", ".join(label_set)))
cv_list = ["Run_" + str(x) + "_CV_" + str(y) for x in range(num_runs) for y in range(num_test)]
seeds = np.random.randint(1000, size=num_runs)

There are 216 raw features...
Building tree structure...
Found tree file...
Populating trees...
There are 406 tree features...
There are 2 classes...n, t2d


In [5]:
cnn1d_tree_stat_df = pd.DataFrame(index=["AUC", "MCC", "Precision", "Recall", "F1"], columns=cv_list)

run = 0
for seed in seeds:
    skf = StratifiedKFold(n_splits=num_test, shuffle=True, random_state=seed)
    fold = 0
    for train_index, test_index in skf.split(my_maps, labels):
        train_x, test_x = tree_x[train_index,:], tree_x[test_index,:]
        train_y_oh, test_y_oh = labels_oh[train_index,:], labels_oh[test_index,:]
        train_y, test_y = labels[train_index], labels[test_index]
        
        train_x = np.log(train_x + 1)
        test_x = np.log(test_x + 1)
        
        c_prob = [0] * len(np.unique(labels))
        train_weights = []

        for l in np.unique(labels):
            a = float(len(labels))
            b = 2.0 * float((np.sum(labels==l)))
            c_prob[int(l)] = a/b

        c_prob = np.array(c_prob).reshape(-1)

        for l in np.argmax(train_y_oh, 1):
            train_weights.append(c_prob[int(l)])
        train_weights = np.array(train_weights)
        
        scaler = MinMaxScaler().fit(train_x)
        train_x = np.clip(scaler.transform(train_x), 0, 1)
        test_x = np.clip(scaler.transform(test_x), 0, 1) 

        train_oh = [train_x, train_y_oh]
        test_oh = [test_x, test_y_oh]

        train = [train_x, train_y]
        test = [test_x, test_y]
        
        cnn1D_model = CNN1D(train_x.shape[1], num_class, config)
        
        if fold + run == 0:
            print("CNN-1D")
            print(cnn1D_model.model.summary())
            print("\n\n Run\tFold\tRF %s\t\tSVM %s\t\tLASSO %s\tMLPNN %s\tCNN-1D %s" % (metric, metric, 
                                                                                   metric, metric, metric))

        cnn1D_model.train(train_oh, train_weights)
        preds, cnn1d_stats = cnn1D_model.test(test_oh)
        if num_class == 2:
                cnn1d_tree_stat_df.loc["AUC"]["Run_" + str(run) + "_CV_" + str(fold)]=cnn1d_stats["AUC"]
        cnn1d_tree_stat_df.loc["MCC"]["Run_" + str(run) + "_CV_" + str(fold)]=cnn1d_stats["MCC"]
        cnn1d_tree_stat_df.loc["Precision"]["Run_" + str(run) + "_CV_" + str(fold)]=cnn1d_stats["Precision"]
        cnn1d_tree_stat_df.loc["Recall"]["Run_" + str(run) + "_CV_" + str(fold)]=cnn1d_stats["Recall"]
        cnn1d_tree_stat_df.loc["F1"]["Run_" + str(run) + "_CV_" + str(fold)]=cnn1d_stats["F1"]

        cnn1D_model.destroy()
        
        fold += 1
    run += 1

CNN-1D
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gaussian_noise (GaussianNois (None, 1, 406, 1)         0         
_________________________________________________________________
conv_0 (Conv2D)              (None, 1, 397, 32)        352       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 1, 198, 32)        0         
_________________________________________________________________
conv_1 (Conv2D)              (None, 1, 189, 32)        10272     
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 1, 94, 32)         0         
_________________________________________________________________
flatten (Flatten)            (None, 3008)              0         
_________________________________________________________________
dropout (Dropout)            (None, 3008)        

In [6]:
cnn1d_tree_stat_df.to_csv(os.path.join(results_dir, "\cnn1d_tree_evaluation.csv"))
cnn1d_tree_stat_df

Unnamed: 0,Run_0_CV_0,Run_0_CV_1,Run_0_CV_2,Run_0_CV_3,Run_0_CV_4,Run_0_CV_5,Run_0_CV_6,Run_0_CV_7,Run_0_CV_8,Run_0_CV_9
AUC,0.702479,0.588843,0.752066,0.613636,0.654959,0.630165,0.681818,0.726708,0.581781,0.521739
MCC,0.09245,-0.045883,0.318511,0.227508,0.091287,0.140028,0.321182,0.362319,0.089027,0.093168
Precision,0.547009,0.476842,0.65942,0.613872,0.545833,0.571895,0.662105,0.681818,0.545455,0.547713
Recall,0.545455,0.477273,0.659091,0.613636,0.545455,0.568182,0.659091,0.681818,0.545455,0.545455
F1,0.541667,0.474831,0.658915,0.613437,0.544513,0.562533,0.657499,0.681818,0.545455,0.545455


In [7]:
rf_tree_stat_df_mean = cnn1d_tree_stat_df.mean(1)

rf_tree_stat_df_mean


AUC          0.645419
MCC          0.168960
Precision    0.585196
Recall       0.584091
F1           0.582612
dtype: float64