In [1]:
import sys
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '4' 
from os.path import abspath
import numpy as np
import pandas as pd
from utils.generate_network import generate_network
from utils.prepare_data import prepare_data
from utils.popphy_io import get_config, save_params, load_params
from utils.popphy_io import get_stat, get_stat_dict
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from models.PopPhy import PopPhyCNN
from models.CNN1D import CNN1D
from models.MLPNN import MLPNN
from models.RF import RF
from models.SVM import SVM
from models.LASSO import LASSO
import warnings
from datetime import datetime
import json
import warnings
warnings.filterwarnings("ignore")



In [2]:
config = get_config()
filt_thresh = config.get('Evaluation', 'FilterThresh')
dataset = config.get('Evaluation', 'DataSet')
num_runs = int(config.get('Evaluation', 'NumberRuns'))
num_test = int(config.get('Evaluation', 'NumberTestSplits'))
path = "../data/" + dataset 

print(dataset)

T2D


In [3]:
results_dir = "../results/notebook_results/" + dataset

try:
    os.makedirs(results_dir)
except OSError:
    print ("Creation of the directory %s failed" % results_dir)
else:
    print ("Successfully created the directory %s" % results_dir)
    

Creation of the directory ../results/notebook_results/T2D failed


In [4]:
my_maps, raw_x, tree_x, raw_features, tree_features, labels, label_set, g, feature_df = prepare_data(path, config)

num_class = len(np.unique(labels))
if num_class == 2:
    metric = "AUC"
else:
    metric = "MCC"

seed = np.random.randint(100)
np.random.seed(seed)
np.random.shuffle(my_maps)
np.random.seed(seed)
np.random.shuffle(raw_x)
np.random.seed(seed)
np.random.shuffle(tree_x)
np.random.seed(seed)
np.random.shuffle(labels)

n_values = np.max(labels) + 1
labels_oh = np.eye(n_values)[labels]

tree_row = my_maps.shape[1]
tree_col = my_maps.shape[2]

print("There are %d classes...%s" % (num_class, ", ".join(label_set)))
cv_list = ["Run_" + str(x) + "_CV_" + str(y) for x in range(num_runs) for y in range(num_test)]
seeds = np.random.randint(1000, size=num_runs)

There are 216 raw features...
Building tree structure...
Found tree file...
Populating trees...
There are 406 tree features...
There are 2 classes...n, t2d


In [5]:
popphy_stat_df = pd.DataFrame(index=["AUC", "MCC", "Precision", "Recall", "F1"], columns=cv_list)

feature_scores = {}

for l in label_set:
    feature_scores[l] = pd.DataFrame(index=tree_features)
run = 0
for seed in seeds:
    skf = StratifiedKFold(n_splits=num_test, shuffle=True, random_state=seed)
    fold = 0
    for train_index, test_index in skf.split(my_maps, labels):
        train_x, test_x = my_maps[train_index,:,:], my_maps[test_index,:,:]
        train_y, test_y = labels_oh[train_index,:], labels_oh[test_index,:]
        
        train_x = np.log(train_x + 1)
        test_x = np.log(test_x + 1)
        
        c_prob = [0] * len(np.unique(labels))
        train_weights = []

        for l in np.unique(labels):
            a = float(len(labels))
            b = 2.0 * float((np.sum(labels==l)))
            c_prob[int(l)] = a/b

        c_prob = np.array(c_prob).reshape(-1)

        for l in np.argmax(train_y, 1):
            train_weights.append(c_prob[int(l)])
        train_weights = np.array(train_weights)
        
        scaler = MinMaxScaler().fit(train_x.reshape(-1, tree_row * tree_col))
        train_x = np.clip(scaler.transform(train_x.reshape(-1, tree_row * tree_col)), 0, 1).reshape(-1, tree_row, tree_col)
        test_x = np.clip(scaler.transform(test_x.reshape(-1, tree_row * tree_col)), 0, 1).reshape(-1, tree_row, tree_col)

        train = [train_x, train_y]
        test = [test_x, test_y]

        popphy_model = PopPhyCNN((tree_row, tree_col), num_class, config)

        if fold + run == 0:
            print(popphy_model.model.summary())
            print("\n\n Run\tFold\t%s" % (metric))

        popphy_model.train(train, train_weights)
        preds, stats = popphy_model.test(test)
        if num_class == 2:
                popphy_stat_df.loc["AUC"]["Run_" + str(run) + "_CV_" + str(fold)]=stats["AUC"]
        popphy_stat_df.loc["MCC"]["Run_" + str(run) + "_CV_" + str(fold)]=stats["MCC"]
        popphy_stat_df.loc["Precision"]["Run_" + str(run) + "_CV_" + str(fold)]=stats["Precision"]
        popphy_stat_df.loc["Recall"]["Run_" + str(run) + "_CV_" + str(fold)]=stats["Recall"]
        popphy_stat_df.loc["F1"]["Run_" + str(run) + "_CV_" + str(fold)]=stats["F1"]

        if metric == "AUC":
                print("# %d\t%d\t%.3f" % (run, fold, stats["AUC"]))
        if metric == "MCC":
                print("# %d\t%d\t%.3f\t" % (run, fold, stats["MCC"]))

        scores = popphy_model.get_feature_scores(train, g, label_set, tree_features, config)
        for l in range(len(label_set)):
                score_list = scores[:,l]
                lab = label_set[l]
                feature_scores[lab]["Run_" + str(run) + "_CV_" + str(fold)] = score_list

        popphy_model.destroy()
        fold += 1
    run += 1

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gaussian_noise (GaussianNois (None, 10, 156, 1)        0         
_________________________________________________________________
conv_0 (Conv2D)              (None, 8, 147, 32)        992       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 4, 73, 32)         0         
_________________________________________________________________
flatten (Flatten)            (None, 9344)              0         
_________________________________________________________________
dropout (Dropout)            (None, 9344)              0         
_________________________________________________________________
fc_0 (Dense)                 (None, 32)                299040    
_________________________________________________________________
dropout_1 (Dropout)          (None, 32)                0

In [6]:
popphy_stat_df.to_csv(os.path.join(results_dir, "\popphy_tree_evaluation.csv"))
popphy_stat_df 

Unnamed: 0,Run_0_CV_0,Run_0_CV_1,Run_0_CV_2,Run_0_CV_3,Run_0_CV_4,Run_0_CV_5,Run_0_CV_6,Run_0_CV_7,Run_0_CV_8,Run_0_CV_9
AUC,0.501035,0.569358,0.635611,0.760331,0.626033,0.741736,0.599174,0.741736,0.665289,0.737603
MCC,0.136505,-0.089027,0.197516,0.46225,0.182574,0.27735,0.09245,0.377964,0.09759,0.3698
Precision,0.569215,0.456428,0.603387,0.735043,0.591667,0.641026,0.547009,0.696429,0.552381,0.688034
Recall,0.568182,0.454545,0.590909,0.727273,0.590909,0.636364,0.545455,0.681818,0.545455,0.681818
F1,0.568405,0.454545,0.585795,0.725,0.590062,0.633333,0.541667,0.675789,0.529915,0.679167


In [7]:
popphy_stat_df.mean(1)

AUC          0.657790
MCC          0.210497
Precision    0.608062
Recall       0.602273
F1           0.598368
dtype: float64