In [2]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
import tensorflow as tf
import numpy as np
import pandas as pd
from pandas import DataFrame as df
import os
import sys
from tensorflow.contrib.layers.python.layers import batch_norm as batch_norm

In [3]:
# SET ENV
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
config = tf.ConfigProto()
config.intra_op_parallelism_threads = 44
config.inter_op_parallelism_threads = 44
config.gpu_options.allow_growth=True

In [4]:
def fc_bn(_x, _output, _phase, _scope):
	with tf.variable_scope(_scope):
		h1 = tf.contrib.layers.fully_connected(_x, _output, activation_fn=None, scope='dense', weights_initializer=tf.contrib.layers.variance_scaling_initializer(), weights_regularizer = tf.contrib.layers.l2_regularizer(0.01), reuse=tf.AUTO_REUSE)
		h2 = tf.contrib.layers.batch_norm(h1, updates_collections=None, fused=True, decay=0.9, center=True, scale=True, is_training=_phase, scope='bn', reuse=tf.AUTO_REUSE)
		return h2

In [5]:
def build_discriminator(X, _phase, _keep_prob) :
	bilstm = tf.keras.layers.Bidirectional(tf.keras.layers.SimpleRNN(32), merge_mode = 'concat')(X)
	fc1 = tf.nn.dropout(tf.nn.leaky_relu(fc_bn(bilstm, 16, _phase, "discriminator_fc1")), _keep_prob)
	logits = fc_bn(fc1, 2, _phase, "logits")
	predicted_value = tf.nn.softmax(logits)
	return predicted_value, logits 

##### Before running the below commands :
1. Please change the **dirname** as your directory having the 5-fold cross validation datasets you want to run.
2. Choose and set the **allergy** type you want to predict. ("egg", "milk", "peanut")

In [6]:
dirname = "./5cv_dataset/"
allergy = "egg" # Can set as "milk", "egg", or "peanut"

If you want to train the classifier without the addition of the imputed subjects, please comment the below line for **x_data_file** and **y_data_file**, and uncomment the right below line.

For example, change the two lines as below: 
*   x_data_file = dirname + "train_group_" + str(group) + "_X.csv"
*   y_data_file = dirname + "train_group_" + str(group) + "_" + allergy + "_Y.csv"

In [7]:
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [8]:
for ver in range(1,6) : # Run the experiment five times
	try :
		os.mkdir("v" + str(ver))
	except :
		print("Dir already exist")
	for group in range(1,6) :
		tf.reset_default_graph()
		x_data_file = dirname + "train_additional_group_" + str(group) + "_X.csv"
		#x_data_file = dirname + "train_group_" + str(group) + "_X.csv"

		y_data_file = dirname + "train_additional_group_" + str(group) + "_" + allergy + "_Y.csv"
		#y_data_file = dirname + "train_group_" + str(group) + "_" + allergy + "_Y.csv"
		
		x_test_file = dirname + "test_group_" + str(group) + "_X.csv"
		y_test_file = dirname + "test_group_" + str(group) + "_" + allergy + "_Y.csv"

		x_data = pd.read_csv(x_data_file) #Row: each sample, Column: Features
		y_data = pd.read_csv(y_data_file)
		x_test = pd.read_csv(x_test_file)
		y_test = pd.read_csv(y_test_file)

		num_timepoint = 8 
		num_feature = len(x_data.columns)
		x_data = x_data.values
		x_data = x_data.reshape(-1, num_timepoint, num_feature)
		num_samples = len(x_data)
		x_test = x_test.values
		x_test = x_test.reshape(-1, num_timepoint, num_feature)
		n_classes = 2

		tf_X = tf.placeholder(tf.float32, [None, num_timepoint, num_feature])
		tf_Y = tf.placeholder(tf.float32, [None, n_classes])
		keep_prob = tf.placeholder(tf.float32)
		phase = tf.placeholder(tf.bool, name='phase')

		d_pred, d_logits = build_discriminator(tf_X, phase, keep_prob)
		d_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits = d_logits, labels = tf_Y))

		learning_rate_d = 0.001
		num_epoch = 4000

		d_train_step = tf.train.AdamOptimizer(learning_rate_d).minimize(d_loss)
		pred = tf.argmax(d_pred, 1)
		label = tf.argmax(tf_Y, 1)
		correct_pred = tf.equal(pred, label)
		accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
		_accuracy = tf.Variable(0)

		max_acc = 0.0
		dp_rate = 0.5

		with tf.Session() as sess:
			sess.run(tf.global_variables_initializer())
			for i in range(num_epoch):
				_, d_loss_print, d_acc = sess.run([d_train_step, d_loss, accuracy], feed_dict={tf_X: x_data, tf_Y: y_data, phase : True, keep_prob: dp_rate})
				if i % 10 == 0:
					test_acc, test_pred, test_label = sess.run([accuracy, pred, label], feed_dict={tf_X: x_test, tf_Y: y_test, phase : False, keep_prob: 1.0})
					print('Epoch: %d, cost: %f, train_acc:%.4f' % (i, d_loss_print, d_acc))
					if test_acc >= max_acc :
						max_acc = test_acc
						max_pred = test_pred
						max_label = test_label
			np.savetxt("./v" + str(ver) + "/prediction_group_" + str(group) + "_" + allergy + ".csv",  max_pred, fmt="%.0f", delimiter=",")
			np.savetxt("./v" + str(ver) + "/label_group_" + str(group) + "_" + allergy + ".csv",  max_label, fmt="%.0f", delimiter=",")

Dir already exist
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.

Epoch: 0, cost: 0.888369, train_acc:0.4746
Epoch: 10, cost: 0.718884, train_acc:0.5678
Epoch: 20, cost: 0.578257, train_acc:0.7542
Epoch: 30, cost: 0.455016, train_acc:0.8136
Epoch: 40, cost: 0.456066, tr

In [9]:
from sklearn.metrics import roc_auc_score

In [10]:
final_df = pd.DataFrame()
final_list = []
for v in range(1,6) :
	dirname = "./v" + str(v) + "/"
	result_df = pd.DataFrame()
	result_list = []
	for i in range(1, 6) :
		label_file = dirname + "label_group_" + str(i) + "_" + allergy + ".csv"
		pred_file = dirname + "prediction_group_" + str(i) + "_" + allergy + ".csv"
		label = pd.read_csv(label_file, header = None)
		pred = pd.read_csv(pred_file, header = None)
		result_list.append(roc_auc_score(label, pred))
		final_list.append(roc_auc_score(label, pred))
	result_df["Expriment" + str(v)] = result_list
	#result_df.to_csv(dirname + "classification_auc_result" + data_type + ".csv", mode = "w", index = False)
	final_df = pd.concat([final_df, result_df], axis = 1)
final_df.loc['avg'] = final_df.mean()

In [11]:
final_df

Unnamed: 0,Expriment1,Expriment2,Expriment3,Expriment4,Expriment5
0,0.542857,0.628571,0.571429,0.642857,0.528571
1,0.421053,0.594737,0.573684,0.689474,0.494737
2,0.607843,0.553922,0.436275,0.745098,0.661765
3,0.591667,0.783333,0.808333,0.9,0.808333
4,0.598684,0.697368,0.473684,0.572368,0.394737
avg,0.552421,0.651586,0.572681,0.709959,0.577629
