In [None]:
%load_ext autoreload
%autoreload 1

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

In [None]:
%%time
clinical = pd.read_csv('/data/archive/compendium/v5/clinical.tsv',
                       sep='\t').rename(columns={'th_sampleid': 'THid'}).set_index('THid')

ribodDiagnosis = pd.read_csv('data/riboDepleted_samples_that_passedQC_and_have_known_diagnosis.tsv',
                             sep='\t').rename(columns={'Treehouse SAMPLE identifier': 'THid',
                                                       'Diagnosis/Disease': 'disease'}).set_index('THid')
ribodDiagnosis['TR_method'] = 'RiboMinus'

methods = pd.read_csv('data/TranscriptMethod_THPEDv1.csv'
                      ).rename(columns={'Treehouse SAMPLE identifier': 'THid'}).set_index('THid')

clinicalIdTissue = clinical[['anat_sample', 'disease']] 
label_df = pd.merge(clinicalIdTissue, ribodDiagnosis, how='outer', left_index=True, right_index=True)
label_df = pd.merge(label_df, methods, how='left', left_index=True, right_index=True)

label_df['disease_y'].fillna(label_df['disease_x'], inplace=True)
label_df['TR_method_y'].fillna(label_df['TR_method_x'], inplace=True)
label_df = label_df.rename(columns={'TR_method_y': 'TR_method', 'disease_y': 'disease',
                                    'anat_sample': 'tissue'})
del label_df['disease_x'], label_df['TR_method_x']
# here I do not use the tissue label, so I reduce label_df to only the prep type
label_df = label_df['TR_method'].dropna()

label_df = (label_df == 'PolyA').astype('int')

In [None]:
%%time
gene_df = pd.read_hdf('/data/archive/compendium/v5/v5_hugo_log2tpm.11340x58581.2018-02-03.hd5')
gene_df = gene_df.T

In [None]:
label_df.head()

In [None]:
gene_df.head()

# Now, select some data to put in the variables train_data, train_labels, test_data, test_labels.

In [None]:
train_data, train_labels = None, None
test_data, test_labels = None, None

# Then the following cells with produce the optimal weight vector

In [None]:
# dont worry about this
tf.reset_default_graph()

In [None]:
batch_size = 32
n_batches = n_train // batch_size
n_epochs = 4

In [None]:
%%time
train_dataset = tf.data.Dataset.from_tensor_slices((train_data, train_labels))
train_batches = train_dataset.shuffle(1000).repeat().batch(batch_size)
train_next_batch = train_batches.make_one_shot_iterator().get_next()

In [None]:
%%time
test_dataset = tf.data.Dataset.from_tensor_slices((test_data, test_labels))
test_batches = test_dataset.shuffle(1000).repeat().batch(n_test)
test_next_batch = test_batches.make_one_shot_iterator().get_next()

In [None]:
N_GENES = 58581

In [None]:
x = tf.placeholder(tf.float32, shape=(None, N_GENES), name='gene_set')
y = tf.placeholder(tf.float32, shape=(None, 1), name='prep_type')

w = tf.Variable(tf.random_normal(shape=(N_GENES, 1), stddev=1/np.sqrt(N_GENES)), name='weight')
h = tf.matmul(x, w)

In [None]:
per_sample_loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=h)
loss = tf.reduce_mean(per_sample_loss)
optimizer = tf.train.GradientDescentOptimizer(0.01)
update_step = optimizer.minimize(loss)

In [None]:
init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())

losses = []

with tf.Session() as sess:
    init_op.run()
    
    for _ in range(n_epochs * n_batches):

        trn_x, trn_y = sess.run(train_next_batch)
        sess.run(update_step, feed_dict={x: trn_x, y: trn_y[:, None]})
    
    print('Final learned weight vector')
    w_vec = w.eval()
    print(w_vec)