In [49]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [2]:
import tensorflow as tf
import pandas as pd
import numpy as np

In [14]:
input_file = '~/amino_acid_genotypes_to_brightness_reuse.tsv'
train_file = '~/amino_acid_genotypes_to_brightness.tsv'
model_file = '/Users/katya/net_1_1_3_iteration_07020.ckpt'
learning_rate = 0.1
batch_size = 628
optimizer_method = 'tf.train.AdagradOptimizer'
line = 'net_structure\t1,tf.tanh\t3,tf.tanh\t1,tf.tanh'
net_structure = {}
counter = 1
for i in line.split('\t')[1:]:
    net_structure['layer' + str(counter)] = i.split(',')
    counter += 1
    
cost_stats_file = '/Users/katya/Desktop/cost_stats.txt'

In [15]:
train_data = pd.read_table(train_file)
train_data.aaMutations = train_data.aaMutations.fillna('')
unique_mutations = set(':'.join(train_data.aaMutations).split(':'))
unique_mutations.remove('')
unique_mutations = sorted(list(unique_mutations))

data = pd.read_table(input_file)
data.aaMutations = data.aaMutations.fillna('')

In [16]:
import matplotlib
matplotlib.use('Agg')
import tensorflow as tf
import numpy as np
from matplotlib import pyplot as plt
from scipy.stats import gaussian_kde

# Functions for plotting. The first is to actually plot, the second is to make the plot readable.
def density_plot(x, y):
    ''' x = observed, y = predicted '''
    x = x[(~np.isnan(x)) & (~np.isnan(y))]
    y = y[(~np.isnan(x)) & (~np.isnan(y))]

    # Calculate the point density
    xy = np.vstack([x, y])
    z = gaussian_kde(xy)(xy)

    # Sort the points by density, so that the densest points are plotted last
    idx = z.argsort()
    x, y, z = x[idx], y[idx], z[idx]

    fig, ax = plt.subplots()
    ax.scatter(x, y, c=z, s=10, edgecolor='')


def format_plot(ax, iteration_number, costs):
    plt.xlim(-1, 1)
    plt.ylim(-1, 1)
    plt.xlabel('Observed brightness')
    plt.ylabel('Predicted brightness')
    plt.title('Iteration %s: cost=%.7f' % (iteration_number, costs))

    plt.tick_params(axis="both", which="both", bottom="off", top="off",
                    labelbottom="on", left="off", right="off", labelleft="on")
    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)
    ax.spines["bottom"].set_visible(True)
    ax.spines["bottom"].set_color('gray')
    ax.spines["left"].set_visible(True)
    ax.spines["left"].set_color('gray')
    ax.xaxis.grid(True)
    ax.yaxis.grid(True)
    return ax


# Function for shaping the data. It performs reshuffling of the data,
# brightness normalization, and extracts genotype and brightness to separate matrices.
def format_data(data, unique_mutations):
    # shuffling rows in the data df
    data = data.reindex(np.random.permutation(data.index))
    print('Normalizing data...')
    # formatting data for the nn input
    nn_genotypes_values = np.zeros((len(data), len(unique_mutations)))
    nn_brightness_values = data.medianBrightness.values
    for i in range(len(unique_mutations)):
        nn_genotypes_values[:, i] = data.aaMutations.str.contains(unique_mutations[i]).astype(np.float32)

    nn_brightness_values = (nn_brightness_values - min(nn_brightness_values)) / max(
        nn_brightness_values - min(nn_brightness_values)) * 2 - 1
    return nn_genotypes_values, nn_brightness_values


# Function for generating batches from the data.
def get_batches(nn_genotypes_values, nn_brightness_values, batch_size, unique_mutations):
    nn_brightness_values_1 = nn_brightness_values
    print('Creating batches...')
    batches = []
    batch_number = int(nn_genotypes_values.shape[0] / batch_size)
    for i in range(batch_number):
        current_batch = nn_genotypes_values[batch_size * i:batch_size * (i + 1), :].reshape(batch_size, 1,
                                                                                            len(unique_mutations))
        current_batch_brightness = nn_brightness_values_1[batch_size * i:batch_size * (i + 1)].reshape(batch_size, 1, 1)
        batches.append((current_batch, current_batch_brightness))
    return batches


# Function for broadcasting a tensor (used before multiplication with weights).
def broadcast(tensor, batch_size):
    return tf.tile(tensor, (batch_size, 1, 1))


In [17]:
class Data():
    def __init__(self, input_file, batch_size):
        # type: (object, object) -> object
        # type: (object, object) -> object
        """
        :param input_file: path to the input_file
        :param batch_size: size of the batches to use with this data
        """

        self.data = data
        self.unique_mutations = unique_mutations
        self.batch_size = batch_size
        self.input_file = input_file
        self.nn_genotypes_values, self.nn_brightness_values = format_data(data, unique_mutations)
        self.batches = get_batches(self.nn_genotypes_values, self.nn_brightness_values, batch_size, unique_mutations)
        self.batch_number = len(self.batches)
        self.to_plot_observed = self.nn_brightness_values[0:(self.batch_number * self.batch_size)]
        self.nn_genotypes = tf.placeholder(tf.float32, shape=[self.batch_size, 1, len(unique_mutations)])
        self.nn_brightness = tf.placeholder(tf.float32, shape=[self.batch_size, 1, 1])

    def reshuffle(self):
        self.nn_genotypes_values, self.nn_brightness_values = format_data(self.data, self.unique_mutations)
        self.batches = get_batches(self.nn_genotypes_values, self.nn_brightness_values, self.batch_size, self.unique_mutations)
        self.to_plot_observed = self.nn_brightness_values[0:(self.batch_number * self.batch_size)]


# Neural network class. Extracts neural net structure from the parameter file.
# Contains all the details of the neural network to be used.
class TFNet(object):
    def __init__(self, net_structure, input_data, optimizer_method, learning_rate, batch_size, cost_stats_file):
        '''
            :param net_structure:
                                {'layer1':(3, tf.tanh()),
                                'layer2':((3, tf.tanh()),
                                'layer3':(1, tf.tanh())}

            :return:

            https://www.tensorflow.org/versions/r0.9/api_docs/python/nn.html#activation-functions

            '''

        self.number_of_layers = len(net_structure)
        self.structure = net_structure

        self.neurons = {}
        self.weights = {}
        self.biases = {}
        self.input = {}
        self.output = {}

        for i in range(self.number_of_layers):
            layer = 'layer' + str(i + 1)
            self.neurons[layer] = int(self.structure[layer][0])
            self.weights[layer] = tf.Variable(
                tf.random_normal([1, len(input_data.unique_mutations), self.neurons[layer]]),
                name=layer + '_weights')
            self.biases[layer] = tf.Variable(tf.random_normal([1, 1, self.neurons[layer]]), name=layer + '_biases')
            self.input[layer] = tf.add(
                tf.batch_matmul(input_data.nn_genotypes, broadcast(self.weights[layer], batch_size)),
                broadcast(self.biases[layer], batch_size))
            self.output[layer] = eval(self.structure[layer][1])(self.input[layer])

        self.cost = tf.reduce_sum(tf.pow(self.output[layer] - input_data.nn_brightness, 2)) / batch_size
        self.optimizer = eval(optimizer_method)(learning_rate).minimize(self.cost)

        self.init = tf.initialize_all_variables()
        self.saver = tf.train.Saver()

        self.cost_stats_file = cost_stats_file

    def __str__(self):
        print('Net structure:\n')
        for i in range(self.number_of_layers):
            print('%s neurons in layer_' % (self.neurons['layer' + str(i + 1)]) + str(i + 1) + '\n')

In [18]:
data = Data(input_file, batch_size)
net = TFNet(net_structure, data, optimizer_method, learning_rate, batch_size, cost_stats_file)

Normalizing data...
Creating batches...


In [19]:
net.weights['layer1'].name

'layer1_weights:0'

In [None]:
with tf.Session() as sess:
    sess.run(net.init)
    net.saver.restore(sess, model_file)
    print(session.run(tf.all_variables()))
    for batch in data.batches:
        to_plot_predicted = np.zeros(data.batch_number * batch_size)
        for index, (batch) in enumerate(data.batches):
            l3_value = sess.run([net.output['layer3']],
                                feed_dict={data.nn_genotypes: batch})
            to_plot_predicted[(index * batch_size):((index + 1) * batch_size)] = l3_value