## Attention Only Analysis

In [1]:
from __future__ import print_function
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import *
from keras.models import Model
from keras import backend as K

Using TensorFlow backend.


In [2]:
import os
import numpy as np
import Bio
from Bio import SeqIO
import seaborn as sns
import pandas as pd
import Bio.motifs
%matplotlib inline
from sklearn import model_selection
import seaborn as sns
from matplotlib import pyplot as plt
import sklearn
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
import scipy
sns.set_context('notebook')

In [3]:
if not os.path.isdir('/home/jtao/analysis/genomic_grammar_analysis/'):
    os.mkdir('/home/jtao/analysis/genomic_grammar_analysis')
os.chdir('/home/jtao/analysis/genomic_grammar_analysis')

## Functions

In [4]:
def convert_sequences_to_array(sequences):
    '''
    inputs: sequence of nucleotides represented as a string composed of A, C, G, T
    outputs: a list of numpy array representations of a sequence with:
             A = [1, 0, 0, 0]
             C = [0, 1, 0, 0]
             G = [0, 0, 1, 0]
             T = [0, 0, 0, 1]
             
    '''

    nucleotide_array_dict = {'A': [1, 0, 0, 0],
                             'C': [0, 1, 0, 0],
                             'G': [0, 0, 1, 0],
                             'T': [0, 0, 0, 1],
                             'N': [0.25,0.25,0.25,0.25]}

    sequence_array_list = []
    for seq in sequences:
        seq_array = []
        for nuc in seq:
            seq_array.append(nucleotide_array_dict[nuc])
        seq_array = np.array(seq_array, dtype=np.float16)
        sequence_array_list.append(seq_array)
    sequence_array_list = np.array(sequence_array_list,dtype=np.float16)
    return sequence_array_list

In [5]:
def quantile_normalize_df(df_input):
    df = df_input.copy()
    #compute rank
    dic = {}
    for col in df:
        dic.update({col : sorted(df[col])})
    sorted_df = pd.DataFrame(dic)
    rank = sorted_df.mean(axis = 1).tolist()
    #sort
    for col in df:
        t = np.searchsorted(np.sort(df[col]), df[col])
        df[col] = [rank[i] for i in t]
    return df

In [6]:
positive_seqRecords = list(SeqIO.parse('./peak_sequences/c57bl6_kla-1h_peaks.fasta', 'fasta'))
negative_seqRecords = list(SeqIO.parse('./background_files/c57bl6_kla-1h_background.fasta', 'fasta'))[:len(positive_seqRecords)]

fasta_seq = [str(x.seq[:200]) for x in positive_seqRecords] + [str(x[:200].seq) for x in negative_seqRecords]

fasta_rc_seq = [str(x[:200].reverse_complement().seq) for x in positive_seqRecords] + \
    [str(x[:200].reverse_complement().seq) for x in negative_seqRecords]

sequence_arrays = convert_sequences_to_array(fasta_seq)

sequence_rc_arrays = convert_sequences_to_array(fasta_rc_seq)


labels = [1 for x in positive_seqRecords] + [0 for x in negative_seqRecords]
labels = np.array(labels)

x_train, x_test, x_rc_train, x_rc_test, y_train, y_test = model_selection.train_test_split(sequence_arrays, sequence_rc_arrays, labels, test_size=0.2)

num_classes = 2
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

## Additive Attention

In [266]:
def get_additiveAttention_model(total_seq_length,
                        seq_size=150,
                        num_motifs=32, 
                        motif_size=10,
                        adjacent_bp_pool_size=10,
                        attention_dim=32,
                        attention_hops=1,
                        num_dense_neurons=32,
                        dropout_rate=0.5):
    input_fwd = Input(shape=(total_seq_length,4), name='input_fwd')
    input_rev = Input(shape=(total_seq_length,4), name='input_rev')

    ### find motifs ###
    convolution_layer = Conv1D(filters=num_motifs, 
        kernel_size=motif_size,
        activation='relu',
        input_shape=(total_seq_length,4),
        name='convolution_layer',
        padding = 'same'
        )
    forward_motif_scores = convolution_layer(input_fwd)
    reverse_motif_scores = convolution_layer(input_rev)

    ### crop motif scores to avoid parts of sequence where motif score is computed in only one direction ###
    to_crop = int((total_seq_length - seq_size)/2)
    crop_layer = Cropping1D(cropping=(to_crop, to_crop), 
        name='crop_layer')
    cropped_fwd_scores = crop_layer(forward_motif_scores)
    cropped_rev_scores = crop_layer(reverse_motif_scores)

    ### flip motif scores ###
    flip_layer = Lambda(lambda x: K.reverse(x,axes=0),
        output_shape=(seq_size, num_motifs),
        name='flip_layer')
    flipped_rev_scores = flip_layer(cropped_rev_scores)

    ### concatenate motif scores ###
    concatenate_layer = keras.layers.Concatenate(axis=2, name='concatenate_layer')
    concatenated_motif_scores = concatenate_layer([cropped_fwd_scores, flipped_rev_scores])

    ### normalize motif scores ###
    motif_score_norm_layer = BatchNormalization(name='motif_score_norm_layer')
    normed_motif_scores = motif_score_norm_layer(concatenated_motif_scores)
    
    ### pool across length of sequence ###
    sequence_pooling_layer = MaxPool1D(pool_size=adjacent_bp_pool_size, 
        strides=adjacent_bp_pool_size,
        name='sequence_pooling_layer')
    pooled_scores = sequence_pooling_layer(normed_motif_scores)
    

    
    ### bidirectional LSTM ###
    forward_lstm_layer = LSTM(units=int(seq_size/adjacent_bp_pool_size),
        return_sequences=True,
        input_shape = (int(seq_size/adjacent_bp_pool_size), 2*num_motifs),
        name = 'forward_lstm_layer'
        )
    forward_hidden_states = forward_lstm_layer(pooled_scores)

    reverse_lstm_layer = LSTM(units=int(seq_size/adjacent_bp_pool_size),
        return_sequences=True,
        input_shape = (int(seq_size/adjacent_bp_pool_size), 2*num_motifs),
        name = 'reverse_lstm_layer',
        go_backwards=True,
        )
    reverse_hidden_states = reverse_lstm_layer(pooled_scores)
    
    ### concatenate lstm hidden states ###
    lstm_concatenate_layer = Concatenate(axis=2)
    bilstm_hidden_states = lstm_concatenate_layer([forward_hidden_states, reverse_hidden_states])
    
    ### normalize lstm states ###
    lstm_norm_layer = BatchNormalization(name='lstm_norm_layer')
    normed_bilistm_hidden_states = lstm_norm_layer(bilstm_hidden_states)
    
    ### attention tanh layer ###
    attention_tanh_layer = Dense(attention_dim,
        activation='tanh',
        use_bias=False,
        name = 'attention_tanh_layer')
    attention_tanh_layer_out = attention_tanh_layer(normed_bilistm_hidden_states)

    ### outer layer ###
    attention_outer_layer = Dense(attention_hops,
        activation='linear',
        use_bias=False,
        name = 'attention_outer_layer')
    attention_outer_layer_out = attention_outer_layer(attention_tanh_layer_out)

    ### apply softmax ###
    softmax_layer = Softmax(axis=1, name='attention_softmax_layer')
    attention_softmax_layer_out = softmax_layer(attention_outer_layer_out)

    ### attend to hidden states ###
    attending_layer = Dot(axes=(1,1),
        name='attending_layer')

    attended_states = attending_layer([attention_softmax_layer_out, normed_bilistm_hidden_states])
    
    ### normalize attended states ###
    attention_norm_layer = BatchNormalization(name='attention_norm_layer')
    normed_attended_states = attention_norm_layer(attended_states)
    
    ### fully connected layer ###
    dense_layer = Dense(num_dense_neurons, 
        activation='relu', 
        name = 'dense_layer'
        )

    dense_output = dense_layer(normed_attended_states)
    
    # drop out
    drop_out = Dropout(dropout_rate,name='dense_dropout')(dense_output)
    
    # make prediction
    flattened = Flatten(name='flatten')(drop_out)
    
    predictions = Dense(num_classes,
                        name='predictions',
                        activation = 'sigmoid', 
                       )(flattened)
    
    # define and compile model
    model = Model(inputs=[input_fwd, input_rev], outputs=predictions)

    model.compile(loss=keras.losses.categorical_crossentropy,
                  optimizer=keras.optimizers.Adam(),
                  metrics=['accuracy'])
    return model

In [267]:
additiveAttention_model = get_additiveAttention_model(200,
    seq_size=150,
    num_motifs=100, 
    motif_size=10,
    adjacent_bp_pool_size=10,
    attention_dim=1,
    attention_hops=1,
    num_dense_neurons=32,
    dropout_rate=0.75
    )
additiveAttention_model.count_params()

32149

In [None]:
additiveAttention_model.summary()

In [None]:
additiveAttention_model.fit([x_train, x_rc_train], y_train,
              batch_size=128,
              epochs=10,
              verbose=1,
              validation_data=([x_test, x_rc_test], y_test))

probs = additiveAttention_model.predict([x_test, x_rc_test])

roc = sklearn.metrics.roc_auc_score([y[1] for y in y_test], probs[:,1], )
precision = sklearn.metrics.precision_score([y[1] for y in y_test], [1 if x > 0.5 else 0 for x in probs[:,1]])
acc = sklearn.metrics.accuracy_score([y[1] for y in y_test], [1 if x > 0.5 else 0 for x in probs[:,1]])
print(roc, precision, acc)

Train on 34841 samples, validate on 8711 samples
Epoch 1/10


## Dot Product Attention

In [262]:
def get_dotProductAttention_model(total_seq_length,
    seq_size=150,
    num_motifs=32, 
    motif_size=10,
    adjacent_bp_pool_size=10,
    num_dense_neurons=32,
    dropout_rate=0.5):
    input_fwd = Input(shape=(total_seq_length,4), name='input_fwd')
    input_rev = Input(shape=(total_seq_length,4), name='input_rev')

    ### find motifs ###
    convolution_layer = Conv1D(filters=num_motifs, 
        kernel_size=motif_size,
        activation='relu',
        input_shape=(total_seq_length,4),
        name='convolution_layer',
        padding = 'same'
        )
    forward_motif_scores = convolution_layer(input_fwd)
    reverse_motif_scores = convolution_layer(input_rev)
    print('forward_motif_scores', forward_motif_scores.get_shape())

    ### crop motif scores to avoid parts of sequence where motif score is computed in only one direction ###
    to_crop = int((total_seq_length - seq_size)/2)
    crop_layer = Cropping1D(cropping=(to_crop, to_crop), 
        name='crop_layer')
    cropped_fwd_scores = crop_layer(forward_motif_scores)
    cropped_rev_scores = crop_layer(reverse_motif_scores)
    print('cropped_fwd_scores', cropped_fwd_scores.get_shape())

    ### flip motif scores ###
    flip_layer = Lambda(lambda x: K.reverse(x,axes=0),
        output_shape=(seq_size, num_motifs),
        name='flip_layer')
    flipped_rev_scores = flip_layer(cropped_rev_scores)
    print('flipped_rev_scores', flipped_rev_scores.get_shape())

    ### concatenate motif scores ###
    concatenate_layer = keras.layers.Concatenate(axis=2, name='concatenate_layer')
    concatenated_motif_scores = concatenate_layer([cropped_fwd_scores, flipped_rev_scores])
    print('concatenated_motif_scores', concatenated_motif_scores.get_shape())

    ### pool across length of sequence ###
    sequence_pooling_layer = MaxPool1D(pool_size=adjacent_bp_pool_size, 
        strides=adjacent_bp_pool_size,
        name='sequence_pooling_layer')
    pooled_scores = sequence_pooling_layer(concatenated_motif_scores)
    print('pooled_scores', pooled_scores.get_shape())
    
    ### normalize motif scores ###
    motif_score_norm_layer = BatchNormalization(name='motif_score_norm_layer')
    normed_pooled_scores = motif_score_norm_layer(pooled_scores)
    print('normed_pooled_scores', normed_pooled_scores.shape)
        
    ### compute attention ###

    # reshape motif scores
    linear_projection_reshaper = Reshape((int(normed_pooled_scores.shape[1]), 
       int(normed_pooled_scores.shape[2]),1), name='linear_projection_reshaper')
    
    reshaped_normed_pooled_scores = linear_projection_reshaper(normed_pooled_scores)    
    print('reshaped_normed_pooled_scores', reshaped_normed_pooled_scores.shape)
    
    ### weight queries ###
    query_transformer = Conv2D(filters=1, 
        kernel_size=1,
        activation='linear',
        input_shape=(seq_size/adjacent_bp_pool_size,num_motifs*2),
        name='query_transformer',
        data_format='channels_last',
        padding = 'same'
    )
    
    weighted_queries = query_transformer(reshaped_normed_pooled_scores)
    print('weighted_queries', weighted_queries.shape)
    
    ### weight keys ###
    key_transformer = Conv2D(filters=1, 
        kernel_size=1,
        activation='linear',
        input_shape=(seq_size/adjacent_bp_pool_size,num_motifs*2),
        name='key_transformer',
        data_format='channels_last',
        padding = 'same'
    )
    
    weighted_keys = key_transformer(reshaped_normed_pooled_scores)
    print('weighted_keys', weighted_keys.shape)
    
    ### calculate unnormalized attention weights ###
    dot_product_reshaper = Reshape((int(normed_pooled_scores.shape[1]), 
       int(normed_pooled_scores.shape[2])), name = 'dot_product_reshaper')
    
    reshaped_weighted_queries = dot_product_reshaper(weighted_queries)
    reshaped_weighted_keys = dot_product_reshaper(weighted_keys)
    print('reshaped weighted queries and keys', reshaped_weighted_queries.shape, reshaped_weighted_keys.shape)
    
    dot_product = Dot(axes=(2,2),name='dot_product')
    attention_weights = dot_product([reshaped_weighted_queries, reshaped_weighted_keys])
    print('attention_weights', attention_weights.shape)
    ### apply softmax ###
    softmax_layer = Softmax(axis=1, name='attention_softmax_layer')
    attention_softmax_layer_out = softmax_layer(attention_weights)
    print('attention_softmax_layer_out',attention_softmax_layer_out.shape)
    
    ### weight values ###
    value_transformer = Conv2D(filters=1, 
        kernel_size=1,
        activation='linear',
        input_shape=(seq_size/adjacent_bp_pool_size,num_motifs*2),
        name='value_transformer',
        data_format='channels_last',
        padding = 'same'
    )
    
    weighted_values = value_transformer(reshaped_normed_pooled_scores)
    weighted_values = Reshape((int(weighted_values.shape[1]), 
       int(weighted_values.shape[2])))(weighted_values)
    print('weighted_values', weighted_values.shape)
    
    ### attend to hidden states ###
    attending_layer = Dot(axes=(1,1),
        name='attending_layer')
    attended_states = attending_layer([attention_softmax_layer_out, weighted_values])
    print('atteneded_states', attended_states.shape)
    
    ### normalize attended states ###
#     attention_norm_layer = BatchNormalization(name='attention_norm_layer')
#     normed_attended_states = attention_norm_layer(attended_states)
    
    ### fully connected layer ###
    dense_layer = Dense(num_dense_neurons, 
        activation='relu', 
        name = 'dense_layer'
        )

#     dense_output = dense_layer(normed_attended_states)
    dense_output = dense_layer(attended_states)
    
    # drop out
    drop_out = Dropout(dropout_rate,name='dense_dropout')(dense_output)
    
    # make prediction
    flattened = Flatten(name='flatten')(drop_out)
    
    predictions = Dense(num_classes,
                        name='predictions',
                        activation = 'sigmoid', 
                       )(flattened)
    
    # define and compile model
    model = Model(inputs=[input_fwd, input_rev], outputs=predictions)

    model.compile(loss=keras.losses.categorical_crossentropy,
                  optimizer=keras.optimizers.Adam(),
                  metrics=['accuracy'])
    return model

In [263]:
dotProductAttention_model = get_dotProductAttention_model(200,
    seq_size=150,
    num_motifs=100, 
    motif_size=5,
    adjacent_bp_pool_size=2,
    num_dense_neurons=32,
    dropout_rate=0.5)

forward_motif_scores (?, 200, 100)
cropped_fwd_scores (?, 150, 100)
flipped_rev_scores (?, 150, 100)
concatenated_motif_scores (?, 150, 200)
pooled_scores (?, 75, 200)
normed_pooled_scores (?, 75, 200)
reshaped_normed_pooled_scores (?, 75, 200, 1)
weighted_queries (?, 75, 200, 1)
weighted_keys (?, 75, 200, 1)
reshaped weighted queries and keys (?, 75, 200) (?, 75, 200)
attention_weights (?, 75, 75)
attention_softmax_layer_out (?, 75, 75)
weighted_values (?, 75, 200)
atteneded_states (?, 75, 200)


In [264]:
dotProductAttention_model.count_params()

14140

In [265]:
dotProductAttention_model.fit([x_train, x_rc_train], y_train,
              batch_size=128,
              epochs=10,
              verbose=1,
              validation_data=([x_test, x_rc_test], y_test))

probs = dotProductAttention_model.predict([x_test, x_rc_test])

roc = sklearn.metrics.roc_auc_score([y[1] for y in y_test], probs[:,1], )
precision = sklearn.metrics.precision_score([y[1] for y in y_test], [1 if x > 0.5 else 0 for x in probs[:,1]])
acc = sklearn.metrics.accuracy_score([y[1] for y in y_test], [1 if x > 0.5 else 0 for x in probs[:,1]])
print(roc, precision, acc)

Train on 34841 samples, validate on 8711 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
0.8626752102100756 0.8175513470258736 0.7676500975777752
