# Exploring the GR library

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="2,3"
target = "gr_2hrs"

In [2]:
import os
import re
import json
import gzip
import codecs
import math
from math import log, ceil
import modisco
import modisco.tfmodisco_workflow.workflow
from modisco.tfmodisco_workflow import workflow
import h5py
import pandas as pd
import modisco.util
from collections import Counter
from modisco.visualization import viz_sequence
import modisco.affinitymat.core
import modisco.cluster.phenograph.core
import modisco.cluster.phenograph.cluster
import modisco.cluster.core
import modisco.aggregator
from numpy.polynomial.polynomial import polyfit
from sklearn.linear_model import LinearRegression as LR
from sklearn.metrics import mean_squared_error
from scipy.stats import spearmanr, pearsonr, gaussian_kde
import keras_genomics
from keras_genomics.layers.convolutional import RevCompConv1D
import keras
import keras.layers as kl
from keras import backend as K 
import tensorflow as tf
import tensorflow_probability as tfp
import numpy as np
from seqdataloader.batchproducers import coordbased
from seqdataloader.batchproducers.coordbased import coordstovals
from seqdataloader.batchproducers.coordbased import coordbatchproducers
from seqdataloader.batchproducers.coordbased import coordbatchtransformers
from keras.models import load_model
from keras.utils import CustomObjectScope
import matplotlib
from matplotlib import pyplot as plt
from deeplift.dinuc_shuffle import dinuc_shuffle
%matplotlib inline
font = {'weight' : 'bold', 'size'   : 14}

TF-MoDISco is using the TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


In [3]:
class CalibratorFactory(object):
    def __call__(self, valid_preacts, valid_labels):
        raise NotImplementedError()

class LinearRegression(CalibratorFactory):
    def __init__(self, verbose=True):
        self.verbose = verbose 

    def __call__(self, valid_preacts, valid_labels):
        lr = LR().fit(valid_preacts.reshape(-1, 1), valid_labels)
    
        def calibration_func(preact):
            return lr.predict(preact.reshape(-1, 1))

        return calibration_func

In [4]:
# extending by 3bp on either side to let matrix slide for alignment (so total is 22bp when using a matrix)
seqToDdg = {}
firstLine = True
with open("/oak/stanford/groups/akundaje/amr1/pho4_final/lite_data/in-vitro/GR/GR_bindingcurves_WT_1_out.csv") as inp:
    for line in inp:
        if firstLine:
            firstLine = False
            continue
        Oligo,Kd_mean,Kd_sdev,ddG,Motif,Sequence = line.strip().split(',')
        seq = Sequence.upper()[11:33]
        pre = Sequence.upper()[:11]
        post = Sequence.upper()[33:]
        if pre != "CGCAATTGCGA":
            print(pre)
            print("CGCAATTGCGA")
        if post != "ACCTTCCTCTCCGGCGGTATGAC":
            print(post)
            print("ACCTTCCTCTCCGGCGGTATGAC")
        if seq not in seqToDdg:
            seqToDdg[seq] = []
        seqToDdg[seq].append(float(ddG))

firstLine = True
with open("/oak/stanford/groups/akundaje/amr1/pho4_final/lite_data/in-vitro/GR/GR_bindingcurves_WT_2_out.csv") as inp:
    for line in inp:
        if firstLine:
            firstLine = False
            continue
        Oligo,Kd_mean,Kd_sdev,ddG,Motif,Sequence = line.strip().split(',')
        seq = Sequence.upper()[11:33]
        pre = Sequence.upper()[:11]
        post = Sequence.upper()[33:]
        if pre != "CGCAATTGCGA":
            print(pre)
            print("CGCAATTGCGA")
        if post != "ACCTTCCTCTCCGGCGGTATGAC":
            print(post)
            print("ACCTTCCTCTCCGGCGGTATGAC")
        seqToDdg[seq].append(float(ddG))
        
seqs = []
all_xvals = []
seqToLabel = {}
for seq in seqToDdg:
    seqs.append(seq)
    all_xvals.append(np.mean(seqToDdg[seq]))
    seqToLabel[seq] = np.mean(seqToDdg[seq])

In [5]:
rep1 = []
rep2 = []
for seq in seqToDdg:
    if len(seqToDdg[seq]) == 1: continue
    rep1.append(seqToDdg[seq][0])
    rep2.append(seqToDdg[seq][1])
xvals = rep1
yvals = rep2
xy = np.vstack([xvals,yvals])
z = gaussian_kde(xy)(xy)
plt.figure()
matplotlib.rc('font', **font)
min_lim = min(np.min(xvals), np.min(yvals))
max_lim = max(np.max(xvals), np.max(yvals))
plt.xlim(min_lim-0.5, max_lim+0.5)
plt.ylim(min_lim-0.5, max_lim+0.5)
plt.gca().set_aspect('equal', adjustable='box')
plt.scatter(xvals, yvals,  c=z, edgecolor='', alpha=0.9)
plt.plot([min_lim-0.5, max_lim+0.5], [min_lim-0.5, max_lim+0.5], color="black")
plt.title("Reps -- spearman:"+str(spearmanr(xvals, yvals)[0])+", pearson:"+str(pearsonr(xvals, yvals)[0]))
plt.xlabel("Rep1")
plt.ylabel("Rep2")
plt.savefig('comparison_figs/gr_library_analysis/mitomi_reps.png', dpi=300, format='png')
plt.clf()

<Figure size 432x288 with 0 Axes>

In [6]:
fastapath = "/users/amr1/pho4/data/genome/hg38/hg38.genome.fa"
GenomeDict={}
sequence=''
inputdatafile = open(fastapath)
for line in inputdatafile:
    if line[0]=='>':
        if sequence != '':
            GenomeDict[chrm] = ''.join(sequence)
        chrm = line.strip().split('>')[1]
        sequence=[]
        Keep=False
        continue
    else:
        sequence.append(line.strip())
GenomeDict[chrm] = ''.join(sequence)

seq_len = 1346
out_pred_len = 1000
peaks = []
with gzip.open("/oak/stanford/groups/akundaje/amr1/pho4_final/lite_data/in-vivo/"+target+"/test_1k_around_summits.bed.gz", 'rt') as inp:
    for line in inp:
        chrm = line.strip().split('\t')[0]
        center = (int(line.strip().split('\t')[1]) + int(line.strip().split('\t')[2]))/2
        start = int(center - (seq_len/2))
        end = int(center + (seq_len/2))
        candidate = GenomeDict[chrm][start:end].upper()
        if len(candidate) == seq_len: peaks.append(candidate)

In [7]:
def multinomial_nll(true_counts, logits):
    """Compute the multinomial negative log-likelihood
    Args:
      true_counts: observed count values
      logits: predicted logit values
    """
    counts_per_example = tf.reduce_sum(true_counts, axis=-1)
    dist = tfp.distributions.Multinomial(total_count=counts_per_example,
                                         logits=logits)
    return (-tf.reduce_sum(dist.log_prob(true_counts)) / 
            tf.to_float(tf.shape(true_counts)[0]))

#from https://github.com/kundajelab/basepair/blob/cda0875571066343cdf90aed031f7c51714d991a/basepair/losses.py#L87
class MultichannelMultinomialNLL(object):
    def __init__(self, n):
        self.__name__ = "MultichannelMultinomialNLL"
        self.n = n

    def __call__(self, true_counts, logits):
        for i in range(self.n):
            loss = multinomial_nll(true_counts[..., i], logits[..., i])
            if i == 0:
                total = loss
            else:
                total += loss
        return total

    def get_config(self):
        return {"n": self.n}

with CustomObjectScope({'MultichannelMultinomialNLL': MultichannelMultinomialNLL,'RevCompConv1D': RevCompConv1D}):
    model = load_model("/oak/stanford/groups/akundaje/amr1/pho4_final/models/example_models/"+target+".h5")

ltrdict = {'a':[1,0,0,0],'c':[0,1,0,0],'g':[0,0,1,0],'t':[0,0,0,1],
           'n':[0,0,0,0],'A':[1,0,0,0],'C':[0,1,0,0],'G':[0,0,1,0],
           'T':[0,0,0,1],'N':[0,0,0,0]}
def getOneHot(ISM_sequences):
  # takes in list of sequences
    one_hot_seqs = []
    for seq in ISM_sequences:
        one_hot = []
        for i in range(len(seq)):
            one_hot.append(ltrdict[seq[i:i+1]])
        one_hot_seqs.append(one_hot)
    return np.array(one_hot_seqs)

def fill_into_center(seq, insert):
    start = int((len(seq)/2.0)-(len(insert)/2.0))
    new_seq = seq[:start]+insert+seq[start+len(insert):]
    return new_seq

seqToDeltaLogCounts = {}
pred_dict = {}  # key is oligo and val is 100 preds structured ([before], [after], final)
for curr_seq in seqs:
    pre_seqs = []
    post_seqs = []
    indices = np.random.choice(len(peaks), 100, replace=False)
    for idx in indices:
        pre_seq = dinuc_shuffle(peaks[idx])
        post_seq = fill_into_center(pre_seq, curr_seq)
        pre_seqs.append(pre_seq)
        post_seqs.append(post_seq)
    if "exo" in target:  # no ctl for the ChIP-exo GR datasets
        pre = model.predict(getOneHot(pre_seqs))
        post = model.predict(getOneHot(post_seqs))
    else:
        pre = model.predict([getOneHot(pre_seqs), np.zeros((100,)), np.zeros((100,out_pred_len,2))])
        post = model.predict([getOneHot(post_seqs), np.zeros((100,)), np.zeros((100,out_pred_len,2))])
    seqToDeltaLogCounts[curr_seq] = np.mean(post[0]-pre[0])
    pred_dict[curr_seq] = (pre[0].tolist(), post[0].tolist(), str(np.mean(post[0]-pre[0])))
pred_dir = "comparison_figs/gr_library_analysis/"
json.dump(pred_dict,
          codecs.open(pred_dir+'seq_to_deltalogcounts.json', 'w', encoding='utf-8'),
          separators=(',', ':'), sort_keys=True, indent=4)







Instructions for updating:
Use `tf.cast` instead.


In [8]:
complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'} 
def getRevComp(seq):  # reverse complement function
    ret = ""
    for bp in seq.upper(): ret += complement[bp]
    return ret[::-1]

def generate_matrix(seq):
    seq_matrix = np.zeros((len(seq), 4))
    for j in range(len(seq)):
        if seq[j] == 'A':
            seq_matrix[j,0] = 1
        elif seq[j] == 'C':
            seq_matrix[j,1] = 1
        elif seq[j] == 'G':
            seq_matrix[j,2] = 1
        elif seq[j] == 'T':
            seq_matrix[j,3] = 1
    return seq_matrix

def get_PWM_max_score(sequence, score_matrix):
    score_len = score_matrix.shape[0]
    scores = []
    for j in range(len(sequence) - score_len + 1):
        seq_matrix = generate_matrix(sequence[j:j+score_len])
        scores.append(np.sum(score_matrix * seq_matrix))
    rc_sequence = getRevComp(sequence)
    for j in range(len(rc_sequence) - score_len + 1):
        seq_matrix = generate_matrix(rc_sequence[j:j+score_len])
        scores.append(np.sum(score_matrix * seq_matrix))
    return max(scores)

In [9]:
filename = "/oak/stanford/groups/akundaje/amr1/pho4_final/models/modisco-lite/"+target+"/modisco_counts_results.h5"
f = h5py.File(filename, 'r')
pattern_list = len(f['pos_patterns'])

def trim_motif(cwm_fwd, max_length=None):
    trim_threshold=0.3
    score_fwd = np.sum(np.abs(cwm_fwd), axis=1)
    trim_thresh_fwd = np.max(score_fwd) * trim_threshold
    pass_inds_fwd = np.where(score_fwd >= trim_thresh_fwd)[0]
    start_fwd, end_fwd = max(np.min(pass_inds_fwd) - 4, 0), min(np.max(pass_inds_fwd) + 4 + 1, len(score_fwd) + 1)
    # max length restricted to seq length which is 22bp
    if max_length != None and (end_fwd - start_fwd) > max_length:
        center = int((start_fwd+end_fwd)/2)
        start_fwd = center - int(max_length/2)
        end_fwd = center + int(max_length/2)
    trimmed_cwm_fwd = cwm_fwd[start_fwd:end_fwd]
    return trimmed_cwm_fwd

CWMs = []
for idx in range(min(10, pattern_list)):
    # max length restricted to seq length which is 22bp
    cwm = trim_motif(f['pos_patterns']['pattern_'+str(idx)]['contrib_scores'], 22)
    CWMs.append(cwm)

# gr_2hrs:  modisco_max_0_metadata.json
gr_cwm = CWMs[0]

seqToCWMScore = {}
for seq in seqs:
    seqToCWMScore[seq] = get_PWM_max_score(seq, gr_cwm)

In [10]:
num_samples = min(1000, ceil(0.1*len(seqs)))
print(num_samples, len(seqs))

# calibration_samples = np.random.choice(seqs, num_samples, replace=False)
# np.save('comparison_figs/gr_library_analysis/calibration_samples.npy', calibration_samples)

## for reproducibility
calibration_samples = np.load('comparison_figs/gr_library_analysis/calibration_samples.npy')

sample_distill = []
sample_cwm_scores = []
sample_labels = []
for seq in calibration_samples:
    sample_distill.append(seqToDeltaLogCounts[seq])
    sample_cwm_scores.append(seqToCWMScore[seq])
    sample_labels.append(seqToLabel[seq])
sample_distill = np.array(sample_distill)
sample_cwm_scores = np.array(sample_cwm_scores)
sample_labels = np.array(sample_labels)    
lr1 = LinearRegression()
lr2 = LinearRegression()
calibration_func1 = lr1(sample_distill, sample_labels)
calibration_func2 = lr2(sample_cwm_scores, sample_labels)
    
seqToDistillPred = {}
seqToModiscoPred = {}
for seq in seqs:
    seqToDistillPred[seq] = calibration_func1(np.array([seqToDeltaLogCounts[seq]]))[0]
    seqToModiscoPred[seq] = calibration_func2(np.array([seqToCWMScore[seq]]))[0]

19 188


In [11]:
xvals = [seqToLabel[seq] for seq in seqs]
yvals = [seqToDistillPred[seq] for seq in seqs]
xy = np.vstack([xvals,yvals])
z = gaussian_kde(xy)(xy)
plt.figure()
matplotlib.rc('font', **font)
min_lim = min(np.min(xvals), np.min(yvals))
max_lim = max(np.max(xvals), np.max(yvals))
plt.xlim(min_lim-0.5, max_lim+0.5)
plt.ylim(min_lim-0.5, max_lim+0.5)
plt.gca().set_aspect('equal', adjustable='box')
plt.scatter(xvals, yvals,  c=z, edgecolor='', alpha=0.9)
plt.plot([min_lim-0.5, max_lim+0.5], [min_lim-0.5, max_lim+0.5], color="black")
plt.title("Distill -- spearman:"+str(spearmanr(xvals, yvals)[0])+", pearson:"+str(pearsonr(xvals, yvals)[0]))
plt.xlabel("Obs. ddG")
plt.ylabel("Pred. ddG")
plt.savefig('comparison_figs/gr_library_analysis/all_distill.png', dpi=300, format='png')
plt.clf()

<Figure size 432x288 with 0 Axes>

In [12]:
xvals = [seqToLabel[seq] for seq in seqs]
yvals = [seqToModiscoPred[seq] for seq in seqs]
xy = np.vstack([xvals,yvals])
z = gaussian_kde(xy)(xy)
plt.figure()
matplotlib.rc('font', **font)
min_lim = min(np.min(xvals), np.min(yvals))
max_lim = max(np.max(xvals), np.max(yvals))
plt.xlim(min_lim-0.5, max_lim+0.5)
plt.ylim(min_lim-0.5, max_lim+0.5)
plt.gca().set_aspect('equal', adjustable='box')
plt.scatter(xvals, yvals,  c=z, edgecolor='', alpha=0.9)
plt.plot([min_lim-0.5, max_lim+0.5], [min_lim-0.5, max_lim+0.5], color="black")
plt.title("MoDISco -- spearman:"+str(spearmanr(xvals, yvals)[0])+", pearson:"+str(pearsonr(xvals, yvals)[0]))
plt.xlabel("Obs. ddG")
plt.ylabel("Pred. ddG")
plt.savefig('comparison_figs/gr_library_analysis/all_modisco.png', dpi=300, format='png')
plt.clf()

<Figure size 432x288 with 0 Axes>

# Library analysis

In [13]:
xl_file = pd.ExcelFile("/oak/stanford/groups/akundaje/amr1/pho4_final/lite_data/in-vitro/GR/DataFromBothExperiments.xlsx")

dfs = {sheet_name: xl_file.parse(sheet_name)
       for sheet_name in xl_file.sheet_names}
dfs.keys()

distill_rmses = {}
modisco_rmses = {}

In [14]:
dfs['MITOMIFullSite']

Unnamed: 0,Oligo #,Well,Oligo Name,Sequence,rMax_1,rMaxE_1,KaN_1,KaNE_1,rMax_2,rMaxE_2,KaN_2,KaNE_2,Unnamed: 12,KaN_AVG,KaNE_AVG,SEM
0,80,G8,GR_MITOMI,CGCAATTGCGAGTCCGGGACATGATGTCCCTCGACCTTCCTCTCCG...,0.701829,0.011613,1.0,0.111221,1.06558,0.016042,1.0,0.094789,,1.0,0.0,0.0
1,81,G9,GR_MITOMI_0A,CGCAATTGCGAGTCCAGGACATGATGTCCCTCGACCTTCCTCTCCG...,0.701829,0.011613,1.65023,0.163067,1.06558,0.016042,1.36839,0.122255,,1.50931,0.199291,0.14092
2,82,G10,GR_MITOMI_0C,CGCAATTGCGAGTCCCGGACATGATGTCCCTCGACCTTCCTCTCCG...,0.701829,0.011613,1.24497,0.135388,1.06558,0.016042,1.3388,0.125794,,1.291885,0.066348,0.046915
3,83,G11,GR_MITOMI_0T,CGCAATTGCGAGTCCTGGACATGATGTCCCTCGACCTTCCTCTCCG...,0.701829,0.011613,1.11469,0.105944,1.06558,0.016042,0.777305,0.075385,,0.945998,0.238567,0.168692
4,84,G12,GR_MITOMI_1A,CGCAATTGCGAGTCCGAGACATGATGTCCCTCGACCTTCCTCTCCG...,0.701829,0.011613,0.527281,0.061503,1.06558,0.016042,0.451175,0.045599,,0.489228,0.053815,0.038053
5,85,H1,GR_MITOMI_1C,CGCAATTGCGAGTCCGCGACATGATGTCCCTCGACCTTCCTCTCCG...,0.701829,0.011613,0.207691,0.02572,1.06558,0.016042,0.139475,0.018032,,0.173583,0.048236,0.034108
6,86,H2,GR_MITOMI_1T,CGCAATTGCGAGTCCGTGACATGATGTCCCTCGACCTTCCTCTCCG...,0.701829,0.011613,0.304923,0.041356,1.06558,0.016042,0.236299,0.026319,,0.270611,0.048524,0.034312
7,87,H3,GR_MITOMI_2A,CGCAATTGCGAGTCCGGAACATGATGTCCCTCGACCTTCCTCTCCG...,0.701829,0.011613,1.19479,0.115215,1.06558,0.016042,1.11456,0.099639,,1.154675,0.056731,0.040115
8,88,H4,GR_MITOMI_2C,CGCAATTGCGAGTCCGGCACATGATGTCCCTCGACCTTCCTCTCCG...,0.701829,0.011613,1.0048,0.114008,1.06558,0.016042,0.980285,0.094863,,0.992542,0.017335,0.012257
9,89,H5,GR_MITOMI_2T,CGCAATTGCGAGTCCGGTACATGATGTCCCTCGACCTTCCTCTCCG...,0.701829,0.011613,1.66579,0.161306,1.06558,0.016042,1.32385,0.118172,,1.49482,0.241788,0.17097


In [15]:
MITOMIFullSite = []
for Sequence in list(dfs['MITOMIFullSite']['Sequence']):
    seq = Sequence.upper()[11:33]
    pre = Sequence.upper()[:11]
    post = Sequence.upper()[33:]
    if pre != "CGCAATTGCGA":
        print(pre)
        print("CGCAATTGCGA")
    if post != "ACCTTCCTCTCCGGCGGTATGAC":
        print(post)
        print("ACCTTCCTCTCCGGCGGTATGAC")
    MITOMIFullSite.append(seq)
MITOMIFullSite = list(set(MITOMIFullSite))

xvals = [seqToLabel[seq] for seq in MITOMIFullSite]
yvals = [seqToDistillPred[seq] for seq in MITOMIFullSite]
distill_rmses['MITOMIFullSite'] = math.sqrt(mean_squared_error(xvals, yvals))
xy = np.vstack([xvals,yvals])
z = gaussian_kde(xy)(xy)
plt.figure()
matplotlib.rc('font', **font)
min_lim = min(np.min(xvals), np.min(yvals))
max_lim = max(np.max(xvals), np.max(yvals))
plt.xlim(min_lim-0.5, max_lim+0.5)
plt.ylim(min_lim-0.5, max_lim+0.5)
plt.gca().set_aspect('equal', adjustable='box')
plt.scatter(xvals, yvals,  c=z, edgecolor='', alpha=0.9)
x = np.linspace(min_lim-0.5, max_lim+0.5, num=len(yvals))
b, m = polyfit(xvals, yvals, 1)
plt.plot(x, b + m * x, color="black")
plt.title("Distill -- spearman:"+str(spearmanr(xvals, yvals)[0])+", pearson:"+str(pearsonr(xvals, yvals)[0]))
plt.xlabel("Obs. ddG")
plt.ylabel("Pred. ddG")
plt.savefig('comparison_figs/gr_library_analysis/MITOMIFullSite_distill.png', dpi=300, format='png')
plt.clf()

xvals = [seqToLabel[seq] for seq in MITOMIFullSite]
yvals = [seqToModiscoPred[seq] for seq in MITOMIFullSite]
modisco_rmses['MITOMIFullSite'] = math.sqrt(mean_squared_error(xvals, yvals))
xy = np.vstack([xvals,yvals])
z = gaussian_kde(xy)(xy)
plt.figure()
matplotlib.rc('font', **font)
min_lim = min(np.min(xvals), np.min(yvals))
max_lim = max(np.max(xvals), np.max(yvals))
plt.xlim(min_lim-0.5, max_lim+0.5)
plt.ylim(min_lim-0.5, max_lim+0.5)
plt.gca().set_aspect('equal', adjustable='box')
plt.scatter(xvals, yvals,  c=z, edgecolor='', alpha=0.9)
x = np.linspace(min_lim-0.5, max_lim+0.5, num=len(yvals))
b, m = polyfit(xvals, yvals, 1)
plt.plot(x, b + m * x, color="black")
plt.title("MoDISco -- spearman:"+str(spearmanr(xvals, yvals)[0])+", pearson:"+str(pearsonr(xvals, yvals)[0]))
plt.xlabel("Obs. ddG")
plt.ylabel("Pred. ddG")
plt.savefig('comparison_figs/gr_library_analysis/MITOMIFullSite_modisco.png', dpi=300, format='png')
plt.clf()

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

In [16]:
dfs['ChIPFullSite']

Unnamed: 0,Oligo #,Well,Oligo Name,Sequence,rMax_1,rMaxE_1,KaN_1,KaNE_1,rMax_2,rMaxE_2,KaN_2,KaNE_2,Unnamed: 12,KaN_AVG,KaNE_AVG,SEM
0,1,A1,GR_ChIP,CGCAATTGCGAGTCCAGAACATTCTGTTCCTCGACCTTCCTCTCCG...,0.680839,0.017837,0.999999,0.23508,1.08107,0.021466,1.0,0.189837,,0.999999,7.071068e-07,5e-07
1,2,A2,GR_ChIP_Mut-1A,CGCAATTGCGAGTCAAGAACATTCTGTTCCTCGACCTTCCTCTCCG...,0.680839,0.017837,1.35078,0.330337,1.08107,0.021466,0.853995,0.146537,,1.102388,0.35128,0.2483925
2,3,A3,GR_ChIP_Mut-1G,CGCAATTGCGAGTCGAGAACATTCTGTTCCTCGACCTTCCTCTCCG...,0.680839,0.017837,0.903794,0.223507,1.08107,0.021466,0.742747,0.139555,,0.823271,0.1138774,0.0805235
3,4,A4,GR_ChIP_Mut-1T,CGCAATTGCGAGTCTAGAACATTCTGTTCCTCGACCTTCCTCTCCG...,0.680839,0.017837,1.82521,0.465242,1.08107,0.021466,0.949008,0.16065,,1.387109,0.6195684,0.438101
4,5,A5,GR_ChIP_Mut0C,CGCAATTGCGAGTCCCGAACATTCTGTTCCTCGACCTTCCTCTCCG...,0.680839,0.017837,2.25204,0.554935,1.08107,0.021466,0.916469,0.160065,,1.584255,0.9443913,0.6677855
5,6,A6,GR_ChIP_Mut0G,CGCAATTGCGAGTCCGGAACATTCTGTTCCTCGACCTTCCTCTCCG...,0.680839,0.017837,1.97886,0.461583,1.08107,0.021466,1.12249,0.194086,,1.550675,0.605545,0.428185
6,7,A7,GR_ChIP_Mut0T,CGCAATTGCGAGTCCTGAACATTCTGTTCCTCGACCTTCCTCTCCG...,0.680839,0.017837,1.81896,0.410615,1.08107,0.021466,0.639614,0.113998,,1.229287,0.8339236,0.589673
7,8,A8,GR_ChIP_Mut1A,CGCAATTGCGAGTCCAAAACATTCTGTTCCTCGACCTTCCTCTCCG...,0.680839,0.017837,0.696781,0.189904,1.08107,0.021466,0.314074,0.06286,,0.505428,0.2706147,0.1913535
8,9,A9,GR_ChIP_Mut1C,CGCAATTGCGAGTCCACAACATTCTGTTCCTCGACCTTCCTCTCCG...,0.680839,0.017837,0.604976,0.147247,1.08107,0.021466,0.195915,0.041132,,0.400446,0.2892498,0.2045305
9,10,A10,GR_ChIP_Mut1T,CGCAATTGCGAGTCCATAACATTCTGTTCCTCGACCTTCCTCTCCG...,0.680839,0.017837,0.704501,0.19063,1.08107,0.021466,0.34491,0.062003,,0.524706,0.2542692,0.1797955


In [17]:
ChIPFullSite = []
for Sequence in list(dfs['ChIPFullSite']['Sequence']):
    seq = Sequence.upper()[11:33]
    pre = Sequence.upper()[:11]
    post = Sequence.upper()[33:]
    if pre != "CGCAATTGCGA":
        print(pre)
        print("CGCAATTGCGA")
    if post != "ACCTTCCTCTCCGGCGGTATGAC":
        print(post)
        print("ACCTTCCTCTCCGGCGGTATGAC")
    ChIPFullSite.append(seq)
ChIPFullSite = list(set(ChIPFullSite))

xvals = [seqToLabel[seq] for seq in ChIPFullSite]
yvals = [seqToDistillPred[seq] for seq in ChIPFullSite]
distill_rmses['ChIPFullSite'] = math.sqrt(mean_squared_error(xvals, yvals))
xy = np.vstack([xvals,yvals])
z = gaussian_kde(xy)(xy)
plt.figure()
matplotlib.rc('font', **font)
min_lim = min(np.min(xvals), np.min(yvals))
max_lim = max(np.max(xvals), np.max(yvals))
plt.xlim(min_lim-0.5, max_lim+0.5)
plt.ylim(min_lim-0.5, max_lim+0.5)
plt.gca().set_aspect('equal', adjustable='box')
plt.scatter(xvals, yvals,  c=z, edgecolor='', alpha=0.9)
x = np.linspace(min_lim-0.5, max_lim+0.5, num=len(yvals))
b, m = polyfit(xvals, yvals, 1)
plt.plot(x, b + m * x, color="black")
plt.title("Distill -- spearman:"+str(spearmanr(xvals, yvals)[0])+", pearson:"+str(pearsonr(xvals, yvals)[0]))
plt.xlabel("Obs. ddG")
plt.ylabel("Pred. ddG")
plt.savefig('comparison_figs/gr_library_analysis/ChIPFullSite_distill.png', dpi=300, format='png')
plt.clf()

xvals = [seqToLabel[seq] for seq in ChIPFullSite]
yvals = [seqToModiscoPred[seq] for seq in ChIPFullSite]
modisco_rmses['ChIPFullSite'] = math.sqrt(mean_squared_error(xvals, yvals))
xy = np.vstack([xvals,yvals])
z = gaussian_kde(xy)(xy)
plt.figure()
matplotlib.rc('font', **font)
min_lim = min(np.min(xvals), np.min(yvals))
max_lim = max(np.max(xvals), np.max(yvals))
plt.xlim(min_lim-0.5, max_lim+0.5)
plt.ylim(min_lim-0.5, max_lim+0.5)
plt.gca().set_aspect('equal', adjustable='box')
plt.scatter(xvals, yvals,  c=z, edgecolor='', alpha=0.9)
x = np.linspace(min_lim-0.5, max_lim+0.5, num=len(yvals))
b, m = polyfit(xvals, yvals, 1)
plt.plot(x, b + m * x, color="black")
plt.title("MoDISco -- spearman:"+str(spearmanr(xvals, yvals)[0])+", pearson:"+str(pearsonr(xvals, yvals)[0]))
plt.xlabel("Obs. ddG")
plt.ylabel("Pred. ddG")
plt.savefig('comparison_figs/gr_library_analysis/ChIPFullSite_modisco.png', dpi=300, format='png')
plt.clf()

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

In [18]:
# all consensus site mutations
xvals = [seqToLabel[seq] for seq in ChIPFullSite+MITOMIFullSite]
yvals = [seqToDistillPred[seq] for seq in ChIPFullSite+MITOMIFullSite]
xy = np.vstack([xvals,yvals])
z = gaussian_kde(xy)(xy)
plt.figure()
matplotlib.rc('font', **font)
min_lim = min(np.min(xvals), np.min(yvals))
max_lim = max(np.max(xvals), np.max(yvals))
plt.xlim(min_lim-0.5, max_lim+0.5)
plt.ylim(min_lim-0.5, max_lim+0.5)
plt.gca().set_aspect('equal', adjustable='box')
plt.scatter(xvals, yvals,  c=z, edgecolor='', alpha=0.9)
x = np.linspace(min_lim-0.5, max_lim+0.5, num=len(yvals))
b, m = polyfit(xvals, yvals, 1)
plt.plot(x, b + m * x, color="black")
plt.title("Distill -- spearman:"+str(spearmanr(xvals, yvals)[0])+", pearson:"+str(pearsonr(xvals, yvals)[0]))
plt.xlabel("Obs. ddG")
plt.ylabel("Pred. ddG")
plt.savefig('comparison_figs/gr_library_analysis/ALLFullSite_distill.png', dpi=300, format='png')
plt.clf()

xvals = [seqToLabel[seq] for seq in ChIPFullSite+MITOMIFullSite]
yvals = [seqToModiscoPred[seq] for seq in ChIPFullSite+MITOMIFullSite]
xy = np.vstack([xvals,yvals])
z = gaussian_kde(xy)(xy)
plt.figure()
matplotlib.rc('font', **font)
min_lim = min(np.min(xvals), np.min(yvals))
max_lim = max(np.max(xvals), np.max(yvals))
plt.xlim(min_lim-0.5, max_lim+0.5)
plt.ylim(min_lim-0.5, max_lim+0.5)
plt.gca().set_aspect('equal', adjustable='box')
plt.scatter(xvals, yvals,  c=z, edgecolor='', alpha=0.9)
x = np.linspace(min_lim-0.5, max_lim+0.5, num=len(yvals))
b, m = polyfit(xvals, yvals, 1)
plt.plot(x, b + m * x, color="black")
plt.title("MoDISco -- spearman:"+str(spearmanr(xvals, yvals)[0])+", pearson:"+str(pearsonr(xvals, yvals)[0]))
plt.xlabel("Obs. ddG")
plt.ylabel("Pred. ddG")
plt.savefig('comparison_figs/gr_library_analysis/ALLFullSite_modisco.png', dpi=300, format='png')
plt.clf()

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

In [19]:
dfs['MITOMISpacing']

Unnamed: 0,Oligo #,Well,Oligo Name,Sequence,rMax_1,rMaxE_1,KaN_1,KaNE_1,rMax_2,rMaxE_2,KaN_2,KaNE_2,Unnamed: 12,KaN_AVG,KaNE_AVG,SEM
0,80,G8,GR_MITOMI,CGCAATTGCGAGTCCGGGACATGATGTCCCTCGACCTTCCTCTCCG...,0.701829,0.011613,1.0,0.111221,1.06558,0.016042,1.0,0.094789,,1.0,0.0,0.0
1,126,C6,GR_MITOMI_AltSpacer1,CGCAATTGCGAGTCCGGGACAGGGTGTCCCTCGACCTTCCTCTCCG...,0.701829,0.011613,1.67227,0.148165,1.06558,0.016042,1.35639,0.124432,,1.51433,0.223361,0.15794
2,127,C7,GR_MITOMI_AltSpacer2,CGCAATTGCGAGTCCGGGACATTCTGTCCCTCGACCTTCCTCTCCG...,0.701829,0.011613,0.774616,0.092691,1.06558,0.016042,0.537533,0.058493,,0.656075,0.167643,0.118541
3,128,C8,GR_MITOMI_AltSpacer3,CGCAATTGCGAGTCCGGGACATTTTGTCCCTCGACCTTCCTCTCCG...,0.701829,0.011613,3.11579,0.293395,1.06558,0.016042,2.24052,0.204587,,2.678155,0.618909,0.437635
4,155,E11,RANDOM,CGCAATTGCGAGTCCGCGGTAGCTGCGCATTCGACCTTCCTCTCCG...,0.701829,0.011613,0.002592,0.001045,1.06558,0.016042,0.018465,0.014473,,0.010528,0.011224,0.007937


In [20]:
MITOMISpacing = []
for Sequence in list(dfs['MITOMISpacing']['Sequence'])[1:-1]:
    seq = Sequence.upper()[11:33]
    pre = Sequence.upper()[:11]
    post = Sequence.upper()[33:]
    if pre != "CGCAATTGCGA":
        print(pre)
        print("CGCAATTGCGA")
    if post != "ACCTTCCTCTCCGGCGGTATGAC":
        print(post)
        print("ACCTTCCTCTCCGGCGGTATGAC")
    MITOMISpacing.append(seq)
MITOMISpacing = list(set(MITOMISpacing))

xvals = [seqToLabel[seq] for seq in MITOMISpacing]
yvals = [seqToDistillPred[seq] for seq in MITOMISpacing]
distill_rmses['MITOMISpacing'] = math.sqrt(mean_squared_error(xvals, yvals))
xy = np.vstack([xvals,yvals])
z = gaussian_kde(xy)(xy)
plt.figure()
matplotlib.rc('font', **font)
min_lim = min(np.min(xvals), np.min(yvals))
max_lim = max(np.max(xvals), np.max(yvals))
plt.xlim(min_lim-0.5, max_lim+0.5)
plt.ylim(min_lim-0.5, max_lim+0.5)
plt.gca().set_aspect('equal', adjustable='box')
plt.scatter(xvals, yvals,  c=z, edgecolor='', alpha=0.9)
x = np.linspace(min_lim-0.5, max_lim+0.5, num=len(yvals))
b, m = polyfit(xvals, yvals, 1)
plt.plot(x, b + m * x, color="black")
plt.title("Distill -- spearman:"+str(spearmanr(xvals, yvals)[0])+", pearson:"+str(pearsonr(xvals, yvals)[0]))
plt.xlabel("Obs. ddG")
plt.ylabel("Pred. ddG")
plt.savefig('comparison_figs/gr_library_analysis/MITOMISpacing_distill.png', dpi=300, format='png')
plt.clf()

xvals = [seqToLabel[seq] for seq in MITOMISpacing]
yvals = [seqToModiscoPred[seq] for seq in MITOMISpacing]
modisco_rmses['MITOMISpacing'] = math.sqrt(mean_squared_error(xvals, yvals))
xy = np.vstack([xvals,yvals])
z = gaussian_kde(xy)(xy)
plt.figure()
matplotlib.rc('font', **font)
min_lim = min(np.min(xvals), np.min(yvals))
max_lim = max(np.max(xvals), np.max(yvals))
plt.xlim(min_lim-0.5, max_lim+0.5)
plt.ylim(min_lim-0.5, max_lim+0.5)
plt.gca().set_aspect('equal', adjustable='box')
plt.scatter(xvals, yvals,  c=z, edgecolor='', alpha=0.9)
x = np.linspace(min_lim-0.5, max_lim+0.5, num=len(yvals))
b, m = polyfit(xvals, yvals, 1)
plt.plot(x, b + m * x, color="black")
plt.title("MoDISco -- spearman:"+str(spearmanr(xvals, yvals)[0])+", pearson:"+str(pearsonr(xvals, yvals)[0]))
plt.xlabel("Obs. ddG")
plt.ylabel("Pred. ddG")
plt.savefig('comparison_figs/gr_library_analysis/MITOMISpacing_modisco.png', dpi=300, format='png')
plt.clf()

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

In [21]:
dfs['ChIPSpacing']

In [22]:
dfs['MITOMIHalfSites']

Unnamed: 0,Oligo #,Well,Oligo Name,Sequence,rMax_1,rMaxE_1,KaN_1,KaNE_1,rMax_2,rMaxE_2,KaN_2,KaNE_2,Unnamed: 12,KaN_AVG,KaNE_AVG,SEM
0,80,G8,GR_MITOMI,CGCAATTGCGAGTCCGGGACATGATGTCCCTCGACCTTCCTCTCCG...,0.701829,0.0,1.0,0.029001,1.06558,0.016042,1.0,0.094789,,1.0,0.0,0.0
1,129,C9,GR_MITOMI_HalfSite1,CGCAATTGCGAGTCCGGGACATGGCTACCTTCGACCTTCCTCTCCG...,0.701829,0.0,0.022981,0.005068,1.06558,0.016042,0.008651,0.004992,,0.015816,0.010133,0.007165
2,130,C10,GR_MITOMI_HalfSite2,CGCAATTGCGAGTCCGGAATACGATGTCCCTCGACCTTCCTCTCCG...,0.701829,0.0,0.169209,0.009341,1.06558,0.016042,0.075994,0.010993,,0.122601,0.065913,0.046608
3,131,C11,GR_MITOMI_HalfSite1_Mut0A,CGCAATTGCGAGTCCAGGACATGGCTACTGTCGACCTTCCTCTCCG...,0.701829,0.0,0.015941,0.005678,1.06558,0.016042,0.003767,0.004064,,0.009854,0.008608,0.006087
4,132,C12,GR_MITOMI_HalfSite1_Mut0C,CGCAATTGCGAGTCCCGGACATGGCTACCTTCGACCTTCCTCTCCG...,0.701829,0.0,0.023054,0.006196,1.06558,0.016042,0.002609,0.002721,,0.012832,0.014457,0.010222
5,133,D1,GR_MITOMI_HalfSite1_Mut0T,CGCAATTGCGAGTCCTGGACATGGCTACCTTCGACCTTCCTCTCCG...,0.701829,0.0,0.012769,0.004401,1.06558,0.016042,0.002227,0.002921,,0.007498,0.007454,0.005271
6,134,D2,GR_MITOMI_HalfSite1_Mut1A,CGCAATTGCGAGTCCGAGACATGGCTACCTTCGACCTTCCTCTCCG...,0.701829,0.0,0.007875,0.003738,1.06558,0.016042,0.002596,0.002631,,0.005236,0.003733,0.00264
7,135,D3,GR_MITOMI_HalfSite1_Mut1C,CGCAATTGCGAGTCCGCGACATGGCTACCTTCGACCTTCCTCTCCG...,0.701829,0.0,0.007615,0.004642,1.06558,0.016042,0.001817,0.003313,,0.004716,0.004099,0.002899
8,136,D4,GR_MITOMI_HalfSite1_Mut1T,CGCAATTGCGAGTCCGTGACATGGCTACCTTCGACCTTCCTCTCCG...,0.701829,0.0,0.009215,0.004556,1.06558,0.016042,0.002519,0.003513,,0.005867,0.004735,0.003348
9,137,D5,GR_MITOMI_HalfSite1_Mut2A,CGCAATTGCGAGTCCGGAACATGGCTACCTTCGACCTTCCTCTCCG...,0.701829,0.0,0.014822,0.004758,1.06558,0.016042,0.002722,0.003873,,0.008772,0.008556,0.00605


In [23]:
MITOMIHalfSites = []
for Sequence in list(dfs['MITOMIHalfSites']['Sequence'])[1:-5]:
    seq = Sequence.upper()[11:33]
    pre = Sequence.upper()[:11]
    post = Sequence.upper()[33:]
    if pre != "CGCAATTGCGA":
        print(pre)
        print("CGCAATTGCGA")
    if post != "ACCTTCCTCTCCGGCGGTATGAC":
        print(post)
        print("ACCTTCCTCTCCGGCGGTATGAC")
    MITOMIHalfSites.append(seq)
MITOMIHalfSites = list(set(MITOMIHalfSites))

xvals = [seqToLabel[seq] for seq in MITOMIHalfSites]
yvals = [seqToDistillPred[seq] for seq in MITOMIHalfSites]
distill_rmses['MITOMIHalfSites'] = math.sqrt(mean_squared_error(xvals, yvals))
xy = np.vstack([xvals,yvals])
z = gaussian_kde(xy)(xy)
plt.figure()
matplotlib.rc('font', **font)
min_lim = min(np.min(xvals), np.min(yvals))
max_lim = max(np.max(xvals), np.max(yvals))
plt.xlim(min_lim-0.5, max_lim+0.5)
plt.ylim(min_lim-0.5, max_lim+0.5)
plt.gca().set_aspect('equal', adjustable='box')
plt.scatter(xvals, yvals,  c=z, edgecolor='', alpha=0.9)
x = np.linspace(min_lim-0.5, max_lim+0.5, num=len(yvals))
b, m = polyfit(xvals, yvals, 1)
plt.plot(x, b + m * x, color="black")
plt.title("Distill -- spearman:"+str(spearmanr(xvals, yvals)[0])+", pearson:"+str(pearsonr(xvals, yvals)[0]))
plt.xlabel("Obs. ddG")
plt.ylabel("Pred. ddG")
plt.savefig('comparison_figs/gr_library_analysis/MITOMIHalfSites_distill.png', dpi=300, format='png')
plt.clf()

xvals = [seqToLabel[seq] for seq in MITOMIHalfSites]
yvals = [seqToModiscoPred[seq] for seq in MITOMIHalfSites]
modisco_rmses['MITOMIHalfSites'] = math.sqrt(mean_squared_error(xvals, yvals))
xy = np.vstack([xvals,yvals])
z = gaussian_kde(xy)(xy)
plt.figure()
matplotlib.rc('font', **font)
min_lim = min(np.min(xvals), np.min(yvals))
max_lim = max(np.max(xvals), np.max(yvals))
plt.xlim(min_lim-0.5, max_lim+0.5)
plt.ylim(min_lim-0.5, max_lim+0.5)
plt.gca().set_aspect('equal', adjustable='box')
plt.scatter(xvals, yvals,  c=z, edgecolor='', alpha=0.9)
x = np.linspace(min_lim-0.5, max_lim+0.5, num=len(yvals))
b, m = polyfit(xvals, yvals, 1)
plt.plot(x, b + m * x, color="black")
plt.title("MoDISco -- spearman:"+str(spearmanr(xvals, yvals)[0])+", pearson:"+str(pearsonr(xvals, yvals)[0]))
plt.xlabel("Obs. ddG")
plt.ylabel("Pred. ddG")
plt.savefig('comparison_figs/gr_library_analysis/MITOMIHalfSites_modisco.png', dpi=300, format='png')
plt.clf()

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

In [24]:
dfs['ChIPHalfSites']

Unnamed: 0,Oligo #,Well,Oligo Name,Sequence,rMax_1,rMaxE_1,KaN_1,KaNE_1,rMax_2,rMaxE_2,KaN_2,KaNE_2,Unnamed: 12,KaN_AVG,KaNE_AVG,SEM
0,1,A1,GR_ChIP,CGCAATTGCGAGTCCAGAACATTCTGTTCCTCGACCTTCCTCTCCG...,0.511898,0.051173,1.0,0.30544,1.08642,0.010498,1.0,0.12302,,1.0,0.0,0.0
1,51,E3,GR_ChIP_HalfSite1,CGCAATTGCGAGTCCAGAACATTGCTCCATTCGACCTTCCTCTCCG...,0.511898,0.051173,0.020706,0.007473,1.08642,0.010498,0.101955,0.016196,,0.061331,0.057452,0.040624
2,52,E4,GR_ChIP_HalfSite2,CGCAATTGCGAGTCCGCTCCTATCTGTTCCTCGACCTTCCTCTCCG...,0.511898,0.051173,0.003799,0.004348,1.08642,0.010498,0.006716,0.006419,,0.005258,0.002062,0.001458
3,53,E5,GR_ChIP_HalfSite1_Mut-1A,CGCAATTGCGAGTCAAGAACATTGCTCCATTCGACCTTCCTCTCCG...,0.511898,0.051173,0.009158,0.006125,1.08642,0.010498,0.012287,0.008774,,0.010723,0.002212,0.001564
4,54,E6,GR_ChIP_HalfSite1_Mut-1G,CGCAATTGCGAGTCGAGAACATTGCTCCATTCGACCTTCCTCTCCG...,0.511898,0.051173,0.00377,0.003933,1.08642,0.010498,0.005797,0.006066,,0.004783,0.001433,0.001013
5,55,E7,GR_ChIP_HalfSite1_Mut-1T,CGCAATTGCGAGTCTAGAACATTGCTCCATTCGACCTTCCTCTCCG...,0.511898,0.051173,0.008828,0.00579,1.08642,0.010498,0.01777,0.01178,,0.013299,0.006323,0.004471
6,56,E8,GR_ChIP_HalfSite1_Mut0C,CGCAATTGCGAGTCCCGAACATTGCTCCTGTCGACCTTCCTCTCCG...,0.511898,0.051173,0.005072,0.003801,1.08642,0.010498,0.007107,0.00533,,0.006089,0.00144,0.001018
7,57,E9,GR_ChIP_HalfSite1_Mut0G,CGCAATTGCGAGTCCGGAACATTGCTCCTGTCGACCTTCCTCTCCG...,0.511898,0.051173,0.007331,0.005677,1.08642,0.010498,0.011815,0.01082,,0.009573,0.003171,0.002242
8,58,E10,GR_ChIP_HalfSite1_Mut0T,CGCAATTGCGAGTCCTGAACATTGCTCCTGTCGACCTTCCTCTCCG...,0.511898,0.051173,0.003605,0.00367,1.08642,0.010498,0.005847,0.00511,,0.004726,0.001585,0.001121
9,59,E11,GR_ChIP_HalfSite1_Mut1A,CGCAATTGCGAGTCCAAAACATTGCTCCTGTCGACCTTCCTCTCCG...,0.511898,0.051173,0.012188,0.006068,1.08642,0.010498,0.016926,0.01047,,0.014557,0.00335,0.002369


In [25]:
ChIPHalfSites = []
for Sequence in list(dfs['ChIPHalfSites']['Sequence'])[1:]:
    if pd.isnull(Sequence): continue
    seq = Sequence.upper()[11:33]
    pre = Sequence.upper()[:11]
    post = Sequence.upper()[33:]
    if pre != "CGCAATTGCGA":
        print(pre)
        print("CGCAATTGCGA")
    if post != "ACCTTCCTCTCCGGCGGTATGAC":
        print(post)
        print("ACCTTCCTCTCCGGCGGTATGAC")
    ChIPHalfSites.append(seq)
ChIPHalfSites = list(set(ChIPHalfSites))

xvals = [seqToLabel[seq] for seq in ChIPHalfSites]
yvals = [seqToDistillPred[seq] for seq in ChIPHalfSites]
distill_rmses['ChIPHalfSites'] = math.sqrt(mean_squared_error(xvals, yvals))
xy = np.vstack([xvals,yvals])
z = gaussian_kde(xy)(xy)
plt.figure()
matplotlib.rc('font', **font)
min_lim = min(np.min(xvals), np.min(yvals))
max_lim = max(np.max(xvals), np.max(yvals))
plt.xlim(min_lim-0.5, max_lim+0.5)
plt.ylim(min_lim-0.5, max_lim+0.5)
plt.gca().set_aspect('equal', adjustable='box')
plt.scatter(xvals, yvals,  c=z, edgecolor='', alpha=0.9)
x = np.linspace(min_lim-0.5, max_lim+0.5, num=len(yvals))
b, m = polyfit(xvals, yvals, 1)
plt.plot(x, b + m * x, color="black")
plt.title("Distill -- spearman:"+str(spearmanr(xvals, yvals)[0])+", pearson:"+str(pearsonr(xvals, yvals)[0]))
plt.xlabel("Obs. ddG")
plt.ylabel("Pred. ddG")
plt.savefig('comparison_figs/gr_library_analysis/ChIPHalfSites_distill.png', dpi=300, format='png')
plt.clf()

xvals = [seqToLabel[seq] for seq in ChIPHalfSites]
yvals = [seqToModiscoPred[seq] for seq in ChIPHalfSites]
modisco_rmses['ChIPHalfSites'] = math.sqrt(mean_squared_error(xvals, yvals))
xy = np.vstack([xvals,yvals])
z = gaussian_kde(xy)(xy)
plt.figure()
matplotlib.rc('font', **font)
min_lim = min(np.min(xvals), np.min(yvals))
max_lim = max(np.max(xvals), np.max(yvals))
plt.xlim(min_lim-0.5, max_lim+0.5)
plt.ylim(min_lim-0.5, max_lim+0.5)
plt.gca().set_aspect('equal', adjustable='box')
plt.scatter(xvals, yvals,  c=z, edgecolor='', alpha=0.9)
x = np.linspace(min_lim-0.5, max_lim+0.5, num=len(yvals))
b, m = polyfit(xvals, yvals, 1)
plt.plot(x, b + m * x, color="black")
plt.title("MoDISco -- spearman:"+str(spearmanr(xvals, yvals)[0])+", pearson:"+str(pearsonr(xvals, yvals)[0]))
plt.xlabel("Obs. ddG")
plt.ylabel("Pred. ddG")
plt.savefig('comparison_figs/gr_library_analysis/ChIPHalfSites_modisco.png', dpi=300, format='png')
plt.clf()

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

In [26]:
# all half site mutations
xvals = [seqToLabel[seq] for seq in ChIPHalfSites+MITOMIHalfSites]
yvals = [seqToDistillPred[seq] for seq in ChIPHalfSites+MITOMIHalfSites]
xy = np.vstack([xvals,yvals])
z = gaussian_kde(xy)(xy)
plt.figure()
matplotlib.rc('font', **font)
min_lim = min(np.min(xvals), np.min(yvals))
max_lim = max(np.max(xvals), np.max(yvals))
plt.xlim(min_lim-0.5, max_lim+0.5)
plt.ylim(min_lim-0.5, max_lim+0.5)
plt.gca().set_aspect('equal', adjustable='box')
plt.scatter(xvals, yvals,  c=z, edgecolor='', alpha=0.9)
x = np.linspace(min_lim-0.5, max_lim+0.5, num=len(yvals))
b, m = polyfit(xvals, yvals, 1)
plt.plot(x, b + m * x, color="black")
plt.title("Distill -- spearman:"+str(spearmanr(xvals, yvals)[0])+", pearson:"+str(pearsonr(xvals, yvals)[0]))
plt.xlabel("Obs. ddG")
plt.ylabel("Pred. ddG")
plt.savefig('comparison_figs/gr_library_analysis/ALLHalfSite_distill.png', dpi=300, format='png')
plt.clf()

xvals = [seqToLabel[seq] for seq in ChIPHalfSites+MITOMIHalfSites]
yvals = [seqToModiscoPred[seq] for seq in ChIPHalfSites+MITOMIHalfSites]
xy = np.vstack([xvals,yvals])
z = gaussian_kde(xy)(xy)
plt.figure()
matplotlib.rc('font', **font)
min_lim = min(np.min(xvals), np.min(yvals))
max_lim = max(np.max(xvals), np.max(yvals))
plt.xlim(min_lim-0.5, max_lim+0.5)
plt.ylim(min_lim-0.5, max_lim+0.5)
plt.gca().set_aspect('equal', adjustable='box')
plt.scatter(xvals, yvals,  c=z, edgecolor='', alpha=0.9)
x = np.linspace(min_lim-0.5, max_lim+0.5, num=len(yvals))
b, m = polyfit(xvals, yvals, 1)
plt.plot(x, b + m * x, color="black")
plt.title("MoDISco -- spearman:"+str(spearmanr(xvals, yvals)[0])+", pearson:"+str(pearsonr(xvals, yvals)[0]))
plt.xlabel("Obs. ddG")
plt.ylabel("Pred. ddG")
plt.savefig('comparison_figs/gr_library_analysis/ALLHalfSite_modisco.png', dpi=300, format='png')
plt.clf()

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

In [27]:
dfs['NaturalVariants1']

Unnamed: 0,Oligo #,Well,Oligo Name,Sequence,rMax_1,rMaxE_1,KaN_1,KaNE_1,rMax_2,rMaxE_2,KaN_2,KaNE_2,Unnamed: 12,KaN_AVG,KaNE_AVG,SEM
0,80,G8,GR_MITOMI,CGCAATTGCGAGTCCGGGACATGATGTCCCTCGACCTTCCTCTCCG...,0.774111,0.015437,1.0,0.112768,1.08642,0.010498,1.0,0.111758,,1.0,0.0,0.0
1,155,E11,RANDOM,CGCAATTGCGAGTCCGCGGTAGCTGCGCATTCGACCTTCCTCTCCG...,0.774111,0.015437,0.009162,0.016475,1.08642,0.010498,0.018465,0.014473,,0.013813,0.006578,0.004651
2,156,E12,GILZ,CGCAATTGCGAGTCCAGAACATTGGGTTCCTCGACCTTCCTCTCCG...,0.774111,0.015437,0.383991,0.046143,1.08642,0.010498,0.352165,0.04222,,0.368078,0.022504,0.015913
3,157,F1,PAL,CGCAATTGCGAGTCCAGAACAAAATGTTCTTCGACCTTCCTCTCCG...,0.774111,0.015437,2.74759,0.2762,1.08642,0.010498,2.89015,0.324333,,2.81887,0.100805,0.07128
4,158,F2,SGK,CGCAATTGCGAGTCCAGAACATTTTGTCCGTCGACCTTCCTCTCCG...,0.774111,0.015437,1.36933,0.146675,1.08642,0.010498,1.95558,0.214769,,1.662455,0.414541,0.293125
5,159,F3,CGT,CGCAATTGCGAGTCCAGAACATTTTGTACGTCGACCTTCCTCTCCG...,0.774111,0.015437,2.20664,0.21083,1.08642,0.010498,1.80404,0.186602,,2.00534,0.284681,0.2013
6,160,F4,CONS,CGCAATTGCGAGTCCAGAACAAAATGTACCTCGACCTTCCTCTCCG...,0.774111,0.015437,1.75319,0.168015,1.08642,0.010498,1.98864,0.209775,,1.870915,0.166488,0.117725
7,161,F5,FKBP5,CGCAATTGCGAGTCCAGAACAGGGTGTTCTTCGACCTTCCTCTCCG...,0.774111,0.015437,0.994619,0.096838,1.08642,0.010498,0.728684,0.07783,,0.861652,0.188044,0.132967
8,162,F6,TAT3_REV,CGCAATTGCGAGTCCTGTACAGGATGTTCTTCGACCTTCCTCTCCG...,0.774111,0.015437,1.2342,0.120089,1.08642,0.010498,1.31221,0.136022,,1.273205,0.055161,0.039005
9,163,F7,MMTV,CGCAATTGCGAGTCCAGAACAGTTTGTAACTCGACCTTCCTCTCCG...,0.774111,0.015437,0.446207,0.049477,1.08642,0.010498,0.280538,0.038196,,0.363372,0.117146,0.082835


In [28]:
NaturalVariants1 = []
for Sequence in list(dfs['NaturalVariants1']['Sequence'])[2:]:
    if pd.isnull(Sequence): continue
    seq = Sequence.upper()[11:33]
    pre = Sequence.upper()[:11]
    post = Sequence.upper()[33:]
    if pre != "CGCAATTGCGA":
        print(pre)
        print("CGCAATTGCGA")
    if post != "ACCTTCCTCTCCGGCGGTATGAC":
        print(post)
        print("ACCTTCCTCTCCGGCGGTATGAC")
    NaturalVariants1.append(seq)
NaturalVariants1 = list(set(NaturalVariants1))

xvals = [seqToLabel[seq] for seq in NaturalVariants1]
yvals = [seqToDistillPred[seq] for seq in NaturalVariants1]
distill_rmses['NaturalVariants1'] = math.sqrt(mean_squared_error(xvals, yvals))
xy = np.vstack([xvals,yvals])
z = gaussian_kde(xy)(xy)
plt.figure()
matplotlib.rc('font', **font)
min_lim = min(np.min(xvals), np.min(yvals))
max_lim = max(np.max(xvals), np.max(yvals))
plt.xlim(min_lim-0.5, max_lim+0.5)
plt.ylim(min_lim-0.5, max_lim+0.5)
plt.gca().set_aspect('equal', adjustable='box')
plt.scatter(xvals, yvals,  c=z, edgecolor='', alpha=0.9)
x = np.linspace(min_lim-0.5, max_lim+0.5, num=len(yvals))
b, m = polyfit(xvals, yvals, 1)
plt.plot(x, b + m * x, color="black")
plt.title("Distill -- spearman:"+str(spearmanr(xvals, yvals)[0])+", pearson:"+str(pearsonr(xvals, yvals)[0]))
plt.xlabel("Obs. ddG")
plt.ylabel("Pred. ddG")
plt.savefig('comparison_figs/gr_library_analysis/NaturalVariants1_distill.png', dpi=300, format='png')
plt.clf()

xvals = [seqToLabel[seq] for seq in NaturalVariants1]
yvals = [seqToModiscoPred[seq] for seq in NaturalVariants1]
modisco_rmses['NaturalVariants1'] = math.sqrt(mean_squared_error(xvals, yvals))
xy = np.vstack([xvals,yvals])
z = gaussian_kde(xy)(xy)
plt.figure()
matplotlib.rc('font', **font)
min_lim = min(np.min(xvals), np.min(yvals))
max_lim = max(np.max(xvals), np.max(yvals))
plt.xlim(min_lim-0.5, max_lim+0.5)
plt.ylim(min_lim-0.5, max_lim+0.5)
plt.gca().set_aspect('equal', adjustable='box')
plt.scatter(xvals, yvals,  c=z, edgecolor='', alpha=0.9)
x = np.linspace(min_lim-0.5, max_lim+0.5, num=len(yvals))
b, m = polyfit(xvals, yvals, 1)
plt.plot(x, b + m * x, color="black")
plt.title("MoDISco -- spearman:"+str(spearmanr(xvals, yvals)[0])+", pearson:"+str(pearsonr(xvals, yvals)[0]))
plt.xlabel("Obs. ddG")
plt.ylabel("Pred. ddG")
plt.savefig('comparison_figs/gr_library_analysis/NaturalVariants1_modisco.png', dpi=300, format='png')
plt.clf()

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

In [29]:
dfs['NaturalVariants2']

Unnamed: 0,Na


In [30]:
dfs['nGREs']

In [31]:
dfs['NaturalVariants3']

In [32]:
shorthand = {
    'MITOMIFullSite': 'M-full',
    'ChIPFullSite': 'C-full',
    'MITOMISpacing': 'spacing',
    'MITOMIHalfSites': 'M-half',
    'ChIPHalfSites': 'C-half',
    'NaturalVariants1': 'nat-vars'
}

In [33]:
diffs = {}
for key in modisco_rmses.keys():
    diffs[key] = modisco_rmses[key]#-distill_rmses[key]
sorted_diffs = dict(sorted(diffs.items(), key=lambda item: item[1]))

In [34]:
sorted_diffs

{'ChIPHalfSites': 0.355199199874834,
 'MITOMIFullSite': 0.5834770983791302,
 'ChIPFullSite': 0.6665302991608953,
 'MITOMISpacing': 0.6812595472950441,
 'NaturalVariants1': 0.6894450220299819,
 'MITOMIHalfSites': 0.9032239798714932}

In [35]:
modisco_vals = []
distill_vals = []
for key in sorted_diffs:
    modisco_vals.append(modisco_rmses[key])
    distill_vals.append(distill_rmses[key])

In [36]:
X = np.arange(len(sorted_diffs))
ax = plt.subplot(111)
barwidth = 0.3
ax.bar(X+(barwidth/2), modisco_vals, width=barwidth, color='r', align='center')
ax.bar(X-(barwidth/2), distill_vals, width=barwidth, color='b', align='center')
ax.legend(('modisco','distill'))
plt.xticks(X, [shorthand[key] for key in sorted_diffs.keys()])
plt.ylabel('RMSE')
plt.ylim((0.25,1.0))
plt.savefig('comparison_figs/gr_library_analysis/rmse_breakdown.png', dpi=300, format='png')
plt.clf()

<Figure size 432x288 with 0 Axes>

#  Additivity analysis

## Where is the non-additivity in the data?

In [37]:
dfs['MITOMIFullSite']['Oligo Name'].iloc[0], dfs['MITOMIFullSite']['Sequence'].iloc[0]

('GR_MITOMI', 'CGCAATTGCGAGTCCGGGACATGATGTCCCTCGACCTTCCTCTCCGGCGGTATGAC')

In [38]:
MITOMI_building_blocks = {}
for Sequence in MITOMIFullSite:
    seq = Sequence[4:-3]
    pre = Sequence[:4]
    post = Sequence[-3:]
    if pre != "GTCC": print(pre)
    if post != "TCG": print(post)
    MITOMI_building_blocks[seq] = seqToLabel[Sequence]

In [39]:
len(list(MITOMI_building_blocks.keys())[0]), len(MITOMI_building_blocks.keys())

(15, 46)

In [40]:
MITOMI_consensus = dfs['MITOMIFullSite']['Sequence'].iloc[0][15:30]
MITOMI_building_blocks[MITOMI_consensus]

0.5997021784118834

In [41]:
#  MITOMISpacing

In [42]:
additive_observed = {}
additive_seq_type = {}
for Sequence in MITOMISpacing:
    seq = Sequence[4:-3]
    pre = Sequence[:4]
    post = Sequence[-3:]
    if pre != "GTCC": print(pre)
    if post != "TCG": print(post)
    additive_seq_type[Sequence] = "MITOMISpacing"
    additive_observed[Sequence] = MITOMI_building_blocks[MITOMI_consensus]
    for idx in range(len(MITOMI_consensus)):
        if seq[idx] != MITOMI_consensus[idx]:
            new_seq = list(MITOMI_consensus)
            new_seq[idx] = seq[idx]
            to_score = "".join(new_seq)
            additive_observed[Sequence] += (MITOMI_building_blocks[to_score]-MITOMI_building_blocks[MITOMI_consensus])

In [43]:
# MITOMIHalfSites

In [44]:
for Sequence in MITOMIHalfSites:
    seq = Sequence[4:-3]
    pre = Sequence[:4]
    post = Sequence[-3:]
    if pre != "GTCC": print(pre)
    if post != "TCG": print(post)
    additive_seq_type[Sequence] = "MITOMIHalfSites"
    additive_observed[Sequence] = MITOMI_building_blocks[MITOMI_consensus]
    for idx in range(len(MITOMI_consensus)):
        if seq[idx] != MITOMI_consensus[idx]:
            new_seq = list(MITOMI_consensus)
            new_seq[idx] = seq[idx]
            to_score = "".join(new_seq)
            additive_observed[Sequence] += (MITOMI_building_blocks[to_score]-MITOMI_building_blocks[MITOMI_consensus])

In [45]:
# ChIPFullSite consensus

In [46]:
dfs['ChIPFullSite']['Oligo Name'].iloc[0], dfs['ChIPFullSite']['Sequence'].iloc[0]

('GR_ChIP', 'CGCAATTGCGAGTCCAGAACATTCTGTTCCTCGACCTTCCTCTCCGGCGGTATGAC')

In [47]:
ChIP_building_blocks = {}
for Sequence in ChIPFullSite:
    seq = Sequence[3:-3]
    pre = Sequence[:3]
    post = Sequence[-3:]
    if pre != "GTC": print(pre)
    if post != "TCG": print(post)
    ChIP_building_blocks[seq] = seqToLabel[Sequence]

In [48]:
len(list(ChIP_building_blocks.keys())[0]), len(ChIP_building_blocks.keys())

(16, 49)

In [49]:
ChIP_consensus = dfs['ChIPFullSite']['Sequence'].iloc[0][14:30]
ChIP_building_blocks[ChIP_consensus]

0.0

In [50]:
#'ChIPHalfSites'

In [51]:
for Sequence in ChIPHalfSites:
    seq = Sequence[3:-3]
    pre = Sequence[:3]
    post = Sequence[-3:]
    if pre != "GTC": print(pre)
    if post != "TCG": print(post)
    additive_seq_type[Sequence] = "ChIPHalfSites"
    additive_observed[Sequence] = ChIP_building_blocks[ChIP_consensus]
    for idx in range(len(ChIP_consensus)):
        if seq[idx] != ChIP_consensus[idx]:
            new_seq = list(ChIP_consensus)
            new_seq[idx] = seq[idx]
            to_score = "".join(new_seq)
            additive_observed[Sequence] += (ChIP_building_blocks[to_score]-ChIP_building_blocks[ChIP_consensus])

In [52]:
xvals = [seqToLabel[seq] for seq in additive_observed]
yvals = [additive_observed[seq] for seq in additive_observed]

xy = np.vstack([xvals,yvals])
z = gaussian_kde(xy)(xy)
plt.figure()
matplotlib.rc('font', **font)
min_lim = min(np.min(xvals), np.min(yvals))
max_lim = max(np.max(xvals), np.max(yvals))
plt.xlim(min_lim-0.5, max_lim+0.5)
plt.ylim(min_lim-0.5, max_lim+0.5)
plt.gca().set_aspect('equal', adjustable='box')
plt.scatter(xvals, yvals,  c=z, edgecolor='', alpha=0.9)
plt.plot([min_lim-0.5, max_lim+0.5], [min_lim-0.5, max_lim+0.5], color="black")
plt.title("Observed -- spearman:"+str(spearmanr(xvals, yvals)[0])+", pearson:"+str(pearsonr(xvals, yvals)[0]))
plt.xlabel("Obs. ddG")
plt.ylabel("Add. ddG")
plt.savefig('comparison_figs/gr_library_analysis/nonadditive_seqs_obs.png', dpi=300, format='png')
plt.clf()

<Figure size 432x288 with 0 Axes>

## How are different methods doing on additivity seqs?

In [53]:
xvals = [seqToLabel[seq] for seq in additive_observed]
yvals = [seqToModiscoPred[seq] for seq in additive_observed]

xy = np.vstack([xvals,yvals])
z = gaussian_kde(xy)(xy)
plt.figure()
matplotlib.rc('font', **font)
min_lim = min(np.min(xvals), np.min(yvals))
max_lim = max(np.max(xvals), np.max(yvals))
plt.xlim(min_lim-0.5, max_lim+0.5)
plt.ylim(min_lim-0.5, max_lim+0.5)
plt.gca().set_aspect('equal', adjustable='box')
plt.scatter(xvals, yvals,  c=z, edgecolor='', alpha=0.9)
x = np.linspace(min_lim-0.5, max_lim+0.5, num=len(yvals))
b, m = polyfit(xvals, yvals, 1)
plt.plot(x, b + m * x, color="black")
plt.title("MoDISco -- spearman:"+str(spearmanr(xvals, yvals)[0])+", pearson:"+str(pearsonr(xvals, yvals)[0]))
plt.xlabel("Obs. ddG")
plt.ylabel("Pred. ddG")
plt.savefig('comparison_figs/gr_library_analysis/nonadditive_seqs_modisco.png', dpi=300, format='png')
plt.clf()

<Figure size 432x288 with 0 Axes>

In [54]:
xvals = [seqToLabel[seq] for seq in additive_observed]
yvals = [seqToDistillPred[seq] for seq in additive_observed]

xy = np.vstack([xvals,yvals])
z = gaussian_kde(xy)(xy)
plt.figure()
matplotlib.rc('font', **font)
min_lim = min(np.min(xvals), np.min(yvals))
max_lim = max(np.max(xvals), np.max(yvals))
plt.xlim(min_lim-0.5, max_lim+0.5)
plt.ylim(min_lim-0.5, max_lim+0.5)
plt.gca().set_aspect('equal', adjustable='box')
plt.scatter(xvals, yvals,  c=z, edgecolor='', alpha=0.9)
x = np.linspace(min_lim-0.5, max_lim+0.5, num=len(yvals))
b, m = polyfit(xvals, yvals, 1)
plt.plot(x, b + m * x, color="black")
plt.title("Distill -- spearman:"+str(spearmanr(xvals, yvals)[0])+", pearson:"+str(pearsonr(xvals, yvals)[0]))
plt.xlabel("Obs. ddG")
plt.ylabel("Pred. ddG")
plt.savefig('comparison_figs/gr_library_analysis/nonadditive_seqs_distill.png', dpi=300, format='png')
plt.clf()

<Figure size 432x288 with 0 Axes>

## Are CWM's predictions additive?

In [55]:
MITOMI_building_blocks = {}
for Sequence in MITOMIFullSite:
    seq = Sequence[4:-3]
    pre = Sequence[:4]
    post = Sequence[-3:]
    if pre != "GTCC": print(pre)
    if post != "TCG": print(post)
    MITOMI_building_blocks[seq] = seqToModiscoPred[Sequence]

In [56]:
#  MITOMISpacing

In [57]:
additive_modisco = {}
for Sequence in MITOMISpacing:
    seq = Sequence[4:-3]
    pre = Sequence[:4]
    post = Sequence[-3:]
    if pre != "GTCC": print(pre)
    if post != "TCG": print(post)
    additive_modisco[Sequence] = MITOMI_building_blocks[MITOMI_consensus]
    for idx in range(len(MITOMI_consensus)):
        if seq[idx] != MITOMI_consensus[idx]:
            new_seq = list(MITOMI_consensus)
            new_seq[idx] = seq[idx]
            to_score = "".join(new_seq)
            additive_modisco[Sequence] += (MITOMI_building_blocks[to_score]-MITOMI_building_blocks[MITOMI_consensus])

In [58]:
# MITOMIHalfSites

In [59]:
for Sequence in MITOMIHalfSites:
    seq = Sequence[4:-3]
    pre = Sequence[:4]
    post = Sequence[-3:]
    if pre != "GTCC": print(pre)
    if post != "TCG": print(post)
    additive_modisco[Sequence] = MITOMI_building_blocks[MITOMI_consensus]
    for idx in range(len(MITOMI_consensus)):
        if seq[idx] != MITOMI_consensus[idx]:
            new_seq = list(MITOMI_consensus)
            new_seq[idx] = seq[idx]
            to_score = "".join(new_seq)
            additive_modisco[Sequence] += (MITOMI_building_blocks[to_score]-MITOMI_building_blocks[MITOMI_consensus])

In [60]:
ChIP_building_blocks = {}
for Sequence in ChIPFullSite:
    seq = Sequence[3:-3]
    pre = Sequence[:3]
    post = Sequence[-3:]
    if pre != "GTC": print(pre)
    if post != "TCG": print(post)
    ChIP_building_blocks[seq] = seqToModiscoPred[Sequence]

In [61]:
#'ChIPHalfSites'

In [62]:
for Sequence in ChIPHalfSites:
    seq = Sequence[3:-3]
    pre = Sequence[:3]
    post = Sequence[-3:]
    if pre != "GTC": print(pre)
    if post != "TCG": print(post)
    additive_modisco[Sequence] = ChIP_building_blocks[ChIP_consensus]
    for idx in range(len(ChIP_consensus)):
        if seq[idx] != ChIP_consensus[idx]:
            new_seq = list(ChIP_consensus)
            new_seq[idx] = seq[idx]
            to_score = "".join(new_seq)
            additive_modisco[Sequence] += (ChIP_building_blocks[to_score]-ChIP_building_blocks[ChIP_consensus])

In [63]:
xvals = [seqToModiscoPred[seq] for seq in additive_observed]
yvals = [additive_modisco[seq] for seq in additive_observed]

xy = np.vstack([xvals,yvals])
z = gaussian_kde(xy)(xy)
plt.figure()
matplotlib.rc('font', **font)
min_lim = min(np.min(xvals), np.min(yvals))
max_lim = max(np.max(xvals), np.max(yvals))
plt.xlim(min_lim-0.5, max_lim+0.5)
plt.ylim(min_lim-0.5, max_lim+0.5)
plt.gca().set_aspect('equal', adjustable='box')
plt.scatter(xvals, yvals,  c=z, edgecolor='', alpha=0.9)
plt.plot([min_lim-0.5, max_lim+0.5], [min_lim-0.5, max_lim+0.5], color="black")
plt.title("MoDISco -- spearman:"+str(spearmanr(xvals, yvals)[0])+", pearson:"+str(pearsonr(xvals, yvals)[0]))
plt.xlabel("Pred. ddG")
plt.ylabel("Add. Pred. ddG")
plt.savefig('comparison_figs/gr_library_analysis/nonadditive_mistakes_modisco.png', dpi=300, format='png')
plt.clf()

<Figure size 432x288 with 0 Axes>

In [64]:
xvals = [seqToLabel[seq] for seq in additive_observed]
yvals = [additive_modisco[seq] for seq in additive_observed]

xy = np.vstack([xvals,yvals])
z = gaussian_kde(xy)(xy)
plt.figure()
matplotlib.rc('font', **font)
min_lim = min(np.min(xvals), np.min(yvals))
max_lim = max(np.max(xvals), np.max(yvals))
plt.xlim(min_lim-0.5, max_lim+0.5)
plt.ylim(min_lim-0.5, max_lim+0.5)
plt.gca().set_aspect('equal', adjustable='box')
plt.scatter(xvals, yvals,  c=z, edgecolor='', alpha=0.9)
x = np.linspace(min_lim-0.5, max_lim+0.5, num=len(yvals))
b, m = polyfit(xvals, yvals, 1)
plt.plot(x, b + m * x, color="black")
plt.title("MoDISco -- spearman:"+str(spearmanr(xvals, yvals)[0])+", pearson:"+str(pearsonr(xvals, yvals)[0]))
plt.xlabel("Obs. ddG")
plt.ylabel("Add. Pred. ddG")
plt.savefig('comparison_figs/gr_library_analysis/additive_modisco_perf.png', dpi=300, format='png')
plt.clf()

<Figure size 432x288 with 0 Axes>

## Are Distill's predictions additive?

In [65]:
MITOMI_building_blocks = {}
for Sequence in MITOMIFullSite:
    seq = Sequence[4:-3]
    pre = Sequence[:4]
    post = Sequence[-3:]
    if pre != "GTCC": print(pre)
    if post != "TCG": print(post)
    MITOMI_building_blocks[seq] = seqToDistillPred[Sequence]

In [66]:
#  MITOMISpacing

In [67]:
additive_distill = {}
for Sequence in MITOMISpacing:
    seq = Sequence[4:-3]
    pre = Sequence[:4]
    post = Sequence[-3:]
    if pre != "GTCC": print(pre)
    if post != "TCG": print(post)
    additive_distill[Sequence] = MITOMI_building_blocks[MITOMI_consensus]
    for idx in range(len(MITOMI_consensus)):
        if seq[idx] != MITOMI_consensus[idx]:
            new_seq = list(MITOMI_consensus)
            new_seq[idx] = seq[idx]
            to_score = "".join(new_seq)
            additive_distill[Sequence] += (MITOMI_building_blocks[to_score]-MITOMI_building_blocks[MITOMI_consensus])

In [68]:
# MITOMIHalfSites

In [69]:
for Sequence in MITOMIHalfSites:
    seq = Sequence[4:-3]
    pre = Sequence[:4]
    post = Sequence[-3:]
    if pre != "GTCC": print(pre)
    if post != "TCG": print(post)
    additive_distill[Sequence] = MITOMI_building_blocks[MITOMI_consensus]
    for idx in range(len(MITOMI_consensus)):
        if seq[idx] != MITOMI_consensus[idx]:
            new_seq = list(MITOMI_consensus)
            new_seq[idx] = seq[idx]
            to_score = "".join(new_seq)
            additive_distill[Sequence] += (MITOMI_building_blocks[to_score]-MITOMI_building_blocks[MITOMI_consensus])

In [70]:
ChIP_building_blocks = {}
for Sequence in ChIPFullSite:
    seq = Sequence[3:-3]
    pre = Sequence[:3]
    post = Sequence[-3:]
    if pre != "GTC": print(pre)
    if post != "TCG": print(post)
    ChIP_building_blocks[seq] = seqToDistillPred[Sequence]

In [71]:
#'ChIPHalfSites'

In [72]:
for Sequence in ChIPHalfSites:
    seq = Sequence[3:-3]
    pre = Sequence[:3]
    post = Sequence[-3:]
    if pre != "GTC": print(pre)
    if post != "TCG": print(post)
    additive_distill[Sequence] = ChIP_building_blocks[ChIP_consensus]
    for idx in range(len(ChIP_consensus)):
        if seq[idx] != ChIP_consensus[idx]:
            new_seq = list(ChIP_consensus)
            new_seq[idx] = seq[idx]
            to_score = "".join(new_seq)
            additive_distill[Sequence] += (ChIP_building_blocks[to_score]-ChIP_building_blocks[ChIP_consensus])

In [73]:
xvals = [seqToDistillPred[seq] for seq in additive_observed]
yvals = [additive_distill[seq] for seq in additive_observed]

xy = np.vstack([xvals,yvals])
z = gaussian_kde(xy)(xy)
plt.figure()
matplotlib.rc('font', **font)
min_lim = min(np.min(xvals), np.min(yvals))
max_lim = max(np.max(xvals), np.max(yvals))
plt.xlim(min_lim-0.5, max_lim+0.5)
plt.ylim(min_lim-0.5, max_lim+0.5)
plt.gca().set_aspect('equal', adjustable='box')
plt.scatter(xvals, yvals,  c=z, edgecolor='', alpha=0.9)
plt.plot([min_lim-0.5, max_lim+0.5], [min_lim-0.5, max_lim+0.5], color="black")
plt.title("Distill -- spearman:"+str(spearmanr(xvals, yvals)[0])+", pearson:"+str(pearsonr(xvals, yvals)[0]))
plt.xlabel("Pred. ddG")
plt.ylabel("Add. Pred. ddG")
plt.savefig('comparison_figs/gr_library_analysis/nonadditive_mistakes_distill.png', dpi=300, format='png')
plt.clf()

<Figure size 432x288 with 0 Axes>

In [74]:
xvals = [seqToLabel[seq] for seq in additive_observed]
yvals = [additive_distill[seq] for seq in additive_observed]

xy = np.vstack([xvals,yvals])
z = gaussian_kde(xy)(xy)
plt.figure()
matplotlib.rc('font', **font)
min_lim = min(np.min(xvals), np.min(yvals))
max_lim = max(np.max(xvals), np.max(yvals))
plt.xlim(min_lim-0.5, max_lim+0.5)
plt.ylim(min_lim-0.5, max_lim+0.5)
plt.gca().set_aspect('equal', adjustable='box')
plt.scatter(xvals, yvals,  c=z, edgecolor='', alpha=0.9)
x = np.linspace(min_lim-0.5, max_lim+0.5, num=len(yvals))
b, m = polyfit(xvals, yvals, 1)
plt.plot(x, b + m * x, color="black")
plt.title("Distill -- spearman:"+str(spearmanr(xvals, yvals)[0])+", pearson:"+str(pearsonr(xvals, yvals)[0]))
plt.xlabel("Obs. ddG")
plt.ylabel("Add. Pred. ddG")
plt.savefig('comparison_figs/gr_library_analysis/additive_distill_perf.png', dpi=300, format='png')
plt.clf()

<Figure size 432x288 with 0 Axes>