In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd

## Read data

Load eCLIP-seq sequences

In [3]:
eclip = pd.read_csv("TRA2A/TRA2A_eclip.txt", sep='\t', header=None)

In [4]:
eclip.head()

Unnamed: 0,0,1
0,chr7:94285693-94285822,ACTTCGGTTGGTGTGTGTCGAAGAAACCTGACTGCGCCCTGAGGAG...
1,chr7:94292667-94292751,AAAAAAAGAAGGAGTGGAAGAGGGGGCCAGGATCCAGGCCTCCATC...
2,chr7:100780742-100780921,CATCTGGGACAAAACTGGAGATGCATCGGGAAAGAAGAAACTCCGA...
3,chr7:11022589-11022767,ACAATGAGGATGATGAAGATGAGGGAAGCGGGAGTGATGAAGACGA...
4,chr7:155493474-155493644,GGAAACATTGGAACTTCAAAAGGACATCAAAGAAGAATCAGATGAA...


Load RBNS binding affinities

In [5]:
four_mers = pd.read_csv("TRA2A/TRA2A_4.tsv", sep='\t')
five_mers = pd.read_csv("TRA2A/TRA2A_5.tsv", sep='\t',)
six_mers = pd.read_csv("TRA2A/TRA2A_6.tsv", sep='\t',)
four_mers

Unnamed: 0,[TRA2A],0 nM,5 nM,20 nM,80 nM,320 nM,1300 nM
0,AGAA,1.0179,1.9157,2.4378,2.8430,2.7831,2.5174
1,GAAG,0.9857,2.0081,2.6597,2.8270,2.6370,2.2785
2,AAGA,1.0130,1.8014,2.2892,2.5155,2.3421,2.0420
3,GAAA,0.9935,1.5545,1.8787,2.1811,2.2963,2.1849
4,TGAA,1.0309,1.5297,1.8455,2.0781,2.1111,1.9834
...,...,...,...,...,...,...,...
251,CGGG,0.9202,0.8796,0.8297,0.6913,0.6943,0.6643
252,CGTT,1.0362,0.8679,0.7962,0.6736,0.6339,0.6104
253,GGGT,0.9313,0.8662,0.7994,0.6708,0.6366,0.6112
254,GGTC,0.9870,0.8717,0.8178,0.6655,0.6135,0.5582


Load knockdown differential rna-seq data

In [6]:
# get other diff expr for subsetting to shared genes later
diff_exp_HNRNPC = pd.read_csv("HNRNPC/HNRNPC.tsv", sep='\t',)
diff_exp_SRSF9 = pd.read_csv("SRSF9/SRSF9.tsv", sep='\t',)
diff_exp_PCBP1 = pd.read_csv("PCBP1/PCBP1.tsv", sep='\t',)
diff_exp_TIA1 = pd.read_csv("TIA1/TIA1.tsv", sep='\t',)
diff_exp_TRA2A = pd.read_csv("TRA2A/TRA2A.tsv", sep='\t',)

## Generate positive pairs

EClip positive binding sites

In [7]:
in_vivo_pos_seqs = eclip[1].values

In [8]:
# filter for length and unique values
in_vivo_pos_seqs = [s for s in in_vivo_pos_seqs if len(s) >= 10]
in_vivo_pos_seqs = list(set(in_vivo_pos_seqs))

RBNS significant binding affinity

In [9]:
def get_signficant_sequences(df):
    num_cols = df.columns[1:]
    filtered = df[df[num_cols].gt(2).any(axis=1)]
    pairs = [
        (row[df.columns[0]], tuple(row[num_cols]))
        for _, row in filtered.iterrows()
    ]
    return pairs

In [10]:
pos_four_mers = get_signficant_sequences(four_mers)
pos_five_mers = get_signficant_sequences(five_mers)
pos_six_mers = get_signficant_sequences(six_mers)

In [11]:
len(pos_four_mers), len(pos_five_mers), len(pos_six_mers), 

(6, 25, 129)

In [12]:
in_vitro_pos_seqs = np.concatenate((pos_four_mers, pos_five_mers,pos_six_mers))
in_vitro_pos_seqs.shape



(160, 2)

In [13]:
unique_rbns = {}
for s, arr in in_vitro_pos_seqs:
    if s not in unique_rbns:
        unique_rbns[s] = arr

in_vitro_pos_seqs = list(unique_rbns.items())
len(in_vitro_pos_seqs)

160

Filter diff expr

In [14]:
# remove when gene expression is always 0 or if logfold change is inf/-inf
diff_exp_filtered_HNRNPC = diff_exp_HNRNPC[
    ~(
        ((diff_exp_HNRNPC["value_1"] == 0) & (diff_exp_HNRNPC["value_2"] == 0)) |
        (diff_exp_HNRNPC["log2(fold_change)"].isin([np.inf, -np.inf]))
    )
]

diff_exp_filtered_SRSF9 = diff_exp_SRSF9[
    ~(
        ((diff_exp_SRSF9["value_1"] == 0) & (diff_exp_SRSF9["value_2"] == 0)) |
        (diff_exp_SRSF9["log2(fold_change)"].isin([np.inf, -np.inf]))
    )
]

diff_exp_filtered_PCBP1 = diff_exp_PCBP1[
    ~(
        ((diff_exp_PCBP1["value_1"] == 0) & (diff_exp_PCBP1["value_2"] == 0)) |
        (diff_exp_PCBP1["log2(fold_change)"].isin([np.inf, -np.inf]))
    )
]

diff_exp_filtered_TIA1 = diff_exp_TIA1[
    ~(
        ((diff_exp_TIA1["value_1"] == 0) & (diff_exp_TIA1["value_2"] == 0)) |
        (diff_exp_TIA1["log2(fold_change)"].isin([np.inf, -np.inf]))
    )
]

diff_exp_filtered_TRA2A = diff_exp_TRA2A[
    ~(
        ((diff_exp_TRA2A["value_1"] == 0) & (diff_exp_TRA2A["value_2"] == 0)) |
        (diff_exp_TRA2A["log2(fold_change)"].isin([np.inf, -np.inf]))
    )
]

In [15]:
dfs = [diff_exp_filtered_HNRNPC, diff_exp_filtered_SRSF9, diff_exp_filtered_PCBP1, diff_exp_filtered_TIA1, diff_exp_filtered_TRA2A]
col = "gene_id"
shared_keys = set.intersection(*(set(df[col]) for df in dfs))

# 2. Subset the dataframe you want (example: df1)
diff_exp_filtered_TRA2A = diff_exp_filtered_TRA2A[diff_exp_filtered_TRA2A[col].isin(shared_keys)]

In [16]:
diff_exp_filtered_TRA2A.head()

Unnamed: 0,test_id,gene_id,gene,locus,sample_1,sample_2,status,value_1,value_2,log2(fold_change),test_stat,p_value,q_value,significant
7,XLOC_000008,XLOC_000008,OR4F29,chr1:317719-461954,TRA2A-BGHLV14-HepG2,Control,OK,0.65528,0.501422,-0.386086,-0.177847,0.5707,0.999131,no
11,XLOC_000012,XLOC_000012,MTND1P23,chr1:536815-660283,TRA2A-BGHLV14-HepG2,Control,OK,136.295,115.024,-0.244798,-0.100524,0.69615,0.999131,no
12,XLOC_000013,XLOC_000013,MTND2P28,chr1:536815-660283,TRA2A-BGHLV14-HepG2,Control,OK,711.276,543.353,-0.38852,-0.765375,0.27055,0.906203,no
13,XLOC_000014,XLOC_000014,hsa-mir-6723,chr1:536815-660283,TRA2A-BGHLV14-HepG2,Control,OK,642.946,732.844,0.188808,0.504172,0.4706,0.993477,no
14,XLOC_000015,XLOC_000015,RP5-857K21.7,chr1:536815-660283,TRA2A-BGHLV14-HepG2,Control,OK,348.262,359.108,0.044242,0.050949,0.94195,0.999131,no


In [17]:
diff_exp_filtered_TRA2A.shape

(17985, 14)

In [18]:
logfold_changes = diff_exp_filtered_TRA2A['log2(fold_change)'].values
logfold_changes

array([-0.386086 , -0.244798 , -0.38852  , ..., -0.434982 , -0.0532165,
        1.4967   ])

## Generate negative pairs

Shuffle eCLIP sequences

In [19]:
import random
def shuffle_by_pairs(seq):
    pairs = [seq[i:i+2] for i in range(0, len(seq), 2)]
    random.shuffle(pairs)
    return "".join(pairs)
in_vivo_neg_seqs = [shuffle_by_pairs(seq) for seq in eclip[1].values]
len(in_vivo_neg_seqs)

59110

In [20]:
# filter for length and unique values
in_vivo_neg_seqs = [s for s in in_vivo_neg_seqs if len(s) >= 10]
in_vivo_neg_seqs = list(set(in_vivo_neg_seqs))
in_vivo_neg_seqs = list(set(in_vivo_neg_seqs) - set(in_vivo_pos_seqs))
len(in_vivo_neg_seqs)

56372

Retrieve RBNS sequences that are not enriched

In [21]:
def get_insignficant_sequences(df):
    num_cols = df.columns[1:]
    filtered = df[df[num_cols].lt(2).any(axis=1)]
    pairs = [
        (row[df.columns[0]], tuple(row[num_cols]))
        for _, row in filtered.iterrows()
    ]
    return pairs

In [22]:
neg_four_mers = get_insignficant_sequences(four_mers)
neg_five_mers = get_insignficant_sequences(five_mers)
neg_six_mers = get_insignficant_sequences(six_mers)

In [23]:
neg_four_mers

[('AGAA', (1.0179, 1.9157, 2.4378, 2.843, 2.7831, 2.5174)),
 ('GAAG', (0.9857, 2.0081, 2.6597, 2.827, 2.637, 2.2785)),
 ('AAGA', (1.013, 1.8014, 2.2892, 2.5155, 2.3421, 2.042)),
 ('GAAA', (0.9935, 1.5545, 1.8787, 2.1811, 2.2963, 2.1849)),
 ('TGAA', (1.0309, 1.5297, 1.8455, 2.0781, 2.1111, 1.9834)),
 ('CGAA', (1.0152, 1.4349, 1.7138, 1.9392, 2.0369, 1.9997)),
 ('GAAT', (1.0332, 1.3938, 1.5998, 1.8862, 1.9037, 1.7997)),
 ('AAAA', (0.9796, 1.2833, 1.4431, 1.562, 1.683, 1.6166)),
 ('GAAC', (1.0107, 1.2331, 1.3571, 1.5423, 1.6013, 1.6389)),
 ('AATA', (0.9756, 1.1941, 1.2813, 1.476, 1.4739, 1.4009)),
 ('AACA', (0.9932, 1.1455, 1.2187, 1.4302, 1.4291, 1.4719)),
 ('AAGC', (0.9808, 1.1753, 1.2622, 1.3948, 1.4063, 1.4377)),
 ('ATGA', (1.0263, 1.1857, 1.2618, 1.3941, 1.3691, 1.3013)),
 ('AAAG', (0.9915, 1.1844, 1.2719, 1.3921, 1.3641, 1.3028)),
 ('CCGA', (1.0218, 1.1603, 1.2665, 1.3396, 1.4863, 1.5305)),
 ('AAAC', (0.9899, 1.1157, 1.1794, 1.3124, 1.3736, 1.4098)),
 ('CAGA', (0.999, 1.1357, 1.2058

In [24]:
in_vitro_neg_seqs_all = np.concatenate((neg_four_mers,neg_five_mers,neg_six_mers))
pos_rbns_seqs = {s for s, _ in in_vitro_pos_seqs}
in_vitro_neg_seqs = np.array([(s, v) for s, v in in_vitro_neg_seqs_all if s not in pos_rbns_seqs])
in_vitro_neg_seqs.shape

  in_vitro_neg_seqs = np.array([(s, v) for s, v in in_vitro_neg_seqs_all if s not in pos_rbns_seqs])


(5216, 2)

In [25]:
unique_rbns = {}
for s, arr in in_vitro_neg_seqs:
    if s not in unique_rbns:
        unique_rbns[s] = arr

in_vitro_neg_seqs = list(unique_rbns.items())
len(in_vitro_neg_seqs)

5216

Construct placeholder expression change vector

In [26]:
placeholder_logfold_changes = np.zeros(shape=logfold_changes.shape)

## Split into train/val/test

In [27]:
# number of total samples per class
combinations_size = 500000

Positive samples

In [28]:
from itertools import product

random.shuffle(in_vivo_pos_seqs)
random.shuffle(in_vitro_pos_seqs)

n_eclip = len(in_vivo_pos_seqs)
n_rbns  = len(in_vitro_pos_seqs)

# Split indices
train_eclip = in_vivo_pos_seqs[:int(0.7*n_eclip)]
val_eclip   = in_vivo_pos_seqs[int(0.7*n_eclip):int(0.85*n_eclip)]
test_eclip  = in_vivo_pos_seqs[int(0.85*n_eclip):]

train_rbns = in_vitro_pos_seqs[:int(0.7*n_rbns)]
val_rbns   = in_vitro_pos_seqs[int(0.7*n_rbns):int(0.85*n_rbns)]
test_rbns  = in_vitro_pos_seqs[int(0.85*n_rbns):]

pos_train_samples = [
    (random.choice(train_eclip), random.choice(train_rbns))
    for _ in range(combinations_size)
]
pos_val_samples = [
    (random.choice(val_eclip), random.choice(val_rbns))
    for _ in range(combinations_size)
]
pos_test_samples = [
    (random.choice(test_eclip), random.choice(test_rbns))
    for _ in range(combinations_size)
]

In [29]:
pos_train_eclips = set(eclip for eclip, _ in pos_train_samples)
pos_val_eclips   = set(eclip for eclip, _ in pos_val_samples)
pos_test_eclips  = set(eclip for eclip, _ in pos_test_samples)

assert pos_train_eclips.isdisjoint(pos_val_eclips), "Overlap between train and val eCLIP sequences!"
assert pos_train_eclips.isdisjoint(pos_test_eclips), "Overlap between train and test eCLIP sequences!"
assert pos_val_eclips.isdisjoint(pos_test_eclips), "Overlap between val and test eCLIP sequences!"
print("eCLIP sequences are disjoint across splits")

eCLIP sequences are disjoint across splits


In [30]:
pos_train_rbns = set(rbns for _, (rbns, _) in pos_train_samples)
pos_val_rbns   = set(rbns for _, (rbns, _) in pos_val_samples)
pos_test_rbns  = set(rbns for _, (rbns, _) in pos_test_samples)

assert pos_train_rbns.isdisjoint(val_rbns), "Overlap between train and val RBNS sequences!"
assert pos_val_rbns.isdisjoint(test_rbns), "Overlap between train and test RBNS sequences!"
assert pos_test_rbns.isdisjoint(test_rbns), "Overlap between val and test RBNS sequences!"
print("RBNS sequences are disjoint across splits")

RBNS sequences are disjoint across splits


In [31]:
pos_train_label = np.ones(len(pos_train_samples))
pos_val_label = np.ones(len(pos_val_samples))
pos_test_label = np.ones(len(pos_test_samples))

Negative samples

In [32]:
random.shuffle(in_vivo_neg_seqs)
random.shuffle(in_vitro_neg_seqs)

n_eclip = len(in_vivo_neg_seqs)
n_rbns  = len(in_vitro_neg_seqs)

# Split indices
train_eclip = in_vivo_neg_seqs[:int(0.7*n_eclip)]
val_eclip   = in_vivo_neg_seqs[int(0.7*n_eclip):int(0.85*n_eclip)]
test_eclip  = in_vivo_neg_seqs[int(0.85*n_eclip):]

train_rbns = in_vitro_neg_seqs[:int(0.7*n_rbns)]
val_rbns   = in_vitro_neg_seqs[int(0.7*n_rbns):int(0.85*n_rbns)]
test_rbns  = in_vitro_neg_seqs[int(0.85*n_rbns):]

neg_train_samples = [
    (random.choice(train_eclip), random.choice(train_rbns))
    for _ in range(combinations_size)
]
neg_val_samples = [
    (random.choice(val_eclip), random.choice(val_rbns))
    for _ in range(combinations_size)
]
neg_test_samples = [
    (random.choice(test_eclip), random.choice(test_rbns))
    for _ in range(combinations_size)
]

In [33]:
neg_train_label = np.zeros(len(neg_train_samples))
neg_val_label = np.zeros(len(neg_val_samples))
neg_test_label = np.zeros(len(neg_test_samples))

In [34]:
train_eclips = set(eclip for eclip, _ in neg_train_samples)
val_eclips   = set(eclip for eclip, _ in neg_val_samples)
test_eclips  = set(eclip for eclip, _ in neg_test_samples)

assert train_eclips.isdisjoint(val_eclips), "Overlap between train and val eCLIP sequences!"
assert train_eclips.isdisjoint(test_eclips), "Overlap between train and test eCLIP sequences!"
assert val_eclips.isdisjoint(test_eclips), "Overlap between val and test eCLIP sequences!"
print("eCLIP sequences are disjoint across splits")

eCLIP sequences are disjoint across splits


In [35]:
train_rbns = set(rbns for _, (rbns, _) in neg_train_samples)
val_rbns   = set(rbns for _, (rbns, _) in neg_val_samples)
test_rbns  = set(rbns for _, (rbns, _) in neg_test_samples)

assert train_rbns.isdisjoint(val_rbns), "Overlap between train and val RBNS sequences!"
assert train_rbns.isdisjoint(test_rbns), "Overlap between train and test RBNS sequences!"
assert val_rbns.isdisjoint(test_rbns), "Overlap between val and test RBNS sequences!"
print("RBNS sequences are disjoint across splits")

RBNS sequences are disjoint across splits


In [36]:
# check across pos and neg samples
assert train_eclips.isdisjoint(pos_train_eclips), "Overlap between train and val eCLIP sequences!"
assert val_eclips.isdisjoint(pos_val_eclips), "Overlap between train and test eCLIP sequences!"
assert test_eclips.isdisjoint(pos_test_eclips), "Overlap between val and test eCLIP sequences!"

assert train_rbns.isdisjoint(pos_train_rbns), "Overlap between train and val RBNS sequences!"
assert val_rbns.isdisjoint(pos_val_rbns), "Overlap between train and test RBNS sequences!"
assert test_rbns.isdisjoint(pos_test_rbns), "Overlap between val and test RBNS sequences!"

Combine for full splits

In [37]:
train_data = np.concatenate((pos_train_samples, neg_train_samples))
val_data = np.concatenate((pos_val_samples, neg_val_samples))
test_data = np.concatenate((pos_test_samples, neg_test_samples))

train_labels = np.concatenate((pos_train_label, neg_train_label))
val_labels = np.concatenate((pos_val_label, neg_val_label))
test_labels = np.concatenate((pos_test_label, neg_test_label))

In [38]:
# get positive/negative label values
positive_value = ("TRA2A", logfold_changes)
negative_value = ("Not TRA2A", placeholder_logfold_changes)

## Save as dictionaries

Save as pickle files since there are nested data structures

In [39]:
import pickle

In [40]:
'''
Saved data format for each split:
- Input
    - List containing:
        - eCLIP sequence: text
        - Tuple of RBNS sequence and binding affinity: (text, vector)
- Output
    - List containing:
        - RBP label: text
        - Logfold gene expression change: vector
        
Labels save separately to save storage:
- List containing:
    - RBP label: text
    - Logfold gene expression change: vector
'''

'\nSaved data format for each split:\n- Input\n    - List containing:\n        - eCLIP sequence: text\n        - Tuple of RBNS sequence and binding affinity: (text, vector)\n- Output\n    - List containing:\n        - RBP label: text\n        - Logfold gene expression change: vector\n        \nLabels save separately to save storage:\n- List containing:\n    - RBP label: text\n    - Logfold gene expression change: vector\n'

In [41]:
# Save train
with open("TRA2A/train_split.pkl", "wb") as f:
    pickle.dump((train_data, train_labels), f)

In [42]:
# Save val
with open("TRA2A/val_split.pkl", "wb") as f:
    pickle.dump((val_data, val_labels), f)

In [43]:
# Save test
with open("TRA2A/test_split.pkl", "wb") as f:
    pickle.dump((test_data, test_labels), f)

In [44]:
# Save positive label
with open("TRA2A/TRA2A_positive_label.pkl", "wb") as f:
    pickle.dump(positive_value, f)

In [45]:
# Save negative label
with open("TRA2A/TRA2A_negative_label.pkl", "wb") as f:
    pickle.dump(negative_value, f)