In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd

## Read data

Load eCLIP-seq sequences

In [3]:
eclip = pd.read_csv("TIA1/TIA1_eclip.txt", sep='\t', header=None)

In [4]:
eclip.head()

Unnamed: 0,0,1
0,chr7:45932874-45932893,CAGTAAAAACTTGTACTAT
1,chr7:45932893-45932940,GTTAATAACTTGTCCTATGTCAATTTGTATATCATGAAACACTTCTC
2,chr7:94296528-94296614,TAATTCTTTGCTACTGCCATTGACCCTGCTGCAGGATTTGTGTCAT...
3,chr7:45932862-45932874,ATTTTTTCTACA
4,chr7:127231645-127231726,CTGTTTTGGGTGTACTCTAGGGGCCAGGTTGGGAGGGGGAAGGTGA...


Load RBNS binding affinities

In [5]:
four_mers = pd.read_csv("TIA1/TIA1_4.tsv", sep='\t')
five_mers = pd.read_csv("TIA1/TIA1_5.tsv", sep='\t',)
six_mers = pd.read_csv("TIA1/TIA1_6.tsv", sep='\t',)
four_mers

Unnamed: 0,[TIA1],5 nM,20 nM,80 nM,320 nM,1300 nM
0,TTTT,1.1589,1.2097,1.0931,1.0908,1.5628
1,TTTA,1.1272,1.0755,1.0910,1.0877,1.2518
2,ATTT,1.1269,1.1175,1.0903,1.0880,1.2115
3,TTAT,1.1057,1.0367,1.0797,1.0719,1.2024
4,TATT,1.1153,1.0524,1.0917,1.0839,1.2018
...,...,...,...,...,...,...
251,TCGT,0.9579,0.9496,0.9480,0.9331,0.8969
252,TCGG,0.9434,0.9667,0.9405,0.9331,0.8898
253,CGAG,0.9099,0.9385,0.9129,0.8956,0.8841
254,CGTC,0.9235,0.9012,0.9271,0.9114,0.8757


Load knockdown differential rna-seq data

In [6]:
# get other diff expr for subsetting to shared genes later
diff_exp_HNRNPC = pd.read_csv("HNRNPC/HNRNPC.tsv", sep='\t',)
diff_exp_SRSF9 = pd.read_csv("SRSF9/SRSF9.tsv", sep='\t',)
diff_exp_PCBP1 = pd.read_csv("PCBP1/PCBP1.tsv", sep='\t',)
diff_exp_TIA1 = pd.read_csv("TIA1/TIA1.tsv", sep='\t',)
diff_exp_TRA2A = pd.read_csv("TRA2A/TRA2A.tsv", sep='\t',)

## Generate positive pairs

EClip positive binding sites

In [7]:
in_vivo_pos_seqs = eclip[1].values

In [8]:
# filter for length and unique values
in_vivo_pos_seqs = [s for s in in_vivo_pos_seqs if len(s) >= 10]
in_vivo_pos_seqs = list(set(in_vivo_pos_seqs))

RBNS significant binding affinity

In [9]:
def get_signficant_sequences(df):
    num_cols = df.columns[1:]
    filtered = df[df[num_cols].gt(2).any(axis=1)]
    pairs = [
        (row[df.columns[0]], tuple(row[num_cols]))
        for _, row in filtered.iterrows()
    ]
    return pairs

In [10]:
pos_four_mers = get_signficant_sequences(four_mers)
pos_five_mers = get_signficant_sequences(five_mers)
pos_six_mers = get_signficant_sequences(six_mers)

In [11]:
len(pos_four_mers), len(pos_five_mers), len(pos_six_mers), 

(0, 1, 7)

In [12]:
in_vitro_pos_seqs = np.concatenate((pos_five_mers,pos_six_mers))
in_vitro_pos_seqs.shape



(8, 2)

In [13]:
unique_rbns = {}
for s, arr in in_vitro_pos_seqs:
    if s not in unique_rbns:
        unique_rbns[s] = arr

in_vitro_pos_seqs = list(unique_rbns.items())
len(in_vitro_pos_seqs)

8

Filter diff expr

In [14]:
# remove when gene expression is always 0 or if logfold change is inf/-inf
diff_exp_filtered_HNRNPC = diff_exp_HNRNPC[
    ~(
        ((diff_exp_HNRNPC["value_1"] == 0) & (diff_exp_HNRNPC["value_2"] == 0)) |
        (diff_exp_HNRNPC["log2(fold_change)"].isin([np.inf, -np.inf]))
    )
]

diff_exp_filtered_SRSF9 = diff_exp_SRSF9[
    ~(
        ((diff_exp_SRSF9["value_1"] == 0) & (diff_exp_SRSF9["value_2"] == 0)) |
        (diff_exp_SRSF9["log2(fold_change)"].isin([np.inf, -np.inf]))
    )
]

diff_exp_filtered_PCBP1 = diff_exp_PCBP1[
    ~(
        ((diff_exp_PCBP1["value_1"] == 0) & (diff_exp_PCBP1["value_2"] == 0)) |
        (diff_exp_PCBP1["log2(fold_change)"].isin([np.inf, -np.inf]))
    )
]

diff_exp_filtered_TIA1 = diff_exp_TIA1[
    ~(
        ((diff_exp_TIA1["value_1"] == 0) & (diff_exp_TIA1["value_2"] == 0)) |
        (diff_exp_TIA1["log2(fold_change)"].isin([np.inf, -np.inf]))
    )
]

diff_exp_filtered_TRA2A = diff_exp_TRA2A[
    ~(
        ((diff_exp_TRA2A["value_1"] == 0) & (diff_exp_TRA2A["value_2"] == 0)) |
        (diff_exp_TRA2A["log2(fold_change)"].isin([np.inf, -np.inf]))
    )
]

In [15]:
dfs = [diff_exp_filtered_HNRNPC, diff_exp_filtered_SRSF9, diff_exp_filtered_PCBP1, diff_exp_filtered_TIA1, diff_exp_filtered_TRA2A]
col = "gene_id"
shared_keys = set.intersection(*(set(df[col]) for df in dfs))

# 2. Subset the dataframe you want (example: df1)
diff_exp_filtered_TIA1 = diff_exp_filtered_TIA1[diff_exp_filtered_TIA1[col].isin(shared_keys)]

In [16]:
diff_exp_filtered_TIA1.head()

Unnamed: 0,test_id,gene_id,gene,locus,sample_1,sample_2,status,value_1,value_2,log2(fold_change),test_stat,p_value,q_value,significant
7,XLOC_000008,XLOC_000008,OR4F29,chr1:317719-461954,TIA1-BGHLV12-HepG2,Control,OK,0.627869,0.250779,-1.32404,-0.52724,0.02875,0.127831,no
11,XLOC_000012,XLOC_000012,MTND1P23,chr1:536815-660283,TIA1-BGHLV12-HepG2,Control,OK,246.719,191.17,-0.368011,-0.180525,0.52255,0.761433,no
12,XLOC_000013,XLOC_000013,MTND2P28,chr1:536815-660283,TIA1-BGHLV12-HepG2,Control,OK,610.343,659.356,0.111438,0.20462,0.77205,0.905931,no
13,XLOC_000014,XLOC_000014,hsa-mir-6723,chr1:536815-660283,TIA1-BGHLV12-HepG2,Control,OK,1268.43,1243.25,-0.028933,-0.095724,0.8912,0.96114,no
14,XLOC_000015,XLOC_000015,RP5-857K21.7,chr1:536815-660283,TIA1-BGHLV12-HepG2,Control,OK,647.872,642.983,-0.010927,-0.015454,0.9818,0.99245,no


In [17]:
diff_exp_filtered_TIA1.shape

(17985, 14)

In [18]:
logfold_changes = diff_exp_filtered_TIA1['log2(fold_change)'].values
logfold_changes

array([-1.32404 , -0.368011,  0.111438, ..., -0.601615, -0.659382,
       -1.69771 ])

## Generate negative pairs

Shuffle eCLIP sequences

In [19]:
import random
def shuffle_by_pairs(seq):
    pairs = [seq[i:i+2] for i in range(0, len(seq), 2)]
    random.shuffle(pairs)
    return "".join(pairs)
in_vivo_neg_seqs = [shuffle_by_pairs(seq) for seq in eclip[1].values]
len(in_vivo_neg_seqs)

239224

In [20]:
# filter for length and unique values
in_vivo_neg_seqs = [s for s in in_vivo_neg_seqs if len(s) >= 10]
in_vivo_neg_seqs = list(set(in_vivo_neg_seqs))
len(in_vivo_neg_seqs)

233638

Retrieve RBNS sequences that are not enriched

In [21]:
def get_insignficant_sequences(df):
    num_cols = df.columns[1:]
    filtered = df[df[num_cols].lt(2).any(axis=1)]
    pairs = [
        (row[df.columns[0]], tuple(row[num_cols]))
        for _, row in filtered.iterrows()
    ]
    return pairs

In [22]:
neg_four_mers = get_insignficant_sequences(four_mers)
neg_five_mers = get_insignficant_sequences(five_mers)
neg_six_mers = get_insignficant_sequences(six_mers)

In [23]:
neg_four_mers

[('TTTT', (1.1589, 1.2097, 1.0931, 1.0908, 1.5628)),
 ('TTTA', (1.1272, 1.0755, 1.091, 1.0877, 1.2518)),
 ('ATTT', (1.1269, 1.1175, 1.0903, 1.088, 1.2115)),
 ('TTAT', (1.1057, 1.0367, 1.0797, 1.0719, 1.2024)),
 ('TATT', (1.1153, 1.0524, 1.0917, 1.0839, 1.2018)),
 ('CTTT', (1.0623, 1.076, 1.0189, 1.0049, 1.154)),
 ('TCTT', (1.0491, 1.0683, 1.0185, 1.0039, 1.1503)),
 ('AAAA', (1.1711, 1.2103, 1.1931, 1.2454, 1.1352)),
 ('TTAA', (1.1236, 1.0608, 1.11, 1.1121, 1.1277)),
 ('TATA', (1.1141, 0.9948, 1.1193, 1.1193, 1.1071)),
 ('TTCT', (1.0577, 1.0894, 1.0281, 1.0135, 1.1071)),
 ('TAAA', (1.1406, 1.0761, 1.15, 1.1714, 1.1029)),
 ('ATTA', (1.1016, 1.0233, 1.0959, 1.0949, 1.1009)),
 ('TAAT', (1.1089, 1.0295, 1.106, 1.1092, 1.0952)),
 ('TTTC', (1.0598, 1.0933, 1.0245, 1.0076, 1.092)),
 ('AATT', (1.1171, 1.093, 1.1048, 1.1057, 1.0919)),
 ('CTTA', (1.0403, 0.9887, 1.0234, 1.0169, 1.0853)),
 ('AATA', (1.1129, 1.0157, 1.1333, 1.1501, 1.0822)),
 ('ATAA', (1.1107, 1.0169, 1.1303, 1.145, 1.08)),
 ('CTCT

In [24]:
in_vitro_neg_seqs = np.concatenate((neg_four_mers,neg_five_mers,neg_six_mers))
in_vitro_neg_seqs.shape



(5376, 2)

In [25]:
unique_rbns = {}
for s, arr in in_vitro_neg_seqs:
    if s not in unique_rbns:
        unique_rbns[s] = arr

in_vitro_neg_seqs = list(unique_rbns.items())
len(in_vitro_neg_seqs)

5376

Construct placeholder expression change vector

In [26]:
placeholder_logfold_changes = np.zeros(shape=logfold_changes.shape)

## Split into train/val/test

In [27]:
# number of total samples per class
combinations_size = 100000

Positive samples

In [28]:
from itertools import product

random.shuffle(in_vivo_pos_seqs)
random.shuffle(in_vitro_pos_seqs)

n_eclip = len(in_vivo_pos_seqs)
n_rbns  = len(in_vitro_pos_seqs)

# Split indices
train_eclip = in_vivo_pos_seqs[:int(0.7*n_eclip)]
val_eclip   = in_vivo_pos_seqs[int(0.7*n_eclip):int(0.85*n_eclip)]
test_eclip  = in_vivo_pos_seqs[int(0.85*n_eclip):]

train_rbns = in_vitro_pos_seqs[:int(0.7*n_rbns)]
val_rbns   = in_vitro_pos_seqs[int(0.7*n_rbns):int(0.85*n_rbns)]
test_rbns  = in_vitro_pos_seqs[int(0.85*n_rbns):]

pos_train_samples = [
    (random.choice(train_eclip), random.choice(train_rbns))
    for _ in range(combinations_size)
]
pos_val_samples = [
    (random.choice(val_eclip), random.choice(val_rbns))
    for _ in range(combinations_size)
]
pos_test_samples = [
    (random.choice(test_eclip), random.choice(test_rbns))
    for _ in range(combinations_size)
]

In [29]:
train_eclips = set(eclip for eclip, _ in pos_train_samples)
val_eclips   = set(eclip for eclip, _ in pos_val_samples)
test_eclips  = set(eclip for eclip, _ in pos_test_samples)

assert train_eclips.isdisjoint(val_eclips), "Overlap between train and val eCLIP sequences!"
assert train_eclips.isdisjoint(test_eclips), "Overlap between train and test eCLIP sequences!"
assert val_eclips.isdisjoint(test_eclips), "Overlap between val and test eCLIP sequences!"
print("eCLIP sequences are disjoint across splits")

eCLIP sequences are disjoint across splits


In [30]:
train_rbns = set(rbns for _, (rbns, _) in pos_train_samples)
val_rbns   = set(rbns for _, (rbns, _) in pos_val_samples)
test_rbns  = set(rbns for _, (rbns, _) in pos_test_samples)

assert train_rbns.isdisjoint(val_rbns), "Overlap between train and val RBNS sequences!"
assert train_rbns.isdisjoint(test_rbns), "Overlap between train and test RBNS sequences!"
assert val_rbns.isdisjoint(test_rbns), "Overlap between val and test RBNS sequences!"
print("RBNS sequences are disjoint across splits")

RBNS sequences are disjoint across splits


In [31]:
pos_train_label = np.ones(len(pos_train_samples))
pos_val_label = np.ones(len(pos_val_samples))
pos_test_label = np.ones(len(pos_test_samples))

Negative samples

In [32]:
random.shuffle(in_vivo_neg_seqs)
random.shuffle(in_vitro_neg_seqs)

n_eclip = len(in_vivo_neg_seqs)
n_rbns  = len(in_vitro_neg_seqs)

# Split indices
train_eclip = in_vivo_neg_seqs[:int(0.7*n_eclip)]
val_eclip   = in_vivo_neg_seqs[int(0.7*n_eclip):int(0.85*n_eclip)]
test_eclip  = in_vivo_neg_seqs[int(0.85*n_eclip):]

train_rbns = in_vitro_neg_seqs[:int(0.7*n_rbns)]
val_rbns   = in_vitro_neg_seqs[int(0.7*n_rbns):int(0.85*n_rbns)]
test_rbns  = in_vitro_neg_seqs[int(0.85*n_rbns):]

neg_train_samples = [
    (random.choice(train_eclip), random.choice(train_rbns))
    for _ in range(combinations_size)
]
neg_val_samples = [
    (random.choice(val_eclip), random.choice(val_rbns))
    for _ in range(combinations_size)
]
neg_test_samples = [
    (random.choice(test_eclip), random.choice(test_rbns))
    for _ in range(combinations_size)
]

In [33]:
neg_train_label = np.zeros(len(neg_train_samples))
neg_val_label = np.zeros(len(neg_val_samples))
neg_test_label = np.zeros(len(neg_test_samples))

Combine for full splits

In [34]:
train_data = np.concatenate((pos_train_samples, neg_train_samples))
val_data = np.concatenate((pos_val_samples, neg_val_samples))
test_data = np.concatenate((pos_test_samples, neg_test_samples))

train_labels = np.concatenate((pos_train_label, neg_train_label))
val_labels = np.concatenate((pos_val_label, neg_val_label))
test_labels = np.concatenate((pos_test_label, neg_test_label))

In [35]:
# get positive/negative label values
positive_value = ("TIA1", logfold_changes)
negative_value = ("Not TIA1", placeholder_logfold_changes)

## Save as dictionaries

Save as pickle files since there are nested data structures

In [36]:
import pickle

In [37]:
'''
Saved data format for each split:
- Input
    - List containing:
        - eCLIP sequence: text
        - Tuple of RBNS sequence and binding affinity: (text, vector)
- Output
    - List containing:
        - RBP label: text
        - Logfold gene expression change: vector
        
Labels save separately to save storage:
- List containing:
    - RBP label: text
    - Logfold gene expression change: vector
'''

'\nSaved data format for each split:\n- Input\n    - List containing:\n        - eCLIP sequence: text\n        - Tuple of RBNS sequence and binding affinity: (text, vector)\n- Output\n    - List containing:\n        - RBP label: text\n        - Logfold gene expression change: vector\n        \nLabels save separately to save storage:\n- List containing:\n    - RBP label: text\n    - Logfold gene expression change: vector\n'

In [38]:
# Save train
with open("TIA1/train_split.pkl", "wb") as f:
    pickle.dump((train_data, train_labels), f)

In [39]:
# Save val
with open("TIA1/val_split.pkl", "wb") as f:
    pickle.dump((val_data, val_labels), f)

In [40]:
# Save test
with open("TIA1/test_split.pkl", "wb") as f:
    pickle.dump((test_data, test_labels), f)

In [41]:
# Save positive label
with open("TIA1/TIA1_positive_label.pkl", "wb") as f:
    pickle.dump(positive_value, f)

In [42]:
# Save negative label
with open("TIA1/TIA1_negative_label.pkl", "wb") as f:
    pickle.dump(negative_value, f)