In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd

## Read data

Load eCLIP-seq sequences

In [3]:
eclip = pd.read_csv("PCBP1/PCBP1_eclip.txt", sep='\t', header=None)

In [4]:
eclip.head()

Unnamed: 0,0,1
0,chr7:45928349-45928392,CTCCGCCGAGAAGCTCGCGCTCTGCCCGCCGGTGTCCGCCTCG
1,chr7:45928417-45928492,GCTGCGGCTGTTGCCCGATGTGCGCCCTGCCTCTGGGCGCCGCGTG...
2,chr7:45928229-45928349,TGTCTGCTGCTCGCGCCTGGAGATGTCAGAGGTCCCCGTTGCTCGC...
3,chr7:45928392-45928417,TGCTCGGAGGTCACCCGGTCCGCCG
4,chr7:45928531-45928577,TGCACGCCCTCACCCGCGGCCAAGGCGCCTGCGTGCAGGAGTCTGA


Load RBNS binding affinities

In [5]:
four_mers = pd.read_csv("PCBP1/PCBP1_4.tsv", sep='\t')
five_mers = pd.read_csv("PCBP1/PCBP1_5.tsv", sep='\t',)
six_mers = pd.read_csv("PCBP1/PCBP1_6.tsv", sep='\t',)
four_mers

Unnamed: 0,[PCBP1],5 nM,20 nM,80 nM,320 nM,1300 nM
0,CCCC,1.1704,1.2545,2.2991,3.6549,3.5584
1,GCCC,1.5532,1.5971,2.4116,3.4126,3.5377
2,CCCA,1.0617,1.1591,1.8562,2.8203,2.9079
3,CCCG,1.5667,1.5468,2.0563,2.6961,2.7707
4,ACCC,1.0986,1.1723,1.7423,2.5136,2.6790
...,...,...,...,...,...,...
251,GAAA,0.9294,0.9057,0.8140,0.6666,0.5976
252,TAGA,0.8989,0.8808,0.8041,0.6606,0.5888
253,TGAA,0.9222,0.9076,0.8075,0.6521,0.5876
254,GAAT,0.9098,0.8883,0.7917,0.6393,0.5715


Load knockdown differential rna-seq data

In [6]:
# get other diff expr for subsetting to shared genes later
diff_exp_HNRNPC = pd.read_csv("HNRNPC/HNRNPC.tsv", sep='\t',)
diff_exp_SRSF9 = pd.read_csv("SRSF9/SRSF9.tsv", sep='\t',)
diff_exp_PCBP1 = pd.read_csv("PCBP1/PCBP1.tsv", sep='\t',)
diff_exp_TIA1 = pd.read_csv("TIA1/TIA1.tsv", sep='\t',)
diff_exp_TRA2A = pd.read_csv("TRA2A/TRA2A.tsv", sep='\t',)

## Generate positive pairs

EClip positive binding sites

In [7]:
in_vivo_pos_seqs = eclip[1].values

In [8]:
# filter for length and unique values
in_vivo_pos_seqs = [s for s in in_vivo_pos_seqs if len(s) >= 10]
in_vivo_pos_seqs = list(set(in_vivo_pos_seqs))

RBNS significant binding affinity

In [9]:
def get_signficant_sequences(df):
    num_cols = df.columns[1:]
    filtered = df[df[num_cols].gt(2).any(axis=1)]
    pairs = [
        (row[df.columns[0]], tuple(row[num_cols]))
        for _, row in filtered.iterrows()
    ]
    return pairs

In [10]:
pos_four_mers = get_signficant_sequences(four_mers)
pos_five_mers = get_signficant_sequences(five_mers)
pos_six_mers = get_signficant_sequences(six_mers)

In [11]:
len(pos_four_mers), len(pos_five_mers), len(pos_six_mers), 

(13, 70, 340)

In [12]:
in_vitro_pos_seqs = np.concatenate((pos_four_mers, pos_five_mers,pos_six_mers))
in_vitro_pos_seqs.shape



(423, 2)

In [13]:
unique_rbns = {}
for s, arr in in_vitro_pos_seqs:
    if s not in unique_rbns:
        unique_rbns[s] = arr

in_vitro_pos_seqs = list(unique_rbns.items())
len(in_vitro_pos_seqs)

423

Filter diff expr

In [14]:
# remove when gene expression is always 0 or if logfold change is inf/-inf
diff_exp_filtered_HNRNPC = diff_exp_HNRNPC[
    ~(
        ((diff_exp_HNRNPC["value_1"] == 0) & (diff_exp_HNRNPC["value_2"] == 0)) |
        (diff_exp_HNRNPC["log2(fold_change)"].isin([np.inf, -np.inf]))
    )
]

diff_exp_filtered_SRSF9 = diff_exp_SRSF9[
    ~(
        ((diff_exp_SRSF9["value_1"] == 0) & (diff_exp_SRSF9["value_2"] == 0)) |
        (diff_exp_SRSF9["log2(fold_change)"].isin([np.inf, -np.inf]))
    )
]

diff_exp_filtered_PCBP1 = diff_exp_PCBP1[
    ~(
        ((diff_exp_PCBP1["value_1"] == 0) & (diff_exp_PCBP1["value_2"] == 0)) |
        (diff_exp_PCBP1["log2(fold_change)"].isin([np.inf, -np.inf]))
    )
]

diff_exp_filtered_TIA1 = diff_exp_TIA1[
    ~(
        ((diff_exp_TIA1["value_1"] == 0) & (diff_exp_TIA1["value_2"] == 0)) |
        (diff_exp_TIA1["log2(fold_change)"].isin([np.inf, -np.inf]))
    )
]

diff_exp_filtered_TRA2A = diff_exp_TRA2A[
    ~(
        ((diff_exp_TRA2A["value_1"] == 0) & (diff_exp_TRA2A["value_2"] == 0)) |
        (diff_exp_TRA2A["log2(fold_change)"].isin([np.inf, -np.inf]))
    )
]

In [15]:
dfs = [diff_exp_filtered_HNRNPC, diff_exp_filtered_SRSF9, diff_exp_filtered_PCBP1, diff_exp_filtered_TIA1, diff_exp_filtered_TRA2A]
col = "gene_id"
shared_keys = set.intersection(*(set(df[col]) for df in dfs))

# 2. Subset the dataframe you want (example: df1)
diff_exp_filtered_PCBP1 = diff_exp_filtered_PCBP1[diff_exp_filtered_PCBP1[col].isin(shared_keys)]

In [16]:
diff_exp_filtered_PCBP1.head()

Unnamed: 0,test_id,gene_id,gene,locus,sample_1,sample_2,status,value_1,value_2,log2(fold_change),test_stat,p_value,q_value,significant
7,XLOC_000008,XLOC_000008,OR4F29,chr1:317719-461954,PCBP1-BGHLV12-HepG2,Control,OK,0.644145,0.256604,-1.32784,-0.570958,0.0281,0.088555,no
11,XLOC_000012,XLOC_000012,MTND1P23,chr1:536815-660283,PCBP1-BGHLV12-HepG2,Control,OK,260.081,195.625,-0.410873,-0.205877,0.70775,0.836839,no
12,XLOC_000013,XLOC_000013,MTND2P28,chr1:536815-660283,PCBP1-BGHLV12-HepG2,Control,OK,763.174,674.709,-0.177747,-0.351151,0.61965,0.776942,no
13,XLOC_000014,XLOC_000014,hsa-mir-6723,chr1:536815-660283,PCBP1-BGHLV12-HepG2,Control,OK,1402.76,1272.22,-0.140916,-0.478016,0.49485,0.68227,no
14,XLOC_000015,XLOC_000015,RP5-857K21.7,chr1:536815-660283,PCBP1-BGHLV12-HepG2,Control,OK,829.868,657.946,-0.334911,-0.503453,0.46355,0.65665,no


In [17]:
diff_exp_filtered_PCBP1.shape

(17985, 14)

In [18]:
logfold_changes = diff_exp_filtered_PCBP1['log2(fold_change)'].values
logfold_changes

array([-1.32784   , -0.410873  , -0.177747  , ..., -0.00184541,
       -0.781436  ,  1.14329   ])

## Generate negative pairs

Shuffle eCLIP sequences

In [19]:
import random
def shuffle_by_pairs(seq):
    pairs = [seq[i:i+2] for i in range(0, len(seq), 2)]
    random.shuffle(pairs)
    return "".join(pairs)
in_vivo_neg_seqs = [shuffle_by_pairs(seq) for seq in eclip[1].values]
len(in_vivo_neg_seqs)

32074

In [20]:
# filter for length and unique values
in_vivo_neg_seqs = [s for s in in_vivo_neg_seqs if len(s) >= 10]
in_vivo_neg_seqs = list(set(in_vivo_neg_seqs))
in_vivo_neg_seqs = list(set(in_vivo_neg_seqs) - set(in_vivo_pos_seqs))
len(in_vivo_neg_seqs)

30452

Retrieve RBNS sequences that are not enriched

In [21]:
def get_insignficant_sequences(df):
    num_cols = df.columns[1:]
    filtered = df[df[num_cols].lt(2).any(axis=1)]
    pairs = [
        (row[df.columns[0]], tuple(row[num_cols]))
        for _, row in filtered.iterrows()
    ]
    return pairs

In [22]:
neg_four_mers = get_insignficant_sequences(four_mers)
neg_five_mers = get_insignficant_sequences(five_mers)
neg_six_mers = get_insignficant_sequences(six_mers)

In [23]:
neg_four_mers

[('CCCC', (1.1704, 1.2545, 2.2991, 3.6549, 3.5584)),
 ('GCCC', (1.5532, 1.5971, 2.4116, 3.4126, 3.5377)),
 ('CCCA', (1.0617, 1.1591, 1.8562, 2.8203, 2.9079)),
 ('CCCG', (1.5667, 1.5468, 2.0563, 2.6961, 2.7707)),
 ('ACCC', (1.0986, 1.1723, 1.7423, 2.5136, 2.679)),
 ('CCCT', (1.0137, 1.077, 1.6233, 2.408, 2.5261)),
 ('TCCC', (0.9922, 1.0517, 1.5783, 2.3736, 2.4432)),
 ('CGCC', (1.4975, 1.4734, 1.7883, 2.2296, 2.4665)),
 ('CCGC', (1.4534, 1.4531, 1.6498, 1.9877, 2.1969)),
 ('AGCC', (1.2636, 1.2923, 1.5775, 1.9674, 2.1408)),
 ('TGCC', (1.2428, 1.2825, 1.5485, 1.9047, 2.107)),
 ('CCAA', (0.952, 1.0202, 1.3377, 1.837, 1.9983)),
 ('CCTC', (0.9722, 1.0262, 1.3282, 1.8297, 2.0515)),
 ('CCTG', (1.2009, 1.2293, 1.4568, 1.7762, 1.8481)),
 ('CCTA', (0.92, 0.9846, 1.2776, 1.7716, 1.9853)),
 ('ATCC', (0.9284, 0.9896, 1.2851, 1.7657, 1.8976)),
 ('GTCC', (1.267, 1.3153, 1.4899, 1.7634, 1.8479)),
 ('CCAC', (1.0477, 1.1006, 1.3564, 1.7629, 1.982)),
 ('TACC', (0.9621, 1.0339, 1.2965, 1.7611, 2.0112)),
 ('

In [24]:
in_vitro_neg_seqs_all = np.concatenate((neg_four_mers,neg_five_mers,neg_six_mers))
pos_rbns_seqs = {s for s, _ in in_vitro_pos_seqs}
in_vitro_neg_seqs = np.array([(s, v) for s, v in in_vitro_neg_seqs_all if s not in pos_rbns_seqs])
in_vitro_neg_seqs.shape

  in_vitro_neg_seqs = np.array([(s, v) for s, v in in_vitro_neg_seqs_all if s not in pos_rbns_seqs])


(4953, 2)

In [25]:
unique_rbns = {}
for s, arr in in_vitro_neg_seqs:
    if s not in unique_rbns:
        unique_rbns[s] = arr

in_vitro_neg_seqs = list(unique_rbns.items())
len(in_vitro_neg_seqs)

4953

Construct placeholder expression change vector

In [26]:
placeholder_logfold_changes = np.zeros(shape=logfold_changes.shape)

## Split into train/val/test

In [27]:
# number of total samples per class
combinations_size = 500000

Positive samples

In [28]:
from itertools import product

random.shuffle(in_vivo_pos_seqs)
random.shuffle(in_vitro_pos_seqs)

n_eclip = len(in_vivo_pos_seqs)
n_rbns  = len(in_vitro_pos_seqs)

# Split indices
train_eclip = in_vivo_pos_seqs[:int(0.7*n_eclip)]
val_eclip   = in_vivo_pos_seqs[int(0.7*n_eclip):int(0.85*n_eclip)]
test_eclip  = in_vivo_pos_seqs[int(0.85*n_eclip):]

train_rbns = in_vitro_pos_seqs[:int(0.7*n_rbns)]
val_rbns   = in_vitro_pos_seqs[int(0.7*n_rbns):int(0.85*n_rbns)]
test_rbns  = in_vitro_pos_seqs[int(0.85*n_rbns):]

pos_train_samples = [
    (random.choice(train_eclip), random.choice(train_rbns))
    for _ in range(combinations_size)
]
pos_val_samples = [
    (random.choice(val_eclip), random.choice(val_rbns))
    for _ in range(combinations_size)
]
pos_test_samples = [
    (random.choice(test_eclip), random.choice(test_rbns))
    for _ in range(combinations_size)
]

In [34]:
pos_train_eclips = set(eclip for eclip, _ in pos_train_samples)
pos_val_eclips   = set(eclip for eclip, _ in pos_val_samples)
pos_test_eclips  = set(eclip for eclip, _ in pos_test_samples)

assert pos_train_eclips.isdisjoint(pos_val_eclips), "Overlap between train and val eCLIP sequences!"
assert pos_train_eclips.isdisjoint(pos_test_eclips), "Overlap between train and test eCLIP sequences!"
assert pos_val_eclips.isdisjoint(pos_test_eclips), "Overlap between val and test eCLIP sequences!"
print("eCLIP sequences are disjoint across splits")

eCLIP sequences are disjoint across splits


In [35]:
pos_train_rbns = set(rbns for _, (rbns, _) in pos_train_samples)
pos_val_rbns   = set(rbns for _, (rbns, _) in pos_val_samples)
pos_test_rbns  = set(rbns for _, (rbns, _) in pos_test_samples)

assert pos_train_rbns.isdisjoint(val_rbns), "Overlap between train and val RBNS sequences!"
assert pos_val_rbns.isdisjoint(test_rbns), "Overlap between train and test RBNS sequences!"
assert pos_test_rbns.isdisjoint(test_rbns), "Overlap between val and test RBNS sequences!"
print("RBNS sequences are disjoint across splits")

RBNS sequences are disjoint across splits


In [36]:
pos_train_label = np.ones(len(pos_train_samples))
pos_val_label = np.ones(len(pos_val_samples))
pos_test_label = np.ones(len(pos_test_samples))

Negative samples

In [37]:
random.shuffle(in_vivo_neg_seqs)
random.shuffle(in_vitro_neg_seqs)

n_eclip = len(in_vivo_neg_seqs)
n_rbns  = len(in_vitro_neg_seqs)

# Split indices
train_eclip = in_vivo_neg_seqs[:int(0.7*n_eclip)]
val_eclip   = in_vivo_neg_seqs[int(0.7*n_eclip):int(0.85*n_eclip)]
test_eclip  = in_vivo_neg_seqs[int(0.85*n_eclip):]

train_rbns = in_vitro_neg_seqs[:int(0.7*n_rbns)]
val_rbns   = in_vitro_neg_seqs[int(0.7*n_rbns):int(0.85*n_rbns)]
test_rbns  = in_vitro_neg_seqs[int(0.85*n_rbns):]

neg_train_samples = [
    (random.choice(train_eclip), random.choice(train_rbns))
    for _ in range(combinations_size)
]
neg_val_samples = [
    (random.choice(val_eclip), random.choice(val_rbns))
    for _ in range(combinations_size)
]
neg_test_samples = [
    (random.choice(test_eclip), random.choice(test_rbns))
    for _ in range(combinations_size)
]

In [38]:
neg_train_label = np.zeros(len(neg_train_samples))
neg_val_label = np.zeros(len(neg_val_samples))
neg_test_label = np.zeros(len(neg_test_samples))

In [39]:
train_eclips = set(eclip for eclip, _ in neg_train_samples)
val_eclips   = set(eclip for eclip, _ in neg_val_samples)
test_eclips  = set(eclip for eclip, _ in neg_test_samples)

assert train_eclips.isdisjoint(val_eclips), "Overlap between train and val eCLIP sequences!"
assert train_eclips.isdisjoint(test_eclips), "Overlap between train and test eCLIP sequences!"
assert val_eclips.isdisjoint(test_eclips), "Overlap between val and test eCLIP sequences!"
print("eCLIP sequences are disjoint across splits")

eCLIP sequences are disjoint across splits


In [40]:
train_rbns = set(rbns for _, (rbns, _) in neg_train_samples)
val_rbns   = set(rbns for _, (rbns, _) in neg_val_samples)
test_rbns  = set(rbns for _, (rbns, _) in neg_test_samples)

assert train_rbns.isdisjoint(val_rbns), "Overlap between train and val RBNS sequences!"
assert train_rbns.isdisjoint(test_rbns), "Overlap between train and test RBNS sequences!"
assert val_rbns.isdisjoint(test_rbns), "Overlap between val and test RBNS sequences!"
print("RBNS sequences are disjoint across splits")

RBNS sequences are disjoint across splits


In [41]:
# check across pos and neg samples
assert train_eclips.isdisjoint(pos_train_eclips), "Overlap between train and val eCLIP sequences!"
assert val_eclips.isdisjoint(pos_val_eclips), "Overlap between train and test eCLIP sequences!"
assert test_eclips.isdisjoint(pos_test_eclips), "Overlap between val and test eCLIP sequences!"

In [42]:
assert train_rbns.isdisjoint(pos_train_rbns), "Overlap between train and val RBNS sequences!"
assert val_rbns.isdisjoint(pos_val_rbns), "Overlap between train and test RBNS sequences!"
assert test_rbns.isdisjoint(pos_test_rbns), "Overlap between val and test RBNS sequences!"

Combine for full splits

In [43]:
train_data = np.concatenate((pos_train_samples, neg_train_samples))
val_data = np.concatenate((pos_val_samples, neg_val_samples))
test_data = np.concatenate((pos_test_samples, neg_test_samples))

train_labels = np.concatenate((pos_train_label, neg_train_label))
val_labels = np.concatenate((pos_val_label, neg_val_label))
test_labels = np.concatenate((pos_test_label, neg_test_label))

In [44]:
# get positive/negative label values
positive_value = ("PCBP1", logfold_changes)
negative_value = ("Not PCBP1", placeholder_logfold_changes)

## Save as dictionaries

Save as pickle files since there are nested data structures

In [45]:
import pickle

In [46]:
'''
Saved data format for each split:
- Input
    - List containing:
        - eCLIP sequence: text
        - Tuple of RBNS sequence and binding affinity: (text, vector)
        
Labels save separately to save storage:
- List containing:
    - RBP label: text
    - Logfold gene expression change: vector
'''

'\nSaved data format for each split:\n- Input\n    - List containing:\n        - eCLIP sequence: text\n        - Tuple of RBNS sequence and binding affinity: (text, vector)\n- Output\n    - List containing:\n        - RBP label: text\n        - Logfold gene expression change: vector\n        \nLabels save separately to save storage:\n- List containing:\n    - RBP label: text\n    - Logfold gene expression change: vector\n'

In [47]:
# Save train
with open("PCBP1/train_split.pkl", "wb") as f:
    pickle.dump((train_data, train_labels), f)

In [48]:
# Save val
with open("PCBP1/val_split.pkl", "wb") as f:
    pickle.dump((val_data, val_labels), f)

In [49]:
# Save test
with open("PCBP1/test_split.pkl", "wb") as f:
    pickle.dump((test_data, test_labels), f)

In [50]:
# Save positive label
with open("PCBP1/PCBP1_positive_label.pkl", "wb") as f:
    pickle.dump(positive_value, f)

In [51]:
# Save negative label
with open("PCBP1/PCBP1_negative_label.pkl", "wb") as f:
    pickle.dump(negative_value, f)