In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd

## Read data

Load eCLIP-seq sequences

In [3]:
eclip = pd.read_csv("SRSF9/SRSF9_eclip.txt", sep='\t', header=None)

In [4]:
eclip.head()

Unnamed: 0,0,1
0,chr7:7222551-7222679,CAAACCCCGGGCGCGGCGGAACAGGTGTCTCGGGGTCAAAGCCGGG...
1,chr7:138145159-138145275,TCCCTCCCTCGCTGGCGCTGCCGCGAGTCCACCGAGCGGCCTCTGA...
2,chr7:1025772-1025833,TCACTACTGGCGATCGCGGACGCCCCAGGAAGGCGAGTGGCACGAG...
3,chr7:150767028-150767100,GCTAGGGGCTGTGCTCTTGGATCTGGACCAGCAGACCCTGCCCGGA...
4,chr7:65551730-65551807,CCACGCCGTGGCACTGACCCGAGACTCTGAGCGGCTGCTGGAGGTG...


Load RBNS binding affinities

In [5]:
four_mers = pd.read_csv("SRSF9/SRSF9_4.tsv", sep='\t')
five_mers = pd.read_csv("SRSF9/SRSF9_5.tsv", sep='\t',)
six_mers = pd.read_csv("SRSF9/SRSF9_6.tsv", sep='\t',)
four_mers

Unnamed: 0,[SRSF9],5 nM,20 nM,80 nM,320 nM,1300 nM
0,GGGG,1.0975,1.1679,1.2271,1.2953,1.7139
1,GGAG,1.0840,1.2955,1.4331,1.6534,1.6673
2,GGGA,1.0948,1.2341,1.3277,1.4853,1.5885
3,AGGG,1.0835,1.2028,1.2505,1.3731,1.5563
4,GAGG,1.0562,1.1583,1.2415,1.3768,1.5015
...,...,...,...,...,...,...
251,TTAA,0.9898,0.9762,0.9261,0.9024,0.6550
252,TATT,0.9665,0.9633,0.9140,0.8942,0.6303
253,TTTT,0.9425,0.8991,0.8394,0.7928,0.6199
254,TTTA,0.9502,0.9413,0.8936,0.8685,0.6146


Load knockdown differential rna-seq data

In [6]:
# get other diff expr for subsetting to shared genes later
diff_exp_HNRNPC = pd.read_csv("HNRNPC/HNRNPC.tsv", sep='\t',)
diff_exp_SRSF9 = pd.read_csv("SRSF9/SRSF9.tsv", sep='\t',)
diff_exp_PCBP1 = pd.read_csv("PCBP1/PCBP1.tsv", sep='\t',)
diff_exp_TIA1 = pd.read_csv("TIA1/TIA1.tsv", sep='\t',)
diff_exp_TRA2A = pd.read_csv("TRA2A/TRA2A.tsv", sep='\t',)

## Generate positive pairs

EClip positive binding sites

In [7]:
in_vivo_pos_seqs = eclip[1].values

In [8]:
# filter for length and unique values
in_vivo_pos_seqs = [s for s in in_vivo_pos_seqs if len(s) >= 10]
in_vivo_pos_seqs = list(set(in_vivo_pos_seqs))

RBNS significant binding affinity

In [9]:
def get_signficant_sequences(df):
    num_cols = df.columns[1:]
    filtered = df[df[num_cols].gt(2).any(axis=1)]
    pairs = [
        (row[df.columns[0]], tuple(row[num_cols]))
        for _, row in filtered.iterrows()
    ]
    return pairs

In [10]:
pos_four_mers = get_signficant_sequences(four_mers)
pos_five_mers = get_signficant_sequences(five_mers)
pos_six_mers = get_signficant_sequences(six_mers)

In [11]:
len(pos_four_mers), len(pos_five_mers), len(pos_six_mers), 

(0, 1, 120)

In [12]:
in_vitro_pos_seqs = np.concatenate((pos_five_mers,pos_six_mers))
in_vitro_pos_seqs.shape



(121, 2)

In [13]:
unique_rbns = {}
for s, arr in in_vitro_pos_seqs:
    if s not in unique_rbns:
        unique_rbns[s] = arr

in_vitro_pos_seqs = list(unique_rbns.items())
len(in_vitro_pos_seqs)

121

Filter diff expr

In [14]:
# remove when gene expression is always 0 or if logfold change is inf/-inf
diff_exp_filtered_HNRNPC = diff_exp_HNRNPC[
    ~(
        ((diff_exp_HNRNPC["value_1"] == 0) & (diff_exp_HNRNPC["value_2"] == 0)) |
        (diff_exp_HNRNPC["log2(fold_change)"].isin([np.inf, -np.inf]))
    )
]

diff_exp_filtered_SRSF9 = diff_exp_SRSF9[
    ~(
        ((diff_exp_SRSF9["value_1"] == 0) & (diff_exp_SRSF9["value_2"] == 0)) |
        (diff_exp_SRSF9["log2(fold_change)"].isin([np.inf, -np.inf]))
    )
]

diff_exp_filtered_PCBP1 = diff_exp_PCBP1[
    ~(
        ((diff_exp_PCBP1["value_1"] == 0) & (diff_exp_PCBP1["value_2"] == 0)) |
        (diff_exp_PCBP1["log2(fold_change)"].isin([np.inf, -np.inf]))
    )
]

diff_exp_filtered_TIA1 = diff_exp_TIA1[
    ~(
        ((diff_exp_TIA1["value_1"] == 0) & (diff_exp_TIA1["value_2"] == 0)) |
        (diff_exp_TIA1["log2(fold_change)"].isin([np.inf, -np.inf]))
    )
]

diff_exp_filtered_TRA2A = diff_exp_TRA2A[
    ~(
        ((diff_exp_TRA2A["value_1"] == 0) & (diff_exp_TRA2A["value_2"] == 0)) |
        (diff_exp_TRA2A["log2(fold_change)"].isin([np.inf, -np.inf]))
    )
]

In [15]:
dfs = [diff_exp_filtered_HNRNPC, diff_exp_filtered_SRSF9, diff_exp_filtered_PCBP1, diff_exp_filtered_TIA1, diff_exp_filtered_TRA2A]
col = "gene_id"
shared_keys = set.intersection(*(set(df[col]) for df in dfs))

# 2. Subset the dataframe you want (example: df1)
diff_exp_filtered_SRSF9 = diff_exp_filtered_SRSF9[diff_exp_filtered_SRSF9[col].isin(shared_keys)]

In [16]:
diff_exp_filtered_SRSF9.head()

Unnamed: 0,test_id,gene_id,gene,locus,sample_1,sample_2,status,value_1,value_2,log2(fold_change),test_stat,p_value,q_value,significant
7,XLOC_000008,XLOC_000008,OR4F29,chr1:317719-461954,SRSF9-BGHLV12-HepG2,Control,OK,0.626629,0.248264,-1.33573,-0.528539,0.0345,0.088801,no
11,XLOC_000012,XLOC_000012,MTND1P23,chr1:536815-660283,SRSF9-BGHLV12-HepG2,Control,OK,170.655,189.246,0.149173,0.07478,0.8906,0.932908,no
12,XLOC_000013,XLOC_000013,MTND2P28,chr1:536815-660283,SRSF9-BGHLV12-HepG2,Control,OK,513.752,652.723,0.345401,0.693741,0.3207,0.478673,no
13,XLOC_000014,XLOC_000014,hsa-mir-6723,chr1:536815-660283,SRSF9-BGHLV12-HepG2,Control,OK,1090.39,1230.73,0.174662,0.626937,0.36875,0.528346,no
14,XLOC_000015,XLOC_000015,RP5-857K21.7,chr1:536815-660283,SRSF9-BGHLV12-HepG2,Control,OK,538.127,636.52,0.24226,0.359909,0.60395,0.735486,no


In [17]:
diff_exp_filtered_SRSF9.shape

(17985, 14)

In [18]:
logfold_changes = diff_exp_filtered_SRSF9['log2(fold_change)'].values
logfold_changes

array([-1.33573 ,  0.149173,  0.345401, ..., -1.01329 , -0.139774,
       -1.51089 ])

## Generate negative pairs

Shuffle eCLIP sequences

In [19]:
import random
def shuffle_by_pairs(seq):
    pairs = [seq[i:i+2] for i in range(0, len(seq), 2)]
    random.shuffle(pairs)
    return "".join(pairs)
in_vivo_neg_seqs = [shuffle_by_pairs(seq) for seq in eclip[1].values]
len(in_vivo_neg_seqs)

128774

In [20]:
# filter for length and unique values
in_vivo_neg_seqs = [s for s in in_vivo_neg_seqs if len(s) >= 10]
in_vivo_neg_seqs = list(set(in_vivo_neg_seqs))
len(in_vivo_neg_seqs)

123719

Retrieve RBNS sequences that are not enriched

In [21]:
def get_insignficant_sequences(df):
    num_cols = df.columns[1:]
    filtered = df[df[num_cols].lt(2).any(axis=1)]
    pairs = [
        (row[df.columns[0]], tuple(row[num_cols]))
        for _, row in filtered.iterrows()
    ]
    return pairs

In [22]:
neg_four_mers = get_insignficant_sequences(four_mers)
neg_five_mers = get_insignficant_sequences(five_mers)
neg_six_mers = get_insignficant_sequences(six_mers)

In [23]:
neg_four_mers

[('GGGG', (1.0975, 1.1679, 1.2271, 1.2953, 1.7139)),
 ('GGAG', (1.084, 1.2955, 1.4331, 1.6534, 1.6673)),
 ('GGGA', (1.0948, 1.2341, 1.3277, 1.4853, 1.5885)),
 ('AGGG', (1.0835, 1.2028, 1.2505, 1.3731, 1.5563)),
 ('GAGG', (1.0562, 1.1583, 1.2415, 1.3768, 1.5015)),
 ('AGGA', (1.0795, 1.2664, 1.3685, 1.5765, 1.4914)),
 ('TGGG', (1.0713, 1.1198, 1.175, 1.2301, 1.4846)),
 ('GGGC', (1.0371, 1.0663, 1.1299, 1.123, 1.4796)),
 ('GGAA', (1.088, 1.2584, 1.3455, 1.554, 1.4778)),
 ('CAGG', (1.0332, 1.1257, 1.1791, 1.2627, 1.4751)),
 ('GGCC', (1.0099, 1.0264, 1.1019, 1.0934, 1.4428)),
 ('GGTG', (1.0333, 1.0965, 1.1215, 1.1694, 1.4261)),
 ('GGGT', (1.0623, 1.1022, 1.1354, 1.1614, 1.4242)),
 ('CGGG', (1.0429, 1.0561, 1.1317, 1.1167, 1.4201)),
 ('GTGG', (1.0395, 1.0887, 1.1268, 1.1691, 1.4162)),
 ('GCGC', (0.9708, 1.0122, 1.0704, 1.0824, 1.402)),
 ('GCGG', (1.0052, 1.0331, 1.0877, 1.0612, 1.3789)),
 ('GCAG', (1.0121, 1.0756, 1.1213, 1.1783, 1.3773)),
 ('GGCA', (1.0079, 1.0643, 1.1165, 1.1847, 1.3691)),

In [24]:
in_vitro_neg_seqs = np.concatenate((neg_four_mers,neg_five_mers,neg_six_mers))
in_vitro_neg_seqs.shape



(17664, 2)

In [25]:
unique_rbns = {}
for s, arr in in_vitro_neg_seqs:
    if s not in unique_rbns:
        unique_rbns[s] = arr

in_vitro_neg_seqs = list(unique_rbns.items())
len(in_vitro_neg_seqs)

17664

Construct placeholder expression change vector

In [26]:
placeholder_logfold_changes = np.zeros(shape=logfold_changes.shape)

## Split into train/val/test

In [27]:
# number of total samples per class
combinations_size = 100000

Positive samples

In [28]:
from itertools import product

random.shuffle(in_vivo_pos_seqs)
random.shuffle(in_vitro_pos_seqs)

n_eclip = len(in_vivo_pos_seqs)
n_rbns  = len(in_vitro_pos_seqs)

# Split indices
train_eclip = in_vivo_pos_seqs[:int(0.7*n_eclip)]
val_eclip   = in_vivo_pos_seqs[int(0.7*n_eclip):int(0.85*n_eclip)]
test_eclip  = in_vivo_pos_seqs[int(0.85*n_eclip):]

train_rbns = in_vitro_pos_seqs[:int(0.7*n_rbns)]
val_rbns   = in_vitro_pos_seqs[int(0.7*n_rbns):int(0.85*n_rbns)]
test_rbns  = in_vitro_pos_seqs[int(0.85*n_rbns):]

pos_train_samples = [
    (random.choice(train_eclip), random.choice(train_rbns))
    for _ in range(combinations_size)
]
pos_val_samples = [
    (random.choice(val_eclip), random.choice(val_rbns))
    for _ in range(combinations_size)
]
pos_test_samples = [
    (random.choice(test_eclip), random.choice(test_rbns))
    for _ in range(combinations_size)
]

In [29]:
train_eclips = set(eclip for eclip, _ in pos_train_samples)
val_eclips   = set(eclip for eclip, _ in pos_val_samples)
test_eclips  = set(eclip for eclip, _ in pos_test_samples)

assert train_eclips.isdisjoint(val_eclips), "Overlap between train and val eCLIP sequences!"
assert train_eclips.isdisjoint(test_eclips), "Overlap between train and test eCLIP sequences!"
assert val_eclips.isdisjoint(test_eclips), "Overlap between val and test eCLIP sequences!"
print("eCLIP sequences are disjoint across splits")

eCLIP sequences are disjoint across splits


In [30]:
train_rbns = set(rbns for _, (rbns, _) in pos_train_samples)
val_rbns   = set(rbns for _, (rbns, _) in pos_val_samples)
test_rbns  = set(rbns for _, (rbns, _) in pos_test_samples)

assert train_rbns.isdisjoint(val_rbns), "Overlap between train and val RBNS sequences!"
assert train_rbns.isdisjoint(test_rbns), "Overlap between train and test RBNS sequences!"
assert val_rbns.isdisjoint(test_rbns), "Overlap between val and test RBNS sequences!"
print("RBNS sequences are disjoint across splits")

RBNS sequences are disjoint across splits


In [31]:
pos_train_label = np.ones(len(pos_train_samples))
pos_val_label = np.ones(len(pos_val_samples))
pos_test_label = np.ones(len(pos_test_samples))

Negative samples

In [32]:
random.shuffle(in_vivo_neg_seqs)
random.shuffle(in_vitro_neg_seqs)

n_eclip = len(in_vivo_neg_seqs)
n_rbns  = len(in_vitro_neg_seqs)

# Split indices
train_eclip = in_vivo_neg_seqs[:int(0.7*n_eclip)]
val_eclip   = in_vivo_neg_seqs[int(0.7*n_eclip):int(0.85*n_eclip)]
test_eclip  = in_vivo_neg_seqs[int(0.85*n_eclip):]

train_rbns = in_vitro_neg_seqs[:int(0.7*n_rbns)]
val_rbns   = in_vitro_neg_seqs[int(0.7*n_rbns):int(0.85*n_rbns)]
test_rbns  = in_vitro_neg_seqs[int(0.85*n_rbns):]

neg_train_samples = [
    (random.choice(train_eclip), random.choice(train_rbns))
    for _ in range(combinations_size)
]
neg_val_samples = [
    (random.choice(val_eclip), random.choice(val_rbns))
    for _ in range(combinations_size)
]
neg_test_samples = [
    (random.choice(test_eclip), random.choice(test_rbns))
    for _ in range(combinations_size)
]

In [33]:
neg_train_label = np.zeros(len(neg_train_samples))
neg_val_label = np.zeros(len(neg_val_samples))
neg_test_label = np.zeros(len(neg_test_samples))

Combine for full splits

In [34]:
train_data = np.concatenate((pos_train_samples, neg_train_samples))
val_data = np.concatenate((pos_val_samples, neg_val_samples))
test_data = np.concatenate((pos_test_samples, neg_test_samples))

train_labels = np.concatenate((pos_train_label, neg_train_label))
val_labels = np.concatenate((pos_val_label, neg_val_label))
test_labels = np.concatenate((pos_test_label, neg_test_label))

In [35]:
# get positive/negative label values
positive_value = ("SRSF9", logfold_changes)
negative_value = ("Not SRSF9", placeholder_logfold_changes)

## Save as dictionaries

Save as pickle files since there are nested data structures

In [36]:
import pickle

In [37]:
'''
Saved data format for each split:
- Input
    - List containing:
        - eCLIP sequence: text
        - Tuple of RBNS sequence and binding affinity: (text, vector)
- Output
    - List containing:
        - RBP label: text
        - Logfold gene expression change: vector
        
Labels save separately to save storage:
- List containing:
    - RBP label: text
    - Logfold gene expression change: vector
'''

'\nSaved data format for each split:\n- Input\n    - List containing:\n        - eCLIP sequence: text\n        - Tuple of RBNS sequence and binding affinity: (text, vector)\n- Output\n    - List containing:\n        - RBP label: text\n        - Logfold gene expression change: vector\n        \nLabels save separately to save storage:\n- List containing:\n    - RBP label: text\n    - Logfold gene expression change: vector\n'

In [38]:
# Save train
with open("SRSF9/train_split.pkl", "wb") as f:
    pickle.dump((train_data, train_labels), f)

In [39]:
# Save val
with open("SRSF9/val_split.pkl", "wb") as f:
    pickle.dump((val_data, val_labels), f)

In [40]:
# Save test
with open("SRSF9/test_split.pkl", "wb") as f:
    pickle.dump((test_data, test_labels), f)

In [41]:
# Save positive label
with open("SRSF9/SRSF9_positive_label.pkl", "wb") as f:
    pickle.dump(positive_value, f)

In [42]:
# Save negative label
with open("SRSF9/SRSF9_negative_label.pkl", "wb") as f:
    pickle.dump(negative_value, f)