In [5]:
import pandas as pd
import numpy as np
from collections import defaultdict
from tqdm.notebook import tqdm

In [4]:
#sort and merge overlapping intervals
#see https://stackoverflow.com/questions/43600878/merging-overlapping-intervals
def merge_intervals(intervals_list):
    # Sort by the start coordinate
    intervals_list.sort(key=lambda interval: interval[0])
    # Create a stack with the first interval
    merged = [intervals_list[0]]
    # Check for overlapping interval
    for current in intervals_list:
        previous = merged[-1]
        if current[0] <= previous[1]:
            previous[1] = max(previous[1], current[1])  # If it’s overlapping, then merge them into one interval
        else: # otherwise, push it in the stack
            merged.append(current)
    return merged

def add_relative_coordinates(motifs_df):

    '''
    add relative coordinates for each eCLIP sequence w.r.t. 3'UTR sequence
    '''

    df = motifs_df.merge(utr_table, how='left')

    df.drop_duplicates(subset=['chrom','eclip_start','eclip_end'], inplace=True)

    df['start_rel'] = df.apply(lambda x: x.eclip_start-x.utr_start if x.strand=='+'
                                    else x.utr_end-x.eclip_end, axis=1) #relative to sequence start
    
    df['end_rel'] = df.apply(lambda x: x.eclip_end-x.utr_start if x.strand=='+'
                                    else x.utr_end-x.eclip_start, axis=1) #relative to sequence start

    return df

def read_fasta(fasta):

    seqs = defaultdict(str)
    
    with open(fasta, 'r') as f:
        for line in f:
            if line.startswith('>'):
                seq_name = line[1:].rstrip()
            else:
                seq = line.rstrip()
                seqs[seq_name] += seq.upper()
    return seqs

def get_overlap(a, b):
    '''
    get the overlap length between 2 intervals
    '''
    return max(0, min(a[1], b[1]) - max(a[0], b[0]))

def get_max_overlap(interval, intervals):
    '''
    get the overlap of maximal length between a given interval and a list of intervals
    '''
    return np.max([get_overlap(interval,x) for x in intervals])

In [8]:
data_dir = '/lustre/groups/epigenereg01/workspace/projects/vale/mlm/motif_analysis/'

In [4]:
# get coordinates of eCLIP peaks for each sequence

utr_table = pd.read_csv(data_dir + '../UTR_coords/GRCh38_3_prime_UTR_clean.bed', sep='\t',
                       usecols=[0,1,2,3,5], names=['chrom','utr_start','utr_end','seq_name','strand']) #absolute coordinates of 3'UTR sequences

utr_table.head()

Unnamed: 0,chrom,utr_start,utr_end,seq_name,strand
0,chr1,67092164,67093004,ENST00000684719.1_utr3_7_0_chr1_67092165_r,-
1,chr1,8352403,8355086,ENST00000400908.7_utr3_22_0_chr1_8352404_r,-
2,chr1,75202128,75203726,ENST00000370859.8_utr3_23_0_chr1_75202129_r,-
3,chr1,83865023,83869961,ENST00000260505.13_utr3_20_0_chr1_83865024_r,-
4,chr1,92246401,92246529,ENST00000370360.8_utr3_18_0_chr1_92246402_r,-


In [5]:
eclip_pos = pd.read_csv(data_dir + 'van_nostrand_2019/eCLIP/eCLIP.3utr.pos_IDR.bed', sep='\t',
                       usecols=[0,1,2,3,6], names=['chrom','eclip_start','eclip_end','seq_name','target']) #absolute coordinates of eCLIP peaks

eclip_pos.head()

Unnamed: 0,chrom,eclip_start,eclip_end,seq_name,target
0,chr1,944221,944317,ENST00000327044.7_utr3_18_0_chr1_944203_r,APOBEC3C_K562_IDR
1,chr1,944229,944297,ENST00000327044.7_utr3_18_0_chr1_944203_r,PABPC4_K562_IDR
2,chr1,944247,944312,ENST00000327044.7_utr3_18_0_chr1_944203_r,PABPC4_K562_IDR
3,chr1,944270,944337,ENST00000327044.7_utr3_18_0_chr1_944203_r,PABPC4_K562_IDR
4,chr1,944319,944511,ENST00000327044.7_utr3_18_0_chr1_944203_r,UPF1_HepG2_IDR


In [6]:
eclip_neg = pd.read_csv(data_dir + 'van_nostrand_2019/eCLIP/eCLIP.3utr.neg.bed', sep='\t',
                       usecols=[0,1,2,3], names=['chrom','eclip_start','eclip_end','seq_name']) #absolute coordinates of eCLIP peaks

eclip_neg.head()

Unnamed: 0,chrom,eclip_start,eclip_end,seq_name
0,chr1,70008,71585,ENST00000641515.2_utr3_2_0_chr1_70009_f
1,chr1,944153,944209,ENST00000616016.5_utr3_13_0_chr1_944154_f
2,chr1,944202,944209,ENST00000327044.7_utr3_18_0_chr1_944203_r
3,chr1,965688,965719,ENST00000338591.8_utr3_11_0_chr1_965192_f
4,chr1,998963,998981,ENST00000304952.11_utr3_3_0_chr1_998964_r


In [7]:
stepwiseR1_min = 0
motif_len = 5

rbns_motifs = pd.read_excel(data_dir + 'van_nostrand_2019/41586_2020_2077_MOESM8_ESM.xlsx')
    
rbns_motifs = rbns_motifs.set_index('RBP').iloc[:,4:].apply(list,axis=1)
rbns_motifs = rbns_motifs.explode().dropna()
    
rbns_motifs = rbns_motifs.apply(lambda x:pd.Series(x.split('_'),index=['motif','logonum','stepwise_r']))
rbns_motifs.stepwise_r = rbns_motifs.stepwise_r.astype(float)
rbns_motifs.logonum = rbns_motifs.logonum.astype(int)
rbns_motifs.motif = rbns_motifs.motif.str.replace('U','T')
rbns_motifs = rbns_motifs[rbns_motifs.stepwise_r>=stepwiseR1_min]

stepwise_r = rbns_motifs.reset_index().set_index(['RBP','motif']).stepwise_r
rbns_motifs = rbns_motifs.groupby(rbns_motifs.index).motif.apply(list)

In [8]:
human_fa = data_dir + '../fasta/Homo_sapiens_rna.fa'
utr_seqs = read_fasta(human_fa)

In [9]:
eclip_pos.target = eclip_pos.target.apply(lambda x:x.split('_')[0])

eclip_pos = eclip_pos[eclip_pos.target.isin(rbns_motifs.index)]
eclip_pos = add_relative_coordinates(eclip_pos)

eclip_pos = eclip_pos.groupby(['seq_name','target']).apply(lambda x: merge_intervals(x[['start_rel','end_rel']].values.tolist()),include_groups=False)

In [10]:
eclip_neg = add_relative_coordinates(eclip_neg)
eclip_neg = eclip_neg.groupby('seq_name').apply(lambda x: merge_intervals(x[['start_rel','end_rel']].values.tolist()),include_groups=False)

In [11]:
pos_df = []
neg_df = []

for seq_name, seq in tqdm(utr_seqs.items()):
    if seq_name in eclip_pos.index or seq_name in eclip_neg.index:
        for motif_start in range(0,len(seq)-motif_len):
            motif_pos = [motif_start,motif_start+motif_len]
            motif = seq[motif_start:motif_start+motif_len]
            if seq_name in eclip_pos.index:
                for target, intervals in eclip_pos.loc[seq_name].items():
                    if motif in rbns_motifs[target] and get_max_overlap(motif_pos, intervals) == motif_len:
                        stepwiseR = stepwise_r[(target,motif)]
                        pos_df.append([seq_name, motif_start, motif, target, stepwiseR])
            if seq_name in eclip_neg.index:
                if get_max_overlap(motif_pos, eclip_neg.loc[seq_name]) == motif_len: 
                    neg_df.append([seq_name, motif_start, motif])

  0%|          | 0/18134 [00:00<?, ?it/s]

In [12]:
pos_df = pd.DataFrame(pos_df,columns=['seq_name', 'motif_start', 'motif', 'RBP', 'stepwiseR_minus1'])
neg_df = pd.DataFrame(neg_df,columns=['seq_name', 'motif_start', 'motif'])

In [13]:
n_proteins = pos_df.RBP.nunique()
n_motifs = pos_df.motif.nunique()

print(n_proteins,n_motifs)

20 77


In [20]:
pos_df = pos_df.groupby(['seq_name', 'motif_start', 'motif']).agg({'RBP':lambda x:','.join(x),
                                                            'stepwiseR_minus1':lambda x:','.join(x.astype(str))}).reset_index()

In [21]:
len(pos_df)

57172

In [23]:
neg_df = neg_df[neg_df.motif.isin(pos_df.motif)] #use only the same k-mers as in positive set

In [24]:
len(neg_df)

1653964

In [25]:
#subsample the negative set s.t. it has the same k-mer distribution as in the positive set

pos_motif_count = pos_df.motif.value_counts()
neg_df = neg_df.groupby('motif').apply(lambda x:x.sample(n=pos_motif_count[x.name],replace=False, random_state=1),include_groups=False).reset_index(level=0)

neg_df = neg_df[['seq_name','motif_start','motif']]

In [28]:
pos_df['eCLIP_RBNS'] = True #functional
neg_df['eCLIP_RBNS'] = False #non-functional

motifs_df = pd.concat([pos_df,neg_df])

In [29]:
motifs_df = motifs_df.merge(utr_table) #add 3'UTR coordinates

In [79]:
#absolute coordinates of the k-mer on the chromosome
#0-based
#for the negative strand, motif_start is defined w.r.t. reverse complemented sequences,
#so different formula for conversion

motifs_df['pos'] = motifs_df.apply(lambda x: x.motif_start+x.utr_start if x.strand=='+'
                                    else x.utr_end-(x.motif_start+motif_len), axis=1)

In [31]:
motifs_df.eCLIP_RBNS.value_counts()

eCLIP_RBNS
True     57172
False    57172
Name: count, dtype: int64

In [80]:
motifs_df['motif_id'] = motifs_df.chrom + '_' + motifs_df.pos.astype(str) + '_' + motifs_df.motif + '_' + motifs_df.seq_name.str[-1]

In [81]:
motifs_df = motifs_df[['chrom','pos','seq_name','strand','motif','motif_id','motif_start','RBP','stepwiseR_minus1','eCLIP_RBNS']]

In [82]:
motifs_df.to_csv(data_dir + 'eclip.tsv',sep='\t',index=False)