In [1]:
import pandas as pd
import numpy as np

In [2]:
data_dir = '/lustre/groups/epigenereg01/workspace/projects/vale/mlm/eCLIP/'

In [3]:
# get coordinates of eCLIP peaks for each sequence

eclip_pos = pd.read_csv(data_dir + 'data/eCLIP.3utr.pos.PhyloP.bed', sep='\t',
                       usecols=[0,1,2,4], names=['chrom','eclip_start','eclip_end','seq_name']) #absolute coordinates of eCLIP peaks

utr_table = pd.read_csv(data_dir + 'data/GRCh38.3utr_5Klimited.bed', sep='\t',
                       usecols=[0,1,2,3,5], names=['chrom','utr_start','utr_end','seq_name','strand']) #absolute coordinates of 3'UTR sequences

eclip_df = eclip_pos.merge(utr_table, how='left')

#eclip_df = eclip_df[eclip_df.seq_name.apply(lambda x: x in test_df.index)] # only seqeunces from the test dataset

eclip_df.drop_duplicates(subset=['chrom','eclip_start','eclip_end'], inplace=True)

eclip_df['eclip_start_rel'] = eclip_df.apply(lambda x: x.eclip_start-x.utr_start if x.strand=='+'
                                else x.utr_end-x.eclip_end, axis=1) #relative to sequence start

eclip_df['eclip_end_rel'] = eclip_df.apply(lambda x: x.eclip_end-x.utr_start if x.strand=='+'
                                else x.utr_end-x.eclip_start, axis=1) #relative to sequence start

eclip_pos_intervals = eclip_df.groupby('seq_name').apply(lambda x: x[['eclip_start_rel','eclip_end_rel']].values.tolist()) #positive eCLIP intervals for each sequence

#sort and merge overlapping intervals
#see https://stackoverflow.com/questions/43600878/merging-overlapping-intervals
def merge_intervals(intervals_list):
    # Sort by the start coordinate
    intervals_list.sort(key=lambda interval: interval[0])
    # Create a stack with the first interval
    merged = [intervals_list[0]]
    # Check for overlapping interval
    for current in intervals_list:
        previous = merged[-1]
        if current[0] <= previous[1]:
            previous[1] = max(previous[1], current[1])  # If it’s overlapping, then merge them into one interval
        else: # otherwise, push it in the stack
            merged.append(current)
    return merged

eclip_pos_intervals = eclip_pos_intervals.apply(merge_intervals)

In [11]:
eclip_pos_intervals.to_pickle(data_dir + 'eCLIP_intervals_3UTR.df')