In [40]:
import pandas as pd
import numpy as np
import re

In [9]:
intervals = pd.read_csv('hsa-all.bed', sep='\t', header=None, names=['chr', 'start', 'end', 'name', 'strand', 'orientation'])

# Get precursor RNA, mature miRNAs and their intervals

In [13]:
pres = intervals[np.logical_and(intervals['name'][-4:] == '_pre', ~intervals['name'].str.contains('pri'))]

In [18]:
def annotate_pre_pri(row):
    if row['name'][-4:] == '_pre':
        row['pre'] = True
    else:
        row['pre'] = False
    if 'pri' in row['name']:
        row['pri'] = True
    else:
        row['pri'] = False
    return row

In [19]:
intervals = intervals.apply(annotate_pre_pri, axis=1)

Get everything after the underscore to figure out what exactly is going on here.

In [121]:
set([re.search('_.*', name).group(0) for name in intervals['name']])

{'_3p',
 '_3p*',
 '_3p*_seed',
 '_3p_co',
 '_3p_co_seed',
 '_3p_seed',
 '_5p',
 '_5p*',
 '_5p*_seed',
 '_5p_seed',
 '_loop',
 '_motif_3p',
 '_motif_5p',
 '_motif_loop',
 '_pre',
 '_pri',
 '_pri_3p',
 '_pri_5p'}

In [83]:
pres = intervals[intervals['pre']].copy().drop(['strand', 'orientation', 'pre', 'pri'], axis=1)

In [86]:
pres['parent_id'] = [re.sub('_.*?$', '', name) for name in pres['name']]

In [97]:
df_5['parent_id'] = [re.sub('_.*?$', '', name) for name in df_5['name']]

In [88]:
df = intervals[~intervals['pri']].drop(['strand', 'orientation', 'pre', 'pri'], axis=1)

In [89]:
df_5 = df[np.logical_and([name[-3:] == '_5p' for name in df['name']], ~df['name'].str.contains('motif'))].copy()

In [90]:
df_3 = df[np.logical_and([name[-3:] == '_3p' for name in df['name']], ~df['name'].str.contains('motif'))].copy()

In [91]:
df_3['parent_id'] = [re.sub('_.*?$', '', name) for name in df_3['name']]

In [96]:
df_5

Unnamed: 0,chr,start,end,name
47,chr1,9151733,9151756,Hsa-Mir-34-P1_5p
56,chr1,34669649,34669671,Hsa-Mir-552_5p
65,chr1,37500971,37500993,Hsa-Mir-5581_5p
71,chr1,40754370,40754394,Hsa-Mir-30-P1d_5p
80,chr1,40757299,40757323,Hsa-Mir-30-P2d_5p
...,...,...,...,...
5626,chrX,151958627,151958651,Hsa-Mir-224_5p
5643,chrX,151959674,151959696,Hsa-Mir-452-v2_5p
5644,chrX,151959675,151959699,Hsa-Mir-452-v1_5p
5657,chrX,152392263,152392287,Hsa-Mir-105-P2_5p


In [106]:
pre_35 = pres.merge(df_5.drop('chr', axis=1), how='left', on='parent_id', suffixes=['_pre', '_5p']).merge(df_3.drop('chr', axis=1).rename(columns={'start':'start_3p', 'end':'end_3p', 'name':'name_3p'}),
                                                                                                                                          how='left', on='parent_id', suffixes = ['', '_3p'])

In [113]:
pre_35.to_csv('precursor_mature_annotated_miRNAs.csv')

In [122]:
pre_

Unnamed: 0,chr,start_pre,end_pre,name_pre,parent_id,start_5p,end_5p,name_5p,start_3p,end_3p,name_3p
0,chr1,1167123,1167182,Hsa-Mir-8-P1a_pre,Hsa-Mir-8-P1a,,,,1167159.0,1167182.0,Hsa-Mir-8-P1a_3p
1,chr1,1167877,1167938,Hsa-Mir-8-P2a_pre,Hsa-Mir-8-P2a,,,,1167915.0,1167938.0,Hsa-Mir-8-P2a_3p
2,chr1,1169019,1169076,Hsa-Mir-8-P3a_pre,Hsa-Mir-8-P3a,,,,1169054.0,1169076.0,Hsa-Mir-8-P3a_3p
3,chr1,3560709,3560769,Hsa-Mir-551-P1_pre,Hsa-Mir-551-P1,,,,3560709.0,3560730.0,Hsa-Mir-551-P1_3p
4,chr1,9151691,9151756,Hsa-Mir-34-P1_pre,Hsa-Mir-34-P1,9151733.0,9151756.0,Hsa-Mir-34-P1_5p,,,
...,...,...,...,...,...,...,...,...,...,...,...
562,chrX,151958582,151958651,Hsa-Mir-224_pre,Hsa-Mir-224,151958627.0,151958651.0,Hsa-Mir-224_5p,,,
563,chrX,151959638,151959699,Hsa-Mir-452-v1_pre,Hsa-Mir-452-v1,151959675.0,151959699.0,Hsa-Mir-452-v1_5p,,,
564,chrX,151959639,151959696,Hsa-Mir-452-v2_pre,Hsa-Mir-452-v2,151959674.0,151959696.0,Hsa-Mir-452-v2_5p,,,
565,chrX,152392228,152392287,Hsa-Mir-105-P2_pre,Hsa-Mir-105-P2,152392263.0,152392287.0,Hsa-Mir-105-P2_5p,,,
