In [1]:
import numpy
import pandas as pd

In [3]:

pd.options.mode.chained_assignment = None
def bf_apadb2(df, val=[   2,   50,  200,  200,  0.3,  50,  50],
                  ids=['utr',  'i', 'j', 'rds','rel','up','dn']):
    
    print('3\'UTRs         =    ' + str(val[0]))
    print('interval (bp) >=   ' + str(val[1]) + ' to ' + str(val[2]) + ' bp')
    print('locus          =  -' + str(val[5]) + ' to +' + str(val[6]) + ' bp')
    print('read depth    >=  ' + str(val[3]))
    print('weakest site  >=    ' + str(val[4]))
    print('-----------------------------------')

    
    print('debug length:   ' + "{:>12}".format("{:,}".format(len(df))) + ' genes ')
    # filtering/reconstructing
    df['genefam']        = df['gene'].str.split('\\.').apply(lambda x: x[0])
    lost_puppy        = len(df.groupby('genefam')) 
    df['apa']         = df.groupby('genefam')['genefam'].transform(len)
    df['start']       = df['start']-400#200
    df['end']         = df['end']+400#200
    df                = df.drop('apa', 1)
    df['total_reads'] = df.groupby('genefam')['reads'].transform(lambda x: sum(x))
    df['rel_use']     = (df['reads'] / df['total_reads']).round(3)
    df['interval']    = df.groupby('genefam')['mode'].transform(lambda x: x.max() - x.min())
    df                = df.drop('genefam', 1)

    print('input :   ' + "{:>12}".format("{:,}".format(lost_puppy)) + ' genes ')
    print('output:   ' + "{:>12}".format("{:,}".format(len(df))) + ' genes ')

    return df

In [4]:
apadb_bed    = 'unprocessed_data/apadb/hg19.apadb_v2_final.bed'#'../../data/genome_hg19/features/hg19.apadb_v2_final.bed'

hg19_fai     = 'unprocessed_data/hg19/hg19.fa.fai'
hg19_fa      = 'unprocessed_data/hg19/hg19.fa'

# read apadb .bed into df
df_columns = ['chr', 'start', 'end', 'gene', 'reads', 'strand', 'feature', 'mode']
df_bed = pd.read_csv(apadb_bed, sep='\t', header=None, names=df_columns, usecols=[0,1,2,3,4,5,6,8])

filtered = bf_apadb2(df_bed,[20, 0, 40000, 1, 0.00, 0, 0])
filtered.head()

3'UTRs         =    20
interval (bp) >=   0 to 40000 bp
locus          =  -0 to +0 bp
read depth    >=  1
weakest site  >=    0.0
-----------------------------------
debug length:         71,829 genes 
input :         16,786 genes 
output:         71,829 genes 


Unnamed: 0,chr,start,end,gene,reads,strand,feature,mode,total_reads,rel_use,interval
0,chr17,61860,62712,RPH3AL.1,122,-,UTR3,62297,130,0.938,136061
1,chr17,197956,198758,RPH3AL.2,8,-,Intron,198358,130,0.062,136061
2,chr17,289358,290185,FAM101B.1,24,-,UTR3,289766,650,0.037,3013
3,chr17,289535,290347,FAM101B.2,170,-,UTR3,289946,650,0.262,3013
4,chr17,289616,290421,FAM101B.3,12,-,UTR3,290017,650,0.018,3013


In [5]:
filtered['interval']    = filtered['interval'].astype(int)
filtered['mode']        = filtered['mode'].astype(int)
filtered['start']       = filtered['start'].astype(int)
filtered['end']         = filtered['end'].astype(int)
filtered['reads']       = filtered['reads'].astype(int)
filtered['total_reads'] = filtered['total_reads'].astype(int)
filtered.tail()

Unnamed: 0,chr,start,end,gene,reads,strand,feature,mode,total_reads,rel_use,interval
71824,chr18,77897015,77897830,ADNP2.3,29,+,UTR3,77897423,748,0.039,1604
71825,chr18,77897126,77897938,ADNP2.2,5,+,UTR3,77897528,748,0.007,1604
71826,chr18,77897805,77898649,ADNP2.1,702,+,UTR3,77898227,748,0.939,1604
71827,chr18,77906737,77907543,PARD6G-AS1.2,48,+,Intron,77907140,87,0.552,13154
71828,chr18,77919885,77920713,PARD6G-AS1.1,39,+,Intron,77920294,87,0.448,13154


In [6]:
output_id = 'apadb_v2_seqs'

# bed
output_bed = output_id + '.bed'
bed_columns = ['chr', 'start', 'end', 'gene', 'feature', 'strand', 'total_reads', 'rel_use', 'interval']
filtered.to_csv(output_bed, sep='\t', header=False, columns=bed_columns, index=False)

# fasta
output_fa = output_id + '.fa'
!bedtools getfasta -name -s -fi "$hg19_fa" -bed "$output_bed" -fo "$output_fa"

# file tops
!head -5 "$output_bed" | column -t ; echo
!head -10 "$output_fa" ; echo


chr17  61860   62712   RPH3AL.1   UTR3    -  130  0.938  136061
chr17  197956  198758  RPH3AL.2   Intron  -  130  0.062  136061
chr17  289358  290185  FAM101B.1  UTR3    -  650  0.037  3013
chr17  289535  290347  FAM101B.2  UTR3    -  650  0.262  3013
chr17  289616  290421  FAM101B.3  UTR3    -  650  0.018  3013

>RPH3AL.1
CTGTCTTCAGCTTGAGGAGCTGGGAAGCTCTGGTGGATGCTATGAACTCACTTGCTGAAGAGCAGCGTTCAGGTGCATCCCCAGCCAGGGCACGTGGCTCCCTCAGCCATGAATTCACTTCTCTTCAGGAGGTTTGGCTTGGCATGAAAATACTTCATTCAGAGTATGGGCAAATGCTTCTGGAAAACCCTTCCCTGAAGAGAGAGAACGTGTGTGTGTGTGTCGGTGATCACACCCTCCCATCCTTCCTGCCTCCTGCCCCAAACCCCGGGTTCCTGGGTCTGGAAGGGCCTTCTCTCCAAGCTGGGAGCTCCTGGGCCCCCACCATTCACTTTTTGTCCTTGCTGCTGGCAAACAGTAAAGAAACTCACTTTCCCTGTGGCACGTTATGCTTCAGAATTAAAACAATGAAGATTAAAATTTGCACCGAGCCAGTGTGTTGATCGAAGACCACGATTGCCTGTGTTTCTGAGATGCGTCCATGGAAAAATGGAAAAAACTGTGGTGCGTTGACTTGCTGGAACCCTTCCTAAGCCGCAGTGAAAGGAGGGGCTAGATCTGTGTGTGTTCATGCAGCTCACACACCTGATGCTGAACGGATAAGCAAGGCCATCTACATACAAAAGTACACACAAATAATCAGAATGAGTTCTGCAAACACATACGTATGTATGC

In [8]:

seq_dict = {}
with open(output_fa, 'r') as f :
    i = 0
    seq_id = ''
    for line in f :
        if i % 2 == 0 :
            seq_id = line[1:].rstrip()
        else :
            seq_dict[seq_id] = line.rstrip().upper()
        
        i += 1

df_dict = {}
for index, row in filtered.iterrows() :
    df_dict[row['gene']] = {}
    
    df_dict[row['gene']]['seq_start'] = row['start']
    df_dict[row['gene']]['seq_end'] = row['end']
    df_dict[row['gene']]['count'] = row['reads']
    df_dict[row['gene']]['total_count'] = row['total_reads']
    df_dict[row['gene']]['cut_mode'] = row['mode']
    df_dict[row['gene']]['total_ratio'] = row['rel_use']
    df_dict[row['gene']]['sort_order'] = index
    


print(len(seq_dict))
print(seq_dict['FAM101B.2'])

gene_dict = {}

site_dict = {}
n_mismatch = 0

with open('unprocessed_data/apadb/hg19.apadb_v2_final.bed', 'r') as f :
    for line in f :
        lineparts = line.rstrip().split('\t')
        site_id = lineparts[3]
        
        site_dict[site_id] = {}
        
        site_dict[site_id]['chr'] = lineparts[0]
        site_dict[site_id]['cut_start'] = int(lineparts[1])
        site_dict[site_id]['cut_end'] = int(lineparts[2])
        site_dict[site_id]['count'] = int(lineparts[4])
        site_dict[site_id]['strand'] = lineparts[5]
        site_dict[site_id]['type'] = lineparts[6]
        site_dict[site_id]['mirna'] = lineparts[9]
        
        if df_dict[site_id]['count'] != site_dict[site_id]['count'] :
            n_mismatch += 1
        
        site_dict[site_id]['seq_start'] = df_dict[site_id]['seq_start']
        site_dict[site_id]['seq_end'] = df_dict[site_id]['seq_end']
        site_dict[site_id]['total_count'] = df_dict[site_id]['total_count']
        site_dict[site_id]['cut_mode'] = df_dict[site_id]['cut_mode']
        site_dict[site_id]['total_ratio'] = df_dict[site_id]['total_ratio']
        site_dict[site_id]['sort_order'] = df_dict[site_id]['sort_order']
        
        gene_id = site_id.split('.')[0]
        site_num = int(site_id.split('.')[1])
        
        if gene_id not in gene_dict :
            gene_dict[gene_id] = 0
        gene_dict[gene_id] += 1

print(len(site_dict))
print(site_dict['FAM101B.2'])

print(n_mismatch)


71689
GAAGTAAGTAGCAGTGAGCGATTGTGAATGTGTAATGTAAATGGAAAACCGGGTTTTACCGTGTTAAGTTATTCACTAGGGAGCCAGTCGTAGTTCTTTGTAATCCTCTTTCTTCCAAACCTGCTTTGCTGAAAGTTGCAGAAAAGGAAGTGTGTGGAGAGAAACAGAACCCTTCAGGGTGGGTCAGAGGACGCCATCCACAGTGGATTCGTGTTCGTTTGCAGGTGGAAGCAGTGATTTTTAGGACCCACTGATTAAAAACAAACATTCCCAAGTGTCTCTGAGAGATGCTGTTTATTTGTTAATTAAAAAGCTTTTTTCTCTGTCTTTTAAATTATGGCTTTCATGTAATAAGGATATTTTTAGTGAAAAATTGTTTTCCTTTCAAATTACAGACCTTTTAAAAAAACTTAATTTGAGCGAGTACCTTTTCATTTGACACTTTTCCTGTTTCTAACCTTAGGAAACCAGAATAGCGTTTGGCAGACACGACGTTTTCAGTTTACCTTTGACACCTGCCCCACTCCATTTTGCTTTGTGATGTCTTCATTTAACAATAAATTATCTGAAAAAACAAAACTTAGAGAGATGCTTCTGTTTTTAAAGTAGAATTATGTTTGTTTACGCAAAATGAGAAAAACAGCTCCTCATCTTGAGAAATTTTAAGACGTGATTATATTTAACAGTATTAATCTACAAGTACAAGATTTTCCGAGTGTGGCTGGGCATGGTGGCTCACACCTATAATCCCAGCGCTTCACAAGGCCAAGGCTGGAGGATCACTTGAGGCCAGGAGTTCGAGATCAGCCTTG
71689
{'chr': 'chr17', 'cut_start': 289935, 'cut_end': 289947, 'count': 170, 'strand': '-', 'type': 'UTR3', 'mirna': 'None', 'seq_start': 289535, 'seq_end': 290347, 'total_count': 6

In [9]:

#Make Valid PAS lookup hierarchy

cano_pas1 = 'AATAAA'
cano_pas2 = 'ATTAAA'

valid_pas = []

valid_pas.append({})
valid_pas[0]['AATAAA'] = True

valid_pas.append({})
valid_pas[1]['ATTAAA'] = True

valid_pas.append({})
valid_pas[2]['AGTAAA'] = True
valid_pas[2]['TATAAA'] = True
valid_pas[2]['CATAAA'] = True
valid_pas[2]['GATAAA'] = True

valid_pas.append({})
for pos in range(0, 6) :
    for base in ['A', 'C', 'G', 'T'] :
        valid_pas[3][cano_pas1[:pos] + base + cano_pas1[pos+1:]] = True

valid_pas.append({})
for pos1 in range(0, 6) :
    for pos2 in range(pos1 + 1, 6) :
        for base1 in ['A', 'C', 'G', 'T'] :
            for base2 in ['A', 'C', 'G', 'T'] :
                valid_pas[4][cano_pas1[:pos1] + base1 + cano_pas1[pos1+1:pos2] + base2 + cano_pas1[pos2+1:]] = True


In [10]:

def align_seq(seq, cut_start, cut_end, before_cut = 30, after_cut = 5) :
    
    align_j = cut_start - 25
    aligned = -1
    
    for i in range(0, len(valid_pas)) :
        for j in range(cut_start - before_cut, cut_start + after_cut) :

            candidate_pas = seq[j:j+6]

            if candidate_pas in valid_pas[i] :
                align_j = j
                aligned = i
                
                if j >= cut_start :
                    break
        if aligned != -1 :
            break
    
    aligned_seq = (seq[align_j-50:])[:186]
    aligned_seq_long = (seq[align_j-100:])[:186 + 50 + 50]
    
    if len(aligned_seq_long) != 186 + 50 + 50 :
        print('WARNING! Aligned seq not long enough. Length = ' + str(len(aligned_seq_long)))
    
    
    aligned_wide_seq = (seq[align_j-125:])[:256]
    aligned_wide_seq_long = (seq[align_j-175:])[:256 + 50 + 50]
    
    if len(aligned_wide_seq_long) != 256 + 50 + 50 :
        print('WARNING! Aligned seq not long enough. Length = ' + str(len(aligned_wide_seq_long)))
    
    return aligned_seq, aligned_seq_long, aligned, (align_j - cut_start), aligned_wide_seq, aligned_wide_seq_long


In [11]:
#Global APADB dataframe generation

gene = []

prox_id = []
prox_seq = []
prox_seq_ext = []
prox_wide_seq = []
prox_wide_seq_ext = []
prox_count = []
total_count = []
prox_ratio = []

prox_mirna = []

cut_start = []
cut_end = []
cut_mode = []

prox_type = []
prox_pas = []
prox_sitenum = []
num_sites = []

prox_chrom = []
prox_strand = []

pas_pos = []

for site_id in site_dict :
    
    gene_id = site_id.split('.')[0]
    site_num = int(site_id.split('.')[1])
    
    gene.append(gene_id)
    
    p_id = site_id
    
    p_seq = seq_dict[p_id]
    p_count = site_dict[p_id]['count']
    p_total_count = site_dict[p_id]['total_count']
    p_usage = site_dict[p_id]['total_ratio']
    p_mirna = site_dict[p_id]['mirna']
    
    p_start = site_dict[p_id]['cut_start']
    p_end = site_dict[p_id]['cut_end']
    p_mode = site_dict[p_id]['cut_mode']
    
    p_type = site_dict[p_id]['type']
    
    p_seq, p_seq_long, p_aligned, p_shift, p_wide_seq, p_wide_seq_long = align_seq(p_seq, 400, len(p_seq) - 800, before_cut = 35, after_cut = 5) #200, ..., 400
    
    if site_dict[p_id]['strand'] == '+' :
        pas_pos.append(p_shift + p_start)
    else :
        pas_pos.append(-p_shift + p_end)
    
    prox_id.append(p_id)
    prox_seq.append(p_seq)
    prox_seq_ext.append(p_seq_long)
    
    prox_wide_seq.append(p_wide_seq)
    prox_wide_seq_ext.append(p_wide_seq_long)
    
    prox_count.append(p_count)
    total_count.append(p_total_count)
    prox_ratio.append(p_usage)
    
    prox_mirna.append(p_mirna)
    
    cut_start.append(p_start)
    cut_end.append(p_end)
    cut_mode.append(p_mode)
    
    prox_type.append(p_type)
    
    prox_pas.append(p_aligned)
    
    prox_sitenum.append(gene_dict[gene_id] - site_num + 1)
    
    num_sites.append(gene_dict[gene_id])
    
    prox_chrom.append(site_dict[p_id]['chr'])
    prox_strand.append(site_dict[p_id]['strand'])


df = pd.DataFrame({
        'gene' : gene,
        'gene_id'  : prox_id,
        'sitenum'  : prox_sitenum,
        'num_sites' : num_sites,
        'pas'  : prox_pas,
        'seq'  : prox_seq,
        'seq_ext'  : prox_seq_ext,
        'wide_seq'  : prox_wide_seq,
        'wide_seq_ext'  : prox_wide_seq_ext,
        'count'  : prox_count,
        'total_count'  : total_count,
        'ratio' : prox_ratio,
        'cut_start'  : cut_start,
        'cut_end'  : cut_end,
        'cut_mode'  : cut_mode,
        'chrom' : prox_chrom,
        'strand' : prox_strand,
        'mirna'  : prox_mirna,
        'site_type'  : prox_type,
        'pas_pos' : pas_pos
    })

df = df[['gene',
        'gene_id',
        'sitenum',
        'num_sites',
        'pas',
        'seq',
        'seq_ext',
        'wide_seq',
        'wide_seq_ext',
        'count',
        'total_count',
        'ratio',
        'pas_pos',
        'cut_start',
        'cut_end',
        'cut_mode',
        'chrom',
        'strand',
        'mirna',
        'site_type']]

df = df.sort_values(by=['chrom', 'gene', 'sitenum'])

print(df.head())
print(df.tail())

print('Total number of members: ' + str(len(df)))


df.to_csv('apadb_processed_v2.csv', header=True, index=False, sep=',')



         gene   gene_id  sitenum  num_sites  pas  \
53932  ABCB10  ABCB10.5        1          5    0   
53931  ABCB10  ABCB10.4        2          5    0   
53930  ABCB10  ABCB10.3        3          5    2   
53929  ABCB10  ABCB10.2        4          5    1   
53928  ABCB10  ABCB10.1        5          5    1   

                                                     seq  \
53932  GTTAAAGATTGAAGCTATTGTCAAATGACAACTTTAAAAAGGCAAT...   
53931  CTATTTCATGAAAAGCATGGAATATTATATTTTATTGTTCATAATT...   
53930  CATAATTAATGAATAAAATTGATATGAATGAATATAGTGTTCTTTG...   
53929  TCTGATACATGATGTTCAATTTTATCTTTAGGTAATATTTTATATC...   
53928  TCAGGAATAAAGAAAAGACTAACATTACACATATCCAAAAACATGT...   

                                                 seq_ext  \
53932  TGGAATATTTTAATTAATATAGCATGGCACCTCATTTTCTTTTGCC...   
53931  CATGAACTAAGCATTTATTAGTTCCCTGATTAGACTGGAAGAAGAA...   
53930  AGAAGAAACCACTATTTCATGAAAAGCATGGAATATTATATTTTAT...   
53929  TGTGCCATAGAAGTATTTACGAAATTGCATTTCATTGTTATGTTTT...   
53928  TATAAAACTTCTCACTACA

In [31]:
chrom = []
strand = []
gene = []

prox_id = []
dist_id = []

prox_seq = []
dist_seq = []

prox_count = []
dist_count = []
total_count = []

site_distance = []

prox_mirna = []
dist_mirna = []

prox_cutsite_length = []
dist_cutsite_length = []

prox_cut_start = []
prox_cut_mode = []
prox_cut_end = []
dist_cut_start = []
dist_cut_mode = []
dist_cut_end = []

prox_type = []
dist_type = []

prox_pas = []
dist_pas = []

prox_sitenum = []
dist_sitenum = []

num_sites = []

for site_id in site_dict :
    
    gene_id = site_id.split('.')[0]
    site_num = int(site_id.split('.')[1])
    
    if gene_id + '.' + str(site_num-1) not in site_dict :
        continue
    
    p_id = site_id
    d_id = gene_id + '.' + str(site_num-1)
    
    gene.append(gene_id)
    
    p_seq = seq_dict[p_id]
    p_count = site_dict[p_id]['count']
    p_mirna = site_dict[p_id]['mirna']
    p_cutsite_length = numpy.abs(site_dict[p_id]['cut_end'] - site_dict[p_id]['cut_start'])
    p_cut_start = site_dict[p_id]['cut_start']
    p_cut_mode = site_dict[p_id]['cut_mode']
    p_cut_end = site_dict[p_id]['cut_end']
    p_type = site_dict[p_id]['type']
    
    d_seq = seq_dict[d_id]
    d_count = site_dict[d_id]['count']
    d_mirna = site_dict[d_id]['mirna']
    d_cutsite_length = numpy.abs(site_dict[d_id]['cut_end'] - site_dict[d_id]['cut_start'])
    d_cut_start = site_dict[d_id]['cut_start']
    d_cut_mode = site_dict[d_id]['cut_mode']
    d_cut_end = site_dict[d_id]['cut_end']
    d_type = site_dict[d_id]['type']
    
    site_dist = numpy.abs(site_dict[p_id]['cut_end'] - site_dict[d_id]['cut_end'])
    
    
    p_seq, _, p_aligned, _, _, _ = align_seq(p_seq, 400, len(p_seq) - 800, before_cut = 30, after_cut = 5) #200, ..., 400
    d_seq, _, d_aligned, _, _, _ = align_seq(d_seq, 400, len(d_seq) - 800, before_cut = 30, after_cut = 5)
    
    chrom.append(site_dict[p_id]['chr'])
    strand.append(site_dict[p_id]['strand'])
    
    prox_id.append(p_id)
    dist_id.append(d_id)
    
    prox_seq.append(p_seq)
    dist_seq.append(d_seq)
    
    prox_count.append(p_count)
    dist_count.append(d_count)
    total_count.append(p_count + d_count)
    
    site_distance.append(site_dist)
    
    prox_mirna.append(p_mirna)
    dist_mirna.append(d_mirna)
    
    prox_cutsite_length.append(p_cutsite_length)
    dist_cutsite_length.append(d_cutsite_length)
    
    prox_cut_start.append(p_cut_start)
    prox_cut_mode.append(p_cut_mode)
    prox_cut_end.append(p_cut_end)
    dist_cut_start.append(d_cut_start)
    dist_cut_mode.append(d_cut_mode)
    dist_cut_end.append(d_cut_end)
    
    prox_type.append(p_type)
    dist_type.append(d_type)
    
    prox_pas.append(p_aligned)
    dist_pas.append(d_aligned)
    
    prox_sitenum.append(gene_dict[gene_id] - site_num + 1)
    dist_sitenum.append(gene_dict[gene_id] - site_num + 2)
    
    num_sites.append(gene_dict[gene_id])


df = pd.DataFrame({
        'gene' : gene,
        'chrom' : chrom,
        'strand' : strand,
        'proximal_id'  : prox_id,
        'distal_id'  : dist_id,
        'proximal_sitenum'  : prox_sitenum,
        'distal_sitenum'  : dist_sitenum,
        'proximal_pas'  : prox_pas,
        'distal_pas'  : dist_pas,
        'proximal_seq'  : prox_seq,
        'distal_seq'  : dist_seq,
        'proximal_count'  : prox_count,
        'distal_count'  : dist_count,
        'total_count'  : total_count,
        'site_distance'  : site_distance,
        'proximal_mirna'  : prox_mirna,
        'distal_mirna'  : dist_mirna,
        'proximal_cutsite_length'  : prox_cutsite_length,
        'distal_cutsite_length'  : dist_cutsite_length,
        'proximal_cut_start'  : prox_cut_start,
        'proximal_cut_mode'  : prox_cut_mode,
        'proximal_cut_end'  : prox_cut_end,
        'distal_cut_start'  : dist_cut_start,
        'distal_cut_mode'  : dist_cut_mode,
        'distal_cut_end'  : dist_cut_end,
        'proximal_type'  : prox_type,
        'distal_type'  : dist_type,
        'num_sites' : num_sites
    })

df = df[['gene',
        'chrom',
        'strand',
        'proximal_id',
        'distal_id',
        'proximal_sitenum',
        'distal_sitenum',
        'proximal_pas',
        'distal_pas',
        'proximal_seq',
        'distal_seq',
        'proximal_count',
        'distal_count',
        'total_count',
        'site_distance',
        'proximal_mirna',
        'distal_mirna',
        'proximal_cutsite_length',
        'distal_cutsite_length',
        'proximal_cut_start',
        'proximal_cut_mode',
        'proximal_cut_end',
        'distal_cut_start',
        'distal_cut_mode',
        'distal_cut_end',
        'proximal_type',
        'distal_type',
        'num_sites']]

df = df.sort_values(by=['chrom', 'gene', 'proximal_sitenum'])


print(df.head())
print(df.tail())

print('Total number of members: ' + str(len(df)))


df.to_csv('apadb_processed_v2_pairs.csv', header=True, index=False, sep=',')



         gene chrom strand proximal_id distal_id  proximal_sitenum  \
41557  ABCB10  chr1      -    ABCB10.5  ABCB10.4                 1   
41556  ABCB10  chr1      -    ABCB10.4  ABCB10.3                 2   
41555  ABCB10  chr1      -    ABCB10.3  ABCB10.2                 3   
41554  ABCB10  chr1      -    ABCB10.2  ABCB10.1                 4   
21825   ABCD3  chr1      +     ABCD3.5   ABCD3.4                 1   

       distal_sitenum  proximal_pas  distal_pas  \
41557               2             0           0   
41556               3             0           2   
41555               4             2           1   
41554               5             1           1   
21825               2             0           0   

                                            proximal_seq    ...      \
41557  GTTAAAGATTGAAGCTATTGTCAAATGACAACTTTAAAAAGGCAAT...    ...       
41556  CTATTTCATGAAAAGCATGGAATATTATATTTTATTGTTCATAATT...    ...       
41555  CATAATTAATGAATAAAATTGATATGAATGAATATAGTGTTCTTTG...   