In [1]:
import numpy
import pandas as pd

In [2]:

polyadb_df = pd.read_csv('unprocessed_data/polyadb/human_v3.PAS.txt', delimiter='\t')
polyadb_df = polyadb_df.loc[~polyadb_df['Gene Symbol'].isnull()].copy().reset_index(drop=True)

print(polyadb_df.head())


          PAS_ID Chromosome  Position Strand   Mean RPM Intron/exon location  \
0  chr1:564599:+       chr1    564599      +   3.333397          Single exon   
1  chr1:564629:+       chr1    564629      +   1.201574          Single exon   
2  chr1:564664:+       chr1    564664      +  14.856402          Single exon   
3  chr1:564708:+       chr1    564708      +   1.872293          Single exon   
4  chr1:564952:+       chr1    564952      +   1.991760          Single exon   

       Ensemble ID RefSeq Gene ID Gene Symbol  \
0  ENSG00000225972             na    MTND1P23   
1  ENSG00000225972             na    MTND1P23   
2  ENSG00000225972             na    MTND1P23   
3  ENSG00000225972             na    MTND1P23   
4  ENSG00000225972             na    MTND1P23   

                                           Gene Name FAMTOM ID  \
0  mitochondrially encoded NADH:ubiquinone oxidor...        na   
1  mitochondrially encoded NADH:ubiquinone oxidor...        na   
2  mitochondrially encoded

In [3]:


chr_list = []
start_list = []
end_list = []
gene_list = []
strand_list = []
feature_list = []
pas_list = []
mode_list = []
count_list = []

gene_next_id = {}

visited_dict = {}

for index, row in polyadb_df.iterrows() :
    
    if row['PAS_ID'] in visited_dict :
        continue
    visited_dict[row['PAS_ID']] = True
    
    chr_list.append(row['Chromosome'])
    start_list.append(int(row['Position']) - 400) #250
    end_list.append(int(row['Position']) + 400)
    
    if row['Gene Symbol'] not in gene_next_id :
        gene_next_id[row['Gene Symbol']] = 1
    
    gene_list.append(row['Gene Symbol'] + '.' + str(gene_next_id[row['Gene Symbol']]))
    strand_list.append(row['Strand'])
    feature_list.append(row['Intron/exon location'].replace(' ', '_').replace('\'', ''))
    pas_list.append(row['PAS Signal'])
    mode_list.append(int(row['Position']))
    count_list.append(float(row['Mean RPM']))
    
    gene_next_id[row['Gene Symbol']] += 1


filtered = pd.DataFrame({'chr'  : chr_list,
                    'start'  : start_list,
                    'end'  : end_list,
                    'gene'  : gene_list,
                    'strand'  : strand_list,
                    'feature'  : feature_list,
                    'pas' : pas_list,
                    'mode' : mode_list,
                    'count' : count_list,
                })

filtered = filtered[['chr', 'start', 'end', 'gene', 'feature', 'strand', 'pas', 'mode', 'count']]

print(filtered.head())


    chr   start     end        gene      feature strand    pas    mode  \
0  chr1  564199  564999  MTND1P23.1  Single_exon      +  NoPAS  564599   
1  chr1  564229  565029  MTND1P23.2  Single_exon      +  NoPAS  564629   
2  chr1  564264  565064  MTND1P23.3  Single_exon      +  NoPAS  564664   
3  chr1  564308  565108  MTND1P23.4  Single_exon      +  NoPAS  564708   
4  chr1  564552  565352  MTND1P23.5  Single_exon      +  NoPAS  564952   

       count  
0   3.333397  
1   1.201574  
2  14.856402  
3   1.872293  
4   1.991760  


In [4]:
filtered

Unnamed: 0,chr,start,end,gene,feature,strand,pas,mode,count
0,chr1,564199,564999,MTND1P23.1,Single_exon,+,NoPAS,564599,3.333397
1,chr1,564229,565029,MTND1P23.2,Single_exon,+,NoPAS,564629,1.201574
2,chr1,564264,565064,MTND1P23.3,Single_exon,+,NoPAS,564664,14.856402
3,chr1,564308,565108,MTND1P23.4,Single_exon,+,NoPAS,564708,1.872293
4,chr1,564552,565352,MTND1P23.5,Single_exon,+,NoPAS,564952,1.991760
5,chr1,564606,565406,MTND1P23.6,Single_exon,+,NoPAS,565006,1.653459
6,chr1,564887,565687,MTND2P28.1,Single_exon,+,OtherPAS,565287,0.806888
7,chr1,564981,565781,MTND2P28.2,Single_exon,+,NoPAS,565381,1.422803
8,chr1,565006,565806,MTND2P28.3,Single_exon,+,NoPAS,565406,0.772380
9,chr1,565247,566047,MTND2P28.4,Single_exon,+,NoPAS,565647,2.232732


In [5]:
#Load gencode annotation for hg19

df_columns = ['chr', 'source', 'feature_type', 'start', 'end', 'score', 'strand', 'phase', 'info']
gencode_df = pd.read_csv("unprocessed_data/hg19/gencode.v19.annotation.gtf_withproteinids", skiprows=5, header=None, names=df_columns, usecols=[0,1,2,3,4,5,6,7,8], sep='\t')


In [6]:

pd.options.display.max_colwidth = 1000

gencode_df.loc[gencode_df['info'].str.contains("gene_name \"CTDP1\"")].query("feature_type == 'stop_codon'")


Unnamed: 0,chr,source,feature_type,start,end,score,strand,phase,info
2220779,chr18,HAVANA,stop_codon,77513788,77513790,.,+,0,"gene_id ""ENSG00000060069.12""; transcript_id ""ENST00000299543.7""; gene_type ""protein_coding""; gene_status ""KNOWN""; gene_name ""CTDP1""; transcript_type ""protein_coding""; transcript_status ""KNOWN""; transcript_name ""CTDP1-001""; exon_number 13; exon_id ""ENSE00001911230.1""; level 2; protein_id ""ENSP00000299543.7""; tag ""basic""; tag ""appris_principal""; tag ""CCDS""; ccdsid ""CCDS12017.1""; havana_gene ""OTTHUMG00000132920.2""; havana_transcript ""OTTHUMT00000256432.1"";"
2220808,chr18,HAVANA,stop_codon,77513669,77513671,.,+,0,"gene_id ""ENSG00000060069.12""; transcript_id ""ENST00000075430.7""; gene_type ""protein_coding""; gene_status ""KNOWN""; gene_name ""CTDP1""; transcript_type ""protein_coding""; transcript_status ""KNOWN""; transcript_name ""CTDP1-002""; exon_number 12; exon_id ""ENSE00003670433.1""; level 2; protein_id ""ENSP00000075430.7""; tag ""basic""; tag ""CCDS""; ccdsid ""CCDS12018.1""; havana_gene ""OTTHUMG00000132920.2""; havana_transcript ""OTTHUMT00000256433.1"";"
2220836,chr18,HAVANA,stop_codon,77513673,77513675,.,+,0,"gene_id ""ENSG00000060069.12""; transcript_id ""ENST00000591598.1""; gene_type ""protein_coding""; gene_status ""KNOWN""; gene_name ""CTDP1""; transcript_type ""protein_coding""; transcript_status ""NOVEL""; transcript_name ""CTDP1-003""; exon_number 12; exon_id ""ENSE00003642463.1""; level 1; protein_id ""ENSP00000465119.1""; tag ""mRNA_start_NF""; tag ""cds_start_NF""; tag ""exp_conf""; havana_gene ""OTTHUMG00000132920.2""; havana_transcript ""OTTHUMT00000450526.1"";"


In [7]:

gencode_df_stops = gencode_df.query("feature_type == 'stop_codon'").copy().reset_index(drop=True)
gencode_df_stops = gencode_df_stops.loc[gencode_df_stops["info"].str.contains("transcript_status \"KNOWN\"")].copy().reset_index(drop=True)


In [8]:

def _extract_field(s, f) :
    
    s_parts = [subs.strip() for subs in s.split(";")]
    
    s_part = None
    for s_part_cand in s_parts :
        if f in s_part_cand :
            s_part = s_part_cand
            break
    
    if s_part is None :
        return "N/A"
    
    return s_part.split(f)[1].replace("\"", "")

gencode_df_stops['gene'] = gencode_df_stops['info'].apply(lambda s: _extract_field(s, "gene_name "))

df_stops = gencode_df_stops[['chr', 'start', 'end', 'strand', 'gene']].rename(columns={"start" : "stop_codon_start", "end" : "stop_codon_end"})
df_stops['stop_codon_end_stranded'] = df_stops['stop_codon_end']
df_stops.loc[df_stops['strand'] == '-', 'stop_codon_end_stranded'] = df_stops['stop_codon_start']

df_stops = df_stops.drop_duplicates().copy().reset_index(drop=True)


In [9]:
df_stops

Unnamed: 0,chr,stop_codon_start,stop_codon_end,strand,gene,stop_codon_end_stranded
0,chr1,70006,70008,+,OR4F5,70008
1,chr1,138530,138532,-,AL627309.1,138530
2,chr1,368595,368597,+,OR4F29,368597
3,chr1,621096,621098,-,OR4F16,621096
4,chr1,879531,879533,+,SAMD11,879533
5,chr1,880074,880076,-,NOC2L,880074
6,chr1,900569,900571,+,KLHL17,900571
7,chr1,898764,898766,+,KLHL17,898766
8,chr1,909953,909955,+,PLEKHN1,909955
9,chr1,911552,911554,-,C1orf170,911552


In [10]:

filtered = filtered.query("feature == '3_most_exon'").copy().reset_index(drop=True)


In [11]:

filtered['genefam'] = filtered['gene'].apply(lambda x: x.split(".")[0])


In [12]:
filtered

Unnamed: 0,chr,start,end,gene,feature,strand,pas,mode,count,genefam
0,chr1,849773,850573,na.39,3_most_exon,+,OtherPAS,850173,1.361162,na
1,chr1,851844,852644,na.40,3_most_exon,+,NoPAS,852244,2.293956,na
2,chr1,879554,880354,SAMD11.4,3_most_exon,+,AAUAAA,879954,21.226921,SAMD11
3,chr1,879583,880383,SAMD11.5,3_most_exon,+,AAUAAA,879983,1.619696,SAMD11
4,chr1,879825,880625,SAMD11.6,3_most_exon,+,OtherPAS,880225,1.197331,SAMD11
5,chr1,881762,882562,SAMD11.7,3_most_exon,+,AAUAAA,882162,0.621291,SAMD11
6,chr1,882082,882882,SAMD11.8,3_most_exon,+,AAUAAA,882482,2.529296,SAMD11
7,chr1,900694,901494,KLHL17.1,3_most_exon,+,AAUAAA,901094,68.402824,KLHL17
8,chr1,909649,910449,PLEKHN1.1,3_most_exon,+,NoPAS,910049,0.684012,PLEKHN1
9,chr1,910211,911011,PLEKHN1.2,3_most_exon,+,OtherPAS,910611,0.338859,PLEKHN1


In [13]:

filtered = filtered.query("genefam != 'na'").copy().reset_index(drop=True)


In [14]:

filtered_plus = filtered.query("strand == '+'").copy()
filtered_minus = filtered.query("strand == '-'").copy()

filtered_plus_agg = filtered_plus.groupby(['genefam']).agg({'mode' : 'min'}).reset_index()
filtered_minus_agg = filtered_minus.groupby(['genefam']).agg({'mode' : 'max'}).reset_index()

filtered_agg = pd.concat([filtered_plus_agg, filtered_minus_agg])


In [15]:
filtered_agg

Unnamed: 0,genefam,mode
0,A2ML1,9029379
1,AACS,125626870
2,AADAC,151546276
3,AADACL3,12788725
4,AADACP1,151502618
5,AAMDC,77617070
6,AANAT,74466199
7,AAR2,34843840
8,AARD,117955499
9,AASDHPPT,105967718


In [16]:

df_stops = df_stops.join(filtered_agg.set_index("genefam"), on='gene', how='inner').copy().reset_index(drop=True)


In [17]:

df_stops


Unnamed: 0,chr,stop_codon_start,stop_codon_end,strand,gene,stop_codon_end_stranded,mode
0,chr1,879531,879533,+,SAMD11,879533,879954
1,chr1,880074,880076,-,NOC2L,880074,880126
2,chr1,900569,900571,+,KLHL17,900571,901094
3,chr1,898764,898766,+,KLHL17,898766,901094
4,chr1,909953,909955,+,PLEKHN1,909955,910049
5,chr1,934439,934441,-,HES4,934439,934344
6,chr1,949856,949858,+,ISG15,949858,949835
7,chr1,990359,990361,+,AGRN,990361,990426
8,chr1,1007197,1007199,-,RNF223,1007197,1006347
9,chr1,1018273,1018275,-,C1orf159,1018273,1017651


In [18]:

df_stops = df_stops.query("(strand == '+' and stop_codon_end_stranded < mode - 20) or (strand == '-' and stop_codon_end_stranded > mode + 20)").copy().reset_index(drop=True)
df_stops['stop_codon_distance'] = numpy.abs(df_stops['stop_codon_end_stranded'] - df_stops['mode'])

df_stops = df_stops.sort_values(by='stop_codon_distance', ascending=True).drop_duplicates(subset=['gene'], keep='first').copy().reset_index(drop=True)


In [19]:
df_stops

Unnamed: 0,chr,stop_codon_start,stop_codon_end,strand,gene,stop_codon_end_stranded,mode,stop_codon_distance
0,chrX,114347779,114347781,-,LRCH2,114347779,114347758,21
1,chr13,37583434,37583436,+,EXOSC8,37583436,37583457,21
2,chr2,99936145,99936147,-,TXNDC9,99936145,99936124,21
3,chr15,38643863,38643865,+,SPRED1,38643865,38643886,21
4,chr11,125453454,125453456,+,EI24,125453456,125453477,21
5,chr12,91497942,91497944,-,LUM,91497942,91497921,21
6,chr11,124502132,124502134,+,TBRG1,124502134,124502155,21
7,chr1,167887561,167887563,-,MPC2,167887561,167887540,21
8,chr22,43559075,43559077,+,TSPO,43559077,43559098,21
9,chr7,138711290,138711292,-,ZC3HAV1L,138711290,138711269,21


In [20]:

filtered = filtered.join(df_stops[['gene', 'stop_codon_end_stranded']].set_index("gene"), on='genefam', how='inner').copy().reset_index(drop=True)


In [21]:

filtered


Unnamed: 0,chr,start,end,gene,feature,strand,pas,mode,count,genefam,stop_codon_end_stranded
0,chr1,879554,880354,SAMD11.4,3_most_exon,+,AAUAAA,879954,21.226921,SAMD11,879533
1,chr1,879583,880383,SAMD11.5,3_most_exon,+,AAUAAA,879983,1.619696,SAMD11,879533
2,chr1,879825,880625,SAMD11.6,3_most_exon,+,OtherPAS,880225,1.197331,SAMD11,879533
3,chr1,881762,882562,SAMD11.7,3_most_exon,+,AAUAAA,882162,0.621291,SAMD11,879533
4,chr1,882082,882882,SAMD11.8,3_most_exon,+,AAUAAA,882482,2.529296,SAMD11,879533
5,chr1,900694,901494,KLHL17.1,3_most_exon,+,AAUAAA,901094,68.402824,KLHL17,900571
6,chr1,909649,910449,PLEKHN1.1,3_most_exon,+,NoPAS,910049,0.684012,PLEKHN1,909955
7,chr1,910211,911011,PLEKHN1.2,3_most_exon,+,OtherPAS,910611,0.338859,PLEKHN1,909955
8,chr1,910845,911645,PLEKHN1.3,3_most_exon,+,OtherPAS,911245,2.716108,PLEKHN1,909955
9,chr1,990026,990826,AGRN.1,3_most_exon,+,NoPAS,990426,1.308165,AGRN,990361


In [22]:

filtered['isoform_len'] = numpy.abs(filtered['mode'] - filtered['stop_codon_end_stranded'])

filtered = filtered.query("isoform_len < 10000").copy().reset_index(drop=True)


In [23]:
filtered

Unnamed: 0,chr,start,end,gene,feature,strand,pas,mode,count,genefam,stop_codon_end_stranded,isoform_len
0,chr1,879554,880354,SAMD11.4,3_most_exon,+,AAUAAA,879954,21.226921,SAMD11,879533,421
1,chr1,879583,880383,SAMD11.5,3_most_exon,+,AAUAAA,879983,1.619696,SAMD11,879533,450
2,chr1,879825,880625,SAMD11.6,3_most_exon,+,OtherPAS,880225,1.197331,SAMD11,879533,692
3,chr1,881762,882562,SAMD11.7,3_most_exon,+,AAUAAA,882162,0.621291,SAMD11,879533,2629
4,chr1,882082,882882,SAMD11.8,3_most_exon,+,AAUAAA,882482,2.529296,SAMD11,879533,2949
5,chr1,900694,901494,KLHL17.1,3_most_exon,+,AAUAAA,901094,68.402824,KLHL17,900571,523
6,chr1,909649,910449,PLEKHN1.1,3_most_exon,+,NoPAS,910049,0.684012,PLEKHN1,909955,94
7,chr1,910211,911011,PLEKHN1.2,3_most_exon,+,OtherPAS,910611,0.338859,PLEKHN1,909955,656
8,chr1,910845,911645,PLEKHN1.3,3_most_exon,+,OtherPAS,911245,2.716108,PLEKHN1,909955,1290
9,chr1,990026,990826,AGRN.1,3_most_exon,+,NoPAS,990426,1.308165,AGRN,990361,65


In [24]:

filtered2 = filtered.copy().reset_index(drop=True)

filtered2_plus = filtered2.query("strand == '+'").copy().reset_index(drop=True)
filtered2_minus = filtered2.query("strand == '-'").copy().reset_index(drop=True)

filtered2_plus = filtered2_plus[['chr', 'stop_codon_end_stranded', 'mode', 'gene', 'feature', 'strand']]
filtered2_minus = filtered2_minus[['chr', 'stop_codon_end_stranded', 'mode', 'gene', 'feature', 'strand']]

filtered2_minus['temp1'] = filtered2_minus['stop_codon_end_stranded']
filtered2_minus['stop_codon_end_stranded'] = filtered2_minus['mode']
filtered2_minus['mode'] = filtered2_minus['temp1']

filtered2 = pd.concat([filtered2_plus, filtered2_minus]).copy().reset_index(drop=True)[['chr', 'stop_codon_end_stranded', 'mode', 'gene', 'feature', 'strand']]

filtered2.loc[filtered2['strand'] == '+', 'stop_codon_end_stranded'] = filtered2.loc[filtered2['strand'] == '+']['stop_codon_end_stranded'] - 3
filtered2.loc[filtered2['strand'] == '-', 'mode'] = filtered2.loc[filtered2['strand'] == '-']['mode'] + 2

filtered2 = filtered2.rename(columns={'stop_codon_end_stranded' : 'start', 'mode' : 'end'})


In [25]:
filtered2

Unnamed: 0,chr,start,end,gene,feature,strand
0,chr1,879530,879954,SAMD11.4,3_most_exon,+
1,chr1,879530,879983,SAMD11.5,3_most_exon,+
2,chr1,879530,880225,SAMD11.6,3_most_exon,+
3,chr1,879530,882162,SAMD11.7,3_most_exon,+
4,chr1,879530,882482,SAMD11.8,3_most_exon,+
5,chr1,900568,901094,KLHL17.1,3_most_exon,+
6,chr1,909952,910049,PLEKHN1.1,3_most_exon,+
7,chr1,909952,910611,PLEKHN1.2,3_most_exon,+
8,chr1,909952,911245,PLEKHN1.3,3_most_exon,+
9,chr1,990358,990426,AGRN.1,3_most_exon,+


In [26]:
output_id = 'polyadb_v3_utr3_isoform_seqs'

hg19_fai     = 'unprocessed_data/hg19/hg19.fa.fai'
hg19_fa      = 'unprocessed_data/hg19/hg19.fa'

# bed
output_bed = output_id + '.bed'
bed_columns = ['chr', 'start', 'end', 'gene', 'feature', 'strand']
filtered2.to_csv(output_bed, sep='\t', header=False, columns=bed_columns, index=False)

# fasta
output_fa = output_id + '.fa'
!bedtools getfasta -name -s -fi "$hg19_fa" -bed "$output_bed" -fo "$output_fa"

# file tops
!head -5 "$output_bed" | column -t ; echo
!head -10 "$output_fa" ; echo


chr1  879530  879954  SAMD11.4  3_most_exon  +
chr1  879530  879983  SAMD11.5  3_most_exon  +
chr1  879530  880225  SAMD11.6  3_most_exon  +
chr1  879530  882162  SAMD11.7  3_most_exon  +
chr1  879530  882482  SAMD11.8  3_most_exon  +

>SAMD11.4
TGAGGTTGCCGGGGGTAGGGGTGGGGCCACACAAATCTCCAGGAGCCACCACTCAACACAATGGCCCTGCCTCCCACCGCTTTATTTCTTTCGGTTTCGGATGCAAAACAAAAAATTTTAAAAGAAAATGTGACTTCAAAGGAAAGGAACAAATTTTCAAAGACTTGGGGGAGTGAAGGCAGAGCCTGGTGCAGATGGACGAGGTCTGCAGACGGAGGGCAGAGGTGGTGGAAGGGGCCAGGGGCCTGCAGGCCTCCCCCTGGAACTGGGACTGGTCTCGGTCTGCTGACGTCAGGGTCAGCTCCCCCGCGGAGCTGACTTCAGCAGCCCACAGCTGTGGGGCTTCAGCAGCCACAccagcccagcccagcccagcTCTCGATACGTTTGGTCTTTCATGCTGAAAAATAAATAATAAAGCCTG
>SAMD11.5
TGAGGTTGCCGGGGGTAGGGGTGGGGCCACACAAATCTCCAGGAGCCACCACTCAACACAATGGCCCTGCCTCCCACCGCTTTATTTCTTTCGGTTTCGGATGCAAAACAAAAAATTTTAAAAGAAAATGTGACTTCAAAGGAAAGGAACAAATTTTCAAAGACTTGGGGGAGTGAAGGCAGAGCCTGGTGCAGATGGACGAGGTCTGCAGACGGAGGGCAGAGGTGGTGGAAGGGGCCAGGGGCCTGCAGGCCTCCCCCTGGAACTGGGACTGGTCTCGGTCTGCTGACGTCAGGGTCAGCTCCCCCGCGGAGCTGAC

In [5]:
#Re-load coordinate dataframe

output_id = 'polyadb_v3_utr3_isoform_seqs'
output_bed = output_id + '.bed'

polyadb_bed_df = pd.read_csv(output_bed, names=['chr', 'start', 'end', 'gene', 'feature', 'strand'], sep='\t')

print(polyadb_bed_df.head())


    chr   start     end      gene      feature strand
0  chr1  879530  879954  SAMD11.4  3_most_exon      +
1  chr1  879530  879983  SAMD11.5  3_most_exon      +
2  chr1  879530  880225  SAMD11.6  3_most_exon      +
3  chr1  879530  882162  SAMD11.7  3_most_exon      +
4  chr1  879530  882482  SAMD11.8  3_most_exon      +


In [27]:

gene_ids = []
seqs = []

with open(output_fa, 'r') as f :
    i = 0
    seq_id = ''
    for line in f :
        if i % 2 == 0 :
            seq_id = line[1:].rstrip()
        else :
            gene_ids.append(seq_id)
            seqs.append(line.rstrip().upper())
        
        i += 1

iso_df = pd.DataFrame({
    'gene_id' : gene_ids,
    'seq' : seqs,
}).sort_values(by='gene_id').copy().reset_index(drop=True)

iso_df.to_csv('polyadb_processed_v3_utr3_isoforms.csv', header=True, index=False, sep='\t')

print(len(iso_df))
print(iso_df.head())


84723
  gene_id  \
0  A1BG.1   
1  A1BG.2   
2  A1BG.3   
3  A1BG.4   
4  A1BG.5   

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   

In [6]:
#Re-load sequence dataframe and re-save with coordinates

iso_df = pd.read_csv('polyadb_processed_v3_utr3_isoforms.csv', sep='\t')

iso_df = iso_df.join(polyadb_bed_df.set_index("gene"), on='gene_id', how='inner').copy().reset_index(drop=True)

iso_df['gene'] = iso_df['gene_id'].apply(lambda x: ".".join(x.split(".")[:-1]))
iso_df = iso_df[['chr', 'gene', 'gene_id', 'seq', 'start', 'end', 'strand']].rename(columns={'chr' : 'chrom'})

iso_df.to_csv('polyadb_processed_v3_utr3_isoforms_and_coords.csv', header=True, index=False, sep='\t')

print(iso_df.head())


   chrom  gene gene_id                                                seq  \
0  chr19  A1BG  A1BG.1  TGATGCAGCCGCGGGCCCAGGGTGCTGTTGGTGTCCTCAGAAGTGC...   
1  chr19  A1BG  A1BG.2  TGATGCAGCCGCGGGCCCAGGGTGCTGTTGGTGTCCTCAGAAGTGC...   
2  chr19  A1BG  A1BG.3  TGATGCAGCCGCGGGCCCAGGGTGCTGTTGGTGTCCTCAGAAGTGC...   
3  chr19  A1BG  A1BG.4  TGATGCAGCCGCGGGCCCAGGGTGCTGTTGGTGTCCTCAGAAGTGC...   
4  chr19  A1BG  A1BG.5  TGATGCAGCCGCGGGCCCAGGGTGCTGTTGGTGTCCTCAGAAGTGC...   

      start       end strand  
0  58853731  58858390      -  
1  58853899  58858390      -  
2  58853924  58858390      -  
3  58855285  58858390      -  
4  58856545  58858390      -  
