In [1]:
import pandas as pd
import xml.etree.ElementTree as et

import numpy as np
import scipy.sparse as sp
import scipy.io as spio


In [2]:

polyadb_site_file = 'polyAsite.db2'
polyadb_gene_file = 'gene.db2'
polyadb_pas_file = 'PAS.db2'

polyadb_site_df = pd.read_csv(polyadb_site_file, sep='\t', delimiter=None)
polyadb_gene_df = pd.read_csv(polyadb_gene_file, sep='\t', delimiter=None)
polyadb_pas_df = pd.read_csv(polyadb_pas_file, sep='\t', delimiter=None)


In [3]:
print(polyadb_site_df.head())
print(polyadb_gene_df.head())
print(polyadb_pas_df.head())

         siteid   llid  chr  sitenum   position  supporting EST  cleavage
0      Dr.1.1.2  30686   24        2    6919195               1         1
1  Dr.10005.1.1      0   Un        1   11820591               6         2
2   Dr.1002.1.1      0   Un        1  143246768               3         2
3  Dr.10024.1.1      0  NaN        1  384763971               3         1
4   Dr.1003.1.1      0   Un        1  113763249               1         1
     llid chr  str     symbol                                   name  \
0    1018  17    1       CDK3              cyclin-dependent kinase 3   
1  115098  19    1  LOC115098          hypothetical protein BC013949   
2  140901  20    1      STK35             serine/threonine kinase 35   
3   10570  10    1     DPYSL4             dihydropyrimidinase-like 4   
4   57165   1    1      GJA12  gap junction protein, alpha 12, 47kDa   

  unigene id organism     chr_fr     chr_to  
0  Hs.100009       Hs   71486896   71513675  
1  Hs.100043       Hs   1790482

In [7]:

#Create polyadb data dicts

gene_dict = {}

num_dup_llid = 0

for index, row in polyadb_gene_df.iterrows() :
    
    if row['organism'] != 'Hs' or 'random' in row['chr'] :
        continue
    
    if str(row['unigene id']) in gene_dict :
        print('Warning! Dup gene ID (unigene id) = ' + str(row['unigene id']))
        num_dup_llid += 1
    
    gene_dict[str(row['unigene id'])] = {}
    
    gene_dict[str(row['unigene id'])]['chr'] = 'chr' + str(row['chr'])
    gene_dict[str(row['unigene id'])]['strand'] = '+'
    if row['str'] == -1 :
        gene_dict[str(row['unigene id'])]['strand'] = '-'
    gene_dict[str(row['unigene id'])]['gene'] = row['symbol']
    gene_dict[str(row['unigene id'])]['llid'] = row['llid']
    
    gene_dict[str(row['unigene id'])]['start'] = row['chr_fr']
    gene_dict[str(row['unigene id'])]['end'] = row['chr_to']
    
    
print('Number of dup gene ids = ' + str(num_dup_llid))
print('Number of genes = ' + str(len(gene_dict)))


site_dict = {}

for index, row in polyadb_site_df.iterrows() :
    
    gene_parts = row['siteid'].split('.')
    gene = gene_parts[0]
    for k in range(1, len(gene_parts) - 2) :
        gene += '.' + gene_parts[k]
    
    if gene not in gene_dict :
        continue
    
    site_dict[str(row['siteid'])] = {}
    
    site_dict[str(row['siteid'])]['llid'] = str(row['llid'])
    site_dict[str(row['siteid'])]['geneid'] = gene
    site_dict[str(row['siteid'])]['chr'] = 'chr' + str(row['chr'])
    site_dict[str(row['siteid'])]['num_sites'] = row['sitenum']
    site_dict[str(row['siteid'])]['position'] = row['position']
    site_dict[str(row['siteid'])]['support'] = row['supporting EST']

print('Number of sites = ' + str(len(site_dict)))


Number of dup gene ids = 76
Number of genes = 22322
Number of sites = 43183


In [None]:
print(gene_dict)

In [None]:
print(site_dict)

In [13]:
#Create bed file of polyadb sites

l_chr = []
l_start = []
l_end = []
l_gene = []
l_support = []
l_strand = []

for siteid in site_dict :
    chrom = site_dict[siteid]['chr']
    strand = gene_dict[site_dict[siteid]['geneid']]['strand']
    
    start = site_dict[siteid]['position'] - 175
    end = site_dict[siteid]['position'] + 125
    if strand == '-' :
        start = site_dict[siteid]['position'] - 125
        end = site_dict[siteid]['position'] + 175
    
    support = site_dict[siteid]['support']
    
    l_chr.append(chrom)
    l_start.append(start)
    l_end.append(end)
    l_gene.append(siteid)
    l_support.append(support)
    l_strand.append(strand)

bed_df = pd.DataFrame({'chr'  : l_chr,
                       'start'  : l_start,
                       'end'  : l_end,
                       'gene'  : l_gene,
                       'reads' : l_support,
                       'strand'  : l_strand,
                })

bed_df = bed_df[['chr', 'start', 'end', 'gene', 'reads', 'strand']]

print(bed_df.head())

output_bed = 'polyadb_sites.bed'
bed_columns = ['chr', 'start', 'end', 'gene', 'reads', 'strand']
bed_df.to_csv(output_bed, sep='\t', header=False, columns=bed_columns, index=False)


     chr      start        end           gene  reads strand
0  chr11    1288338    1288638  Hs.547350.1.2     15      +
1   chrX   53178406   53178706   Hs.7483.1.11      2      -
2  chr13   94166326   94166626  Hs.552626.1.2      1      +
3   chrX    8308031    8308331  Hs.521869.1.3      1      -
4   chr7  150691602  150691902  Hs.549162.1.9      8      -


In [14]:
hg17_fai     = 'hg17.fa.fai'
hg17_fa      = 'hg17.fa'

# fasta
output_fa = 'polyadb_seqs.fa'
#!bedtools getfasta -name -s -fi "$hg19_fa" -bed "$output_bed" -fo | cut -d : -f-4 > "$output_fa"
!bedtools getfasta -name -s -fi "$hg17_fa" -bed "$output_bed" -fo "$output_fa"
    
# file tops
!head -5 "$output_bed" | column -t ; echo
!head -10 "$output_fa" ; echo




chr11  1288338    1288638    Hs.547350.1.2  15  +
chrX   53178406   53178706   Hs.7483.1.11   2   -
chr13  94166326   94166626   Hs.552626.1.2  1   +
chrX   8308031    8308331    Hs.521869.1.3  1   -
chr7   150691602  150691902  Hs.549162.1.9  8   -

>Hs.547350.1.2
acgagcccaggtagtacagcacgtctcccccgtgatgttttttggcttttatcttacatataaacaagcgtacccaggtggacgccttcctcctcgtgcttttgggaacaccctgctctatggagtagccattcttttattccttcactgccttaataaacttgctttcactttactctgtggactggcctggaattctttcttccacaagatccacgaatcctcttttggggtctggatcaggacccttttccggtaacaCAaccactccactaaaaactctttgtcagtgatctctat
>Hs.7483.1.11
GTCGTTTGTTCCCTTGCTGTGGCCATTGCTGCCATCTCCTCCACTGCTTGAAGGCCTCACCCCATGCCCTCTGCCACTCCCATAGTGCTCTGTAAATATTATCAGGAGGAAAAGGCCTCTCAGAGTGCGTGTTGCTGTGTACAAAGGAATTTCCATCAATAAAAGCTGATCTCTTCTCTCTGTCTGATGTATATTCCACCCACCCCCACTTCCCTCTTCCAATCCCCACCAGGGCCTTGCCTTCTTTCCTATCCAGGGCTGGGAATAGGGGCCGGGAAAGAGAGTAGGGTTTGATTCGTG
>Hs.552626.1.2
GTATCTAGTCGGAAACTGAGTGACACTCCCTCAGAAAAAGTTGTGGGCCTGTGTAAATTGAGTTTCCTCTTTTCCCCCTTCAATTCTCCCCAGAAAATAGTCT