In [1]:
import numpy
import pandas as pd

In [5]:

polyadb_df = pd.read_csv('unprocessed_data/polyadb/human_v3.PAS.txt', delimiter='\t')
polyadb_df = polyadb_df.loc[~polyadb_df['Gene Symbol'].isnull()].copy().reset_index(drop=True)

print(polyadb_df.head())


          PAS_ID Chromosome  Position Strand   Mean RPM Intron/exon location  \
0  chr1:564599:+       chr1    564599      +   3.333397          Single exon   
1  chr1:564629:+       chr1    564629      +   1.201574          Single exon   
2  chr1:564664:+       chr1    564664      +  14.856402          Single exon   
3  chr1:564708:+       chr1    564708      +   1.872293          Single exon   
4  chr1:564952:+       chr1    564952      +   1.991760          Single exon   

       Ensemble ID RefSeq Gene ID Gene Symbol  \
0  ENSG00000225972             na    MTND1P23   
1  ENSG00000225972             na    MTND1P23   
2  ENSG00000225972             na    MTND1P23   
3  ENSG00000225972             na    MTND1P23   
4  ENSG00000225972             na    MTND1P23   

                                           Gene Name FAMTOM ID  \
0  mitochondrially encoded NADH:ubiquinone oxidor...        na   
1  mitochondrially encoded NADH:ubiquinone oxidor...        na   
2  mitochondrially encoded

In [6]:


chr_list = []
start_list = []
end_list = []
gene_list = []
strand_list = []
feature_list = []
pas_list = []
mode_list = []

gene_next_id = {}

visited_dict = {}

for index, row in polyadb_df.iterrows() :
    
    if row['PAS_ID'] in visited_dict :
        continue
    visited_dict[row['PAS_ID']] = True
    
    chr_list.append(row['Chromosome'])
    start_list.append(int(row['Position']) - 400) #250
    end_list.append(int(row['Position']) + 400)
    
    if row['Gene Symbol'] not in gene_next_id :
        gene_next_id[row['Gene Symbol']] = 1
    
    gene_list.append(row['Gene Symbol'] + '.' + str(gene_next_id[row['Gene Symbol']]))
    strand_list.append(row['Strand'])
    feature_list.append(row['Intron/exon location'].replace(' ', '_').replace('\'', ''))
    pas_list.append(row['PAS Signal'])
    mode_list.append(int(row['Position']))
    
    gene_next_id[row['Gene Symbol']] += 1


polyadb_bed = pd.DataFrame({'chr'  : chr_list,
                    'start'  : start_list,
                    'end'  : end_list,
                    'gene'  : gene_list,
                    'strand'  : strand_list,
                    'feature'  : feature_list,
                    'pas' : pas_list,
                    'mode' : mode_list,
                })

polyadb_bed = polyadb_bed[['chr', 'start', 'end', 'gene', 'feature', 'strand', 'pas', 'mode']]

polyadb_bed.to_csv('polyadb_bed.bed', sep='\t', header=False, index=False)

print(polyadb_bed.head())


    chr   start     end        gene      feature strand    pas    mode
0  chr1  564199  564999  MTND1P23.1  Single_exon      +  NoPAS  564599
1  chr1  564229  565029  MTND1P23.2  Single_exon      +  NoPAS  564629
2  chr1  564264  565064  MTND1P23.3  Single_exon      +  NoPAS  564664
3  chr1  564308  565108  MTND1P23.4  Single_exon      +  NoPAS  564708
4  chr1  564552  565352  MTND1P23.5  Single_exon      +  NoPAS  564952


In [7]:
hg19_fai     = 'unprocessed_data/hg19/hg19.fa.fai'
hg19_fa      = 'unprocessed_data/hg19/hg19.fa'

# fasta
output_fa = 'poladb_seqs.fa'
output_bed = 'polyadb_bed.bed'
!bedtools getfasta -name -s -fi "$hg19_fa" -bed "$output_bed" -fo "$output_fa"
    
# file tops
!head -5 "$output_bed" | column -t ; echo
!head -10 "$output_fa" ; echo


chr1  564199  564999  MTND1P23.1  Single_exon  +  NoPAS  564599
chr1  564229  565029  MTND1P23.2  Single_exon  +  NoPAS  564629
chr1  564264  565064  MTND1P23.3  Single_exon  +  NoPAS  564664
chr1  564308  565108  MTND1P23.4  Single_exon  +  NoPAS  564708
chr1  564552  565352  MTND1P23.5  Single_exon  +  NoPAS  564952

>MTND1P23.1
ATCTATTGGCCATTCACAGCATAGCGTATAAACCTAGCTCATGATTTCTTTGCAATAGAAGTGTACTTTTTCATCACATTCCCTTCACAACTTACTCACCAGATCAGACTTTGAGCTCTCCTCCTGGCTTAGCCTGGATCGTTTGAAATGGTCATCCATCCTTTGGCCCCAATACCTAAACTAAGGTCTATGAACAATAAGATGATTTTCTTCAGTGGGACTTTTTTGTTTAATATATTAGATTTGACCTTCAGCAAGGTCAAAGGGAGTCCGAACTAGTCTCAGGCTTCAACATCGAATACGCCGCAGGCCCCTTCGCCCTATTCTTCATAGCCGAATACACAAACATTATTATAATAAACACCCTCACCACTACAATCTTCCTAGGAACAACATATAACGCACTCTCCCCTGAACTCTACACAACATATTTTGTCACCAAGACCCTACTTCTGACCTCCCTGTTCTTATGAATTCGAACAGCATACCCCCGATTCCGCTACGACCAACTCATACACCTCCTATGAAAAAACTTCCTACCACTCACCCTAGCATTACTTATATGATATGTCTCCATACCCATTACAATCTCCAGCATTCCCCCTCAAACCTAAGAAATATGTCTGATAAAAGAGTTACTTTGATAGAGTAAATAATAGGAGTTTAA

In [8]:

seq_dict = {}
with open('poladb_seqs.fa', 'r') as f :
    i = 0
    seq_id = ''
    for line in f :
        if i % 2 == 0 :
            seq_id = line[1:-1]
        else :
            seq_dict[seq_id] = line[:-1].upper()
        
        i += 1

gene_dict = {}

df_dict = {}
for index, row in polyadb_bed.iterrows() :
    df_dict[row['gene']] = {}
    
    df_dict[row['gene']]['seq'] = seq_dict[row['gene']]
    df_dict[row['gene']]['chr'] = row['chr']
    df_dict[row['gene']]['seq_start'] = row['start']
    df_dict[row['gene']]['seq_end'] = row['end']
    df_dict[row['gene']]['strand'] = row['strand']
    df_dict[row['gene']]['type'] = row['feature']
    df_dict[row['gene']]['pas'] = row['pas']
    df_dict[row['gene']]['cut_mode'] = row['mode']
    df_dict[row['gene']]['sort_order'] = index
    
    gene_id = row['gene'].split('.')[0]
    
    if gene_id not in gene_dict :
        gene_dict[gene_id] = 0
    gene_dict[gene_id] += 1


print(len(df_dict))
print(df_dict['WASH7P.1'])


228500
{'seq': 'TCTGGGATGGGAGCTGGGCCGGGGACCTCCCTGGTCACACACCTTCTTCCCTAGACACCCCACACTTTGTGTTTCAGACCTACAAGATGGGGTACTAACACCACCCCCACCGCCCCCACCACCACCCCCAGCTCCTGAGGTGCTGGCCAGTGCACCCCCACTCCCACCCTCAACCGCGGCCCCTGTAGGCCAAGGCGCCAGGCAGGACGACAGCAGCAGCAGCGCGTCTCCTTCAGGTGGGAGCAGCTCTTTGAGGCCACCTGATTTCTGGCGTGCTCAGTGCACTCGGGTGGATTTTCTGTGGGTTTGTTAAGTGGTCAGAAATTCTCAATTTTTTGAATAGTTTCCATTTCAAATATCTTGTTCTACTTGGTTCATAAAATAGTGGTTTTCAAACTGTAGAGCTCTGGACTTCTCACTTCTAGGGCAGAGGGAGCCTGAACAAGTGAGGCTCTGGGTTCCCCATTCCTAATTAAACCAATGGAAAGAAGGGGTCTAATAACAAACTACAGCAACACATTTTTCATTTCAGCTTCACTGCTGTGTCTCCCAGTGTAACCCTAGCATCCAGAAGTGGCACAAAACCCCTCTGCTGGCTCGTGTGTGCAACTGAGACTGTCAGAGCATGGCTAGCTCAGGGGTCCAGCTCTGCAGGGTGGGGGCTAGAGAGGAAGCAGGGAGTATCTGCACACAGGATGCCCGCGCTCAGGTGGTTGCAGAAGTCAGTGCCCAGGCCCCCACACACAGTCTCCAAAGGTCCGGCCTCCCCAGCGCAGGGCTCCTCGTTTGAGGGGAGGTGA', 'chr': 'chr1', 'seq_start': 16042, 'seq_end': 16842, 'strand': '-', 'type': 'Intron', 'pas': 'OtherPAS', 'cut_mode': 16442, 'sort_order': 10925}


In [9]:

#Make Valid PAS lookup hierarchy

cano_pas1 = 'AATAAA'
cano_pas2 = 'ATTAAA'

valid_pas = []

valid_pas.append({})
valid_pas[0]['AATAAA'] = True

valid_pas.append({})
valid_pas[1]['ATTAAA'] = True

valid_pas.append({})
valid_pas[2]['AGTAAA'] = True
valid_pas[2]['TATAAA'] = True
valid_pas[2]['CATAAA'] = True
valid_pas[2]['GATAAA'] = True

valid_pas.append({})
for pos in range(0, 6) :
    for base in ['A', 'C', 'G', 'T'] :
        valid_pas[3][cano_pas1[:pos] + base + cano_pas1[pos+1:]] = True

valid_pas.append({})
for pos1 in range(0, 6) :
    for pos2 in range(pos1 + 1, 6) :
        for base1 in ['A', 'C', 'G', 'T'] :
            for base2 in ['A', 'C', 'G', 'T'] :
                valid_pas[4][cano_pas1[:pos1] + base1 + cano_pas1[pos1+1:pos2] + base2 + cano_pas1[pos2+1:]] = True


In [10]:


def search_seq(seq, cut_start, cut_end, search_pas, before_cut = 45, after_cut = 1) :
    
    align_j = cut_start - 25
    aligned = -1
    
    for j in range(cut_start - before_cut, cut_start + after_cut) :

        candidate_pas = seq[j:j+6]

        if candidate_pas == search_pas :
            align_j = j
            aligned = 0
        
        if aligned != -1 :
            break
    
    aligned_seq = (seq[align_j-50:])[:186]
    aligned_seq_long = (seq[align_j-100:])[:186 + 50 + 50]
    
    if len(aligned_seq_long) != 186 + 50 + 50 :
        print('WARNING! Aligned seq not long enough. Length = ' + str(len(aligned_seq_long)))
    
    
    aligned_wide_seq = (seq[align_j-125:])[:256]
    aligned_wide_seq_long = (seq[align_j-175:])[:256 + 50 + 50]
    
    if len(aligned_wide_seq_long) != 256 + 50 + 50 :
        print('WARNING! Aligned seq not long enough. Length = ' + str(len(aligned_wide_seq_long)))
    
    return aligned_seq, aligned_seq_long, aligned, (align_j - cut_start), aligned_wide_seq, aligned_wide_seq_long

def align_seq(seq, cut_start, cut_end, before_cut = 45, after_cut = 1) :
    
    align_j = cut_start - 25
    aligned = -1
    
    for i in range(0, len(valid_pas)) :
        for j in range(cut_start - before_cut, cut_start + after_cut) :

            candidate_pas = seq[j:j+6]

            if candidate_pas in valid_pas[i] :
                align_j = j
                aligned = i
                
                if j >= cut_start :
                    break
        if aligned != -1 :
            break
    
    aligned_seq = (seq[align_j-50:])[:186]
    aligned_seq_long = (seq[align_j-100:])[:186 + 50 + 50]
    
    if len(aligned_seq_long) != 186 + 50 + 50 :
        print('WARNING! Aligned seq not long enough. Length = ' + str(len(aligned_seq_long)))
    
    
    aligned_wide_seq = (seq[align_j-125:])[:256]
    aligned_wide_seq_long = (seq[align_j-175:])[:256 + 50 + 50]
    
    if len(aligned_wide_seq_long) != 256 + 50 + 50 :
        print('WARNING! Aligned seq not long enough. Length = ' + str(len(aligned_wide_seq_long)))
    
    return aligned_seq, aligned_seq_long, aligned, (align_j - cut_start), aligned_wide_seq, aligned_wide_seq_long


In [11]:
#Global PolyaDB dataframe generation

gene = []

prox_id = []
prox_seq = []
prox_seq_ext = []
prox_wide_seq = []
prox_wide_seq_ext = []

cut_mode = []

prox_type = []
prox_pas = []
prox_sitenum = []
num_sites = []

prox_chrom = []
prox_strand = []

pas_pos = []

for site_id in df_dict :
    
    gene_id = site_id.split('.')[0]
    site_num = int(site_id.split('.')[1])
    
    gene.append(gene_id)
    
    p_id = site_id
    
    p_seq = df_dict[p_id]['seq']
    
    p_mode = df_dict[p_id]['cut_mode']
    
    p_type = df_dict[p_id]['type']
    
    p_pas = df_dict[p_id]['pas'].replace('U', 'T')
    
    if p_pas in ['NoPAS', 'OtherPAS'] :
        p_seq, p_seq_long, p_aligned, p_shift, p_wide_seq, p_wide_seq_long = align_seq(p_seq, 400, len(p_seq) - 800, before_cut = 45, after_cut = 1) #250, ..., 500
    else :
        p_seq, p_seq_long, p_aligned, p_shift, p_wide_seq, p_wide_seq_long = search_seq(p_seq, 400, len(p_seq) - 800, p_pas, before_cut = 45, after_cut = 1)
    
    if df_dict[p_id]['strand'] == '+' :
        pas_pos.append(p_shift + p_mode)
    else :
        pas_pos.append(-p_shift + p_mode)
    
    prox_id.append(p_id)
    prox_seq.append(p_seq)
    prox_seq_ext.append(p_seq_long)
    
    prox_wide_seq.append(p_wide_seq)
    prox_wide_seq_ext.append(p_wide_seq_long)
    
    cut_mode.append(p_mode)
    
    prox_type.append(p_type)
    
    prox_pas.append(p_aligned)
    
    prox_chrom.append(df_dict[p_id]['chr'])
    prox_strand.append(df_dict[p_id]['strand'])
    
    #sort_order.append(df_dict[p_id]['sort_order'])
    p_sitenum = -1
    if df_dict[p_id]['strand'] == '-' :
        p_sitenum = gene_dict[gene_id] - site_num + 1
    elif df_dict[p_id]['strand'] == '+' :
        p_sitenum = site_num
    
    num_sites.append(gene_dict[gene_id])
    prox_sitenum.append(p_sitenum)


df = pd.DataFrame({
        'gene' : gene,
        'gene_id'  : prox_id,
        'sitenum'  : prox_sitenum,
        'num_sites' : num_sites,
        'pas'  : prox_pas,
        'seq'  : prox_seq,
        'seq_ext'  : prox_seq_ext,
        'wide_seq'  : prox_wide_seq,
        'wide_seq_ext'  : prox_wide_seq_ext,
        'pas_pos'  : pas_pos,
        'cut_mode'  : cut_mode,
        'chrom' : prox_chrom,
        'strand' : prox_strand,
        'site_type'  : prox_type
    })

df = df[['gene',
        'gene_id',
        'sitenum',
        'num_sites',
        'pas',
        'seq',
        'seq_ext',
        'wide_seq',
        'wide_seq_ext',
        'pas_pos',
        'cut_mode',
        'chrom',
        'strand',
        'site_type']]

df = df.sort_values(by=['chrom', 'gene', 'sitenum'])

print(df.head())
print(df.tail())

print('Total number of members: ' + str(len(df)))


df.to_csv('polyadb_processed_v3.csv', header=True, index=False, sep=',')



          gene    gene_id  sitenum  num_sites  pas  \
876    AADACL3  AADACL3.1        1          2    0   
877    AADACL3  AADACL3.2        2          2    0   
15892    ABCA4    ABCA4.5        1          5    0   
15891    ABCA4    ABCA4.4        2          5    2   
15890    ABCA4    ABCA4.3        3          5    4   

                                                     seq  \
876    GTGCTTTCTAATCTGTGGAATGCCAGGGTCCCAGTGTGGGAGCCTT...   
877    GCTGCAGGTGGTGGTTGCTGAAGGTGGGGGAGGCTGTGGCAATTTC...   
15892  CAGTAAAATATTTTCTGCATTTGCCCAAGGACACATTCCCAACGAA...   
15891  GTTTATCAAATACAACTCAGACGTCAGTCTCCTGGCCCCTTTGAGA...   
15890  ATTCAAATATGTGAAGAGCATCCACTTTAAAATATTTAAAATGCAG...   

                                                 seq_ext  \
876    TTACCCCCAGGGAATTAGGGGAGAGGAAACACCTTTATTTGCTTTC...   
877    TAAATTGTCGTAATCTTTTTGCTGGTTGATGGTCTTGCCTTGATGT...   
15892  CCCCAGTATCCCCATCTTGGTGGGACAACAGAACCCAAGAACTGGC...   
15891  CCTCTGATCCATCTCTGTCTGCATGAGTGACAGCTGGCAGAGTCCT...   
15890  CCTGGCT

In [20]:
#Pairwise PolyaDB dataframe generation

chrom = []
strand = []
gene = []

prox_id = []
dist_id = []

prox_seq = []
dist_seq = []

site_distance = []

prox_cut_mode = []
dist_cut_mode = []

prox_type = []
dist_type = []

prox_pas = []
dist_pas = []

prox_sitenum = []
dist_sitenum = []

num_sites = []

for site_id in df_dict :
    
    gene_id = site_id.split('.')[0]
    site_num = int(site_id.split('.')[1])
    
    p_id = site_id
    d_id = ''
    if df_dict[site_id]['strand'] == '-' :
        if gene_id + '.' + str(site_num - 1) not in df_dict :
            continue
        
        d_id = gene_id + '.' + str(site_num - 1)
    elif df_dict[site_id]['strand'] == '+' :
        if gene_id + '.' + str(site_num + 1) not in df_dict :
            continue
        
        d_id = gene_id + '.' + str(site_num + 1)
    
    
    gene.append(gene_id)
    chrom.append(df_dict[p_id]['chr'])
    strand.append(df_dict[p_id]['strand'])
    
    p_seq = df_dict[p_id]['seq']
    p_mode = df_dict[p_id]['cut_mode']
    p_type = df_dict[p_id]['type']
    p_pas = df_dict[p_id]['pas'].replace('U', 'T')
    
    if p_pas in ['NoPAS', 'OtherPAS'] :
        p_seq, p_seq_long, p_aligned, p_shift, _, _ = align_seq(p_seq, 400, len(p_seq) - 800, before_cut = 45, after_cut = 1) #250, ..., 500
    else :
        p_seq, p_seq_long, p_aligned, p_shift, _, _ = search_seq(p_seq, 400, len(p_seq) - 800, p_pas, before_cut = 45, after_cut = 1)
    
    prox_id.append(p_id)
    prox_seq.append(p_seq)
    prox_cut_mode.append(p_mode)
    prox_type.append(p_type)
    prox_pas.append(p_aligned)
    
    d_seq = df_dict[d_id]['seq']
    d_mode = df_dict[d_id]['cut_mode']
    d_type = df_dict[d_id]['type']
    d_pas = df_dict[d_id]['pas'].replace('U', 'T')
    
    if d_pas in ['NoPAS', 'OtherPAS'] :
        d_seq, d_seq_long, d_aligned, d_shift = align_seq(d_seq, 250, len(d_seq) - 500, before_cut = 45, after_cut = 1)
    else :
        d_seq, d_seq_long, d_aligned, d_shift = search_seq(d_seq, 250, len(d_seq) - 500, d_pas, before_cut = 45, after_cut = 1)
    
    dist_id.append(d_id)
    dist_seq.append(d_seq)
    dist_cut_mode.append(d_mode)
    dist_type.append(d_type)
    dist_pas.append(d_aligned)
    
    if df_dict[site_id]['strand'] == '-' :
        prox_sitenum.append(gene_dict[gene_id] - site_num + 1)
        dist_sitenum.append(gene_dict[gene_id] - site_num + 2)
    elif df_dict[site_id]['strand'] == '+' :
        prox_sitenum.append(site_num)
        dist_sitenum.append(site_num + 1)
    
    site_dist = numpy.abs(df_dict[p_id]['cut_mode'] - df_dict[d_id]['cut_mode'])
    site_distance.append(site_dist)
    num_sites.append(gene_dict[gene_id])
        


df = pd.DataFrame({
        'gene' : gene,
        'chrom' : chrom,
        'strand' : strand,
        'proximal_id'  : prox_id,
        'distal_id'  : dist_id,
        'proximal_sitenum'  : prox_sitenum,
        'distal_sitenum'  : dist_sitenum,
        'proximal_pas'  : prox_pas,
        'distal_pas'  : dist_pas,
        'proximal_seq'  : prox_seq,
        'distal_seq'  : dist_seq,
        'site_distance'  : site_distance,
        'proximal_cut_mode'  : prox_cut_mode,
        'distal_cut_mode'  : dist_cut_mode,
        'proximal_type'  : prox_type,
        'distal_type'  : dist_type,
        'num_sites' : num_sites
    })

df = df[['gene',
        'chrom',
        'strand',
        'proximal_id',
        'distal_id',
        'proximal_sitenum',
        'distal_sitenum',
        'proximal_pas',
        'distal_pas',
        'proximal_seq',
        'distal_seq',
        'site_distance',
        'proximal_cut_mode',
        'distal_cut_mode',
        'proximal_type',
        'distal_type',
        'num_sites']]

df = df.sort_values(by=['chrom', 'gene', 'proximal_sitenum'])

print(df.head())
print(df.tail())

print('Total number of members: ' + str(len(df)))


df.to_csv('polyadb_processed_v3_pairs.csv', header=True, index=False, sep=',')



         gene chrom strand proximal_id  distal_id  proximal_sitenum  \
507   AADACL3  chr1      +   AADACL3.1  AADACL3.2                 1   
2815    ABCA4  chr1      -     ABCA4.2    ABCA4.1                 1   
5752   ABCB10  chr1      -    ABCB10.2   ABCB10.1                 1   
2825    ABCD3  chr1      +     ABCD3.1    ABCD3.2                 1   
2826    ABCD3  chr1      +     ABCD3.2    ABCD3.3                 2   

      distal_sitenum  proximal_pas  distal_pas  \
507                2             0           0   
2815               2             2           0   
5752               2             0           0   
2825               2            -1           0   
2826               3             0           2   

                                           proximal_seq  \
507   GTGCTTTCTAATCTGTGGAATGCCAGGGTCCCAGTGTGGGAGCCTT...   
2815  GTTTATCAAATACAACTCAGACGTCAGTCTCCTGGCCCCTTTGAGA...   
5752  TCTGATACATGATGTTCAATTTTATCTTTAGGTAATATTTTATATC...   
2825  TTCTTGAATTTTATAAGTATCTCTAGCTTC