In [1]:
######### ::: create promoter prediction training data ::: ##########

In [2]:
import re
import os
import sys
import math
import random
import numpy as np
import pandas as pd
from Bio import SeqIO
from Bio import motifs
from Bio.Seq import Seq

In [3]:
dir_path = os.path.dirname(os.path.realpath('create_data.ipynb'))

In [4]:
## read in fasta
TATA_pos_dir = dir_path + "/../data/promoter/human_epdnew_hg38_TATA_199to300.fa"
noTATA_pos_dir = dir_path + "/../data/promoter/human_epdnew_hg38_noTATA_199to300.fa"

In [5]:
## parse
TATA_pos = SeqIO.to_dict(SeqIO.parse(open(TATA_pos_dir),'fasta'))
noTATA_pos = SeqIO.to_dict(SeqIO.parse(open(noTATA_pos_dir),'fasta'))

In [6]:
## positive df
# TATA
TATA_pos_df = pd.DataFrame({'seq':pd.DataFrame.from_dict(TATA_pos,orient='index').agg(''.join, axis=1),
                            'label': 1,
                            'TATA': 1})
print(len(TATA_pos_df))
# no TATA
noTATA_pos_df = pd.DataFrame({'seq':pd.DataFrame.from_dict(noTATA_pos,orient='index').agg(''.join, axis=1),
                            'label': 1,
                            'TATA': 0})
print(len(noTATA_pos_df))

3065
26533


In [7]:
## create negative df
# TATA
TATA_neg_dir = dir_path + "/../data/promoter/human_epdnew_hg38_TATA_neg_199to300.fa"
TATA_neg = SeqIO.to_dict(SeqIO.parse(open(TATA_neg_dir),'fasta'))

# take a random sample of 3065
random.seed(123)
keep = random.sample(list(TATA_neg), len(TATA_pos_df))
TATA_neg = {k:v for k, v in TATA_neg.items() if k in keep}

In [8]:
TATA_neg_df = pd.DataFrame({'seq':pd.DataFrame.from_dict(TATA_neg,orient='index').agg(''.join, axis=1),
                            'label': 0,
                            'TATA': 1})

In [9]:
len(TATA_neg_df)

3065

In [10]:
# noTATA
def random_substitution(seq, fold=20, pct=0.6, **kwargs):
    seq_len = len(seq)
    subseq_len = int(seq_len/fold)
    
    seq_list = [str(seq[i*subseq_len:((i+1)*subseq_len)].seq) for i in range(fold)]
    if str(seq[subseq_len*fold:seq_len].seq) != "":
        seq_list.append(str(seq[subseq_len*fold:seq_len].seq))

    sub_idx = random.sample(range(0,fold-1), int(fold*pct))
    
    # substitute
    for i in sub_idx:
        rand_seq = ''.join([random.choice(['A','T','C','G']) for n in range(subseq_len)])
        seq_list[i] = rand_seq
        
    return ''.join(seq_list)

In [11]:
random.seed(123)
noTATA_neg = {k+'_neg':random_substitution(v) for k,v in noTATA_pos.items()}

In [12]:
noTATA_neg_df = pd.DataFrame({'seq':pd.DataFrame.from_dict(noTATA_neg,orient='index').agg(''.join, axis=1),
                            'label': 0,
                            'TATA': 0})

In [13]:
len(noTATA_neg_df)

26533

In [14]:
# concatnate
TATA_df = pd.concat([TATA_pos_df,TATA_neg_df])
noTATA_df = pd.concat([noTATA_pos_df,noTATA_neg_df])

In [15]:
TATA_df.to_csv(dir_path + "/../data/promoter/TATA_199to300.csv")
noTATA_df.to_csv(dir_path + "/../data/promoter/noTATA_199to300.csv")

In [16]:
with open(dir_path + "/../data/promoter/TATA_199to300.tsv",'w') as TATA_tsv:
    TATA_tsv.write(TATA_df.to_csv(sep='\t', index=False))

In [17]:
with open(dir_path + "/../data/promoter/noTATA_199to300.tsv",'w') as noTATA_tsv:
    noTATA_tsv.write(noTATA_df.to_csv(sep='\t', index=False))