In [90]:
from Bio import SeqIO

rna_dict = {}
protein_dict = {}

# RefSeq
ref_seq_acc = {}
with open('hg19_refseq_metadata.txt', "r") as handle:
    for line in handle.readlines()[1:]:
        tokens = line.split('\t')
        ref_seq_acc[tokens[5]] = tokens[4]
        
with open('GRCh37_latest_cds.fna', "r") as handle:
    for record in SeqIO.parse(handle, "fasta"):
        header = record.description
        start_pos = header.find("protein_id=")
        end_pos = header[start_pos:].find("]")
        protein_id = header[start_pos:][:end_pos].split('=')[-1].strip()
        transcript_id = ref_seq_acc[protein_id]
        rna_dict[transcript_id.split('.')[0]] = str(record.seq).strip()
    
with open('GRCh37_latest_protein.faa', "r") as handle:
    for record in SeqIO.parse(handle, "fasta"):
        protein_id = record.description.split()[0].strip()
        transcript_id = ref_seq_acc[protein_id]
        protein_dict[transcript_id.split('.')[0]] = str(record.seq).strip()
        
# Ensembl
with open('Homo_sapiens.GRCh37.cds.all.fa', "r") as handle:
    for record in SeqIO.parse(handle, "fasta"):
        rna_dict[record.description.split()[0].split('.')[0]] = str(record.seq).strip()
    
with open('Homo_sapiens.GRCh37.pep.all.fa', "r") as handle:
    for record in SeqIO.parse(handle, "fasta"):
        protein = record.description.split()[4].split(':')[1]
        protein_dict[protein.split('.')[0]] = str(record.seq).strip()
        
# UCSC
with open('knownGene_cds.fna', "r") as handle:
    for record in SeqIO.parse(handle, "fasta"):
        transcript_id = record.description.split()[0].split("_")[-1].strip()
        rna_dict[transcript_id] = str(record.seq).strip()
    
with open('knownGene_protein.faa', "r") as handle:
    for record in SeqIO.parse(handle, "fasta"):
        transcript_id = record.description.split()[0].strip()
        protein_dict[transcript_id] = str(record.seq).strip()
        
# Write dictionaries
with open('hg19_cDNA_DICT.dict', 'w') as handle:
    for key,value in rna_dict.items():
        handle.write("{}:{}\n".format(key,value))
    
with open('hg19_PROTEIN_DICT.dict', 'w') as handle:
    for key,value in protein_dict.items():
        handle.write("{}:{}\n".format(key,value))

In [93]:
test1 = {}
with open('hg19_cDNA_DICT.dict', 'r') as handle:
    for line in handle.readlines():
        tokens = line.split(":")
        test1[tokens[0]] = tokens[1]
        
test2 = {}
with open('FASTA_cDNA.dict', 'r') as handle:
    for line in handle.readlines():
        tokens = line.split(":")
        test2[tokens[0]] = tokens[1]

In [86]:
shared = list(set(test1.keys()).intersection(set(test2.keys())))
print("Shared {}".format(len(shared)))
for i in [0,5,10,15,20,25,50,100,200,500,1000]:
    key = shared[i]
    if test1[key] != test2[key]:
        print("Fail at {}".format(key))

Shared 202850
Fail at uc010pkw.1
Fail at uc010zyf.2
Fail at ENST00000512127
Fail at uc004fxl.3
Fail at uc009zfr.3
Fail at uc002qow.2
Fail at NM_018461
Fail at ENST00000366617
Fail at ENST00000497240
Fail at uc004bfa.3


In [67]:
test1 = {}
with open('hg19_PROTEIN_DICT.dict', 'r') as handle:
    for line in handle.readlines():
        tokens = line.split(":")
        test1[tokens[0]] = tokens[1]
        
test2 = {}
with open('FASTA_AA.dict', 'r') as handle:
    for line in handle.readlines():
        tokens = line.split(":")
        test2[tokens[0]] = tokens[1]

In [79]:
shared = list(set(test1.keys()).intersection(set(test2.keys())))
print("Shared {}".format(len(shared)))
for i in [0,5,10,15,20,25,50,100,200,500,1000]:
    key = shared[i]
    if test1[key] != test2[key]:
        print("Fail at {}".format(key))

Shared 204916


In [87]:
test1['uc010pkw.1']

'AGAGCAAATTACTTCTTCCAGAGTGCTAAATGATCCCCTGGTGCCCTGGAAAGAGACCGTTTACCTCCAAAAGTTTTGAGTATCCGAGGAGCCCAGGAGGAGGAACCCACAGACCCCCAGCTGATGCGGCTGGACAACATGCTGTTAGCGGAAGGCGTGGCGGGGCCTGAGAAGGGCGGAGGGTCGGCGGCAGCGGCGGCAGCGGCGGCGGCTTCTGGAGGGGCAGGTTCAGACAACTCAGTGGAGCATTCAGATTACAGAGCCAAACTCTCACAGATCAGACAAATCTACCATACGGAGCTGGAGAAATACGAGCAGGCCTGCAACGAGTTCACCACCCACGTGATGAATCTCCTGCGAGAGCAAAGCCGGACCAGGCCCATCTCCCCAAAGGAGATTGAGCGGATGGTCAGCATCATCCACCGCAAGTTCAGCTCCATCCAGATGCAGCTCAAGCAGAGCACGTGCGAGGCGGTGATGATCCTGCGTTCCCGATTTCTGGATGCGCGGCGGAAGAGACGGAATTTCAACAAGCAAGCGACAGAAATCCTGAATGAATATTTCTATTCCCATCTCAGCAACCCTTACCCCAGTGAGGAAGCCAAAGAGGAGTTAGCCAAGAAGTGTGGCATCACAGTCTCCCAGGTATCAAACTGGTTTGGAAATAAGCGAATCCGGTACAAGAAGAACATAGGTAAATTTCAAGAGGAAGCCAATATTTATGCTGCCAAAACAGCTGTCACTGCTACCAATGTGTCAGCCCATGGAAGCCAAGCTAACTCGCCCTCAACTCCCAACTCGGCTGGTTCTTCCAGTTCTTTTAACATGTCAAACTCTGGAGATTTGTTCATGAGCGTGCAGTCACTCAATGGGGATTCTTACCAAGGGGCCCAGGTTGGAGCCAACGTGCAATCACAGGTGGATACCCTTCGCCATGTTATCAGCCAGACAGGAGGATACAGTGATGGACTCGCAGCCAGTCAGATGTACAGTCCGCAG

In [88]:
test2['uc010pkw.1']

'ATGCTGTTAGCGGAAGGCGTGGCGGGGCCTGAGAAGGGCGGAGGGTCGGCGGCAGCGGCGGCAGCGGCGGCGGCTTCTGGAGGGGCAGGTTCAGACAACTCAGTGGAGCATTCAGATTACAGAGCCAAACTCTCACAGATCAGACAAATCTACCATACGGAGCTGGAGAAATACGAGCAGGCCTGCAACGAGTTCACCACCCACGTGATGAATCTCCTGCGAGAGCAAAGCCGGACCAGGCCCATCTCCCCAAAGGAGATTGAGCGGATGGTCAGCATCATCCACCGCAAGTTCAGCTCCATCCAGATGCAGCTCAAGCAGAGCACGTGCGAGGCGGTGATGATCCTGCGTTCCCGATTTCTGGATGCGCGGCGGAAGAGACGGAATTTCAACAAGCAAGCGACAGAAATCCTGAATGAATATTTCTATTCCCATCTCAGCAACCCTTACCCCAGTGAGGAAGCCAAAGAGGAGTTAGCCAAGAAGTGTGGCATCACAGTCTCCCAGGTATCAAACTGGTTTGGAAATAAGCGAATCCGGTACAAGAAGAACATAGGTAAATTTCAAGAGGAAGCCAATATTTATGCTGCCAAAACAGCTGTCACTGCTACCAATGTGTCAGCCCATGGAAGCCAAGCTAACTCGCCCTCAACTCCCAACTCGGCTGGTTCTTCCAGTTCTTTTAACATGTCAAACTCTGGAGATTTGTTCATGAGCGTGCAGTCACTCAATGGGGATTCTTACCAAGGGGCCCAGGTTGGAGCCAACGTGCAATCACAGGTGGATACCCTTCGCCATGTTATCAGCCAGACAGGAGGATACAGTGATGGACTCGCAGCCAGTCAGATGTACAGTCCGCAGGGCATCAGTGAAATTGCAAGAACCTATTGA\n'

In [95]:
anno_var = set()
with open("hg19_ensGene.txt", "r") as handle:
    for line in handle.readlines():
        anno_var.add(line.split("\t")[1])
        
# Ensembl
gencode = set()
with open('gencode.v19.pc_transcripts.fa', "r") as handle:
    for record in SeqIO.parse(handle, "fasta"):
        gencode.add(record.description.split("|")[0].split('.')[0])

In [96]:
print(len(anno_var))
print(len(gencode))

204940
95309


In [94]:
len(list(set(test1.keys()).intersection(anno_var)))

99380

In [97]:
len(gencode.intersection(anno_var))

95234

In [98]:
len(list(set(test2.keys()).intersection(anno_var)))

99380