# Genbank file내 CDS만 골라 protein 서열로 바꾸고, 'gene * 종' 형태의 matrix만들기

In [1]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import pandas as pd

### TOMATO

In [2]:
t_gb_parse = [t_file for t_file in SeqIO.parse("gb file/tomato.gb", "genbank")]

In [3]:
print(t_gb_parse)

[SeqRecord(seq=Seq('TGGGCGAACGACGGGAATTGAACCCGCGCATGGTGGATTCACAATCCACTGCCT...TAA', IUPACAmbiguousDNA()), id='DQ347959.1', name='DQ347959', description='Solanum lycopersicum cultivar LA3023 chloroplast, complete genome', dbxrefs=[])]


In [4]:
for t_gb in t_gb_parse :
    print(("Name %s, %i features")%((t_gb.name), len(t_gb.features)))
    print(repr(t_gb.seq))

Name DQ347959, 307 features
Seq('TGGGCGAACGACGGGAATTGAACCCGCGCATGGTGGATTCACAATCCACTGCCT...TAA', IUPACAmbiguousDNA())


In [5]:
t_CDS_dic = {}

for t_gb in t_gb_parse :
    t_feats = [t_feat for t_feat in t_gb.features if t_feat.type == "CDS"]
    for t_feat in t_feats :
        key = t_feat.qualifiers['gene'][0]
        value = str(t_feat.location.extract(parent_sequence = t_gb.seq).translate(table=11, to_stop=True))
        t_CDS_dic[key] = value

In [6]:
t_CDS_dic

{'accD': 'MTIHLLYFHANRGQENSMERWWFNSMLFKKEFERRCGLNKSMGSLGPIENTSEDPNLKVKNIHSCSNVDYLFGVKDIWNFISNDTFLVSDRNGDSYSIYFDIENHIFEVDNDHSFLSELESSFYSYRNSSYLNNGFRGEDPYYNSYMSYMYDTQYSWNNHINSCIDNYLQSQICIDTSIISGSESNGDSYIYRAICSGQSLNSSENEGSSRRTRTKDSDLTIRESSNDLEVTQKYKHLWVQCENCYGLNYKKFLKSKMNICEQCGYHLKMSSSDRIELLIDPGTWDPMDEDMVSLDPIEFHSEEEPYKDRIDSYQRKTGLTEAVQTGIGQLNGIPVAIGVMDFQFMGGSMGSVVGEKITRLIEHAANQNLPLMIVCASGGARMQEGSLSLMQMAKISSALYDYQLNKKLFYVSILTSPTTGGVTASFGMLGDIIIAEPNAYIAFAGKRVIEQTLNKTVPEGSQAAEYLFQKGLFDLIVPRNLLKSVLSELFKLHAFFPLNQKSSKIK',
 'atpA': 'MVTIRADEISNIIRERIEQYNREVKIVNTGTVLQVGDGIARIHGLDEVMAGELVEFEEGTIGIALNLESNNVGVVLMGDGLLIQEGSSVKATGRIAQIPVSEAYLGRVVNALAKPIDGRGEISASEFRLIESAAPGIISRRSVYEPLQTGLIAIDSMIPIGRGQRELIIGDRQTGKTAVATDTILNQQGQNVICVYVAIGQKASSVAQVVTTLQERGAMEYTIVVAETADSPATLQYLAPYTGAALAEYFMYRERHTLIIYDDLSKQAQAYRQMSLLLRRPPGREAYPGDVFYLHSRLLERAAKLSSSLGEGSMTALPIVETQSGDVSAYIPTNVISITDGQIFLSADLFNSGIRPAINVGISVSRVGSAAQIKAMKQVAGKLKLELAQFAELEAFAQFASDLDKATQNQLARGQRLRELLKQSQSAPLTVEEQIMTIYTGTNGYLDSLEVGQVRKFLVELRTYLKTTKP

1_intro 내용의 translation 문항을 응용, value값으로 둘 수 있는 코드가 많음

### Arabidopsis thaliana

In [7]:
a_gb_parse = [a_file for a_file in SeqIO.parse("gb file/A.thaliana.gb", "genbank")]

In [8]:
print(a_gb_parse)

[SeqRecord(seq=Seq('ATGGGCGAACGACGGGAATTGAACCCGCGATGGTGAATTCACAATCCACTGCCT...CAT', IUPACAmbiguousDNA()), id='KX551970.1', name='KX551970', description='Arabidopsis thaliana chloroplast, complete genome', dbxrefs=[])]


In [9]:
for a_gb in a_gb_parse :
    print(("Name %s, %i features")%((a_gb.name), len(a_gb.features)))
    print(repr(a_gb.seq))

Name KX551970, 244 features
Seq('ATGGGCGAACGACGGGAATTGAACCCGCGATGGTGAATTCACAATCCACTGCCT...CAT', IUPACAmbiguousDNA())


In [10]:
a_CDS_dic = {}

for a_gb in a_gb_parse :
    a_feats = [a_feat for a_feat in a_gb.features if a_feat.type == "CDS"]
    for a_feat in a_feats :
        key = a_feat.qualifiers['gene'][0]
        value = str(a_feat.location.extract(parent_sequence = a_gb.seq).translate(table=11, to_stop=True))
        a_CDS_dic[key] = value

In [11]:
a_CDS_dic

{'accD': 'MEKSWFNFMFSKGELEYRGELSKAMDSFAPGEKTTISQDRFIYDMDKNFYGWDERSSYSSSYSNNVDLLVSSKDIRNFISDDTFFVRDSNKNSYSIFFDKKKKIFEIDNDFSDLEKFFYSYCSSSYLNNRSKGDNDLHYDPYIKDTKYNCTNHINSCIDSYFRSYICIDNNFLIDSNNFNESYIYNFICSESGKIRESKNYKIRTNRNRSNLISSKDFDITQNYNQLWIQCDNCYGLMYKKVKMNVCEQCGHYLKMSSSERIELSIDPGTWNPMDEDMVSADPIKFHSKEEPYKNRIDSAQKTTGLTDAVQTGTGQLNGIPVALGVMDFRFMGGSMGSVVGEKITRLIEYATNQCLPLILVCSSGGARMQEGSLSLMQMAKISSVLCDYQSSKKLFYISILTSPTTGGVTASFGMLGDIIIAEPYAYIAFAGKRVIEQTLKKAVPEGSQAAESLLRKGLLDAIVPRNLLKGVLSELFQLHAFFPLNTN',
 'atpA': 'MVTIRADEISNIIRERIEQYNREVTIVNTGTVLQVGDGIARIYGLDEVMAGELVEFEEGTIGIALNLESNNVGVVLMGDGLMIQEGSSVKATGKIAQIPVSEAYLGRVINALANPIDGRGKISASESRLIESPAPGIISRRSVYEPLQTGLIAIDSMIPIGRGQRELIIGDRQTGKTAVATDTILNQQGQNVICVYVAIGQKASSVAQVVTSLQERGAMEYTIVVAETADSPATLQYLAPYTGAALAEYFMYREQHTLIIYDDLSKQAQAYRQMSLLLRRPPGREAYPGDVFYLHSRLLERAAKLSSQLGEGSMTALPIVETQSGDVSAYIPTNVISITDGQIFLSADLFNAGIRPAINVGISVSRVGSAAQIKAMKQVAGKLKLELAQFAELEAFSQFSSDLDKATQNQLARGQRLRELLKQSQSAPLTVEEQIMTIYTGTNGYLDGLEIGQVRKFLVQLRTYLKTNKPQFQEIIASTKTLTAEAESF

## Matrix