**Finding a Protein Motif**

To allow for the presence of its varying forms, a protein motif is represented by a shorthand as follows: [XY] means "either X or Y" and {X} means "any amino acid except X." For example, the N-glycosylation motif is written as N{P}[ST]{P}.

You can see the complete description and features of a particular protein by its access ID "uniprot_id" in the UniProt database, by inserting the ID number into

http://www.uniprot.org/uniprot/uniprot_id
Alternatively, you can obtain a protein sequence in FASTA format by following

http://www.uniprot.org/uniprot/uniprot_id.fasta
For example, the data for protein B5ZC00 can be found at http://www.uniprot.org/uniprot/B5ZC00.

In [1]:
import requests
import re

In [2]:
def uniprot_access(seq_id):
    seq_id = seq_id.split('_')[0] if '_' in seq_id else seq_id # ID에 ID만 있는게 아니라 부가설명이 있는 경우도 있음. P07204_TRBM_HUMAN가 그런 경우.
    url = f"http://www.uniprot.org/uniprot/{seq_id}.fasta" 
    response = requests.get(url)
    return response.text

def fasta_processing(fasta_seq):
    fasta_list = fasta_seq.split('\n')
    return {fasta_list[0][1:] : "".join(fasta_list[1:])}


In [3]:
def uniprot_process(seq_id):
    fasta_dict = fasta_processing(uniprot_access(seq_id))
    seq = list(fasta_dict.values())[0]
    return {seq_id:seq}

In [4]:
one_letter_aa = "ADEFGHIKLMNPQRSTVW" # 정규표현식에서 필요한지 아닌지 몰라 일단 만들어둠.
p = re.compile(f'N[^P][ST][^P]')

In [5]:
filepath = "/mnt/c/Data/ROSALIND_download/rosalind_mprt.txt"
seq_dict = {}
with open(filepath) as f:
    for line in f.readlines():
        line = line.rstrip()
        con_dict = uniprot_process(line)
        seq_dict[line] = con_dict[line]

seq_dict

{'A4TEW1': 'MTTPLTLENIRRAPKALLHDHLDGGLRPSTVLELAEQYGYDDLPAHDADELAEFFRTAAHSGSLVRYLEPFAHTVGVMQNHDALHRVARECVEDLADDNVVYAEIRFAPELHIDGGLSLDAVVEAVLAGFADGEKAAAAAGRTITVRCLVTAMRHAARSREIAALAIRFRDQGVVGFDIAGAEAGYPPSRHLDAFEYMRSNNARFTIHAGEAFGLPSIHEAIAFCGADRLGHGVRIVDDIDMDAEGGPKLGRLAALLRDKRIPFEMCPSSNVQTGAVASIAEHPFDRLARLRFRVTVNTDNRLMSDTTMSLEMLRLVEAFGYGWSDLERFTINAMKSAFISFPERLAIIDEVIKPRYAVLVG',
 'A0QQ98': 'MLLSDRDIRAEIAAKRLALEPFDDALVQPSSIDVRLDRMFRVFNNTRYTHIDPAMQQDELTTLVEPAEGEPFVLHPGEFVLGSTLELCTLPDDLAGRLEGKSSLGRLGLLTHSTAGFIDPGFSGHITLELSNVANLPITLWPGMKIGQLCLLRLTSPAENPYGSAAVGSKYQGQRGPTPSRSHLNFIKS',
 'Q5PA87': 'MKRTFQPSRIVRKRRHGFRARMSTRWGRKILNRRRAKGRCLLCA',
 'A5GVY9': 'MDQSFVTERLEATCRTFNALERQLADPSVAADPEQLLTLAKERSRLEPLVLDYQRLQQLHAEHQQAQQLLKESKGDAELEALAQEELQQLSSEQEQLNQRLKVALLPSDPRDERSVMLEIRAGAGGDEACLWAGDLARMYERHAQTCGWQVNPVSASEAELGGFKELILAIRGDAVFSQLKYEAGVHRVQRVPATESQGRVHTSTATVAVMPEADPVDVQIDPKDLDISTARSGGAGGQNVNKVETAVDLLHKPTGIRVFCTQERSQLQNRERAMEILRAKLLAKEEEEAAAAESSARRAQVGSGDRSEKIRTYNYKDNRTTDHRLGKNFPLETVLNGQLSDLIEACT

In [6]:
seq_list = list(seq_dict.values())
id_list = list(seq_dict.keys())
seq_list

['MTTPLTLENIRRAPKALLHDHLDGGLRPSTVLELAEQYGYDDLPAHDADELAEFFRTAAHSGSLVRYLEPFAHTVGVMQNHDALHRVARECVEDLADDNVVYAEIRFAPELHIDGGLSLDAVVEAVLAGFADGEKAAAAAGRTITVRCLVTAMRHAARSREIAALAIRFRDQGVVGFDIAGAEAGYPPSRHLDAFEYMRSNNARFTIHAGEAFGLPSIHEAIAFCGADRLGHGVRIVDDIDMDAEGGPKLGRLAALLRDKRIPFEMCPSSNVQTGAVASIAEHPFDRLARLRFRVTVNTDNRLMSDTTMSLEMLRLVEAFGYGWSDLERFTINAMKSAFISFPERLAIIDEVIKPRYAVLVG',
 'MLLSDRDIRAEIAAKRLALEPFDDALVQPSSIDVRLDRMFRVFNNTRYTHIDPAMQQDELTTLVEPAEGEPFVLHPGEFVLGSTLELCTLPDDLAGRLEGKSSLGRLGLLTHSTAGFIDPGFSGHITLELSNVANLPITLWPGMKIGQLCLLRLTSPAENPYGSAAVGSKYQGQRGPTPSRSHLNFIKS',
 'MKRTFQPSRIVRKRRHGFRARMSTRWGRKILNRRRAKGRCLLCA',
 'MDQSFVTERLEATCRTFNALERQLADPSVAADPEQLLTLAKERSRLEPLVLDYQRLQQLHAEHQQAQQLLKESKGDAELEALAQEELQQLSSEQEQLNQRLKVALLPSDPRDERSVMLEIRAGAGGDEACLWAGDLARMYERHAQTCGWQVNPVSASEAELGGFKELILAIRGDAVFSQLKYEAGVHRVQRVPATESQGRVHTSTATVAVMPEADPVDVQIDPKDLDISTARSGGAGGQNVNKVETAVDLLHKPTGIRVFCTQERSQLQNRERAMEILRAKLLAKEEEEAAAAESSARRAQVGSGDRSEKIRTYNYKDNRTTDHRLGKNFPLETVLNGQLSDLIEACTHADQQQKLEELAASES',
 'MGLSDGEWELVLKTWGKVE

In [7]:
loc = []
for seq in seq_list:
    result = p.finditer(seq)
    temp = []
    for m in result:
        temp.append(str(m.start()+1))
    loc.append(' '.join(temp))

loc

['',
 '44',
 '',
 '319',
 '',
 '15 27 195',
 '16 140 336 364 406 461',
 '64 187 240 246 453',
 '225 316 367',
 '32 38 49 58 75 101 123 179 229 242 257 275 300 307 317 356',
 '67 121',
 '833',
 '162 183 193 253']

In [8]:
for i in range(len(id_list)):
    if not loc[i]:
        continue
    print(f"{id_list[i]}\n{loc[i]}")

A0QQ98
44
A5GVY9
319
Q5FMJ3
15 27 195
Q60960
16 140 336 364 406 461
P02790_HEMO_HUMAN
64 187 240 246 453
P01880_DTC_HUMAN
225 316 367
P13473_LMP2_HUMAN
32 38 49 58 75 101 123 179 229 242 257 275 300 307 317 356
P02974_FMM1_NEIGO
67 121
P08318_P100_HCMVA
833
P02749_APOH_HUMAN
162 183 193 253


**정규표현식의 finditer는 한번 매칭되면 건너뜀 -> motif가 겹쳐있을 경우 이를 인식하지 못함**
한글자씩 서열 검사하며 겹치는 motif 찾는 방식으로 검사해야 함...

In [9]:
def n_position(seq):
    position = []
    for i in range(len(seq)-3):
        temp = seq[i:i+4]
        if (temp[0] == "N" and
            temp[1] != 'P' and
            temp[2] in 'ST' and
            temp[3] != 'P'
        ):
            position.append(str(i+1))
    return ' '.join(position)
        

In [10]:
loc2 = []
for seq in seq_list:
    loc2.append(n_position(seq))
loc2

['',
 '44',
 '',
 '319',
 '',
 '15 27 195',
 '16 140 336 364 406 461',
 '64 187 240 246 453',
 '225 316 367',
 '32 38 49 58 75 101 123 179 229 242 257 275 300 307 317 356',
 '67 68 121',
 '833',
 '162 183 193 253']

In [11]:
fh = open("/mnt/c/Data/ROSALIND_download/rosalind_output.txt",'w')
for i in range(len(id_list)):
    if not loc2[i]:
        continue
    fh.write(f"{id_list[i]}\n{loc[i]}\n")
    print(f"{id_list[i]}\n{loc[i]}")

A0QQ98
44
A5GVY9
319
Q5FMJ3
15 27 195
Q60960
16 140 336 364 406 461
P02790_HEMO_HUMAN
64 187 240 246 453
P01880_DTC_HUMAN
225 316 367
P13473_LMP2_HUMAN
32 38 49 58 75 101 123 179 229 242 257 275 300 307 317 356
P02974_FMM1_NEIGO
67 121
P08318_P100_HCMVA
833
P02749_APOH_HUMAN
162 183 193 253


**왜 안되는지 모르겠음..**