## About
This code takes a 2-column TSV from UniProt with GO (Biolgical Function) annotations for all proteins in the yeast proteome and then decomposes into true/false for nucleic acid binding based on these GO annotations.


In [1]:
with open('go_biological_process_uid_mapping.tsv','r') as fh:
    content = fh.readlines()

In [16]:
uid2go = {}
for line in content[1:]:
    sline = line.strip().split('\t')
    uid = sline[1]
    if len(sline) > 2:
        uid2go[uid] = sline[2]
    else:
        uid2go[uid] = ''
        

## We define nucleic acid binding based on the following GO codes...

    # DNA binding
    GO:0003677 - DNA binding
    GO:0043565 - Sequence-specific DNA binding
    GO:0003700 - DNA-binding transcription factor activity
    GO:0140110 - Transcription regulator activity, DNA-binding
    GO:0000981 - RNA polymerase II transcription factor activity, sequence-specific DNA binding
    GO:0000977 - RNA polymerase II regulatory region sequence-specific DNA binding
    GO:0000978 - RNA polymerase II proximal promoter sequence-specific DNA binding
    GO:0003682 - Chromatin binding
    GO:0044212 - Transcription regulatory region DNA binding

    # RNA binding
    GO:0003723 - RNA binding
    GO:0008135 - Translation factor activity, RNA binding
    GO:0003727 - mRNA binding
    GO:0035198 - miRNA binding
    GO:0003729 - mRNA 3'-UTR binding
    GO:1990907 - RISC complex RNA binding
    GO:0019843 - rRNA binding
    GO:0031077 - rRNA 5S binding
    GO:0031076 - rRNA 5.8S binding
    GO:0031075 - rRNA 28S binding
    GO:0031074 - rRNA 18S binding
    GO:0031369 - tRNA binding
    GO:0017080 - snRNA binding
    GO:0008497 - tRNA methyltransferase activity (binds tRNA to modify it)
    


In [17]:
# all possible codes
nucleic_acid_binding_codes = ['0003677', '0043565', '0003700', '0140110', '0000981', '0000977','0000978', '0003682','0003682','0003723','0008135','0003727','0035198','0003729','1990907','0019843','0031077','0031076','0031075','0031074','0031369','0017080','0008497']
with open('shprd_nucleic_acid_binding.tsv','w') as fh:
    for uid in uid2go:
        
        hit = False
        for k in nucleic_acid_binding_codes:
            if uid2go[uid].find(k) > -1:
                hit = True
                break
        if hit:
            fh.write(f'{uid}\tnucelic_acid_binding:True\n')
        else:
            fh.write(f'{uid}\tnucelic_acid_binding:False\n')
                
    