Here I generate fasta files of all of the human proteins in the orthoDB database. There are duplicates/isoforms present in the database, so I also make a file of clustered sequences as well.

In [2]:
import orthodb_tools.sql_queries as sql_queries
from Bio import SeqIO
import orthodb_tools.env_variables.env_variables as env
from orthodb_tools.tools import cli_wrappers as cli
from orthodb_tools.tools import alignment_tools as aln_tools
import numpy as np
import copy
from pathlib import Path
import json
from pyprojroot import here

%load_ext autoreload
%autoreload 2

In [3]:
ODBDATABASE = env.orthoDBDatabase()

In [4]:
OUTPUT_DIR = here() / "data" / "orthodb_clustered_species_proteins" / "9606_6"
OUTPUT_DIR.mkdir(exist_ok=True, parents=True)

# Get all human proteins in orthoDB - SQLite

In [5]:
gene_list = sql_queries.get_all_odb_gene_ids_from_species_id('9606_0')

In [6]:
print(len(gene_list))

20675


# load all human proteins from orthoDB

In [7]:
missing_seqs = []
all_human_odb_seqs_dict = {}
for i in gene_list:
    try:
        seq = copy.deepcopy(ODBDATABASE.data_all_seqrecords_dict[i])
        seq.name = ''
        seq.description = ''
    except KeyError:
        missing_seqs.append(i)
        print(i)
        continue
    all_human_odb_seqs_dict[i] = seq
print(len(missing_seqs))

9606_0:00332d
9606_0:000339
9606_0:000320
9606_0:00327c
9606_0:000b56
9606_0:003f96
9606_0:002f2e
9606_0:002700
9606_0:002f06
9606_0:003f19
9606_0:00402a
9606_0:002652
9606_0:003353
9606_0:004b3c
9606_0:003e8a
9606_0:00420f
9606_0:0009be
9606_0:00064b
9606_0:0012ba
9606_0:000a51
9606_0:003fec
9606_0:004274
9606_0:001a54
9606_0:00366d
9606_0:004abd
9606_0:001a94
9606_0:002ef2
9606_0:000ae0
9606_0:0048b0
9606_0:0042cc
9606_0:000b00
9606_0:0002a6
9606_0:0003cf
9606_0:001cde
9606_0:00339f
9606_0:001f07
9606_0:004a93
9606_0:0002e9
9606_0:002d73
9606_0:0006a9
9606_0:001768
9606_0:002120
9606_0:004a17
9606_0:001330
9606_0:003b4e
9606_0:000cbb
9606_0:000740
9606_0:003923
9606_0:0001b0
9606_0:0050ba
9606_0:0004ef
9606_0:0019d8
9606_0:004491
9606_0:003db4
9606_0:0039ee
9606_0:003cc8
9606_0:004a6d
9606_0:000021
9606_0:000c1d
9606_0:0021ec
9606_0:000bcd
9606_0:0033c6
9606_0:004263
9606_0:003ab7
9606_0:001374
9606_0:001c64
9606_0:003e25
9606_0:001cf3
9606_0:003fd7
9606_0:000384
9606_0:002994
9606_0

## key error 
- is because the fasta file that I have is for all sequences that are in ortholog groups. If a sequence is not in an ortholog group, it is not in the fasta file. I'm not interested in those sequences anyway, so I can ignore the key error.
- odbid=9606_0:001081 is an example of a sequence that is not in an ortholog group

In [8]:
ODBDATABASE.data_all_seqrecords_dict['9606_0:001081']

KeyError: 

# write all human proteins from orthoDB to file

In [9]:
with open(OUTPUT_DIR / 'all_human_proteins_in_odb.fasta', 'w') as f:
    SeqIO.write(list(all_human_odb_seqs_dict.values()), f, 'fasta')

# issues
I've noticed some issues with this "proteome": 
There is an issue with ID `9606_0:004fc3`

/home/jch/Documents/SLiM_bioinformatics_workspace/data/orthoDB_ortholog_group_runs/2023-07-19-whole_proteome/orthoDB_analysis_multiprocessed_minfrac-0.75/msa_by_organism/9606_0:004fc3

```
>Homo_sapiens|9606_0:004d85
MAAAAAGLGGGGAGPGPEAGDFLARYRLVSNKLKKRFLRKPNVAEAGEQFGQLGRELRAQ
ECLPYAAWCQLAVARCQQALFHGPGEALALTEAARLFLRQERDARQRLVCPAAYGEPLQA
AASALGAAVRLHLELGQPAAAAALCLELAAALRDLGQPAAAAGHFQRAAQLQLPQLPLAA
LQALGEAASCQLLARDYTGALAVFTRMQRLAREHGSHPVQSLPPPPPPAPQPGPGATPAL
PAALLPPNSGSAAPSPAALGAFSDVLVRCEVSRVLLLLLLQPPPAKLLPEHAQTLEKYSW
EAFDSHGQESSGQLPEELFLLLQSLVMATHEKDTEAIKSLQVEMWPLLTAEQNHLLHLVL
QETISPSGQGV
>Homo_sapiens|9606_0:004f34
MAAAAAGLGGGGAGPGPEAGDFLARYRLVSNKLKKRFLRKPNVAEAGEQFGQLGRELRAQ
ECLPYAAWCQLAVARCQQALFHGPGEALALTEAARLFLRQERDARQRLVCPAAYGEPLQA
AASALGAAVRLHLELGQPAAAAALCLELAAALRDLGQPAAAAGHFQRAAQLQLPQLPLAA
LQALGEAASCQLLARDYTGALAVFTRMQRLAREHGSHPVQSLPPPPPPAPQPGPGATPAL
PAALLPPNSGSAAPSPAALGAFSDVLVRCEVSRVLLLLLLQPPPAKLLPEHAQTLEKYSW
EAFDSHGQESSGQLPEELFLLLQSLVMATHEKDTEAIKSLQVEMWPLLTAEQNHLLHLVL
QETISPSGQGV
>Homo_sapiens|9606_0:004fc3
MAAAAAGLGGGGAGPGPEAGDFLARYRLVSNKLKKRFLRKPNVAEAGEQFGQLGRELRAQ
ECLPYAAWCQLAVARCQQALFHGPGEALALTEAARLFLRQERDARQRLVCPAAYGEPLQA
AASALGAAVRLHLELGQPAAAAALCLELAAALRDLGQPAAAAGHFQRAAQLQLPQLPLAA
LQALGEAASCQLLARDYTGALAVFTRMQRLAREHGSHPVQSLPPPPPPAPQPGPGATPAL
PAALLPPNSGSAAPSPAALGAFSDVLVRCEVSRVLLLLLLQPPPAKLLPEHAQTLEKYSW
EAFDSHGQESSGQLPEELFLLLQSLVMATHEKDTEAIKSLQVEMWPLLTAEQNHLLHLVL
QETISPSGQGV
```

- these 3 sequences are identitcal.
---

I think that there are also isoforms in orthoDB even though I think that those are supposed to be removed according to the documentation.<br>
example
```
>9544_0:003203 {"pub_og_id":"4636942at2759","og_name":"xin actin binding repeat containing 2 ","level_taxid":2759,"organism_taxid":"9544_0","organism_name":"Macaca mulatta","pub_gene_id":"XIRP2","description":"XIRP2"}
MFPMQKGSLNLLWQKWESCDYQRSECYPRDSRCTILQPQESKLLEPEEEVVSAPEPLDPTSLPCSGELTLSSKPEGKDSVDKSNTTREYGRPEVLKEDSLSSRRRIERFSIALDELRSVFEAPKSGNKSAEHGGKEVEIERSLCSPAFKSHPGSQLEDSVKDSDEKGEETSCDKMSPESGHSHIFEAIAGPNKPESGFAEDSAALGEVVSDLHEVVSLKERMARYQAAVSRGDCRSFSANMMEESEMCTVPGGLAKVKKQFEDEITSSRNTFAQYQYQHQNRSEQEAIHSSQVGTSKSSQELARNEQEGSKVQKIDVHGTEMVSHLEKHTKEINQASQFHQYVQETVIDTPEDEEIPKVSTKLLKEQFEKSAQEKILYSDKEMTTPAKQIKIESECEETLKPSSVVSTSSTSCISTSQRKETSTTRYSDHSVTSSTLAQINATSSGMTEEFPPPPPDVLQTSVDATAFSQSPELPSPPRRLPVPKDVYSKQRNLYELNRLYKHIHPELRKNLEKDYISEVSEIVSSQMNSGSSVSADVQQARYVFENTNDSSQKDLNSEREYLEWDEILKGEVQSIRWIFENQPLDSINNGSPDEGDISRGVADQEIIAGGDVKYTTWMFETQPIDTLGAHSSDTVENAEKIPELARGDVCTARWMFETRPLDSMNKMHQSQEESAVTISKDITGGNVKTVRYMFETQHLDQLGQLHSVDEVHLLQLRSELKEIKGNVKRSIKCFETQPLYVIRDGSGQMLEIKTVHREDVEKGDVRTARWMFETQPLDTINKDITEIKIVRGISMEENVKGGVSKAKWLFETQPLEKIKESEEVIIEKETIIGTDVSRKCWMFETQPLDILKEVPDADPLQHEEIIGGDVQTTKHLFETLPIEALKDSPDIGKLQKITASEEEKGDVRHQKWIFETQPLEDIRKDKKEYTRTVKLEEVDRGDVKNYTHIFESNNLIKFDASHKIEVEGVTRGAVELNKSLFETTPLYAIQDPLGKYHQVKTVQQEEIIRGDVRSCRWLFETRPIDQFDESIHKFQIIRGISAQEIQTGNVKSAKWLFETQPLDSIKYFSDVEETESKTEQARDIIKGDVKTCKWLFETQPMESLYEKVSLMTSSEEIHKGDVKTCTWLFETQPLDAIKDDSETKVKLQTVKQEEIQGGDVRTTCFLFETENLDSIQGEEVKEIKPVEMDIQAGDVSSMRYKFENQSLDSISSSSEEVLKKIKTLKTEDIQKGNVLNCRWLFENQPIDKIKESQEGDECVKTVTDIQGGDVRKGCFIFETFSLDEIKEESDYISTKKTITEEVMQGDVKSYRMLFETQPLYAIQDREGSYHEVTTVKKEEVIHGDVRGTRWLFETKPLDSINKSETVYVIKAVTQEDIQKGDVSSVRYRFETQPLDQISEESHNIVPTVDHIQGGNVKTSKQFFESENFDKNNYIRTVSVNEIQKGNVKTSTWLFETHTIDELRGEGLEYENIKTVTQEDVQKGDVKQAVWLFENQTFDSIMEAHKGVTKVTKEEIPPSDVKTTTWLFETTPLHEFNENRIEKIEIIGKSIKKTLEDLYSQKVIQAPGIIIEADEVGDVRMAKYKLMNQASPEIQKEEIIRADLRNIMVNLLSKRDYTKREILVSEEEKGNVNLTKTQLLNKSTEFHAEKEEIVKGDVQHAIKNLFSEERSVKKGILIQEDERGDINMTIYCLLHENDGDTIEREEVIGGDVRRTIHNLLSSTSNNKISERAKFDASERGNVQFFTTCIEAGALDYLKQLHTESNETLTAKKQEGEKEIIGGDVEGTKLLLKKRQSLVERTVSETDIIPGDVHNTVKVFMTEPQSTLGKIPKEEIIKGDLASTLNSLSQAVNQKTVTKTEEIIKGDMLATLKSLKESTRRWKESKQPDAIPGDIEKAIECLEKATNTRTEILKKELLKDDLETSLRSLKEAQRSFKEVDKEGVIKKDAHVVMAGSSGEQKTDIHQVAVQRNKNSLLQPKSGPFEPAAEWQGGADTLSQTMGKSCHGNLVEERTEVNLPKAPKGSVKIVIDREQNNDALEKSLRRLSNSHHKSIKNVLESGDKMGVWTDITGEQHLRDEYMSRQLTSTVSVKNNLKTKESDREVRELKKDDDFNSVQSADKTVGKQQTYERRNDHQKTEAFHIKSPKKTENIKILTDTQNSKPSPTQHPVSMPVGGTYKISGDFQKQTLLKQETKYSNKNIKQKNINFQPMWQPLPVEQDTTSVTEVKVSEKNHNTFKTTNKKQETDVHLKSQDFLMKTNTSTDLKTAMERSLNPINFNPENNVKESECPLPPPSPPPPPPSNASSEIEFPLPPPPPLMMFPEKNGFLPSLSTEKIKAEFESFPGLPLPPPSVDEKSERESPSMFLPPPPPPTPSQNPAHLLSSSAPEKHSGAFMQQYSQKEASNSQNSQAKIITGKSGVLPPPTLPKPKLPKHIKDNKNHFSPKVELTNSLSDMECKITTSKDQKKVMMMTSSEHIETKQNVISKSLDERKQLFVDSANCLSHTVPGTSAPKKKQIAPLIKSHSFPESSGQQSPKPYMRKFKTPLMIAEEKYRQQKEELEKQKQESSYYNIVKTESQNQHISEVEKEMPLRKTNEEVSVSGIDSERTVVQPNPGSQSNARVLGVCSDNQLSTTSPVTVTAKRLHHVLAASEDKDKMKKEVLQSARDIMQSKSACEIKQSHQECSTQQTQQNQYLEQLHLPQSKPISPNFKVKTIKLPTLDHTLNETDHSYESHKQQSEVDVQTFAKQQYLETKKTEASTECSHKQSLAERHYQLPKKEKRVTIKLPTESIQKNHEDKLQIVPGKQEEFAGSDRGKLPGSEEKNKGPSMISRKEERLITERKQEVLKNKSAPKVVKQKVIDAHLDSQTQNFQQTQIQTSESKAEHKKWPQPYNSLQEEKCLQVKGIQQKQVFSNTKDSKQEITQNKSFFSAVKESQQDDGKCAVNIVEFLRKREELQQILSRVKQFEAEPNKSGLKTFQTLLNTIPGWLISEEKREYAVHIAMENNLEKVKEEITHIKTQAEDMLVSYENIIQTAMMSSKTGKSGNKPTSLDETSSKVSNVHVSNNKNSEQKENKIAKEKTGQHQVAAHREAAVHSHVKTHQEIKLDESNIPPPSLKTRPPSPTFITIESTVRRTETPTKDELSQSPKKDSYVEPPPRRPKSQTSEIHRANTSPSPPRSRSEQLVRLKDTTAKLSKGAIPCPAVTPVPVVEKRSEIIMSPATLRRQIKIETRGRDSPPTITIPVNIHHTPSGSSRESTEAQEEIRKVEKRATYVHNAGLNSTDPIVPDTESYDAVEIIRKVEVPPRLSEHTQRYEAANRTVQMAENFVDDRENEINRWFREFEHGPVSEAKSNRRVYANGETNHNIQQESHTFCQEEFGLTSLGNTSFTDFSCTHPRELQEKIPVKQPRICSETRSLSEHFSGMDAFESQIVESKMKTSSSHSSEAGKSGCDFKHAPPTYEDVIAGHILDISDSPKEVRKNFQNTWQESGRVFKSLGYATSDSSATEMRTAFQEESAFISETAAPRQGNMYTLSKDSLSNGVPSGRQAEFS

>9544_0:0032c6 {"pub_og_id":"4636942at2759","og_name":"xin actin binding repeat containing 2 ","level_taxid":2759,"organism_taxid":"9544_0","organism_name":"Macaca mulatta","pub_gene_id":"XIRP2","description":"xin actin-binding repeat-containing protein 2 isoform X2"}
MARYQAAVSRGDCRSFSANMMEESEMCTVPGGLAKVKKQFEDEITSSRNTFAQYQYQHQNRSEQEAIHSSQVGTSKSSQELARNEQEGSKVQKIDVHGTEMVSHLEKHTKEINQASQFHQYVQETVIDTPEDEEIPKVSTKLLKEQFEKSAQEKILYSDKEMTTPAKQIKIESECEETLKPSSVVSTSSTSCISTSQRKETSTTRYSDHSVTSSTLAQINATSSGMTEEFPPPPPDVLQTSVDATAFSQSPELPSPPRRLPVPKDVYSKQRNLYELNRLYKHIHPELRKNLEKDYISEVSEIVSSQMNSGSSVSADVQQARYVFENTNDSSQKDLNSEREYLEWDEILKGEVQSIRWIFENQPLDSINNGSPDEGDISRGVADQEIIAGGDVKYTTWMFETQPIDTLGAHSSDTVENAEKIPELARGDVCTARWMFETRPLDSMNKMHQSQEESAVTISKDITGGNVKTVRYMFETQHLDQLGQLHSVDEVHLLQLRSELKEIKGNVKRSIKCFETQPLYVIRDGSGQMLEIKTVHREDVEKGDVRTARWMFETQPLDTINKDITEIKIVRGISMEENVKGGVSKAKWLFETQPLEKIKESEEVIIEKETIIGTDVSRKCWMFETQPLDILKEVPDADPLQHEEIIGGDVQTTKHLFETLPIEALKDSPDIGKLQKITASEEEKGDVRHQKWIFETQPLEDIRKDKKEYTRTVKLEEVDRGDVKNYTHIFESNNLIKFDASHKIEVEGVTRGAVELNKSLFETTPLYAIQDPLGKYHQVKTVQQEEIIRGDVRSCRWLFETRPIDQFDESIHKFQIIRGISAQEIQTGNVKSAKWLFETQPLDSIKYFSDVEETESKTEQARDIIKGDVKTCKWLFETQPMESLYEKVSLMTSSEEIHKGDVKTCTWLFETQPLDAIKDDSETKVKLQTVKQEEIQGGDVRTTCFLFETENLDSIQGEEVKEIKPVEMDIQAGDVSSMRYKFENQSLDSISSSSEEVLKKIKTLKTEDIQKGNVLNCRWLFENQPIDKIKESQEGDECVKTVTDIQGGDVRKGCFIFETFSLDEIKEESDYISTKKTITEEVMQGDVKSYRMLFETQPLYAIQDREGSYHEVTTVKKEEVIHGDVRGTRWLFETKPLDSINKSETVYVIKAVTQEDIQKGDVSSVRYRFETQPLDQISEESHNIVPTVDHIQGGNVKTSKQFFESENFDKNNYIRTVSVNEIQKGNVKTSTWLFETHTIDELRGEGLEYENIKTVTQEDVQKGDVKQAVWLFENQTFDSIMEAHKGVTKVTKEEIPPSDVKTTTWLFETTPLHEFNENRIEKIEIIGKSIKKTLEDLYSQKVIQAPGIIIEADEVGDVRMAKYKLMNQASPEIQKEEIIRADLRNIMVNLLSKRDYTKREILVSEEEKGNVNLTKTQLLNKSTEFHAEKEEIVKGDVQHAIKNLFSEERSVKKGILIQEDERGDINMTIYCLLHENDGDTIEREEVIGGDVRRTIHNLLSSTSNNKISERAKFDASERGNVQFFTTCIEAGALDYLKQLHTESNETLTAKKQEGEKEIIGGDVEGTKLLLKKRQSLVERTVSETDIIPGDVHNTVKVFMTEPQSTLGKIPKEEIIKGDLASTLNSLSQAVNQKTVTKTEEIIKGDMLATLKSLKESTRRWKESKQPDAIPGDIEKAIECLEKATNTRTEILKKELLKDDLETSLRSLKEAQRSFKEVDKEGVIKKDAHVVMAGSSGEQKTDIHQVAVQRNKNSLLQPKSGPFEPAAEWQGGADTLSQTMGKSCHGNLVEERTEVNLPKAPKGSVKIVIDREQNNDALEKSLRRLSNSHHKSIKNVLESGDKMGVWTDITGEQHLRDEYMSRQLTSTVSVKNNLKTKESDREVRELKKDDDFNSVQSADKTVGKQQTYERRNDHQKTEAFHIKSPKKTENIKILTDTQNSKPSPTQHPVSMPVGGTYKISGDFQKQTLLKQETKYSNKNIKQKNINFQPMWQPLPVEQDTTSVTEVKVSEKNHNTFKTTNKKQETDVHLKSQDFLMKTNTSTDLKTAMERSLNPINFNPENNVKESECPLPPPSPPPPPPSNASSEIEFPLPPPPPLMMFPEKNGFLPSLSTEKIKAEFESFPGLPLPPPSVDEKSERESPSMFLPPPPPPTPSQNPAHLLSSSAPEKHSGAFMQQYSQKEASNSQNSQAKIITGKSGVLPPPTLPKPKLPKHIKDNKNHFSPKVELTNSLSDMECKITTSKDQKKVMMMTSSEHIETKQNVISKSLDERKQLFVDSANCLSHTVPGTSAPKKKQIAPLIKSHSFPESSGQQSPKPYMRKFKTPLMIAEEKYRQQKEELEKQKQESSYYNIVKTESQNQHISEVEKEMPLRKTNEEVSVSGIDSERTVVQPNPGSQSNARVLGVCSDNQLSTTSPVTVTAKRLHHVLAASEDKDKMKKEVLQSARDIMQSKSACEIKQSHQECSTQQTQQNQYLEQLHLPQSKPISPNFKVKTIKLPTLDHTLNETDHSYESHKQQSEVDVQTFAKQQYLETKKTEASTECSHKQSLAERHYQLPKKEKRVTIKLPTESIQKNHEDKLQIVPGKQEEFAGSDRGKLPGSEEKNKGPSMISRKEERLITERKQEVLKNKSAPKVVKQKVIDAHLDSQTQNFQQTQIQTSESKAEHKKWPQPYNSLQEEKCLQVKGIQQKQVFSNTKDSKQEITQNKSFFSAVKESQQDDGKCAVNIVEFLRKREELQQILSRVKQFEAEPNKSGLKTFQTLLNTIPGWLISEEKREYAVHIAMENNLEKVKEEITHIKTQAEDMLVSYENIIQTAMMSSKTGKSGNKPTSLDETSSKVSNVHVSNNKNSEQKENKIAKEKTGQHQVAAHREAAVHSHVKTHQEIKLDESNIPPPSLKTRPPSPTFITIESTVRRTETPTKDELSQSPKKDSYVEPPPRRPKSQTSEIHRANTSPSPPRSRSEQLVRLKDTTAKLSKGAIPCPAVTPVPVVEKRSEIIMSPATLRRQIKIETRGRDSPPTITIPVNIHHTPSGSSRESTEAQEEIRKVEKRATYVHNAGLNSTDPIVPDTESYDAVEIIRKVEVPPRLSEHTQRYEAANRTVQMAENFVDDRENEINRWFREFEHGPVSEAKSNRRVYANGETNHNIQQESHTFCQEEFGLTSLGNTSFTDFSCTHPRELQEKIPVKQPRICSETRSLSEHFSGMDAFESQIVESKMKTSSSHSSEAGKSGCDFKHAPPTYEDVIAGHILDISDSPKEVRKNFQNTWQESGRVFKSLGYATSDSSATEMRTAFQEESAFISETAAPRQGNMYTLSKDSLSNGVPSGRQAEFS

>9606_0:000a46 {"pub_og_id":"4636942at2759","og_name":"xin actin binding repeat containing 2 ","level_taxid":2759,"organism_taxid":"9606_0","organism_name":"Homo sapiens","pub_gene_id":"XIRP2","description":"xin actin-binding repeat-containing protein 2 isoform 4"}
MARYQAAVSRGDCRSFSANMMEESEMCAVPGGLAKVKKQFEDEITSSRNTFAQYQYQHQNRSEQEAIHSSQVGTSRSSQEMARNEQEGSKVQKIDVHGTEMVSHLEKHTEEVNQASQFHQYVQETVIDTPEDEEIPKVSTKLLKEQFEKSAQEKILYSDKEMTTPAKQIKTESEYEETFKPSSVVSTSSTSCVSTSQRKETSTTRYSDHSVTSSTLAQINATSSGMTEEFPPPPPDVLQTSVDVTAFSQSPELPSPPRRLPVPKDVYSKQRNLYELNRLYKHIHPELRKNLEKDYISEVSEIVSSQMNSGSSVSADVQQARYVFENTNDSSQKDLNSEREYLEWDEILKGEVQSIRWIFENQPLDSINNGSPDEGDISRGIADQEIIAGGDVKYTTWMFETQPIDTLGAYSSDTVENAEKIPELARGDVCTARWMFETRPLDSMNKMHQSQEESAVTISKDITGGDVKTVRYMFETQHLDQLGQLHSVDEVHLLQLRSELKEIKGNVKRSIKCFETQPLYVIRDGSGQMLEIKTVHREDVEKGDVRTARWMFETQPLDTINKDITEIKVVRGISMEENVKGGVSKAKWLFETQPLEKIKESEEVIIEKEKIIGTDVSRKCWMFETQPLDILKEVPDADSLQREEIIGGDVQTTKHLFETLPIEALKDSPDIGKLQKITASEEEKGDVRHQKWIFETQPLEDIRKDKKEYTRTVKLEEVDRGDVKNYTHIFESNNLIKFDASHKIEVEGVTRGAVELNKSLFETTPLYAIQDPLGKYHQVKTVQQEEIVRGDVRSCRWLFETRPIDQFDESIHKFQIIRGISAQEIQTGNVKSAKWLFETQPLDSIKYFSDVEETESKTEQTRDIVKGDVKTCKWLFETQPMESLYEKVSLMTSSEEIHKGDVKTCTWLFETQPLDTIKDDSETAVKLQTVKQEEIQGGDVRTACFLFETENLDSIQGEEVKEIKPVEMDIQAGDVSSMRYKFENQSLDSISSSSEEVLKKIKTLKTEDIQKGNVLNCRWLFENQPIDKIKESQEGDECVKTVTDIQGGDVRKGCFIFETFSLDEIKEESDYISTKKTITEEVIQGDVKSYRMLFETQPLYAIQDREGSYHEVTTVKKEEVIHGDVRGTRWLFETKPLDSINKSETVYVIKSVTQEDIQKGDVSSVRYRFETQPLDQISEESHNIMPSIDHIQGGNVKTSRQFFESENFDKNNYIRTVSVNEIQKGNVKTSTWLFETHTMDELRGEGLEYENIKTVTQEDVQKGDVKQAVWLFENRTFDSIMEAHKGITKMTKEEIPPSDVKTTTWLFETTPLHEFNETRVEKIEIIGKSIKETLEDLYSQKVIQAPGIIIEADEIGDVRMAKYKLMNQASPEIQKEEIIRADLRNIMVNLLSKRDCTEREILISEEEKGNVNLTKTQLLNRSTEFHAEKEEIVKGDVQQAIKNLFSEERSVKKGILIQEDEKGDINMTIYCLLHENDGDTIEREEVIGGDVKRTIHNLLSSTSNNKISERAKIDASERGNVQFFTTCIEAGALDYLKQLHTESNETLTAKKQEGEKEIIGGDVEGTKLLLKKRQSLVERTVSETDIIPGDVHNTVKVFMTEPQSTFGKIPKEEIIKGDLTSTLNSLSQAVNQKTVTKTEEIIKGNMLATLKSLKESSHRWKESKQPDAIPGDIEKAIECLEKATNTKTEILKKELLKDDLETSLRSLKEAQRSFKEVHKEGVIKKDAKAVMAGSSGEQKTDIHQVAVQRNKNSLLQPKPGPFEPAAKWQGGADTLSQTMGKSCHGNLVEERTEVNLPKAPKGTVKIVIDREQNNDALEKSLRRLSNSHHKSNVLESGDKTGVWTDTTGEQHLRDEYMSRQLTSTVSVKNNLTTKESDRAVRELKKDDVFNSIQSAGKTVGKQQTYELRNDHQKMEGFHIKSPKKTKNIKILTDTQSSKPSPTQHPVSMPVGGTYDLSGDFQKQTLLKQETKYSNKDIKKKNINLQPMWQLLPVEQDTSNVTEMKVSEKSHNTFKATNKKRETDVHLKSQDFLMKTNTSTGLKMAMERSLNPINFNPENNVKESECPLPPPSPPPPPPSNASSEIEFPLPPPPPLMMFPEKNGFLPSLSTEKIKAEFESFPGLPLPPPPVDEKSERESSSMFLPPPPPPTPSQKPAHLLSSSAPEKHSGDFMQQYSQKEASNSQNSQAKIITGKTGVLPPPTLPKPKLPKHIKDNKNDFSPKVELATSLSDMECKITTSKDQKKVMVMTSSEHTETKQNVISKSLDERKQLSIDSANCLSHTVPGTSAPRKKQIAPLIKSHSFPESSGQQNPKPYMRKFKTPLMIAEEKYRQQKEEIEKQKQESSYYNIVKTQSQNQHITEVEKEMPLQKTNEEVSLSGIDSECTVVQPSPGSQSNARILGVCSDNQLSTTSPETVAAKRLHHVLAASEDKDKMKKEVLQSSRDIMQSKSACEIKQSHQECSTQQTQQKKYLEQLHLPQSKPISPNFKVKTIKLPTLDHTLNETDHSYESHKQQSEIDVQTFTKKQYLKTKKTEASTECSHKQSLAERHYQLPKKEKRVTVQLPTESIQKNQEDKLKMVPRKQREFSGSDRGKLPGSEEKNQGPSMIGRKEERLITERKHEHLKNKSAPKVVKQKVIDAHLDSQTQNFQQTQIQTAESKAEHKKLPQPYNSLQEEKCLEVKGIQEKQVFSNTKDSKQEITQNKSFFSSVKESQRDDGKGALNIVEFLRKREELQQILSRVKQFEAEPNKSGLKTFQTLLNTIPGWLISEDKREYAVHIAMENNLEKVKEEITHIKTQAEDMLVSYENIIQTAMMSSKTGKPGNKPTSLDETSSKVSNVHVSNNKNSEQKENKIAKEKTVQHQVAAHHEATVRSHVKTHQEIKLDDSNIPPPSLKTRPPSPTFITIESTARRTENPTKNELSQSPKKDSYVEPPPRRPMSQKSEIHRANTSPSPPRSRSEQLVRLKDTTAKLSKGAIPCPAATPVPIVEKRSEIIMSPATLRRQIKIETRGRDSPPTITIPVNINHAASGSFRESVDAQEEIRKVEKRATYVHKDGLNSTDHMVPDTESYDAVEIIRKVAVPPRLSEHTQRYEAANRTVQMAENFVNDPENEINRWFREFEHGPVSEAKSNRRVYAKGETNHNIQQESRTFCKEEFGLTSLGNTSFTDFSCKHPRELREKIPVKQPRICSETRSLSEHFSGMDAFESQIVESKMKTSSSHSSEAGKSGCDFKHAPPTYEDVIAGHILDISDSPKEVRKNFQKTWQESGRVFKGLGYATADASATEMRTTFQEESAFISEAAAPRQGNMYTLSKDSLSNGVPSGRQAEFS

>9606_0:000b0b {"pub_og_id":"4636942at2759","og_name":"xin actin binding repeat containing 2 ","level_taxid":2759,"organism_taxid":"9606_0","organism_name":"Homo sapiens","pub_gene_id":"XIRP2","description":"Xin actin-binding repeat-containing protein 2"}
MFPMQKGSLNLLRQKWESCDYQRSECHPRDSHCTIFQPQESKLLAPEGEVVSAPQSLDPTSLPYSTGEEMWSSKPEEKDSVDKSNNTREYGRPEVLKEDSLSSRRRIERFSIALDELRSVFEAPKSGNKPAEYGGKEVEIERSLCSPAFKSHPGSQLEDSVKDSDKKGKETSFDKMSPESGHSRIFEATAGPNKPESGFAEDSAARGEGVSDLHEVVSLKERMARYQAAVSRGDCRSFSANMMEESEMCAVPGGLAKVKKQFEDEITSSRNTFAQYQYQHQNRSEQEAIHSSQVGTSRSSQEMARNEQEGSKVQKIDVHGTEMVSHLEKHTEEVNQASQFHQYVQETVIDTPEDEEIPKVSTKLLKEQFEKSAQEKILYSDKEMTTPAKQIKTESEYEETFKPSSVVSTSSTSCVSTSQRKETSTTRYSDHSVTSSTLAQINATSSGMTEEFPPPPPDVLQTSVDVTAFSQSPELPSPPRRLPVPKDVYSKQRNLYELNRLYKHIHPELRKNLEKDYISEVSEIVSSQMNSGSSVSADVQQARYVFENTNDSSQKDLNSEREYLEWDEILKGEVQSIRWIFENQPLDSINNGSPDEGDISRGIADQEIIAGGDVKYTTWMFETQPIDTLGAYSSDTVENAEKIPELARGDVCTARWMFETRPLDSMNKMHQSQEESAVTISKDITGGDVKTVRYMFETQHLDQLGQLHSVDEVHLLQLRSELKEIKGNVKRSIKCFETQPLYVIRDGSGQMLEIKTVHREDVEKGDVRTARWMFETQPLDTINKDITEIKVVRGISMEENVKGGVSKAKWLFETQPLEKIKESEEVIIEKEKIIGTDVSRKCWMFETQPLDILKEVPDADSLQREEIIGGDVQTTKHLFETLPIEALKDSPDIGKLQKITASEEEKGDVRHQKWIFETQPLEDIRKDKKEYTRTVKLEEVDRGDVKNYTHIFESNNLIKFDASHKIEVEGVTRGAVELNKSLFETTPLYAIQDPLGKYHQVKTVQQEEIVRGDVRSCRWLFETRPIDQFDESIHKFQIIRGISAQEIQTGNVKSAKWLFETQPLDSIKYFSDVEETESKTEQTRDIVKGDVKTCKWLFETQPMESLYEKVSLMTSSEEIHKGDVKTCTWLFETQPLDTIKDDSETAVKLQTVKQEEIQGGDVRTACFLFETENLDSIQGEEVKEIKPVEMDIQAGDVSSMRYKFENQSLDSISSSSEEVLKKIKTLKTEDIQKGNVLNCRWLFENQPIDKIKESQEGDECVKTVTDIQGGDVRKGCFIFETFSLDEIKEESDYISTKKTITEEVIQGDVKSYRMLFETQPLYAIQDREGSYHEVTTVKKEEVIHGDVRGTRWLFETKPLDSINKSETVYVIKSVTQEDIQKGDVSSVRYRFETQPLDQISEESHNIMPSIDHIQGGNVKTSRQFFESENFDKNNYIRTVSVNEIQKGNVKTSTWLFETHTMDELRGEGLEYENIKTVTQEDVQKGDVKQAVWLFENRTFDSIMEAHKGITKMTKEEIPPSDVKTTTWLFETTPLHEFNETRVEKIEIIGKSIKETLEDLYSQKVIQAPGIIIEADEIGDVRMAKYKLMNQASPEIQKEEIIRADLRNIMVNLLSKRDCTEREILISEEEKGNVNLTKTQLLNRSTEFHAEKEEIVKGDVQQAIKNLFSEERSVKKGILIQEDEKGDINMTIYCLLHENDGDTIEREEVIGGDVKRTIHNLLSSTSNNKISERAKIDASERGNVQFFTTCIEAGALDYLKQLHTESNETLTAKKQEGEKEIIGGDVEGTKLLLKKRQSLVERTVSETDIIPGDVHNTVKVFMTEPQSTFGKIPKEEIIKGDLTSTLNSLSQAVNQKTVTKTEEIIKGNMLATLKSLKESSHRWKESKQPDAIPGDIEKAIECLEKATNTKTEILKKELLKDDLETSLRSLKEAQRSFKEVHKEGVIKKDAKAVMAGSSGEQKTDIHQVAVQRNKNSLLQPKPGPFEPAAKWQGGADTLSQTMGKSCHGNLVEERTEVNLPKAPKGTVKIVIDREQNNDALEKSLRRLSNSHHKSNVLESGDKTGVWTDTTGEQHLRDEYMSRQLTSTVSVKNNLTTKESDRAVRELKKDDVFNSIQSAGKTVGKQQTYELRNDHQKMEGFHIKSPKKTKNIKILTDTQSSKPSPTQHPVSMPVGGTYDLSGDFQKQTLLKQETKYSNKDIKKKNINLQPMWQLLPVEQDTSNVTEMKVSEKSHNTFKATNKKRETDVHLKSQDFLMKTNTSTGLKMAMERSLNPINFNPENNVKESECPLPPPSPPPPPPSNASSEIEFPLPPPPPLMMFPEKNGFLPSLSTEKIKAEFESFPGLPLPPPPVDEKSERESSSMFLPPPPPPTPSQKPAHLLSSSAPEKHSGDFMQQYSQKEASNSQNSQAKIITGKTGVLPPPTLPKPKLPKHIKDNKNDFSPKVELATSLSDMECKITTSKDQKKVMVMTSSEHTETKQNVISKSLDERKQLSIDSANCLSHTVPGTSAPRKKQIAPLIKSHSFPESSGQQNPKPYMRKFKTPLMIAEEKYRQQKEEIEKQKQESSYYNIVKTQSQNQHITEVEKEMPLQKTNEEVSLSGIDSECTVVQPSPGSQSNARILGVCSDNQLSTTSPETVAAKRLHHVLAASEDKDKMKKEVLQSSRDIMQSKSACEIKQSHQECSTQQTQQKKYLEQLHLPQSKPISPNFKVKTIKLPTLDHTLNETDHSYESHKQQSEIDVQTFTKKQYLKTKKTEASTECSHKQSLAERHYQLPKKEKRVTVQLPTESIQKNQEDKLKMVPRKQREFSGSDRGKLPGSEEKNQGPSMIGRKEERLITERKHEHLKNKSAPKVVKQKVIDAHLDSQTQNFQQTQIQTAESKAEHKKLPQPYNSLQEEKCLEVKGIQEKQVFSNTKDSKQEITQNKSFFSSVKESQRDDGKGALNIVEFLRKREELQQILSRVKQFEAEPNKSGLKTFQTLLNTIPGWLISEDKREYAVHIAMENNLEKVKEEITHIKTQAEDMLVSYENIIQTAMMSSKTGKPGNKPTSLDETSSKVSNVHVSNNKNSEQKENKIAKEKTVQHQVAAHHEATVRSHVKTHQEIKLDDSNIPPPSLKTRPPSPTFITIESTARRTENPTKNELSQSPKKDSYVEPPPRRPMSQKSEIHRANTSPSPPRSRSEQLVRLKDTTAKLSKGAIPCPAATPVPIVEKRSEIIMSPATLRRQIKIETRGRDSPPTITIPVNINHAASGSFRESVDAQEEIRKVEKRATYVHKDGLNSTDHMVPDTESYDAVEIIRKVAVPPRLSEHTQRYEAANRTVQMAENFVNDPENEINRWFREFEHGPVSEAKSNRRVYAKGETNHNIQQESRTFCKEEFGLTSLGNTSFTDFSCKHPRELREKIPVKQPRICSETRSLSEHFSGMDAFESQIVESKMKTSSSHSSEAGKSGCDFKHAPPTYEDVIAGHILDISDSPKEVRKNFQKTWQESGRVFKGLGYATADASATEMRTTFQEESAFISEAAAPRQGNMYTLSKDSLSNGVPSGRQAEFS
```

also check out 
B4DDR5	9606_0:001d74
Q96HA1	9606_0:001e36


# removing identical sequences - cd-hit

c parameter:
>   -c	sequence identity threshold, default 0.9
 	this is the default cd-hit's "global sequence identity" calculated as:
 	number of identical amino acids or bases in alignment
 	divided by the full length of the shorter sequence

In [10]:
_, clustered_seqs_c1, cdhit_dict_c1 = cli.cd_hit_wrapper(list(all_human_odb_seqs_dict.values()), extra_args='-c 1.0')

Program: CD-HIT, V4.8.1 (+OpenMP), May 15 2023, 22:49:31
Command: cd-hit -i /tmp/tmpd15f3wsa -o
         /tmp/tmpd15f3wsa-cdhit.fa -M 0 -d 0 -g 1 -c 1.0

Started: Tue Jul  9 01:35:53 2024
                            Output                              
----------------------------------------------------------------
total seq: 20345
longest and shortest : 35991 and 31
Total letters: 11959655
Sequences have been sorted

Approximated minimal memory consumption:
Sequence        : 14M
Buffer          : 1 X 23M = 23M
Table           : 1 X 65M = 65M
Miscellaneous   : 0M
Total           : 104M

Table limit with the given memory limit:
Max number of representatives: 4000000
Max number of word counting entries: 444495500

comparing sequences from          0  to      20345
..........    10000  finished       9789  clusters
..........    20000  finished      19442  clusters

    20345  finished      19763  clusters

Approximated maximum memory consumption: 195M
writing new database
writing cluste

In [11]:
for k in cdhit_dict_c1:
    if len(cdhit_dict_c1[k]['all_members']) > 1:
        # print(k, cdhit_dict[k]['all_members'])
        lens = np.array([len(all_human_odb_seqs_dict[j]) for j in cdhit_dict_c1[k]['all_members']])
        print(np.diff(lens), [j for j in cdhit_dict_c1[k]['all_members']], cdhit_dict_c1[k]['representative_seq'])

[217] ['9606_0:001ac3', '9606_0:001c1e'] 9606_0:001c1e
[3688] ['9606_0:003f13', '9606_0:003d50'] 9606_0:003d50
[-1200] ['9606_0:0012c6', '9606_0:0013ae'] 9606_0:0012c6
[222] ['9606_0:000a46', '9606_0:000b0b'] 9606_0:000b0b
[2240] ['9606_0:00271a', '9606_0:00293e'] 9606_0:00293e
[140] ['9606_0:001b3e', '9606_0:001a92'] 9606_0:001a92
[-126] ['9606_0:00163f', '9606_0:0017d5'] 9606_0:00163f
[632] ['9606_0:004913', '9606_0:004975'] 9606_0:004975
[-833] ['9606_0:00192c', '9606_0:001c38'] 9606_0:00192c
[-1173] ['9606_0:000c19', '9606_0:000d15'] 9606_0:000c19
[-408] ['9606_0:001aeb', '9606_0:001aff'] 9606_0:001aeb
[-502] ['9606_0:002612', '9606_0:0024ec'] 9606_0:002612
[-866] ['9606_0:003372', '9606_0:00336a'] 9606_0:003372
[ 1393 -1186] ['9606_0:000af4', '9606_0:000969', '9606_0:00093c'] 9606_0:000969
[-513] ['9606_0:000494', '9606_0:00015a'] 9606_0:000494
[0] ['9606_0:000a08', '9606_0:000bfa'] 9606_0:000a08
[-1075] ['9606_0:001c36', '9606_0:001a0d'] 9606_0:001c36
[-1473] ['9606_0:001902', '9

In [12]:
print(len(all_human_odb_seqs_dict['9606_0:003f13']))
print(len(all_human_odb_seqs_dict['9606_0:003d50']))

798
4486


In [13]:
aln=aln_tools.pairwise_alignment(
    all_human_odb_seqs_dict['9606_0:003f13'],
    all_human_odb_seqs_dict['9606_0:003d50'] 
)
print(aln)

9606_0:00         0 ------------------------------------------------------------
                  0 ------------------------------------------------------------
9606_0:00         0 MRLAEERAALAAENADGEPGADRRLRLLGTYVAMSLRPAAGAWERCAGSAEAEQLLQAFL

9606_0:00         0 ------------------------------------------------------------
                 60 ------------------------------------------------------------
9606_0:00        60 GRDAAEGPRPLLVVRPGPRGLAIRPGLEVGPESGLAGAKALFFLRTGPEPPGPDSFRGAV

9606_0:00         0 ------------------------------------------------------------
                120 ------------------------------------------------------------
9606_0:00       120 VCGDLPAAPLEHLAALFSEVVLPVLANEKNRLNWPHMICEDVRRHAHSLQCDLSVILEQV

9606_0:00         0 ------------------------------------------------------------
                180 ------------------------------------------------------------
9606_0:00       180 KGKTLLPLPAGSEKMEFADSKSETVLDSIDKSVIYAIESAVIKWSYQVQVVLKRESSQPL

9606_0:00         0 ----

In [14]:
with open(OUTPUT_DIR / 'all_human_proteins_in_odb_clustered_c1.fasta', 'w') as f:
    SeqIO.write(list(clustered_seqs_c1.values()), f, 'fasta')

with open(OUTPUT_DIR / 'all_human_proteins_in_odb_clustered_c1.json', 'w') as f:
    json.dump(cdhit_dict_c1, f, indent=4)

# removing 95% identical sequences

In [15]:
_, clustered_seqs_c095, cdhit_dict_c095 = cli.cd_hit_wrapper(list(all_human_odb_seqs_dict.values()), extra_args='-c 0.95')

Program: CD-HIT, V4.8.1 (+OpenMP), May 15 2023, 22:49:31
Command: cd-hit -i /tmp/tmpofiwdsmu -o
         /tmp/tmpofiwdsmu-cdhit.fa -M 0 -d 0 -g 1 -c 0.95

Started: Tue Jul  9 01:36:14 2024
                            Output                              
----------------------------------------------------------------
total seq: 20345
longest and shortest : 35991 and 31
Total letters: 11959655
Sequences have been sorted

Approximated minimal memory consumption:
Sequence        : 14M
Buffer          : 1 X 23M = 23M
Table           : 1 X 65M = 65M
Miscellaneous   : 0M
Total           : 104M

Table limit with the given memory limit:
Max number of representatives: 4000000
Max number of word counting entries: 444495500

comparing sequences from          0  to      20345
..........    10000  finished       9426  clusters
..........    20000  finished      18716  clusters

    20345  finished      19027  clusters

Approximated maximum memory consumption: 192M
writing new database
writing clust

In [16]:
with open(OUTPUT_DIR / 'all_human_proteins_in_odb_clustered_c0_95.fasta', 'w') as f:
    SeqIO.write(list(clustered_seqs_c095.values()), f, 'fasta')

with open(OUTPUT_DIR / 'all_human_proteins_in_odb_clustered_c0_95.json', 'w') as f:
    json.dump(cdhit_dict_c095, f, indent=4)

In [17]:
cdhit_dict_c095

{'Cluster 0': {'all_members': ['9606_0:000cca'],
  'representative_seq': '9606_0:000cca'},
 'Cluster 1': {'all_members': ['9606_0:004691'],
  'representative_seq': '9606_0:004691'},
 'Cluster 2': {'all_members': ['9606_0:00024e'],
  'representative_seq': '9606_0:00024e'},
 'Cluster 3': {'all_members': ['9606_0:001c07'],
  'representative_seq': '9606_0:001c07'},
 'Cluster 4': {'all_members': ['9606_0:000cc4'],
  'representative_seq': '9606_0:000cc4'},
 'Cluster 5': {'all_members': ['9606_0:0030f3'],
  'representative_seq': '9606_0:0030f3'},
 'Cluster 6': {'all_members': ['9606_0:001e65'],
  'representative_seq': '9606_0:001e65'},
 'Cluster 7': {'all_members': ['9606_0:00195e'],
  'representative_seq': '9606_0:00195e'},
 'Cluster 8': {'all_members': ['9606_0:0006b5'],
  'representative_seq': '9606_0:0006b5'},
 'Cluster 9': {'all_members': ['9606_0:00112c'],
  'representative_seq': '9606_0:00112c'},
 'Cluster 10': {'all_members': ['9606_0:00347d'],
  'representative_seq': '9606_0:00347d'}

In [18]:
print(len(clustered_seqs_c095))

19027
