In [1]:
import local_env_variables.project_filepaths as fp
import local_orthoDB_group_pipeline.sql_queries as sql_queries
from Bio import SeqIO
import local_env_variables.env_variables as env
from local_seqtools import cli_wrappers as cli
from local_seqtools import alignment_tools as aln_tools
import numpy as np
import copy
from pathlib import Path
import json

%load_ext autoreload
%autoreload 2

In [2]:
ODBDATABASE = env.orthoDB_database()

In [3]:
OUTPUT_DIR = fp.data / "orthodb_clustered_species_proteins" / "9606_6"
OUTPUT_DIR.mkdir(exist_ok=True, parents=True)

# Get all human proteins in orthoDB - SQLite

In [4]:
gene_list = sql_queries.get_all_odb_gene_ids_from_species_id('9606_0')

In [5]:
print(len(gene_list))

20675


# load all human proteins from orthoDB

In [6]:
missing_seqs = []
all_human_odb_seqs_dict = {}
for i in gene_list:
    try:
        seq = copy.deepcopy(ODBDATABASE.data_all_seqrecords_dict[i])
        seq.name = ''
        seq.description = ''
    except KeyError:
        missing_seqs.append(i)
        print(i)
        continue
    all_human_odb_seqs_dict[i] = seq
print(len(missing_seqs))

9606_0:003cf2
9606_0:0028a4
9606_0:0033f4
9606_0:000740
9606_0:0004ef
9606_0:003e1f
9606_0:001768
9606_0:002a1e
9606_0:004b26
9606_0:0001b0
9606_0:0033d0
9606_0:0002a6
9606_0:0025d5
9606_0:000bf2
9606_0:003869
9606_0:000ae0
9606_0:000745
9606_0:003f25
9606_0:004bad
9606_0:001afd
9606_0:0042cc
9606_0:0033a0
9606_0:00420f
9606_0:000fba
9606_0:001374
9606_0:0019ba
9606_0:0004bf
9606_0:00320c
9606_0:003cc8
9606_0:003e84
9606_0:001081
9606_0:004274
9606_0:003623
9606_0:001b4c
9606_0:004ac7
9606_0:00044f
9606_0:000f98
9606_0:0050ae
9606_0:003e49
9606_0:003424
9606_0:003a5c
9606_0:0044a6
9606_0:00332d
9606_0:0040f2
9606_0:003d7e
9606_0:003df0
9606_0:001330
9606_0:0033c6
9606_0:001c9d
9606_0:00053d
9606_0:000339
9606_0:00174a
9606_0:00069c
9606_0:0017da
9606_0:003fc0
9606_0:00460b
9606_0:0029e5
9606_0:001a54
9606_0:003e8a
9606_0:0040f9
9606_0:004b3e
9606_0:003354
9606_0:00491f
9606_0:002896
9606_0:0037bb
9606_0:004277
9606_0:001a8b
9606_0:001c64
9606_0:004a69
9606_0:00508d
9606_0:002994
9606_0

## key error 
- is because the fasta file that I have is for all sequences that are in ortholog groups. If a sequence is not in an ortholog group, it is not in the fasta file. I'm not interested in those sequences anyway, so I can ignore the key error.
- odbid=9606_0:001081 is an example of a sequence that is not in an ortholog group

In [7]:
ODBDATABASE.data_all_seqrecords_dict['9606_0:001081']

KeyError: 

# write all human proteins from orthoDB to file

In [8]:
with open(OUTPUT_DIR / 'all_human_proteins_in_odb.fasta', 'w') as f:
    SeqIO.write(list(all_human_odb_seqs_dict.values()), f, 'fasta')

# issues
I've noticed some issues with this "proteome": 
There is an issue with ID `9606_0:004fc3`

/home/jch/Documents/SLiM_bioinformatics_workspace/data/orthoDB_ortholog_group_runs/2023-07-19-whole_proteome/orthoDB_analysis_multiprocessed_minfrac-0.75/msa_by_organism/9606_0:004fc3

>Homo_sapiens|9606_0:004d85
MAAAAAGLGGGGAGPGPEAGDFLARYRLVSNKLKKRFLRKPNVAEAGEQFGQLGRELRAQ
ECLPYAAWCQLAVARCQQALFHGPGEALALTEAARLFLRQERDARQRLVCPAAYGEPLQA
AASALGAAVRLHLELGQPAAAAALCLELAAALRDLGQPAAAAGHFQRAAQLQLPQLPLAA
LQALGEAASCQLLARDYTGALAVFTRMQRLAREHGSHPVQSLPPPPPPAPQPGPGATPAL
PAALLPPNSGSAAPSPAALGAFSDVLVRCEVSRVLLLLLLQPPPAKLLPEHAQTLEKYSW
EAFDSHGQESSGQLPEELFLLLQSLVMATHEKDTEAIKSLQVEMWPLLTAEQNHLLHLVL
QETISPSGQGV
>Homo_sapiens|9606_0:004f34
MAAAAAGLGGGGAGPGPEAGDFLARYRLVSNKLKKRFLRKPNVAEAGEQFGQLGRELRAQ
ECLPYAAWCQLAVARCQQALFHGPGEALALTEAARLFLRQERDARQRLVCPAAYGEPLQA
AASALGAAVRLHLELGQPAAAAALCLELAAALRDLGQPAAAAGHFQRAAQLQLPQLPLAA
LQALGEAASCQLLARDYTGALAVFTRMQRLAREHGSHPVQSLPPPPPPAPQPGPGATPAL
PAALLPPNSGSAAPSPAALGAFSDVLVRCEVSRVLLLLLLQPPPAKLLPEHAQTLEKYSW
EAFDSHGQESSGQLPEELFLLLQSLVMATHEKDTEAIKSLQVEMWPLLTAEQNHLLHLVL
QETISPSGQGV
>Homo_sapiens|9606_0:004fc3
MAAAAAGLGGGGAGPGPEAGDFLARYRLVSNKLKKRFLRKPNVAEAGEQFGQLGRELRAQ
ECLPYAAWCQLAVARCQQALFHGPGEALALTEAARLFLRQERDARQRLVCPAAYGEPLQA
AASALGAAVRLHLELGQPAAAAALCLELAAALRDLGQPAAAAGHFQRAAQLQLPQLPLAA
LQALGEAASCQLLARDYTGALAVFTRMQRLAREHGSHPVQSLPPPPPPAPQPGPGATPAL
PAALLPPNSGSAAPSPAALGAFSDVLVRCEVSRVLLLLLLQPPPAKLLPEHAQTLEKYSW
EAFDSHGQESSGQLPEELFLLLQSLVMATHEKDTEAIKSLQVEMWPLLTAEQNHLLHLVL
QETISPSGQGV


- these 3 sequences are identitcal.
---

I think that there are also isoforms in orthoDB even thought those are supposed to be removed.
example
```
>9544_0:003203 {"pub_og_id":"4636942at2759","og_name":"xin actin binding repeat containing 2 ","level_taxid":2759,"organism_taxid":"9544_0","organism_name":"Macaca mulatta","pub_gene_id":"XIRP2","description":"XIRP2"}
MFPMQKGSLNLLWQKWESCDYQRSECYPRDSRCTILQPQESKLLEPEEEVVSAPEPLDPTSLPCSGELTLSSKPEGKDSVDKSNTTREYGRPEVLKEDSLSSRRRIERFSIALDELRSVFEAPKSGNKSAEHGGKEVEIERSLCSPAFKSHPGSQLEDSVKDSDEKGEETSCDKMSPESGHSHIFEAIAGPNKPESGFAEDSAALGEVVSDLHEVVSLKERMARYQAAVSRGDCRSFSANMMEESEMCTVPGGLAKVKKQFEDEITSSRNTFAQYQYQHQNRSEQEAIHSSQVGTSKSSQELARNEQEGSKVQKIDVHGTEMVSHLEKHTKEINQASQFHQYVQETVIDTPEDEEIPKVSTKLLKEQFEKSAQEKILYSDKEMTTPAKQIKIESECEETLKPSSVVSTSSTSCISTSQRKETSTTRYSDHSVTSSTLAQINATSSGMTEEFPPPPPDVLQTSVDATAFSQSPELPSPPRRLPVPKDVYSKQRNLYELNRLYKHIHPELRKNLEKDYISEVSEIVSSQMNSGSSVSADVQQARYVFENTNDSSQKDLNSEREYLEWDEILKGEVQSIRWIFENQPLDSINNGSPDEGDISRGVADQEIIAGGDVKYTTWMFETQPIDTLGAHSSDTVENAEKIPELARGDVCTARWMFETRPLDSMNKMHQSQEESAVTISKDITGGNVKTVRYMFETQHLDQLGQLHSVDEVHLLQLRSELKEIKGNVKRSIKCFETQPLYVIRDGSGQMLEIKTVHREDVEKGDVRTARWMFETQPLDTINKDITEIKIVRGISMEENVKGGVSKAKWLFETQPLEKIKESEEVIIEKETIIGTDVSRKCWMFETQPLDILKEVPDADPLQHEEIIGGDVQTTKHLFETLPIEALKDSPDIGKLQKITASEEEKGDVRHQKWIFETQPLEDIRKDKKEYTRTVKLEEVDRGDVKNYTHIFESNNLIKFDASHKIEVEGVTRGAVELNKSLFETTPLYAIQDPLGKYHQVKTVQQEEIIRGDVRSCRWLFETRPIDQFDESIHKFQIIRGISAQEIQTGNVKSAKWLFETQPLDSIKYFSDVEETESKTEQARDIIKGDVKTCKWLFETQPMESLYEKVSLMTSSEEIHKGDVKTCTWLFETQPLDAIKDDSETKVKLQTVKQEEIQGGDVRTTCFLFETENLDSIQGEEVKEIKPVEMDIQAGDVSSMRYKFENQSLDSISSSSEEVLKKIKTLKTEDIQKGNVLNCRWLFENQPIDKIKESQEGDECVKTVTDIQGGDVRKGCFIFETFSLDEIKEESDYISTKKTITEEVMQGDVKSYRMLFETQPLYAIQDREGSYHEVTTVKKEEVIHGDVRGTRWLFETKPLDSINKSETVYVIKAVTQEDIQKGDVSSVRYRFETQPLDQISEESHNIVPTVDHIQGGNVKTSKQFFESENFDKNNYIRTVSVNEIQKGNVKTSTWLFETHTIDELRGEGLEYENIKTVTQEDVQKGDVKQAVWLFENQTFDSIMEAHKGVTKVTKEEIPPSDVKTTTWLFETTPLHEFNENRIEKIEIIGKSIKKTLEDLYSQKVIQAPGIIIEADEVGDVRMAKYKLMNQASPEIQKEEIIRADLRNIMVNLLSKRDYTKREILVSEEEKGNVNLTKTQLLNKSTEFHAEKEEIVKGDVQHAIKNLFSEERSVKKGILIQEDERGDINMTIYCLLHENDGDTIEREEVIGGDVRRTIHNLLSSTSNNKISERAKFDASERGNVQFFTTCIEAGALDYLKQLHTESNETLTAKKQEGEKEIIGGDVEGTKLLLKKRQSLVERTVSETDIIPGDVHNTVKVFMTEPQSTLGKIPKEEIIKGDLASTLNSLSQAVNQKTVTKTEEIIKGDMLATLKSLKESTRRWKESKQPDAIPGDIEKAIECLEKATNTRTEILKKELLKDDLETSLRSLKEAQRSFKEVDKEGVIKKDAHVVMAGSSGEQKTDIHQVAVQRNKNSLLQPKSGPFEPAAEWQGGADTLSQTMGKSCHGNLVEERTEVNLPKAPKGSVKIVIDREQNNDALEKSLRRLSNSHHKSIKNVLESGDKMGVWTDITGEQHLRDEYMSRQLTSTVSVKNNLKTKESDREVRELKKDDDFNSVQSADKTVGKQQTYERRNDHQKTEAFHIKSPKKTENIKILTDTQNSKPSPTQHPVSMPVGGTYKISGDFQKQTLLKQETKYSNKNIKQKNINFQPMWQPLPVEQDTTSVTEVKVSEKNHNTFKTTNKKQETDVHLKSQDFLMKTNTSTDLKTAMERSLNPINFNPENNVKESECPLPPPSPPPPPPSNASSEIEFPLPPPPPLMMFPEKNGFLPSLSTEKIKAEFESFPGLPLPPPSVDEKSERESPSMFLPPPPPPTPSQNPAHLLSSSAPEKHSGAFMQQYSQKEASNSQNSQAKIITGKSGVLPPPTLPKPKLPKHIKDNKNHFSPKVELTNSLSDMECKITTSKDQKKVMMMTSSEHIETKQNVISKSLDERKQLFVDSANCLSHTVPGTSAPKKKQIAPLIKSHSFPESSGQQSPKPYMRKFKTPLMIAEEKYRQQKEELEKQKQESSYYNIVKTESQNQHISEVEKEMPLRKTNEEVSVSGIDSERTVVQPNPGSQSNARVLGVCSDNQLSTTSPVTVTAKRLHHVLAASEDKDKMKKEVLQSARDIMQSKSACEIKQSHQECSTQQTQQNQYLEQLHLPQSKPISPNFKVKTIKLPTLDHTLNETDHSYESHKQQSEVDVQTFAKQQYLETKKTEASTECSHKQSLAERHYQLPKKEKRVTIKLPTESIQKNHEDKLQIVPGKQEEFAGSDRGKLPGSEEKNKGPSMISRKEERLITERKQEVLKNKSAPKVVKQKVIDAHLDSQTQNFQQTQIQTSESKAEHKKWPQPYNSLQEEKCLQVKGIQQKQVFSNTKDSKQEITQNKSFFSAVKESQQDDGKCAVNIVEFLRKREELQQILSRVKQFEAEPNKSGLKTFQTLLNTIPGWLISEEKREYAVHIAMENNLEKVKEEITHIKTQAEDMLVSYENIIQTAMMSSKTGKSGNKPTSLDETSSKVSNVHVSNNKNSEQKENKIAKEKTGQHQVAAHREAAVHSHVKTHQEIKLDESNIPPPSLKTRPPSPTFITIESTVRRTETPTKDELSQSPKKDSYVEPPPRRPKSQTSEIHRANTSPSPPRSRSEQLVRLKDTTAKLSKGAIPCPAVTPVPVVEKRSEIIMSPATLRRQIKIETRGRDSPPTITIPVNIHHTPSGSSRESTEAQEEIRKVEKRATYVHNAGLNSTDPIVPDTESYDAVEIIRKVEVPPRLSEHTQRYEAANRTVQMAENFVDDRENEINRWFREFEHGPVSEAKSNRRVYANGETNHNIQQESHTFCQEEFGLTSLGNTSFTDFSCTHPRELQEKIPVKQPRICSETRSLSEHFSGMDAFESQIVESKMKTSSSHSSEAGKSGCDFKHAPPTYEDVIAGHILDISDSPKEVRKNFQNTWQESGRVFKSLGYATSDSSATEMRTAFQEESAFISETAAPRQGNMYTLSKDSLSNGVPSGRQAEFS

>9544_0:0032c6 {"pub_og_id":"4636942at2759","og_name":"xin actin binding repeat containing 2 ","level_taxid":2759,"organism_taxid":"9544_0","organism_name":"Macaca mulatta","pub_gene_id":"XIRP2","description":"xin actin-binding repeat-containing protein 2 isoform X2"}
MARYQAAVSRGDCRSFSANMMEESEMCTVPGGLAKVKKQFEDEITSSRNTFAQYQYQHQNRSEQEAIHSSQVGTSKSSQELARNEQEGSKVQKIDVHGTEMVSHLEKHTKEINQASQFHQYVQETVIDTPEDEEIPKVSTKLLKEQFEKSAQEKILYSDKEMTTPAKQIKIESECEETLKPSSVVSTSSTSCISTSQRKETSTTRYSDHSVTSSTLAQINATSSGMTEEFPPPPPDVLQTSVDATAFSQSPELPSPPRRLPVPKDVYSKQRNLYELNRLYKHIHPELRKNLEKDYISEVSEIVSSQMNSGSSVSADVQQARYVFENTNDSSQKDLNSEREYLEWDEILKGEVQSIRWIFENQPLDSINNGSPDEGDISRGVADQEIIAGGDVKYTTWMFETQPIDTLGAHSSDTVENAEKIPELARGDVCTARWMFETRPLDSMNKMHQSQEESAVTISKDITGGNVKTVRYMFETQHLDQLGQLHSVDEVHLLQLRSELKEIKGNVKRSIKCFETQPLYVIRDGSGQMLEIKTVHREDVEKGDVRTARWMFETQPLDTINKDITEIKIVRGISMEENVKGGVSKAKWLFETQPLEKIKESEEVIIEKETIIGTDVSRKCWMFETQPLDILKEVPDADPLQHEEIIGGDVQTTKHLFETLPIEALKDSPDIGKLQKITASEEEKGDVRHQKWIFETQPLEDIRKDKKEYTRTVKLEEVDRGDVKNYTHIFESNNLIKFDASHKIEVEGVTRGAVELNKSLFETTPLYAIQDPLGKYHQVKTVQQEEIIRGDVRSCRWLFETRPIDQFDESIHKFQIIRGISAQEIQTGNVKSAKWLFETQPLDSIKYFSDVEETESKTEQARDIIKGDVKTCKWLFETQPMESLYEKVSLMTSSEEIHKGDVKTCTWLFETQPLDAIKDDSETKVKLQTVKQEEIQGGDVRTTCFLFETENLDSIQGEEVKEIKPVEMDIQAGDVSSMRYKFENQSLDSISSSSEEVLKKIKTLKTEDIQKGNVLNCRWLFENQPIDKIKESQEGDECVKTVTDIQGGDVRKGCFIFETFSLDEIKEESDYISTKKTITEEVMQGDVKSYRMLFETQPLYAIQDREGSYHEVTTVKKEEVIHGDVRGTRWLFETKPLDSINKSETVYVIKAVTQEDIQKGDVSSVRYRFETQPLDQISEESHNIVPTVDHIQGGNVKTSKQFFESENFDKNNYIRTVSVNEIQKGNVKTSTWLFETHTIDELRGEGLEYENIKTVTQEDVQKGDVKQAVWLFENQTFDSIMEAHKGVTKVTKEEIPPSDVKTTTWLFETTPLHEFNENRIEKIEIIGKSIKKTLEDLYSQKVIQAPGIIIEADEVGDVRMAKYKLMNQASPEIQKEEIIRADLRNIMVNLLSKRDYTKREILVSEEEKGNVNLTKTQLLNKSTEFHAEKEEIVKGDVQHAIKNLFSEERSVKKGILIQEDERGDINMTIYCLLHENDGDTIEREEVIGGDVRRTIHNLLSSTSNNKISERAKFDASERGNVQFFTTCIEAGALDYLKQLHTESNETLTAKKQEGEKEIIGGDVEGTKLLLKKRQSLVERTVSETDIIPGDVHNTVKVFMTEPQSTLGKIPKEEIIKGDLASTLNSLSQAVNQKTVTKTEEIIKGDMLATLKSLKESTRRWKESKQPDAIPGDIEKAIECLEKATNTRTEILKKELLKDDLETSLRSLKEAQRSFKEVDKEGVIKKDAHVVMAGSSGEQKTDIHQVAVQRNKNSLLQPKSGPFEPAAEWQGGADTLSQTMGKSCHGNLVEERTEVNLPKAPKGSVKIVIDREQNNDALEKSLRRLSNSHHKSIKNVLESGDKMGVWTDITGEQHLRDEYMSRQLTSTVSVKNNLKTKESDREVRELKKDDDFNSVQSADKTVGKQQTYERRNDHQKTEAFHIKSPKKTENIKILTDTQNSKPSPTQHPVSMPVGGTYKISGDFQKQTLLKQETKYSNKNIKQKNINFQPMWQPLPVEQDTTSVTEVKVSEKNHNTFKTTNKKQETDVHLKSQDFLMKTNTSTDLKTAMERSLNPINFNPENNVKESECPLPPPSPPPPPPSNASSEIEFPLPPPPPLMMFPEKNGFLPSLSTEKIKAEFESFPGLPLPPPSVDEKSERESPSMFLPPPPPPTPSQNPAHLLSSSAPEKHSGAFMQQYSQKEASNSQNSQAKIITGKSGVLPPPTLPKPKLPKHIKDNKNHFSPKVELTNSLSDMECKITTSKDQKKVMMMTSSEHIETKQNVISKSLDERKQLFVDSANCLSHTVPGTSAPKKKQIAPLIKSHSFPESSGQQSPKPYMRKFKTPLMIAEEKYRQQKEELEKQKQESSYYNIVKTESQNQHISEVEKEMPLRKTNEEVSVSGIDSERTVVQPNPGSQSNARVLGVCSDNQLSTTSPVTVTAKRLHHVLAASEDKDKMKKEVLQSARDIMQSKSACEIKQSHQECSTQQTQQNQYLEQLHLPQSKPISPNFKVKTIKLPTLDHTLNETDHSYESHKQQSEVDVQTFAKQQYLETKKTEASTECSHKQSLAERHYQLPKKEKRVTIKLPTESIQKNHEDKLQIVPGKQEEFAGSDRGKLPGSEEKNKGPSMISRKEERLITERKQEVLKNKSAPKVVKQKVIDAHLDSQTQNFQQTQIQTSESKAEHKKWPQPYNSLQEEKCLQVKGIQQKQVFSNTKDSKQEITQNKSFFSAVKESQQDDGKCAVNIVEFLRKREELQQILSRVKQFEAEPNKSGLKTFQTLLNTIPGWLISEEKREYAVHIAMENNLEKVKEEITHIKTQAEDMLVSYENIIQTAMMSSKTGKSGNKPTSLDETSSKVSNVHVSNNKNSEQKENKIAKEKTGQHQVAAHREAAVHSHVKTHQEIKLDESNIPPPSLKTRPPSPTFITIESTVRRTETPTKDELSQSPKKDSYVEPPPRRPKSQTSEIHRANTSPSPPRSRSEQLVRLKDTTAKLSKGAIPCPAVTPVPVVEKRSEIIMSPATLRRQIKIETRGRDSPPTITIPVNIHHTPSGSSRESTEAQEEIRKVEKRATYVHNAGLNSTDPIVPDTESYDAVEIIRKVEVPPRLSEHTQRYEAANRTVQMAENFVDDRENEINRWFREFEHGPVSEAKSNRRVYANGETNHNIQQESHTFCQEEFGLTSLGNTSFTDFSCTHPRELQEKIPVKQPRICSETRSLSEHFSGMDAFESQIVESKMKTSSSHSSEAGKSGCDFKHAPPTYEDVIAGHILDISDSPKEVRKNFQNTWQESGRVFKSLGYATSDSSATEMRTAFQEESAFISETAAPRQGNMYTLSKDSLSNGVPSGRQAEFS

>9606_0:000a46 {"pub_og_id":"4636942at2759","og_name":"xin actin binding repeat containing 2 ","level_taxid":2759,"organism_taxid":"9606_0","organism_name":"Homo sapiens","pub_gene_id":"XIRP2","description":"xin actin-binding repeat-containing protein 2 isoform 4"}
MARYQAAVSRGDCRSFSANMMEESEMCAVPGGLAKVKKQFEDEITSSRNTFAQYQYQHQNRSEQEAIHSSQVGTSRSSQEMARNEQEGSKVQKIDVHGTEMVSHLEKHTEEVNQASQFHQYVQETVIDTPEDEEIPKVSTKLLKEQFEKSAQEKILYSDKEMTTPAKQIKTESEYEETFKPSSVVSTSSTSCVSTSQRKETSTTRYSDHSVTSSTLAQINATSSGMTEEFPPPPPDVLQTSVDVTAFSQSPELPSPPRRLPVPKDVYSKQRNLYELNRLYKHIHPELRKNLEKDYISEVSEIVSSQMNSGSSVSADVQQARYVFENTNDSSQKDLNSEREYLEWDEILKGEVQSIRWIFENQPLDSINNGSPDEGDISRGIADQEIIAGGDVKYTTWMFETQPIDTLGAYSSDTVENAEKIPELARGDVCTARWMFETRPLDSMNKMHQSQEESAVTISKDITGGDVKTVRYMFETQHLDQLGQLHSVDEVHLLQLRSELKEIKGNVKRSIKCFETQPLYVIRDGSGQMLEIKTVHREDVEKGDVRTARWMFETQPLDTINKDITEIKVVRGISMEENVKGGVSKAKWLFETQPLEKIKESEEVIIEKEKIIGTDVSRKCWMFETQPLDILKEVPDADSLQREEIIGGDVQTTKHLFETLPIEALKDSPDIGKLQKITASEEEKGDVRHQKWIFETQPLEDIRKDKKEYTRTVKLEEVDRGDVKNYTHIFESNNLIKFDASHKIEVEGVTRGAVELNKSLFETTPLYAIQDPLGKYHQVKTVQQEEIVRGDVRSCRWLFETRPIDQFDESIHKFQIIRGISAQEIQTGNVKSAKWLFETQPLDSIKYFSDVEETESKTEQTRDIVKGDVKTCKWLFETQPMESLYEKVSLMTSSEEIHKGDVKTCTWLFETQPLDTIKDDSETAVKLQTVKQEEIQGGDVRTACFLFETENLDSIQGEEVKEIKPVEMDIQAGDVSSMRYKFENQSLDSISSSSEEVLKKIKTLKTEDIQKGNVLNCRWLFENQPIDKIKESQEGDECVKTVTDIQGGDVRKGCFIFETFSLDEIKEESDYISTKKTITEEVIQGDVKSYRMLFETQPLYAIQDREGSYHEVTTVKKEEVIHGDVRGTRWLFETKPLDSINKSETVYVIKSVTQEDIQKGDVSSVRYRFETQPLDQISEESHNIMPSIDHIQGGNVKTSRQFFESENFDKNNYIRTVSVNEIQKGNVKTSTWLFETHTMDELRGEGLEYENIKTVTQEDVQKGDVKQAVWLFENRTFDSIMEAHKGITKMTKEEIPPSDVKTTTWLFETTPLHEFNETRVEKIEIIGKSIKETLEDLYSQKVIQAPGIIIEADEIGDVRMAKYKLMNQASPEIQKEEIIRADLRNIMVNLLSKRDCTEREILISEEEKGNVNLTKTQLLNRSTEFHAEKEEIVKGDVQQAIKNLFSEERSVKKGILIQEDEKGDINMTIYCLLHENDGDTIEREEVIGGDVKRTIHNLLSSTSNNKISERAKIDASERGNVQFFTTCIEAGALDYLKQLHTESNETLTAKKQEGEKEIIGGDVEGTKLLLKKRQSLVERTVSETDIIPGDVHNTVKVFMTEPQSTFGKIPKEEIIKGDLTSTLNSLSQAVNQKTVTKTEEIIKGNMLATLKSLKESSHRWKESKQPDAIPGDIEKAIECLEKATNTKTEILKKELLKDDLETSLRSLKEAQRSFKEVHKEGVIKKDAKAVMAGSSGEQKTDIHQVAVQRNKNSLLQPKPGPFEPAAKWQGGADTLSQTMGKSCHGNLVEERTEVNLPKAPKGTVKIVIDREQNNDALEKSLRRLSNSHHKSNVLESGDKTGVWTDTTGEQHLRDEYMSRQLTSTVSVKNNLTTKESDRAVRELKKDDVFNSIQSAGKTVGKQQTYELRNDHQKMEGFHIKSPKKTKNIKILTDTQSSKPSPTQHPVSMPVGGTYDLSGDFQKQTLLKQETKYSNKDIKKKNINLQPMWQLLPVEQDTSNVTEMKVSEKSHNTFKATNKKRETDVHLKSQDFLMKTNTSTGLKMAMERSLNPINFNPENNVKESECPLPPPSPPPPPPSNASSEIEFPLPPPPPLMMFPEKNGFLPSLSTEKIKAEFESFPGLPLPPPPVDEKSERESSSMFLPPPPPPTPSQKPAHLLSSSAPEKHSGDFMQQYSQKEASNSQNSQAKIITGKTGVLPPPTLPKPKLPKHIKDNKNDFSPKVELATSLSDMECKITTSKDQKKVMVMTSSEHTETKQNVISKSLDERKQLSIDSANCLSHTVPGTSAPRKKQIAPLIKSHSFPESSGQQNPKPYMRKFKTPLMIAEEKYRQQKEEIEKQKQESSYYNIVKTQSQNQHITEVEKEMPLQKTNEEVSLSGIDSECTVVQPSPGSQSNARILGVCSDNQLSTTSPETVAAKRLHHVLAASEDKDKMKKEVLQSSRDIMQSKSACEIKQSHQECSTQQTQQKKYLEQLHLPQSKPISPNFKVKTIKLPTLDHTLNETDHSYESHKQQSEIDVQTFTKKQYLKTKKTEASTECSHKQSLAERHYQLPKKEKRVTVQLPTESIQKNQEDKLKMVPRKQREFSGSDRGKLPGSEEKNQGPSMIGRKEERLITERKHEHLKNKSAPKVVKQKVIDAHLDSQTQNFQQTQIQTAESKAEHKKLPQPYNSLQEEKCLEVKGIQEKQVFSNTKDSKQEITQNKSFFSSVKESQRDDGKGALNIVEFLRKREELQQILSRVKQFEAEPNKSGLKTFQTLLNTIPGWLISEDKREYAVHIAMENNLEKVKEEITHIKTQAEDMLVSYENIIQTAMMSSKTGKPGNKPTSLDETSSKVSNVHVSNNKNSEQKENKIAKEKTVQHQVAAHHEATVRSHVKTHQEIKLDDSNIPPPSLKTRPPSPTFITIESTARRTENPTKNELSQSPKKDSYVEPPPRRPMSQKSEIHRANTSPSPPRSRSEQLVRLKDTTAKLSKGAIPCPAATPVPIVEKRSEIIMSPATLRRQIKIETRGRDSPPTITIPVNINHAASGSFRESVDAQEEIRKVEKRATYVHKDGLNSTDHMVPDTESYDAVEIIRKVAVPPRLSEHTQRYEAANRTVQMAENFVNDPENEINRWFREFEHGPVSEAKSNRRVYAKGETNHNIQQESRTFCKEEFGLTSLGNTSFTDFSCKHPRELREKIPVKQPRICSETRSLSEHFSGMDAFESQIVESKMKTSSSHSSEAGKSGCDFKHAPPTYEDVIAGHILDISDSPKEVRKNFQKTWQESGRVFKGLGYATADASATEMRTTFQEESAFISEAAAPRQGNMYTLSKDSLSNGVPSGRQAEFS

>9606_0:000b0b {"pub_og_id":"4636942at2759","og_name":"xin actin binding repeat containing 2 ","level_taxid":2759,"organism_taxid":"9606_0","organism_name":"Homo sapiens","pub_gene_id":"XIRP2","description":"Xin actin-binding repeat-containing protein 2"}
MFPMQKGSLNLLRQKWESCDYQRSECHPRDSHCTIFQPQESKLLAPEGEVVSAPQSLDPTSLPYSTGEEMWSSKPEEKDSVDKSNNTREYGRPEVLKEDSLSSRRRIERFSIALDELRSVFEAPKSGNKPAEYGGKEVEIERSLCSPAFKSHPGSQLEDSVKDSDKKGKETSFDKMSPESGHSRIFEATAGPNKPESGFAEDSAARGEGVSDLHEVVSLKERMARYQAAVSRGDCRSFSANMMEESEMCAVPGGLAKVKKQFEDEITSSRNTFAQYQYQHQNRSEQEAIHSSQVGTSRSSQEMARNEQEGSKVQKIDVHGTEMVSHLEKHTEEVNQASQFHQYVQETVIDTPEDEEIPKVSTKLLKEQFEKSAQEKILYSDKEMTTPAKQIKTESEYEETFKPSSVVSTSSTSCVSTSQRKETSTTRYSDHSVTSSTLAQINATSSGMTEEFPPPPPDVLQTSVDVTAFSQSPELPSPPRRLPVPKDVYSKQRNLYELNRLYKHIHPELRKNLEKDYISEVSEIVSSQMNSGSSVSADVQQARYVFENTNDSSQKDLNSEREYLEWDEILKGEVQSIRWIFENQPLDSINNGSPDEGDISRGIADQEIIAGGDVKYTTWMFETQPIDTLGAYSSDTVENAEKIPELARGDVCTARWMFETRPLDSMNKMHQSQEESAVTISKDITGGDVKTVRYMFETQHLDQLGQLHSVDEVHLLQLRSELKEIKGNVKRSIKCFETQPLYVIRDGSGQMLEIKTVHREDVEKGDVRTARWMFETQPLDTINKDITEIKVVRGISMEENVKGGVSKAKWLFETQPLEKIKESEEVIIEKEKIIGTDVSRKCWMFETQPLDILKEVPDADSLQREEIIGGDVQTTKHLFETLPIEALKDSPDIGKLQKITASEEEKGDVRHQKWIFETQPLEDIRKDKKEYTRTVKLEEVDRGDVKNYTHIFESNNLIKFDASHKIEVEGVTRGAVELNKSLFETTPLYAIQDPLGKYHQVKTVQQEEIVRGDVRSCRWLFETRPIDQFDESIHKFQIIRGISAQEIQTGNVKSAKWLFETQPLDSIKYFSDVEETESKTEQTRDIVKGDVKTCKWLFETQPMESLYEKVSLMTSSEEIHKGDVKTCTWLFETQPLDTIKDDSETAVKLQTVKQEEIQGGDVRTACFLFETENLDSIQGEEVKEIKPVEMDIQAGDVSSMRYKFENQSLDSISSSSEEVLKKIKTLKTEDIQKGNVLNCRWLFENQPIDKIKESQEGDECVKTVTDIQGGDVRKGCFIFETFSLDEIKEESDYISTKKTITEEVIQGDVKSYRMLFETQPLYAIQDREGSYHEVTTVKKEEVIHGDVRGTRWLFETKPLDSINKSETVYVIKSVTQEDIQKGDVSSVRYRFETQPLDQISEESHNIMPSIDHIQGGNVKTSRQFFESENFDKNNYIRTVSVNEIQKGNVKTSTWLFETHTMDELRGEGLEYENIKTVTQEDVQKGDVKQAVWLFENRTFDSIMEAHKGITKMTKEEIPPSDVKTTTWLFETTPLHEFNETRVEKIEIIGKSIKETLEDLYSQKVIQAPGIIIEADEIGDVRMAKYKLMNQASPEIQKEEIIRADLRNIMVNLLSKRDCTEREILISEEEKGNVNLTKTQLLNRSTEFHAEKEEIVKGDVQQAIKNLFSEERSVKKGILIQEDEKGDINMTIYCLLHENDGDTIEREEVIGGDVKRTIHNLLSSTSNNKISERAKIDASERGNVQFFTTCIEAGALDYLKQLHTESNETLTAKKQEGEKEIIGGDVEGTKLLLKKRQSLVERTVSETDIIPGDVHNTVKVFMTEPQSTFGKIPKEEIIKGDLTSTLNSLSQAVNQKTVTKTEEIIKGNMLATLKSLKESSHRWKESKQPDAIPGDIEKAIECLEKATNTKTEILKKELLKDDLETSLRSLKEAQRSFKEVHKEGVIKKDAKAVMAGSSGEQKTDIHQVAVQRNKNSLLQPKPGPFEPAAKWQGGADTLSQTMGKSCHGNLVEERTEVNLPKAPKGTVKIVIDREQNNDALEKSLRRLSNSHHKSNVLESGDKTGVWTDTTGEQHLRDEYMSRQLTSTVSVKNNLTTKESDRAVRELKKDDVFNSIQSAGKTVGKQQTYELRNDHQKMEGFHIKSPKKTKNIKILTDTQSSKPSPTQHPVSMPVGGTYDLSGDFQKQTLLKQETKYSNKDIKKKNINLQPMWQLLPVEQDTSNVTEMKVSEKSHNTFKATNKKRETDVHLKSQDFLMKTNTSTGLKMAMERSLNPINFNPENNVKESECPLPPPSPPPPPPSNASSEIEFPLPPPPPLMMFPEKNGFLPSLSTEKIKAEFESFPGLPLPPPPVDEKSERESSSMFLPPPPPPTPSQKPAHLLSSSAPEKHSGDFMQQYSQKEASNSQNSQAKIITGKTGVLPPPTLPKPKLPKHIKDNKNDFSPKVELATSLSDMECKITTSKDQKKVMVMTSSEHTETKQNVISKSLDERKQLSIDSANCLSHTVPGTSAPRKKQIAPLIKSHSFPESSGQQNPKPYMRKFKTPLMIAEEKYRQQKEEIEKQKQESSYYNIVKTQSQNQHITEVEKEMPLQKTNEEVSLSGIDSECTVVQPSPGSQSNARILGVCSDNQLSTTSPETVAAKRLHHVLAASEDKDKMKKEVLQSSRDIMQSKSACEIKQSHQECSTQQTQQKKYLEQLHLPQSKPISPNFKVKTIKLPTLDHTLNETDHSYESHKQQSEIDVQTFTKKQYLKTKKTEASTECSHKQSLAERHYQLPKKEKRVTVQLPTESIQKNQEDKLKMVPRKQREFSGSDRGKLPGSEEKNQGPSMIGRKEERLITERKHEHLKNKSAPKVVKQKVIDAHLDSQTQNFQQTQIQTAESKAEHKKLPQPYNSLQEEKCLEVKGIQEKQVFSNTKDSKQEITQNKSFFSSVKESQRDDGKGALNIVEFLRKREELQQILSRVKQFEAEPNKSGLKTFQTLLNTIPGWLISEDKREYAVHIAMENNLEKVKEEITHIKTQAEDMLVSYENIIQTAMMSSKTGKPGNKPTSLDETSSKVSNVHVSNNKNSEQKENKIAKEKTVQHQVAAHHEATVRSHVKTHQEIKLDDSNIPPPSLKTRPPSPTFITIESTARRTENPTKNELSQSPKKDSYVEPPPRRPMSQKSEIHRANTSPSPPRSRSEQLVRLKDTTAKLSKGAIPCPAATPVPIVEKRSEIIMSPATLRRQIKIETRGRDSPPTITIPVNINHAASGSFRESVDAQEEIRKVEKRATYVHKDGLNSTDHMVPDTESYDAVEIIRKVAVPPRLSEHTQRYEAANRTVQMAENFVNDPENEINRWFREFEHGPVSEAKSNRRVYAKGETNHNIQQESRTFCKEEFGLTSLGNTSFTDFSCKHPRELREKIPVKQPRICSETRSLSEHFSGMDAFESQIVESKMKTSSSHSSEAGKSGCDFKHAPPTYEDVIAGHILDISDSPKEVRKNFQKTWQESGRVFKGLGYATADASATEMRTTFQEESAFISEAAAPRQGNMYTLSKDSLSNGVPSGRQAEFS
```

also check out 
B4DDR5	9606_0:001d74
Q96HA1	9606_0:001e36


# Try removing extremely similar sequences - cd-hit

c parameter:
>   -c	sequence identity threshold, default 0.9
 	this is the default cd-hit's "global sequence identity" calculated as:
 	number of identical amino acids or bases in alignment
 	divided by the full length of the shorter sequence

In [9]:
_, clustered_seqs_c1, cdhit_dict_c1 = cli.cd_hit_wrapper(list(all_human_odb_seqs_dict.values()), extra_args='-c 1.0')

Program: CD-HIT, V4.8.1, Aug 07 2022, 07:05:53
Command: /Users/jackson/mambaforge/envs/cd_hit_x86/bin/cd-hit
         -i
         /var/folders/q4/k476_qrd3jvdvzwd6lq30kqc0000gn/T/tmpk_1nubzu
         -o
         /var/folders/q4/k476_qrd3jvdvzwd6lq30kqc0000gn/T/tmpk_1nubzu-cdhit.fa
         -M 0 -d 0 -g 1 -c 1.0

Started: Tue Feb 13 11:48:40 2024
                            Output                              
----------------------------------------------------------------
total seq: 20345
longest and shortest : 35991 and 31
Total letters: 11959655
Sequences have been sorted

Approximated minimal memory consumption:
Sequence        : 14M
Buffer          : 1 X 23M = 23M
Table           : 1 X 65M = 65M
Miscellaneous   : 0M
Total           : 104M

Table limit with the given memory limit:
Max number of representatives: 4000000
Max number of word counting entries: 444495500

comparing sequences from          0  to      20345
..........    10000  finished       9789  clusters
..........    2

In [11]:
for k in cdhit_dict_c1:
    if len(cdhit_dict_c1[k]['all_members']) > 1:
        # print(k, cdhit_dict[k]['all_members'])
        lens = np.array([len(all_human_odb_seqs_dict[j]) for j in cdhit_dict_c1[k]['all_members']])
        print(np.diff(lens), [j for j in cdhit_dict_c1[k]['all_members']], cdhit_dict_c1[k]['representative_seq'])

[-217] ['9606_0:001c1e', '9606_0:001ac3'] 9606_0:001c1e
[3688] ['9606_0:003f13', '9606_0:003d50'] 9606_0:003d50
[-1200] ['9606_0:0012c6', '9606_0:0013ae'] 9606_0:0012c6
[-222] ['9606_0:000b0b', '9606_0:000a46'] 9606_0:000b0b
[2240] ['9606_0:00271a', '9606_0:00293e'] 9606_0:00293e
[-140] ['9606_0:001a92', '9606_0:001b3e'] 9606_0:001a92
[126] ['9606_0:0017d5', '9606_0:00163f'] 9606_0:00163f
[632] ['9606_0:004913', '9606_0:004975'] 9606_0:004975
[833] ['9606_0:001c38', '9606_0:00192c'] 9606_0:00192c
[-1173] ['9606_0:000c19', '9606_0:000d15'] 9606_0:000c19
[-408] ['9606_0:001aeb', '9606_0:001aff'] 9606_0:001aeb
[-502] ['9606_0:002612', '9606_0:0024ec'] 9606_0:002612
[-866] ['9606_0:003372', '9606_0:00336a'] 9606_0:003372
[ 207 1186] ['9606_0:000af4', '9606_0:00093c', '9606_0:000969'] 9606_0:000969
[513] ['9606_0:00015a', '9606_0:000494'] 9606_0:000494
[0] ['9606_0:000bfa', '9606_0:000a08'] 9606_0:000bfa
[1075] ['9606_0:001a0d', '9606_0:001c36'] 9606_0:001c36
[-1473] ['9606_0:001902', '9606

In [13]:
print(len(all_human_odb_seqs_dict['9606_0:003f13']))
print(len(all_human_odb_seqs_dict['9606_0:003d50']))

798
4486


In [14]:
aln=aln_tools.pairwise_alignment(
    all_human_odb_seqs_dict['9606_0:003f13'],
    all_human_odb_seqs_dict['9606_0:003d50'] 
)
print(aln)

9606_0:00         0 ------------------------------------------------------------
                  0 ------------------------------------------------------------
9606_0:00         0 MRLAEERAALAAENADGEPGADRRLRLLGTYVAMSLRPAAGAWERCAGSAEAEQLLQAFL

9606_0:00         0 ------------------------------------------------------------
                 60 ------------------------------------------------------------
9606_0:00        60 GRDAAEGPRPLLVVRPGPRGLAIRPGLEVGPESGLAGAKALFFLRTGPEPPGPDSFRGAV

9606_0:00         0 ------------------------------------------------------------
                120 ------------------------------------------------------------
9606_0:00       120 VCGDLPAAPLEHLAALFSEVVLPVLANEKNRLNWPHMICEDVRRHAHSLQCDLSVILEQV

9606_0:00         0 ------------------------------------------------------------
                180 ------------------------------------------------------------
9606_0:00       180 KGKTLLPLPAGSEKMEFADSKSETVLDSIDKSVIYAIESAVIKWSYQVQVVLKRESSQPL

9606_0:00         0 ----

In [15]:
with open(OUTPUT_DIR / 'all_human_proteins_in_odb_clustered_c1.fasta', 'w') as f:
    SeqIO.write(list(clustered_seqs_c1.values()), f, 'fasta')

with open(OUTPUT_DIR / 'all_human_proteins_in_odb_clustered_c1.json', 'w') as f:
    json.dump(cdhit_dict_c1, f, indent=4)