In [1]:
import os.path
import json
import statistics
from collections import Counter

from orthologue_analysis.orthogroups import init_orthogroup_df
from orthologue_analysis.species import HaemonchusFromTool, SpeciesList
from orthologue_analysis.utils import SequenceIDMapping, orthofinder_paths
from reannotation.analysis import (
    interpro_accessions_frequently_missed_by_each_tool,
    interpro_accessions_frequently_missed_by_all_tools,
    interpro_accessions_in_novel_transcripts,
    interpro_accessions_in_missed_transcripts,
    missed_transcripts_with_significantly_more_frequent_accessions
)
from reannotation.constants import ASPARTIC_PEPTIDASE_SUPERFAMILY
from reannotation.pipelines import (
    interpro_accession_pipeline,
    interpro_accession_pipeline_all_tools,
    suspicious_orthologue_pipeline,
    pickle_cache_suspicious_orthologue_pipeline,
    novel_orthologue_pipeline
)
from reannotation.statistics import fisher_exact_for_two_lists_of_accessions
from utils.esm import extract_esm_means
from utils.generic import flatten_nested_dict
from utils.gffutils import init_db


results_label = "Results_Aug08"
wbps_ann_path = "data/from_WBPS/haemonchus_contortus.PRJEB506.WBPS19.annotations.gff3"
braker_path = "data/from_MARS/Haemonchus_contortus_braker3_full.gff3"
helixer_path = "data/from_MARS/Haemonchus_contortus_helixer_full.gff3"
anno_path = "data/from_EBI/haemonchus_contortus_gca000469685v2.gff3"
db = init_db(wbps_ann_path, "db/Hcon_wbps.db")
of = orthofinder_paths(results_label, subdir="Orthogroups")

wbps_col = "Hcon_LT"
braker_col = "Hcon_braker3_LT"
helixer_col = "Hcon_helixer_LT"
anno_col = "Hcon_anno_LT"

og_df = init_orthogroup_df(of["orthogroups"])
no_og_df = init_orthogroup_df(of["orthogroups_unassigned_genes"])
seq_id_map = SequenceIDMapping(of["wd"])
mars_data_dir = os.path.join("data", "from_MARS", "")
ebi_data_dir = os.path.join("data", "from_EBI", "")

species_list = SpeciesList([
    HaemonchusFromTool("contortus", data_dir=mars_data_dir, data_label="Hcon_LT", prot_filename_suffix=".fa"),
    HaemonchusFromTool("contortus_braker3_reann", data_dir=mars_data_dir, data_label="Hcon_braker3_LT", prot_filename_suffix=".fa"),
    HaemonchusFromTool("contortus_helixer_reann", data_dir=mars_data_dir, data_label="Hcon_helixer_LT", prot_filename_suffix=".fa"),
    HaemonchusFromTool("contortus_anno_reann", data_dir=ebi_data_dir, data_label="Hcon_anno_LT", prot_filename_suffix=".fa")],
    wd_path=of["wd"],
    load_blast=True
)

wbps_species = species_list.get_species_with_data_label(wbps_col)
braker_species = species_list.get_species_with_data_label(braker_col)
helixer_species = species_list.get_species_with_data_label(helixer_col)
anno_species = species_list.get_species_with_data_label(anno_col)

with open("data/acc_product_hcontortus.json", "r") as f:
    acc_product = json.loads(f.read())

min_freq = 10

interproscan_dir = "data/from_MARS/interproscan/hcon"

loading /home/will/wbp_scratch/data/from_MARS/OrthoFinder/WorkingDirectory/Results_Aug08/Blast0_1.txt...
loading /home/will/wbp_scratch/data/from_MARS/OrthoFinder/WorkingDirectory/Results_Aug08/Blast0_2.txt...
loading /home/will/wbp_scratch/data/from_MARS/OrthoFinder/WorkingDirectory/Results_Aug08/Blast0_3.txt...
loading /home/will/wbp_scratch/data/from_MARS/OrthoFinder/WorkingDirectory/Results_Aug08/Blast0_2.txt...
loading /home/will/wbp_scratch/data/from_MARS/OrthoFinder/WorkingDirectory/Results_Aug08/Blast0_3.txt...
loading /home/will/wbp_scratch/data/from_MARS/OrthoFinder/WorkingDirectory/Results_Aug08/Blast0_1.txt...


# General Statistics

In [2]:
print("Shared orthologues with WBPS:")
print("WBPS: {}".format(len(og_df[~og_df[wbps_col].isna()])))
print("BRAKER3: {}".format(len(og_df[~og_df[wbps_col].isna() & ~og_df[braker_col].isna()])))
print("Helixer: {}".format(len(og_df[~og_df[wbps_col].isna() & ~og_df[helixer_col].isna()])))
print("Anno: {}".format(len(og_df[~og_df[wbps_col].isna() & ~og_df[anno_col].isna()])))

Shared orthologues with WBPS:
WBPS: 14248
BRAKER3: 11998
Helixer: 13154
Anno: 13088


# Assessing merged/split genes

In [9]:
braker_merged, braker_split = pickle_cache_suspicious_orthologue_pipeline("braker", "hcon", og_df, wbps_col, braker_col, species_list, seq_id_map)
anno_merged, anno_split = pickle_cache_suspicious_orthologue_pipeline("anno", "hcon", og_df, wbps_col, anno_col, species_list, seq_id_map)
helixer_merged, helixer_split = pickle_cache_suspicious_orthologue_pipeline("helixer", "hcon", og_df, wbps_col, helixer_col, species_list, seq_id_map)

100%|██████████| 15714/15714 [06:46<00:00, 38.69it/s]  
100%|██████████| 15714/15714 [08:46<00:00, 29.85it/s] 
100%|██████████| 15714/15714 [11:45<00:00, 22.27it/s]  


In [10]:
num_genes = len(list(braker_species.db.all_features(featuretype="gene")))
print(f"BRAKER3: merged={len(braker_merged)}, split={len(braker_split)}, total={round(100*(len(braker_split) + len(braker_merged)*2)/num_genes, 2)}")
num_genes = len(list(helixer_species.db.all_features(featuretype="gene")))
print(f"Helixer: merged={len(helixer_merged)}, split={len(helixer_split)}, total={round(100*(len(helixer_split) + len(helixer_merged)*2)/num_genes, 2)}")
num_genes = len(list(anno_species.db.all_features(featuretype="gene")))
print(f"Anno: merged={len(anno_merged)}, split={len(anno_split)}, total={round(100*(len(anno_split) + len(anno_merged)*2)/num_genes, 2)}")



BRAKER3: merged=42, split=38, total=0.84
Helixer: merged=176, split=71, total=2.21
Anno: merged=297, split=47, total=3.06


# InterPro accession investigation

### BRAKER3

In [2]:
acc_tally_shared_braker3, acc_tally_missed_braker3, acc_tally_novel_braker3, missed_transcripts = interpro_accession_pipeline(db, og_df, wbps_col, braker_col, interproscan_dir)
_, acc_tally_missed_braker3_unassigned, acc_tally_novel_braker3_unassigned, _ = interpro_accession_pipeline(db, no_og_df, wbps_col, braker_col, os.path.join(interproscan_dir, "unassigned_genes"))
acc_tally_missed_braker3 += acc_tally_missed_braker3_unassigned
acc_tally_novel_braker3 += acc_tally_novel_braker3_unassigned

# Find InterPro accessions occurring with significantly different frequency than in control (acc_tally_shared_braker3)
braker3_novel_results = fisher_exact_for_two_lists_of_accessions(acc_tally_novel_braker3, acc_tally_shared_braker3 + acc_tally_missed_braker3)
braker3_missed_results = fisher_exact_for_two_lists_of_accessions(acc_tally_missed_braker3, acc_tally_shared_braker3 + acc_tally_novel_braker3)

In [3]:
list(interpro_accessions_in_novel_transcripts(flatten_nested_dict(acc_product), acc_tally_novel_braker3, braker3_novel_results, min_freq=10))

InterPro accessions occurring with significantly higher frequency in novel transcripts than in shared transcripts:
	IPR002486: Nematode cuticle collagen, N-terminal (16 occurrences, 2 expected)
	IPR008160: Collagen triple helix repeat (15 occurrences, 2 expected)
	IPR035940: CAP superfamily (14 occurrences, 4 expected)

InterPro accessions that are completely missing from shared transcripts, with high frequency in novel transcripts:



['IPR002486', 'IPR008160', 'IPR035940']

In [4]:
interpro_accessions_in_missed_transcripts(flatten_nested_dict(acc_product), acc_tally_missed_braker3, acc_tally_novel_braker3, braker3_missed_results, braker3_novel_results, min_freq=75)

InterPro accessions that are completely missing from shared transcripts, with high frequency in missed transcripts:

InterPro accessions occurring with significantly higher frequency in missed transcripts than in shared transcripts:
	IPR005312: Protein of unknown function DUF1759 (157 occurrences, 5 expected)
	IPR001584: Integrase, catalytic core (164 occurrences, 6 expected)
	IPR041588: Integrase zinc-binding domain (152 occurrences, 6 expected)
	IPR001995: Peptidase A2A, retrovirus, catalytic (94 occurrences, 4 expected)
	IPR008737: Peptidase aspartic, putative (94 occurrences, 4 expected)
	IPR008042: Retrotransposon, Pao (124 occurrences, 6 expected)
	IPR040676: Domain of unknown function DUF5641 (87 occurrences, 4 expected)
	IPR036397: Ribonuclease H superfamily (403 occurrences, 24 expected)
	IPR001878: Zinc finger, CCHC-type (101 occurrences, 7 expected)
	IPR018244: Allergen V5/Tpx-1-related, conserved site (96 occurrences, 7 expected)
	IPR012337: Ribonuclease H-like superfamily 

In [5]:
missed_transcripts_with_significantly_more_frequent_accessions(wbps_species.db, missed_transcripts, acc_tally_missed_braker3, braker3_missed_results, min_freq)

hcontortus_chr4_Celeg_TT_arrow_pilon - HCON_00127470-00001 - {'IPR036397', 'IPR038717'}
hcontortus_chr4_Celeg_TT_arrow_pilon - HCON_00120440-00001 - {'IPR035940'}
hcontortus_chr4_Celeg_TT_arrow_pilon - HCON_00112380-00001 - {'IPR035940'}
hcontortus_chr4_Celeg_TT_arrow_pilon - HCON_00110860-00001 - {'IPR001969', 'IPR021109'}
hcontortus_chr4_Celeg_TT_arrow_pilon - HCON_00110640-00001 - {'IPR001969', 'IPR021109'}
hcontortus_chr4_Celeg_TT_arrow_pilon - HCON_00110340-00001 - {'IPR036397', 'IPR048703'}
hcontortus_chr4_Celeg_TT_arrow_pilon - HCON_00113120-00001 - {'IPR001969', 'IPR021109'}
hcontortus_chr4_Celeg_TT_arrow_pilon - HCON_00110900-00001 - {'IPR035940'}
hcontortus_chr4_Celeg_TT_arrow_pilon - HCON_00100285-00001 - {'IPR021109'}
hcontortus_chr4_Celeg_TT_arrow_pilon - HCON_00192680-00001 - {'IPR001283', 'IPR035940', 'IPR014044'}
hcontortus_chr4_Celeg_TT_arrow_pilon - HCON_00121040-00001 - {'IPR036397', 'IPR012337'}
hcontortus_chr4_Celeg_TT_arrow_pilon - HCON_00110400-00001 - {'IPR00128

### Helixer

In [6]:
acc_tally_shared_helixer, acc_tally_missed_helixer, acc_tally_novel_helixer, missed_transcripts = interpro_accession_pipeline(db, og_df, wbps_col, helixer_col, interproscan_dir)
_, acc_tally_missed_helixer_unassigned, acc_tally_novel_helixer_unassigned, _ = interpro_accession_pipeline(db, no_og_df, wbps_col, helixer_col, os.path.join(interproscan_dir, "unassigned_genes"))
acc_tally_missed_helixer += acc_tally_missed_helixer_unassigned
acc_tally_novel_helixer += acc_tally_novel_helixer_unassigned

# Find InterPro accessions occurring with significantly different frequency than in control (acc_tally_shared_helixer)
helixer_novel_results = fisher_exact_for_two_lists_of_accessions(acc_tally_novel_helixer, acc_tally_shared_helixer + acc_tally_missed_helixer)
helixer_missed_results = fisher_exact_for_two_lists_of_accessions(acc_tally_missed_helixer, acc_tally_shared_helixer + acc_tally_novel_helixer)

In [7]:
list(interpro_accessions_in_novel_transcripts(flatten_nested_dict(acc_product), acc_tally_novel_helixer, helixer_novel_results, min_freq))

InterPro accessions occurring with significantly higher frequency in novel transcripts than in shared transcripts:
	IPR019421: 7TM GPCR, serpentine receptor class d (Srd) (12 occurrences, 1 expected)
	IPR019424: Serpentine type 7TM GPCR chemoreceptor Srsx (11 occurrences, 2 expected)
	IPR019426: 7TM GPCR, serpentine receptor class v (Srv) (14 occurrences, 2 expected)
	IPR002486: Nematode cuticle collagen, N-terminal (19 occurrences, 4 expected)
	IPR008160: Collagen triple helix repeat (17 occurrences, 4 expected)
	IPR035940: CAP superfamily (16 occurrences, 7 expected)
	IPR003582: ShKT domain (10 occurrences, 5 expected)
	IPR000276: G protein-coupled receptor, rhodopsin-like (10 occurrences, 5 expected)
	IPR017452: GPCR, rhodopsin-like, 7TM (12 occurrences, 6 expected)

InterPro accessions that are completely missing from shared transcripts, with high frequency in novel transcripts:



['IPR019421',
 'IPR019424',
 'IPR019426',
 'IPR002486',
 'IPR008160',
 'IPR035940',
 'IPR003582',
 'IPR000276',
 'IPR017452']

In [8]:
interpro_accessions_in_missed_transcripts(flatten_nested_dict(acc_product), acc_tally_missed_helixer, acc_tally_novel_helixer, helixer_missed_results, helixer_novel_results, min_freq=30)

InterPro accessions that are completely missing from shared transcripts, with high frequency in missed transcripts:

InterPro accessions occurring with significantly higher frequency in missed transcripts than in shared transcripts:
	IPR027124: SWR1-complex protein 5/Craniofacial development protein 1/2 (38 occurrences, 2 expected)
	IPR036691: Endonuclease/exonuclease/phosphatase superfamily (151 occurrences, 10 expected)
	IPR005135: Endonuclease/exonuclease/phosphatase (55 occurrences, 6 expected)
	IPR000477: Reverse transcriptase domain (245 occurrences, 33 expected)
	IPR001461: Aspartic peptidase A1 family (32 occurrences, 5 expected)
	IPR033121: Peptidase family A1 domain (32 occurrences, 5 expected)
	IPR043502: DNA/RNA polymerase superfamily (218 occurrences, 33 expected)
	IPR043128: Reverse transcriptase/Diguanylate cyclase domain (120 occurrences, 20 expected)
	IPR008042: Retrotransposon, Pao (44 occurrences, 8 expected)
	IPR001969: Aspartic peptidase, active site (49 occurrence

### Anno

In [9]:
acc_tally_shared_anno, acc_tally_missed_anno, acc_tally_novel_anno, missed_transcripts = interpro_accession_pipeline(db, og_df, wbps_col, anno_col, interproscan_dir)
_, acc_tally_missed_anno_unassigned, acc_tally_novel_anno_unassigned, _ = interpro_accession_pipeline(db, no_og_df, wbps_col, anno_col, os.path.join(interproscan_dir, "unassigned_genes"))
acc_tally_missed_anno += acc_tally_missed_anno_unassigned
acc_tally_novel_anno += acc_tally_novel_anno_unassigned

# Find InterPro accessions occurring with significantly different frequency than in control (acc_tally_shared_anno)
anno_novel_results = fisher_exact_for_two_lists_of_accessions(acc_tally_novel_anno, acc_tally_shared_anno + acc_tally_missed_anno)
anno_missed_results = fisher_exact_for_two_lists_of_accessions(acc_tally_missed_anno, acc_tally_shared_anno + acc_tally_novel_anno)

In [10]:
list(interpro_accessions_in_novel_transcripts(flatten_nested_dict(acc_product), acc_tally_novel_anno, anno_novel_results, min_freq=10))

InterPro accessions occurring with significantly higher frequency in novel transcripts than in shared transcripts:
	IPR019426: 7TM GPCR, serpentine receptor class v (Srv) (13 occurrences, 3 expected)
	IPR002486: Nematode cuticle collagen, N-terminal (18 occurrences, 4 expected)
	IPR008160: Collagen triple helix repeat (16 occurrences, 4 expected)
	IPR000742: EGF-like domain (11 occurrences, 4 expected)
	IPR003582: ShKT domain (13 occurrences, 5 expected)
	IPR017452: GPCR, rhodopsin-like, 7TM (17 occurrences, 7 expected)

InterPro accessions that are completely missing from shared transcripts, with high frequency in novel transcripts:



['IPR019426', 'IPR002486', 'IPR008160', 'IPR000742', 'IPR003582', 'IPR017452']

In [11]:
interpro_accessions_in_missed_transcripts(flatten_nested_dict(acc_product), acc_tally_missed_anno, acc_tally_novel_anno, anno_missed_results, anno_novel_results, min_freq=50)

InterPro accessions that are completely missing from shared transcripts, with high frequency in missed transcripts:

InterPro accessions occurring with significantly higher frequency in missed transcripts than in shared transcripts:
	IPR040676: Domain of unknown function DUF5641 (52 occurrences, 5 expected)
	IPR008737: Peptidase aspartic, putative (54 occurrences, 5 expected)
	IPR001995: Peptidase A2A, retrovirus, catalytic (53 occurrences, 6 expected)
	IPR036691: Endonuclease/exonuclease/phosphatase superfamily (128 occurrences, 14 expected)
	IPR041588: Integrase zinc-binding domain (78 occurrences, 9 expected)
	IPR005312: Protein of unknown function DUF1759 (79 occurrences, 10 expected)
	IPR021109: Aspartic peptidase domain superfamily (130 occurrences, 16 expected)
	IPR001584: Integrase, catalytic core (79 occurrences, 10 expected)
	IPR008042: Retrotransposon, Pao (61 occurrences, 8 expected)
	IPR005135: Endonuclease/exonuclease/phosphatase (56 occurrences, 7 expected)
	IPR001969: A

### General

Find accessions that are significantly commonly missed by **all** tools

In [12]:
acc_tally_no_tool, acc_tally_one_plus_tool_shared, acc_tally_one_plus_tool_novel = interpro_accession_pipeline_all_tools(
    wbps_species,
    og_df,
    wbps_col,
    [braker_col, helixer_col, anno_col],
    interproscan_dir
)

acc_tally_no_tool_unassigned, _, acc_tally_one_plus_tool_novel_unassigned = interpro_accession_pipeline_all_tools(
    wbps_species,
    no_og_df,
    wbps_col,
    [braker_col, helixer_col, anno_col],
    os.path.join(interproscan_dir, "unassigned_genes")
)
acc_tally_no_tool += acc_tally_no_tool_unassigned
acc_tally_one_plus_tool_novel += acc_tally_one_plus_tool_novel_unassigned

no_tools_results = fisher_exact_for_two_lists_of_accessions(acc_tally_no_tool, acc_tally_one_plus_tool_shared + acc_tally_one_plus_tool_novel)
only_tools_results = fisher_exact_for_two_lists_of_accessions(acc_tally_one_plus_tool_novel, acc_tally_one_plus_tool_shared + acc_tally_no_tool)

In [13]:
overrepd_accs = list(interpro_accessions_in_novel_transcripts(acc_product["domain"], acc_tally_one_plus_tool_novel, only_tools_results, min_freq=10))

InterPro accessions occurring with significantly higher frequency in novel transcripts than in shared transcripts:
	IPR019430: 7TM GPCR, serpentine receptor class x (Srx) (14 occurrences, 3 expected)
	IPR002486: Nematode cuticle collagen, N-terminal (54 occurrences, 11 expected)
	IPR011527: ABC transporter type 1, transmembrane domain (10 occurrences, 2 expected)
	IPR003439: ABC transporter-like, ATP-binding domain (14 occurrences, 4 expected)
	IPR003599: Immunoglobulin subtype (10 occurrences, 4 expected)
	IPR000535: Major sperm protein (MSP) domain (10 occurrences, 4 expected)
	IPR003582: ShKT domain (34 occurrences, 13 expected)
	IPR017452: GPCR, rhodopsin-like, 7TM (45 occurrences, 18 expected)
	IPR007110: Immunoglobulin-like domain (11 occurrences, 5 expected)
	IPR014044: CAP domain (35 occurrences, 17 expected)
	IPR000742: EGF-like domain (22 occurrences, 11 expected)
	IPR001506: Peptidase M12A (17 occurrences, 9 expected)

InterPro accessions that are completely missing from sha

In [14]:
interpro_accessions_frequently_missed_by_all_tools(flatten_nested_dict(acc_product), acc_tally_no_tool, no_tools_results)

InterPro accessions occurring with significantly higher frequency in transcripts that were missed by all tools, than in transcripts shared by at least 1 tool:
	IPR049163: DNA helicase Pif1-like, 2B domain (5 occurrences, 0 expected)
	IPR048998: STPR domain (2 occurrences, 0 expected)
	IPR036691: Endonuclease/exonuclease/phosphatase superfamily (87 occurrences, 6 expected)
	IPR005135: Endonuclease/exonuclease/phosphatase (40 occurrences, 3 expected)
	IPR001461: Aspartic peptidase A1 family (26 occurrences, 2 expected)
	IPR033121: Peptidase family A1 domain (26 occurrences, 2 expected)
	IPR000477: Reverse transcriptase domain (161 occurrences, 15 expected)
	IPR010285: DNA helicase Pif1-like (3 occurrences, 0 expected)
	IPR043502: DNA/RNA polymerase superfamily (145 occurrences, 15 expected)
	IPR018244: Allergen V5/Tpx-1-related, conserved site (29 occurrences, 3 expected)
	IPR001969: Aspartic peptidase, active site (37 occurrences, 4 expected)
	IPR035940: CAP superfamily (57 occurrences,

Find accessions that are significantly commonly missed by each tool.

In [16]:
interpro_accessions_frequently_missed_by_each_tool(
    flatten_nested_dict(acc_product),
    {
        "BRAKER3": braker3_missed_results,
        "Helixer": helixer_missed_results,
        "Anno": anno_missed_results
    },
    {
        "BRAKER3": acc_tally_missed_braker3,
        "Helixer": acc_tally_missed_helixer,
        "Anno": acc_tally_missed_anno
    }
)

IPR001969: Aspartic peptidase, active site
9.04 times more likely than with BRAKER3 (117 occurrences, 13 expected)
5.21 times more likely than with Helixer (49 occurrences, 9 expected)
7.36 times more likely than with Anno (73 occurrences, 10 expected)
~~~~~~~~~~
IPR041373: Reverse transcriptase, RNase H-like domain
23.29 times more likely than with BRAKER3 (56 occurrences, 2 expected)
5.13 times more likely than with Helixer (18 occurrences, 4 expected)
9.76 times more likely than with Anno (32 occurrences, 3 expected)
~~~~~~~~~~
IPR001584: Integrase, catalytic core
25.94 times more likely than with BRAKER3 (164 occurrences, 6 expected)
3.76 times more likely than with Helixer (41 occurrences, 11 expected)
7.63 times more likely than with Anno (79 occurrences, 10 expected)
~~~~~~~~~~
IPR041426: Mos1 transposase, HTH domain
5.55 times more likely than with BRAKER3 (49 occurrences, 9 expected)
2.81 times more likely than with Helixer (16 occurrences, 6 expected)
2.45 times more likely t

In [17]:
import re
MIN_WORD_LEN = 5
tools_missed_results = {
    "BRAKER3": braker3_missed_results,
    "Helixer": helixer_missed_results,
    "Anno": anno_missed_results
}
tools_novel_results = {
    "BRAKER3": braker3_novel_results,
    "Helixer": helixer_novel_results,
    "Anno": anno_novel_results
}
acc_tally_missed_tools = {
    "BRAKER3": acc_tally_missed_braker3,
    "Helixer": acc_tally_missed_helixer,
    "Anno": acc_tally_missed_anno
}
acc_tally_novel_tools = {
    "BRAKER3": acc_tally_novel_braker3,
    "Helixer": acc_tally_novel_helixer,
    "Anno": acc_tally_novel_anno
}
seen_words = set()
all_words = set()
for acc in set.intersection(*[set(results["more_frequent"].keys()) for results in tools_missed_results.values()]):
    prod = flatten_nested_dict(acc_product)[acc]
    words = [w for w in re.split(r'\W', prod.lower()) if w and len(w) >= MIN_WORD_LEN]
    all_words.update(words)
    # Check if words occur more frequently in novel-only accessions
    for novel_acc in set.intersection(*[set(results["more_frequent"].keys()) for results in tools_novel_results.values()]):
        novel_prod = flatten_nested_dict(acc_product)[novel_acc]
        for w in words:
            if w in novel_prod.lower():
                print(f"{w} appears in {novel_acc}: {novel_prod}")
                seen_words.add(w)

domain appears in IPR003582: ShKT domain
domain appears in IPR003582: ShKT domain
terminal appears in IPR002486: Nematode cuticle collagen, N-terminal
domain appears in IPR003582: ShKT domain
domain appears in IPR003582: ShKT domain
domain appears in IPR003582: ShKT domain
domain appears in IPR003582: ShKT domain
domain appears in IPR003582: ShKT domain
domain appears in IPR003582: ShKT domain
domain appears in IPR003582: ShKT domain
domain appears in IPR003582: ShKT domain
domain appears in IPR003582: ShKT domain


In [18]:
from reannotation.pipelines import count_prod_word_occurrence_for_signif_accs
count_prod_word_occurrence_for_signif_accs(
    acc_tally_no_tool,
    acc_tally_no_tool + acc_tally_one_plus_tool_shared + acc_tally_one_plus_tool_novel,
    flatten_nested_dict(acc_product)
)

[('superfamily', 218291), ('domain', 194630), ('reverse', 154764), ('transcriptase', 154764), ('dna', 108305), ('rna', 106720), ('polymerase', 106720), ('h', 44458), ('ribonuclease', 43650), ('endonuclease', 33908), ('exonuclease', 33264), ('phosphatase', 33264), ('peptidase', 31386), ('', 30629), ('diguanylate', 30168), ('cyclase', 30168), ('cap', 26998), ('aspartic', 25652), ('like', 15825), ('protein', 14191), ('related', 11585), ('site', 11210), ('allergen', 8768), ('cysteine', 7447), ('rich', 7447), ('secretory', 7409), ('active', 6972), ('integrase', 6930), ('catalytic', 6519), ('zinc', 5486), ('hydrolase', 5450), ('of', 5449), ('unknown', 5441), ('function', 5441), ('family', 5363), ('a1', 5356), ('1', 5301), ('p', 5017), ('loop', 5016), ('containing', 5016), ('nucleoside', 5016), ('triphosphate', 5016), ('5', 4988), ('binding', 4985), ('venom', 4592), ('conserved', 4207), ('v5', 4176), ('tpx', 4176), ('duf1759', 3900), ('core', 3564), ('type', 3181), ('retrotransposon', 3161), 

In [19]:
all_words.difference(seen_words)

{'active',
 'allergen',
 'aspartic',
 'binding',
 'catalytic',
 'complex',
 'conserved',
 'craniofacial',
 'cyclase',
 'cysteine',
 'development',
 'diguanylate',
 'duf1759',
 'duf5641',
 'endonuclease',
 'exonuclease',
 'family',
 'finger',
 'function',
 'fusion',
 'glycoprotein',
 'helicase',
 'integrase',
 'peptidase',
 'phlebovirus',
 'phosphatase',
 'polymerase',
 'protein',
 'putative',
 'related',
 'retrotransposon',
 'retrovirus',
 'reverse',
 'ribonuclease',
 'rnase',
 'secretory',
 'superfamily',
 'transcriptase',
 'transposase',
 'unknown',
 'venom'}

In [20]:
word = "aspartic"

def frequency_of_word(word, tool_tally, acc_product):
    for tool, acc_tally in tool_tally.items():
        for acc, freq in Counter(acc_tally).items():
            try:
                prod = acc_product[acc]
            except Exception as e:
                print(e)
            if word in prod.lower():
                print(f"{tool}: word '{word}' occurs in {acc} with frequency {freq}")
    

def find_word_in_accessions(word, acc_product):
    for acc, prod in acc_product.items():
        if word in prod.lower():
            print(f"{acc}: {prod}")

            
find_word_in_accessions(word, flatten_nested_dict(acc_product))
print("MISSED")
frequency_of_word(word, acc_tally_missed_tools, flatten_nested_dict(acc_product))
print("NOVEL")
frequency_of_word(word, acc_tally_novel_tools, flatten_nested_dict(acc_product))


IPR001969: Aspartic peptidase, active site
IPR033112: Phospholipase A2, aspartic acid active site
IPR008737: Peptidase aspartic, putative
IPR012848: Aspartic peptidase, N-terminal
IPR019103: Aspartic peptidase, DDI1-type
IPR001461: Aspartic peptidase A1 family
IPR021109: Aspartic peptidase domain superfamily
MISSED
BRAKER3: word 'aspartic' occurs in IPR001461 with frequency 41
BRAKER3: word 'aspartic' occurs in IPR001969 with frequency 117
BRAKER3: word 'aspartic' occurs in IPR021109 with frequency 213
BRAKER3: word 'aspartic' occurs in IPR008737 with frequency 94
Helixer: word 'aspartic' occurs in IPR001969 with frequency 49
Helixer: word 'aspartic' occurs in IPR021109 with frequency 64
Helixer: word 'aspartic' occurs in IPR008737 with frequency 26
Helixer: word 'aspartic' occurs in IPR001461 with frequency 32
'IPR039037'
Anno: word 'aspartic' occurs in IPR001461 with frequency 34
Anno: word 'aspartic' occurs in IPR001969 with frequency 73
Anno: word 'aspartic' occurs in IPR021109 wit

In [21]:
for ap_acc in ASPARTIC_PEPTIDASE_SUPERFAMILY + ["IPR001969", "IPR008737"]:
    if ap_acc in flatten_nested_dict(acc_product):
        print(f"{ap_acc}: {flatten_nested_dict(acc_product)[ap_acc]}")
    for tool, results in tools_missed_results.items():
        for cat, accs in results.items():
            for acc in accs:
                if acc == ap_acc:
                    print(f"\t{tool} missed - {cat}")
    for tool, results in tools_novel_results.items():
        for cat, accs in results.items():
            for acc in accs:
                if acc == ap_acc:
                    print(f"\t{tool} novel - {cat}")
    

IPR021109: Aspartic peptidase domain superfamily
	BRAKER3 missed - more_frequent
	Helixer missed - more_frequent
	Anno missed - more_frequent
	BRAKER3 novel - as_expected
	Helixer novel - as_expected
	Anno novel - as_expected
IPR001995: Peptidase A2A, retrovirus, catalytic
	BRAKER3 missed - more_frequent
	Helixer missed - more_frequent
	Anno missed - more_frequent
	Anno novel - as_expected
IPR012848: Aspartic peptidase, N-terminal
IPR018061: Retropepsins
	BRAKER3 missed - not_occurring
IPR019103: Aspartic peptidase, DDI1-type
IPR033121: Peptidase family A1 domain
	BRAKER3 missed - more_frequent
	Helixer missed - more_frequent
	Anno missed - more_frequent
	BRAKER3 novel - as_expected
	Helixer novel - as_expected
	Anno novel - as_expected
IPR034164: Pepsin-like domain
	BRAKER3 missed - as_expected
	Helixer missed - as_expected
	Anno missed - as_expected
	BRAKER3 novel - as_expected
IPR001461: Aspartic peptidase A1 family
	BRAKER3 missed - more_frequent
	Helixer missed - more_frequent
	An

In [24]:
from reannotation.utils import extract_accessions_from_transcript
from collections import defaultdict

all_cds_counts = []
ap_cds_counts = []
all_prot_lens = []
ap_prot_lens = []
transcript_count = 0
for t in wbps_species.db.all_features(featuretype="mRNA"):
    cds_exons = list(wbps_species.db.children(t, featuretype="CDS"))
    prot_len = wbps_species.get_amino_acid_count(cds_exons)
    all_cds_counts.append(len(cds_exons))
    all_prot_lens.append(prot_len)
    transcript_has_ap_acc = False
    for acc, prod in extract_accessions_from_transcript(t):
        if acc in ASPARTIC_PEPTIDASE_SUPERFAMILY:
            ap_cds_counts.append(len(cds_exons))
            ap_prot_lens.append(prot_len)
            transcript_has_ap_acc = True
    if transcript_has_ap_acc:
        transcript_count += 1

In [25]:
print(statistics.median(all_cds_counts))
print(statistics.median(ap_cds_counts))
print(statistics.mean(all_prot_lens))
print(statistics.mean(ap_prot_lens))

transcript_count

7.0
4.0
407.471904315197
731.6088957055215


321

# Novel orthologues

In [7]:
print("Novel transcripts (% of total predicted by tool)")

shared_braker_genes = len(og_df[~og_df[wbps_col].isna() & ~og_df[braker_col].isna()][braker_col].str.split(",").explode().unique())
novel_braker_orths = len(og_df[og_df[wbps_col].isna() & ~og_df[braker_col].isna()][braker_col].str.split(",").explode().unique())
novel_braker_ungs = len(no_og_df[no_og_df[wbps_col].isna() & ~no_og_df[braker_col].isna()][braker_col].str.split(",").explode().unique())
assert shared_braker_genes + novel_braker_orths + novel_braker_ungs == len(braker_species.all_transcript_ids)
print(f"BRAKER3: {novel_braker_orths + novel_braker_ungs} ({round(100*(novel_braker_orths + novel_braker_ungs)/len(braker_species.all_transcript_ids), 2)}%)")


shared_helixer_genes = len(og_df[~og_df[wbps_col].isna() & ~og_df[helixer_col].isna()][helixer_col].str.split(",").explode().unique())
novel_helixer_orths = len(og_df[og_df[wbps_col].isna() & ~og_df[helixer_col].isna()][helixer_col].str.split(",").explode().unique())
novel_helixer_ungs = len(no_og_df[no_og_df[wbps_col].isna() & ~no_og_df[helixer_col].isna()][helixer_col].str.split(",").explode().unique())
assert shared_helixer_genes + novel_helixer_orths + novel_helixer_ungs == len(helixer_species.all_transcript_ids)
print(f"Helixer: {novel_helixer_orths + novel_helixer_ungs} ({round(100*(novel_helixer_orths + novel_helixer_ungs)/len(helixer_species.all_transcript_ids), 2)}%)")

shared_anno_genes = len(og_df[~og_df[wbps_col].isna() & ~og_df[anno_col].isna()][anno_col].str.split(",").explode().unique())
novel_anno_orths = len(og_df[og_df[wbps_col].isna() & ~og_df[anno_col].isna()][anno_col].str.split(",").explode().unique())
novel_anno_ungs = len(no_og_df[no_og_df[wbps_col].isna() & ~no_og_df[anno_col].isna()][anno_col].str.split(",").explode().unique())
assert shared_anno_genes + novel_anno_orths + novel_anno_ungs == len(anno_species.all_transcript_ids)
print(f"Anno: {novel_anno_orths + novel_anno_ungs} ({round(100*(novel_anno_orths + novel_anno_ungs)/len(anno_species.all_transcript_ids), 2)}%)")


Novel transcripts (% of total predicted by tool)
BRAKER3: 1458 (10.07%)
Helixer: 3565 (18.61%)
Anno: 4917 (23.48%)


In [None]:
## Run once to populate
# novel_orthologue_pipeline(no_og_df, wbps_col, anno_col, species_list, out_dir="data/novel_orthologue_sequences/hcon/anno/unassigned_genes/")
# novel_orthologue_pipeline(no_og_df, wbps_col, braker_col, species_list, out_dir="data/novel_orthologue_sequences/hcon/braker3/unassigned_genes/")
# novel_orthologue_pipeline(no_og_df, wbps_col, helixer_col, species_list, out_dir="data/novel_orthologue_sequences/hcon/helixer/unassigned_genes/")

In [None]:
## Run once to populate
# novel_orthologue_pipeline(og_df, wbps_col, anno_col, species_list, out_dir="data/novel_orthologue_sequences/hcon/anno/")
# novel_orthologue_pipeline(og_df, wbps_col, braker_col, species_list, out_dir="data/novel_orthologue_sequences/hcon/braker3/")
# novel_orthologue_pipeline(og_df, wbps_col, helixer_col, species_list, out_dir="data/novel_orthologue_sequences/hcon/helixer/")
anno_esm_means = extract_esm_means("data/from_MARS/Hcon_esm_pLDDTs_anno.txt").values()
braker3_esm_means = extract_esm_means("data/from_MARS/Hcon_esm_pLDDTs_braker3.txt").values()
helixer_esm_means = extract_esm_means("data/from_MARS/Hcon_esm_pLDDTs_helixer.txt").values()

print(statistics.mean(map(float, anno_esm_means)))
print(statistics.mean(map(float, braker3_esm_means)))
print(statistics.mean(map(float, helixer_esm_means)))

In [None]:
import pandas as pd
cols = (
    "fn",
    "mean",
    "median",
    "stdev",
    "var",
    "max",
    "min",
    "perc_confident"
)
df = pd.read_csv("data/from_MARS/pLDDT_hcon.csv", names=cols)
print(f"Mean of means: {df['mean'].mean()}")
print(f"% that are \"Confident\": {100*df[df['mean'] >= 70].shape[0]/df.shape[0]}")


In [29]:

df_anno = pd.read_csv("data/from_MARS/pLDDT_hcon_anno.csv", names=cols)
df_braker3 = pd.read_csv("data/from_MARS/pLDDT_hcon_braker3.csv", names=cols)
df_helixer = pd.read_csv("data/from_MARS/pLDDT_hcon_helixer.csv", names=cols)

In [None]:
print(f"Mean of WBPS % that are \"Confident\" residues: {df['perc_confident'].mean()}")
print(f"Mean of BRAKER3 % that are \"Confident\" residues: {df_braker3['perc_confident'].mean()}")
print(f"Mean of Anno % that are \"Confident\" residues: {df_anno['perc_confident'].mean()}")
print(f"Mean of Helixer % that are \"Confident\" residues: {df_helixer['perc_confident'].mean()}")