# Setup

In [42]:
import os.path
from collections import defaultdict
import contextlib
import json
import statistics
from collections import Counter
from operator import itemgetter

import matplotlib
from matplotlib import pyplot as plt
import numpy as np
from tqdm import tqdm

from orthologue_analysis.orthogroups import init_orthogroup_df
from orthologue_analysis.species import HaemonchusFromTool, MansoniCladeFromTool, PristionchusFromTool, SpeciesList
from orthologue_analysis.utils import SequenceIDMapping, orthofinder_paths
from reannotation.analysis import (
    interpro_accessions_frequently_missed_by_all_tools,
    interpro_accessions_frequently_missed_by_each_tool,
    interpro_accessions_in_novel_transcripts,
    interpro_accessions_in_missed_transcripts,
    missed_transcripts_with_significantly_more_frequent_accessions
)
from reannotation.constants import ASPARTIC_PEPTIDASE_SUPERFAMILY
from reannotation.pipelines import (
    interpro_accession_pipeline_all_tools,
    interpro_accession_pipeline,
    suspicious_orthologue_pipeline,
    pickle_cache_suspicious_orthologue_pipeline,
    novel_orthologue_pipeline
)
from reannotation.statistics import fisher_exact_for_two_lists_of_accessions, count_transcripts_with_accession
from reannotation.utils import extract_accessions_from_transcript
from utils.esm import extract_esm_means
from utils.generic import flatten_nested_dict, flatten_list_to_set
from utils.gffutils import init_db

matplotlib.use("TkAgg")

mars_data_dir = os.path.join("data", "from_MARS", "")
ebi_data_dir = os.path.join("data", "from_EBI", "")
interproscan_dir = os.path.join(mars_data_dir, "interproscan", "")


def breakdown_for_tool(tool_col):
    return {
    "shared": len(og_df[~og_df[wbps_col].isna() & ~og_df[tool_col].isna()][tool_col].str.split(",").explode().unique()),
    "missing": len(og_df[~og_df[wbps_col].isna() & og_df[tool_col].isna()][wbps_col].str.split(",").explode().unique()),
    "novel": len(og_df[og_df[wbps_col].isna() & ~og_df[tool_col].isna()][tool_col].str.split(",").explode().unique()) + \
                len(no_og_df[~no_og_df[tool_col].isna()])
    }

# S. mansoni

In [47]:
results_label = "Results_Aug05"
wbps_ann_path = "data/from_WBPS/schistosoma_mansoni.PRJEA36577.WBPS19.annotations.gff3"
braker_path = "data/from_MARS/Schistosoma_mansoni_braker3_full.gff3"
helixer_path = "data/from_MARS/Schistosoma_mansoni_helixer_full.gff3"
anno_path = "data/from_EBI/schistosoma_mansoni_gca000000000v1.gff3"
db = init_db(wbps_ann_path, "db/Sman_wbps.db")
of = orthofinder_paths(results_label, subdir="Orthogroups")

wbps_col = "Sman_LT"
braker_col = "Sman_braker3_LT"
helixer_col = "Sman_helixer_LT"
anno_col = "Sman_anno_LT"

og_df = init_orthogroup_df(of["orthogroups"])
no_og_df = init_orthogroup_df(of["orthogroups_unassigned_genes"])
seq_id_map = SequenceIDMapping(of["wd"])

species_list = SpeciesList([
    MansoniCladeFromTool("mansoni", data_dir=mars_data_dir, data_label="Sman_LT", prot_filename_suffix=".fa"),
    MansoniCladeFromTool("mansoni_braker3_reann", data_dir=mars_data_dir, data_label="Sman_braker3_LT", prot_filename_suffix=".fa"),
    MansoniCladeFromTool("mansoni_helixer_reann", data_dir=mars_data_dir, data_label="Sman_helixer_LT", prot_filename_suffix=".fa"),
    MansoniCladeFromTool("mansoni_anno_reann", data_dir=ebi_data_dir, data_label="Sman_anno_LT", prot_filename_suffix=".fa")],
    wd_path=of["wd"],
    load_blast=True
)

wbps_species = species_list.get_species_with_data_label(wbps_col)
braker_species = species_list.get_species_with_data_label(braker_col)
helixer_species = species_list.get_species_with_data_label(helixer_col)
anno_species = species_list.get_species_with_data_label(anno_col)

with open("data/acc_product.json", "r") as f:
    acc_product = json.loads(f.read())

min_freq = 10

interproscan_dir += "sman"

loading /home/will/wbp_scratch/data/from_MARS/OrthoFinder/WorkingDirectory/Results_Aug05/Blast0_1.txt...
loading /home/will/wbp_scratch/data/from_MARS/OrthoFinder/WorkingDirectory/Results_Aug05/Blast0_2.txt...
loading /home/will/wbp_scratch/data/from_MARS/OrthoFinder/WorkingDirectory/Results_Aug05/Blast0_3.txt...
loading /home/will/wbp_scratch/data/from_MARS/OrthoFinder/WorkingDirectory/Results_Aug05/Blast0_2.txt...
loading /home/will/wbp_scratch/data/from_MARS/OrthoFinder/WorkingDirectory/Results_Aug05/Blast0_3.txt...
loading /home/will/wbp_scratch/data/from_MARS/OrthoFinder/WorkingDirectory/Results_Aug05/Blast0_1.txt...


In [48]:
print("Total protein-coding genes (transcripts have been filtered 1 per gene):")
total_wbps_genes = len([1 for line in open(wbps_species.prot_meta.file_path) if line.startswith(">")])
print(f"WBPS: {total_wbps_genes}")
total_braker_genes = len([1 for line in open(braker_species.prot_meta.file_path) if line.startswith(">")])
print(f"BRAKER3: {total_braker_genes}")
total_helixer_genes = len([1 for line in open(helixer_species.prot_meta.file_path) if line.startswith(">")])
print(f"Helixer: {total_helixer_genes}")
total_anno_genes = len([1 for line in open(anno_species.prot_meta.file_path) if line.startswith(">")])
print(f"Anno: {total_anno_genes}")

Total protein-coding genes (transcripts have been filtered 1 per gene):
WBPS: 9896
BRAKER3: 9092
Helixer: 9901
Anno: 12798


In [49]:
assert breakdown_for_tool(braker_col)["shared"] + breakdown_for_tool(braker_col)["novel"] == total_braker_genes
print(f"BRAKER3: {breakdown_for_tool(braker_col)}")
assert breakdown_for_tool(helixer_col)["shared"] + breakdown_for_tool(helixer_col)["novel"] == total_helixer_genes
print(f"Helixer: {breakdown_for_tool(helixer_col)}")
assert breakdown_for_tool(anno_col)["shared"] + breakdown_for_tool(anno_col)["novel"] == total_anno_genes
print(f"Anno: {breakdown_for_tool(anno_col)}")

BRAKER3: {'shared': 8584, 'missing': 760, 'novel': 508}
Helixer: {'shared': 8737, 'missing': 767, 'novel': 1164}
Anno: {'shared': 10078, 'missing': 527, 'novel': 2720}


Shared as a % of total predicted

In [50]:
print(f"BRAKER3: {round(100 * breakdown_for_tool(braker_col)['shared'] / total_braker_genes, 2)}%")
print(f"Helixer: {round(100 * breakdown_for_tool(helixer_col)['shared'] / total_helixer_genes, 2)}%")
print(f"Anno: {round(100 * breakdown_for_tool(anno_col)['shared'] / total_anno_genes, 2)}%")

BRAKER3: 94.41%
Helixer: 88.24%
Anno: 78.75%


Novel as a % of total predicted

In [51]:
print(f"BRAKER3: {round(100 * breakdown_for_tool(braker_col)['novel'] / total_braker_genes, 2)}%")
print(f"Helixer: {round(100 * breakdown_for_tool(helixer_col)['novel'] / total_helixer_genes, 2)}%")
print(f"Anno: {round(100 * breakdown_for_tool(anno_col)['novel'] / total_anno_genes, 2)}%")

BRAKER3: 5.59%
Helixer: 11.76%
Anno: 21.25%


# H. contortus


In [52]:
results_label = "Results_Aug08"
wbps_ann_path = "data/from_WBPS/haemonchus_contortus.PRJEB506.WBPS19.annotations.gff3"
braker_path = "data/from_MARS/Haemonchus_contortus_braker3_full.gff3"
helixer_path = "data/from_MARS/Haemonchus_contortus_helixer_full.gff3"
anno_path = "data/from_EBI/haemonchus_contortus_gca000469685v2.gff3"
db = init_db(wbps_ann_path, "db/Hcon_wbps.db")
of = orthofinder_paths(results_label, subdir="Orthogroups")

wbps_col = "Hcon_LT"
braker_col = "Hcon_braker3_LT"
helixer_col = "Hcon_helixer_LT"
anno_col = "Hcon_anno_LT"

og_df = init_orthogroup_df(of["orthogroups"])
no_og_df = init_orthogroup_df(of["orthogroups_unassigned_genes"])
seq_id_map = SequenceIDMapping(of["wd"])

species_list = SpeciesList([
    HaemonchusFromTool("contortus", data_dir=mars_data_dir, data_label="Hcon_LT", prot_filename_suffix=".fa"),
    HaemonchusFromTool("contortus_braker3_reann", data_dir=mars_data_dir, data_label="Hcon_braker3_LT", prot_filename_suffix=".fa"),
    HaemonchusFromTool("contortus_helixer_reann", data_dir=mars_data_dir, data_label="Hcon_helixer_LT", prot_filename_suffix=".fa"),
    HaemonchusFromTool("contortus_anno_reann", data_dir=ebi_data_dir, data_label="Hcon_anno_LT", prot_filename_suffix=".fa")],
    wd_path=of["wd"],
    load_blast=True
)

wbps_species = species_list.get_species_with_data_label(wbps_col)
braker_species = species_list.get_species_with_data_label(braker_col)
helixer_species = species_list.get_species_with_data_label(helixer_col)
anno_species = species_list.get_species_with_data_label(anno_col)

loading /home/will/wbp_scratch/data/from_MARS/OrthoFinder/WorkingDirectory/Results_Aug08/Blast0_1.txt...
loading /home/will/wbp_scratch/data/from_MARS/OrthoFinder/WorkingDirectory/Results_Aug08/Blast0_2.txt...
loading /home/will/wbp_scratch/data/from_MARS/OrthoFinder/WorkingDirectory/Results_Aug08/Blast0_3.txt...
loading /home/will/wbp_scratch/data/from_MARS/OrthoFinder/WorkingDirectory/Results_Aug08/Blast0_2.txt...
loading /home/will/wbp_scratch/data/from_MARS/OrthoFinder/WorkingDirectory/Results_Aug08/Blast0_3.txt...
loading /home/will/wbp_scratch/data/from_MARS/OrthoFinder/WorkingDirectory/Results_Aug08/Blast0_1.txt...


In [53]:
print("Total protein-coding genes (transcripts have been filtered 1 per gene):")
total_wbps_genes = len([1 for line in open(wbps_species.prot_meta.file_path) if line.startswith(">")])
print(f"WBPS: {total_wbps_genes}")
total_braker_genes = len([1 for line in open(braker_species.prot_meta.file_path) if line.startswith(">")])
print(f"BRAKER3: {total_braker_genes}")
total_helixer_genes = len([1 for line in open(helixer_species.prot_meta.file_path) if line.startswith(">")])
print(f"Helixer: {total_helixer_genes}")
total_anno_genes = len([1 for line in open(anno_species.prot_meta.file_path) if line.startswith(">")])
print(f"Anno: {total_anno_genes}")

Total protein-coding genes (transcripts have been filtered 1 per gene):
WBPS: 19621
BRAKER3: 14476
Helixer: 19153
Anno: 20939


In [54]:
assert breakdown_for_tool(braker_col)["shared"] + breakdown_for_tool(braker_col)["novel"] == total_braker_genes
print(f"BRAKER3: {breakdown_for_tool(braker_col)}")
assert breakdown_for_tool(helixer_col)["shared"] + breakdown_for_tool(helixer_col)["novel"] == total_helixer_genes
print(f"Helixer: {breakdown_for_tool(helixer_col)}")
assert breakdown_for_tool(anno_col)["shared"] + breakdown_for_tool(anno_col)["novel"] == total_anno_genes
print(f"Anno: {breakdown_for_tool(anno_col)}")

BRAKER3: {'shared': 13018, 'missing': 4548, 'novel': 1458}
Helixer: {'shared': 15588, 'missing': 2307, 'novel': 3565}
Anno: {'shared': 16022, 'missing': 2287, 'novel': 4917}


Shared as a % of total predicted

In [55]:
print(f"BRAKER3: {round(100 * breakdown_for_tool(braker_col)['shared'] / total_braker_genes, 2)}%")
print(f"Helixer: {round(100 * breakdown_for_tool(helixer_col)['shared'] / total_helixer_genes, 2)}%")
print(f"Anno: {round(100 * breakdown_for_tool(anno_col)['shared'] / total_anno_genes, 2)}%")

BRAKER3: 89.93%
Helixer: 81.39%
Anno: 76.52%


Novel as a % of total predicted

In [56]:
print(f"BRAKER3: {round(100 * breakdown_for_tool(braker_col)['novel'] / total_braker_genes, 2)}%")
print(f"Helixer: {round(100 * breakdown_for_tool(helixer_col)['novel'] / total_helixer_genes, 2)}%")
print(f"Anno: {round(100 * breakdown_for_tool(anno_col)['novel'] / total_anno_genes, 2)}%")

BRAKER3: 10.07%
Helixer: 18.61%
Anno: 23.48%


# P. pacificus

In [57]:
results_label = "Results_Aug21"
wbps_ann_path = "data/from_WBPS/pristionchus_pacificus.PRJNA12644.WBPS19.annotations.gff3"
braker_path = "data/from_MARS/Pristionchus_pacificus_braker3_full.gff3"
helixer_path = "data/from_MARS/Pristionchus_pacificus_helixer_full.gff3"
anno_path = "data/from_EBI/pristionchus_pacificus_gca000180635v4.gff3"
db = init_db(wbps_ann_path, "db/Ppac_wbps.db")
of = orthofinder_paths(results_label, subdir="Orthogroups")

wbps_col = "Ppac_LT"
braker_col = "Ppac_braker3_LT"
helixer_col = "Ppac_helixer_LT"
anno_col = "Ppac_anno_LT"

og_df = init_orthogroup_df(of["orthogroups"])
no_og_df = init_orthogroup_df(of["orthogroups_unassigned_genes"])
seq_id_map = SequenceIDMapping(of["wd"])

species_list = SpeciesList([
    PristionchusFromTool("pacificus", data_dir=mars_data_dir, data_label="Ppac_LT", prot_filename_suffix=".fa"),
    PristionchusFromTool("pacificus_braker3_reann", data_dir=mars_data_dir, data_label="Ppac_braker3_LT", prot_filename_suffix=".fa"),
    PristionchusFromTool("pacificus_helixer_reann", data_dir=mars_data_dir, data_label="Ppac_helixer_LT", prot_filename_suffix=".fa"),
    PristionchusFromTool("pacificus_anno_reann", data_dir=ebi_data_dir, data_label="Ppac_anno_LT", prot_filename_suffix=".fa")],
    wd_path=of["wd"],
    load_blast=True
)

wbps_species = species_list.get_species_with_data_label(wbps_col)
braker_species = species_list.get_species_with_data_label(braker_col)
helixer_species = species_list.get_species_with_data_label(helixer_col)
anno_species = species_list.get_species_with_data_label(anno_col)

loading /home/will/wbp_scratch/data/from_MARS/OrthoFinder/WorkingDirectory/Results_Aug21/Blast0_1.txt...
loading /home/will/wbp_scratch/data/from_MARS/OrthoFinder/WorkingDirectory/Results_Aug21/Blast0_2.txt...
loading /home/will/wbp_scratch/data/from_MARS/OrthoFinder/WorkingDirectory/Results_Aug21/Blast0_3.txt...
loading /home/will/wbp_scratch/data/from_MARS/OrthoFinder/WorkingDirectory/Results_Aug21/Blast0_2.txt...
loading /home/will/wbp_scratch/data/from_MARS/OrthoFinder/WorkingDirectory/Results_Aug21/Blast0_3.txt...
loading /home/will/wbp_scratch/data/from_MARS/OrthoFinder/WorkingDirectory/Results_Aug21/Blast0_1.txt...


In [58]:
print("Total protein-coding genes (transcripts have been filtered 1 per gene):")
total_wbps_genes = len([1 for line in open(wbps_species.prot_meta.file_path) if line.startswith(">")])
print(f"WBPS: {total_wbps_genes}")
total_braker_genes = len([1 for line in open(braker_species.prot_meta.file_path) if line.startswith(">")])
print(f"BRAKER3: {total_braker_genes}")
total_helixer_genes = len([1 for line in open(helixer_species.prot_meta.file_path) if line.startswith(">")])
print(f"Helixer: {total_helixer_genes}")
total_anno_genes = len([1 for line in open(anno_species.prot_meta.file_path) if line.startswith(">")])
print(f"Anno: {total_anno_genes}")

Total protein-coding genes (transcripts have been filtered 1 per gene):
WBPS: 26342
BRAKER3: 24077
Helixer: 32221
Anno: 28283


In [59]:
assert breakdown_for_tool(braker_col)["shared"] + breakdown_for_tool(braker_col)["novel"] == total_braker_genes
print(f"BRAKER3: {breakdown_for_tool(braker_col)}")
assert breakdown_for_tool(helixer_col)["shared"] + breakdown_for_tool(helixer_col)["novel"] == total_helixer_genes
print(f"Helixer: {breakdown_for_tool(helixer_col)}")
assert breakdown_for_tool(anno_col)["shared"] + breakdown_for_tool(anno_col)["novel"] == total_anno_genes
print(f"Anno: {breakdown_for_tool(anno_col)}")

BRAKER3: {'shared': 19278, 'missing': 4505, 'novel': 4799}
Helixer: {'shared': 17702, 'missing': 5231, 'novel': 14519}
Anno: {'shared': 21226, 'missing': 3584, 'novel': 7057}


Shared as a % of total predicted

In [60]:
print(f"BRAKER3: {round(100 * breakdown_for_tool(braker_col)['shared'] / total_braker_genes, 2)}%")
print(f"Helixer: {round(100 * breakdown_for_tool(helixer_col)['shared'] / total_helixer_genes, 2)}%")
print(f"Anno: {round(100 * breakdown_for_tool(anno_col)['shared'] / total_anno_genes, 2)}%")

BRAKER3: 80.07%
Helixer: 54.94%
Anno: 75.05%


Novel as a % of total predicted

In [61]:
print(f"BRAKER3: {round(100 * breakdown_for_tool(braker_col)['novel'] / total_braker_genes, 2)}%")
print(f"Helixer: {round(100 * breakdown_for_tool(helixer_col)['novel'] / total_helixer_genes, 2)}%")
print(f"Anno: {round(100 * breakdown_for_tool(anno_col)['novel'] / total_anno_genes, 2)}%")

BRAKER3: 19.93%
Helixer: 45.06%
Anno: 24.95%
