In [1]:
import os.path
from collections import defaultdict
import contextlib
import statistics

import matplotlib
from matplotlib import pyplot as plt
import numpy as np

from orthologue_analysis.orthogroups import init_orthogroup_df
from orthologue_analysis.species import AltSourceMixin, MansoniClade, SpeciesList
from orthologue_analysis.utils import SequenceIDMapping, orthofinder_paths
from reannotation.analysis import (
    interpro_accessions_frequently_missed_by_all_tools,
    interpro_accessions_frequently_missed_by_each_tool,
    interpro_accessions_in_novel_transcripts,
    interpro_accessions_in_missed_transcripts,
    missed_transcripts_with_significantly_more_frequent_accessions
)
from reannotation.pipelines import (
    interpro_accession_pipeline_all_tools,
    interpro_accession_pipeline,
    suspicious_orthologue_pipeline,
    novel_orthologue_pipeline
)
from reannotation.statistics import fisher_exact_for_two_lists_of_accessions
from reannotation.utils import extract_accessions_from_transcript
from utils.esm import extract_esm_means
from utils.gffutils import init_db

matplotlib.use("TkAgg")


class MansoniCladeFromTool(AltSourceMixin, MansoniClade):
    pass


results_label = "Results_Aug05"
wbps_ann_path = "data/from_WBPS/schistosoma_mansoni.PRJEA36577.WBPS19.annotations.gff3"
braker_path = "data/from_MARS/Schistosoma_mansoni_braker3_full.gff3"
helixer_path = "data/from_MARS/Schistosoma_mansoni_helixer_full.gff3"
anno_path = "data/from_EBI/schistosoma_mansoni_gca000000000v1.gff3"
db = init_db(wbps_ann_path, "db/Sman_wbps.db")
of = orthofinder_paths(results_label)

wbps_col = "Sman_LT"
braker_col = "Sman_braker3_LT"
helixer_col = "Sman_helixer_LT"
anno_col = "Sman_anno_LT"

hog_df = init_orthogroup_df(of["orthogroups"])
seq_id_map = SequenceIDMapping(of["wd"])
mars_data_dir = os.path.join("data", "from_MARS", "")
ebi_data_dir = os.path.join("data", "from_EBI", "")

species_list = SpeciesList([
    MansoniCladeFromTool("mansoni", data_dir=mars_data_dir, data_label="Sman_LT", prot_filename_suffix=".fa"),
    MansoniCladeFromTool("mansoni_braker3_reann", data_dir=mars_data_dir, data_label="Sman_braker3_LT", prot_filename_suffix=".fa"),
    MansoniCladeFromTool("mansoni_helixer_reann", data_dir=mars_data_dir, data_label="Sman_helixer_LT", prot_filename_suffix=".fa"),
    MansoniCladeFromTool("mansoni_anno_reann", data_dir=ebi_data_dir, data_label="Sman_anno_LT", prot_filename_suffix=".fa")],
    wd_path=of["wd"],
    load_blast=True
)

wbps_species = species_list.get_species_with_data_label(wbps_col)

min_freq = 10

interproscan_dir = "data/from_MARS/interproscan/sman"

acc_product = {}

loading /home/will/wbp_scratch/data/from_MARS/OrthoFinder/WorkingDirectory/Results_Aug05/Blast0_1.txt...
loading /home/will/wbp_scratch/data/from_MARS/OrthoFinder/WorkingDirectory/Results_Aug05/Blast0_2.txt...
loading /home/will/wbp_scratch/data/from_MARS/OrthoFinder/WorkingDirectory/Results_Aug05/Blast0_3.txt...
loading /home/will/wbp_scratch/data/from_MARS/OrthoFinder/WorkingDirectory/Results_Aug05/Blast0_2.txt...
loading /home/will/wbp_scratch/data/from_MARS/OrthoFinder/WorkingDirectory/Results_Aug05/Blast0_3.txt...
loading /home/will/wbp_scratch/data/from_MARS/OrthoFinder/WorkingDirectory/Results_Aug05/Blast0_1.txt...


# General Statistics

In [2]:
from utils.generic import makedirs
plot_dir = "plots/reannotation/"
makedirs(plot_dir)

In [3]:
print("Shared orthologues with WBPS:")
print("WBPS: {}".format(len(hog_df[~hog_df[wbps_col].isna()])))
print("BRAKER3: {}".format(len(hog_df[~hog_df[wbps_col].isna() & ~hog_df[braker_col].isna()])))
print("Helixer: {}".format(len(hog_df[~hog_df[wbps_col].isna() & ~hog_df[helixer_col].isna()])))
print("Anno: {}".format(len(hog_df[~hog_df[wbps_col].isna() & ~hog_df[anno_col].isna()])))

Shared orthologues with WBPS:
WBPS: 8970
BRAKER3: 8203
Helixer: 8332
Anno: 8392


In [4]:
def count_all_mRNA_exons(label):
    count = []
    sp = species_list.get_species_with_data_label(label)
    for t in sp.db.all_features(featuretype="mRNA"):
        count.append(len(list(sp.db.children(t, featuretype="exon"))))
    return count


exon_count = defaultdict(list)
exon_count["wbps"] = count_all_mRNA_exons(wbps_col)
exon_count["anno"] = count_all_mRNA_exons(anno_col)
exon_count["braker"] = count_all_mRNA_exons(braker_col)
exon_count["helixer"] = count_all_mRNA_exons(helixer_col)

In [5]:
def count_all_mRNA_amino_acids(label):
    count = []
    sp = species_list.get_species_with_data_label(label)
    for t in sp.db.all_features(featuretype="mRNA"):
        cds_exons = list(sp.db.children(t, featuretype="CDS"))
        count.append(sp.get_amino_acid_count(cds_exons))
    return count

protein_lengths = defaultdict(list)
protein_lengths["wbps"] = count_all_mRNA_amino_acids(wbps_col)
protein_lengths["anno"] = count_all_mRNA_amino_acids(anno_col)
protein_lengths["braker"] = count_all_mRNA_amino_acids(braker_col)
protein_lengths["helixer"] = count_all_mRNA_amino_acids(helixer_col)

In [6]:
def boxplot_axes(idx, label, data, ylim=(0, 25)):
    ax = plt.subplot(1, 4, idx)
    ax.boxplot(data[label])
    if ylim:
        ax.set_ylim(top=ylim[1], bottom=ylim[0])
    ax.grid(which="both")
    ax.set_title(label.upper())
    ax.set_xticklabels([])
    ax.set_xlabel(round(statistics.mean(data[label]), 2))

boxplot_axes(1, "wbps", exon_count)
boxplot_axes(2, "anno", exon_count)
boxplot_axes(3, "braker", exon_count)
boxplot_axes(4, "helixer", exon_count)
plt.suptitle("Exon counts")
plt.text(-1.05, -1.7, 'Means')
plt.savefig(os.path.join(plot_dir, "sman_exon_count_boxplots.png"))
# plt.show()

In [7]:
boxplot_axes(1, "wbps", protein_lengths, ylim=(0, 3000))
boxplot_axes(2, "anno", protein_lengths, ylim=(0, 3000))
boxplot_axes(3, "braker", protein_lengths, ylim=(0, 3000))
boxplot_axes(4, "helixer", protein_lengths, ylim=(0, 3000))
plt.suptitle("Amino acid counts")
plt.text(-1.05, -1.7, 'Means')
plt.savefig(os.path.join(plot_dir, "sman_prot_length_boxplots.png"))
plt.show()

In [8]:
wbps_species = species_list.get_species_with_data_label(wbps_col)
anno_species = species_list.get_species_with_data_label(anno_col)
braker_species = species_list.get_species_with_data_label(braker_col)
helixer_species = species_list.get_species_with_data_label(helixer_col)
x1, x2, x3 = [], [], []
x1e, x2e, x3e = [], [], []
y1, y2, y3 = [], [], []
y1e, y2e, y3e = [], [], []
for _, row in hog_df[~hog_df[wbps_col].isna()].iterrows():
    wbps_tran = wbps_species.db["transcript:" + list(map(str.strip, row[wbps_col].split(",")))[0].split("transcript_")[1]]
    wbps_cds_exons = list(wbps_species.db.children(wbps_tran, featuretype="CDS"))
    wbps_prot_length = wbps_species.get_amino_acid_count(wbps_cds_exons)
    if not row[anno_col] is np.nan:
        x1.append(wbps_prot_length)
        x1e.append(len(wbps_cds_exons))
        anno_tran = anno_species.db["transcript:" + list(map(str.strip, row[anno_col].split(",")))[0].split("transcript_")[1]]
        cds_exons = list(anno_species.db.children(anno_tran, featuretype="CDS"))
        y1.append(anno_species.get_amino_acid_count(cds_exons))
        y1e.append(len(cds_exons))
    if not row[braker_col] is np.nan:
        x2.append(wbps_prot_length)
        x2e.append(len(wbps_cds_exons))
        braker_tran = braker_species.db[list(map(str.strip, row[braker_col].split(",")))[0]]
        cds_exons = list(braker_species.db.children(braker_tran, featuretype="CDS"))
        y2.append(braker_species.get_amino_acid_count(cds_exons))
        y2e.append(len(cds_exons))
    if not row[helixer_col] is np.nan:
        x3.append(wbps_prot_length)
        x3e.append(len(wbps_cds_exons))
        helixer_tran = helixer_species.db[list(map(str.strip, row[helixer_col].split(",")))[0]]
        cds_exons = list(helixer_species.db.children(helixer_tran, featuretype="CDS"))
        y3.append(helixer_species.get_amino_acid_count(cds_exons))
        y3e.append(len(cds_exons))

In [9]:
def scatter_axes(idx, label, x, y, log=True):
    ax = plt.subplot(1, 3, idx)
    ax.scatter(x, y, linewidths=0.1, marker=".")
    if log:
        ax.set_yscale('log')
        ax.set_xscale('log')
    ax.set_xlabel("WBPS")
    ax.set_ylabel(label, rotation=90)
    ax.set_title(f"r={round(np.corrcoef(x, y)[1, 0], 2)} (\N{GREEK CAPITAL LETTER SIGMA}={len(x)})")

In [10]:
scatter_axes(1, "Anno", x1, y1)
scatter_axes(2, "BRAKER3", x2, y2)
scatter_axes(3, "Helixer", x3, y3)
plt.suptitle("Amino acid count correlations")
plt.tight_layout()
plt.savefig(os.path.join(plot_dir, "sman_prot_length_correlations.png"))
plt.show()

In [12]:
scatter_axes(1, "Anno", x1e, y1e, log=False)
scatter_axes(2, "BRAKER3", x2e, y2e, log=False)
scatter_axes(3, "Helixer", x3e, y3e, log=False)
plt.suptitle("Exon count correlations")
plt.tight_layout()
plt.savefig(os.path.join(plot_dir, "sman_exon_count_correlations.png"))
plt.show()

In [None]:
# statistics_pipeline()

# Assessing merged/split genes

In [5]:
braker_merged, braker_split = suspicious_orthologue_pipeline(hog_df, wbps_col, braker_col, species_list, seq_id_map)
anno_merged, anno_split = suspicious_orthologue_pipeline(hog_df, wbps_col, anno_col, species_list, seq_id_map)
helixer_merged, helixer_split = suspicious_orthologue_pipeline(hog_df, wbps_col, helixer_col, species_list, seq_id_map)

100%|██████████| 9600/9600 [01:31<00:00, 105.09it/s] 
100%|██████████| 9600/9600 [08:02<00:00, 19.91it/s] 
100%|██████████| 9600/9600 [03:13<00:00, 49.49it/s] 


In [14]:
# TODO Tabulate totals for each, and compare commonly merged/split transcripts between tools.
wbps_species = species_list.get_species_with_data_label(wbps_col)
acc_product = {}
acc_list = []
for k in braker_merged.keys():
    tran = wbps_species.db["transcript:" + k]
    for acc, prod in extract_accessions_from_transcript(tran):
        acc_product[acc] = prod
        acc_list.append(acc)

method:InterPro accession:IPR011333 description:SKP1/BTB/POZ domain superfamily 
method:InterPro accession:IPR021777 description:SANT and BTB domain regulator of CSR, BTB domain 
method:InterPro accession:IPR045902 description:SANBR-like
No InterPro accessions found for transcript transcript:Smp_246020.1
method:InterPro accession:IPR001412 description:Aminoacyl-tRNA synthetase, class I, conserved site 
method:InterPro accession:IPR002300 description:Aminoacyl-tRNA synthetase, class Ia 
method:InterPro accession:IPR002302 description:Leucine-tRNA ligase 
method:InterPro accession:IPR009080 description:Aminoacyl-tRNA synthetase, class Ia, anticodon-binding 
method:InterPro accession:IPR014729 description:Rossmann-like alpha/beta/alpha sandwich fold


In [6]:
num_genes = len(list(species_list.get_species_with_data_label("Sman_braker3_LT").db.all_features(featuretype="gene")))
print(f"BRAKER3: merged={len(braker_merged)}, split={len(braker_split)}, total={round(100*(len(braker_split) + len(braker_merged)*2)/num_genes, 2)}")
num_genes = len(list(species_list.get_species_with_data_label("Sman_helixer_LT").db.all_features(featuretype="gene")))
print(f"Helixer: merged={len(helixer_merged)}, split={len(helixer_split)}, total={round(100*(len(helixer_split) + len(helixer_merged)*2)/num_genes, 2)}")
num_genes = len(list(species_list.get_species_with_data_label("Sman_anno_LT").db.all_features(featuretype="gene")))
print(f"Anno: merged={len(anno_merged)}, split={len(anno_split)}, total={round(100*(len(anno_split) + len(anno_merged)*2)/num_genes, 2)}")



BRAKER3: merged=24, split=11, total=0.65
Helixer: merged=170, split=23, total=3.67
Anno: merged=209, split=26, total=3.47


In [7]:
len(list(species_list.get_species_with_data_label("Sman_braker3_LT").db.all_features(featuretype="gene")))

9092

# InterPro accession investigation

### BRAKER3

In [8]:
acc_product, acc_tally_shared, acc_tally_missed_braker3, acc_tally_novel_braker3, missed_transcripts = interpro_accession_pipeline(wbps_species.db, hog_df, wbps_col, braker_col, interproscan_dir, acc_product)

# Find InterPro accessions occurring with significantly different frequency than in control (acc_tally_shared)
braker3_novel_results = fisher_exact_for_two_lists_of_accessions(acc_tally_novel_braker3, acc_tally_shared)
braker3_missed_results = fisher_exact_for_two_lists_of_accessions(acc_tally_missed_braker3, acc_tally_shared)

In [9]:
interpro_accessions_in_novel_transcripts(acc_product, acc_tally_novel_braker3, braker3_novel_results, min_freq)

InterPro accessions occurring with significantly higher frequency in novel transcripts than in shared transcripts:
	IPR000009: Protein phosphatase 2A regulatory subunit PR55  (14 occurrences, 0 expected)
	IPR001461: Aspartic peptidase A1 family  (10 occurrences, 0 expected)
	IPR000217: Tubulin  (10 occurrences, 0 expected)
	IPR021109: Aspartic peptidase domain superfamily (11 occurrences, 0 expected)
	IPR002110: Ankyrin repeat  (47 occurrences, 2 expected)
	IPR001623: DnaJ domain  (20 occurrences, 1 expected)
	IPR003961: Fibronectin type III  (22 occurrences, 1 expected)
	IPR013087: Zinc finger C2H2-type  (31 occurrences, 4 expected)
	IPR036770: Ankyrin repeat-containing domain superfamily (10 occurrences, 2 expected)
	IPR013783: Immunoglobulin-like fold  (11 occurrences, 3 expected)
	IPR001680: WD40 repeat  (14 occurrences, 4 expected)

InterPro accessions that are completely missing from shared transcripts, with high frequency in novel transcripts:
	IPR002453: Beta tubulin  (13 occur

In [10]:
interpro_accessions_in_missed_transcripts(acc_product, acc_tally_missed_braker3, acc_tally_novel_braker3, braker3_missed_results, braker3_novel_results, min_freq)

InterPro accessions that are completely missing from shared transcripts, with high frequency in missed transcripts:
	IPR041036: Glycoside hydrolase family 5, C-terminal domain (12 occurrences)

InterPro accessions occurring with significantly higher frequency in missed transcripts than in shared transcripts:
	IPR001547: Glycoside hydrolase, family 5  (11 occurrences, 0 expected)
	IPR009003: Peptidase S1, PA clan  (21 occurrences, 0 expected)
	IPR001254: Serine proteases, trypsin domain  (20 occurrences, 0 expected)
	IPR043504: Peptidase S1, PA clan, chymotrypsin-like fold (20 occurrences, 0 expected)
	IPR013780: Glycosyl hydrolase, all-beta  (13 occurrences, 1 expected)
	IPR017853: Glycoside hydrolase superfamily  (14 occurrences, 1 expected)
	IPR000276: G protein-coupled receptor, rhodopsin-like  (26 occurrences, 4 expected)
		IPR000276 also significantly more frequent in novel transcripts (5 occurrences, 2 expected)
	IPR017452: GPCR, rhodopsin-like, 7TM (30 occurrences, 4 expected)



In [11]:
missed_transcripts_with_significantly_more_frequent_accessions(wbps_species.db, missed_transcripts, acc_tally_missed_braker3, braker3_missed_results, min_freq)

SM_V10_1 - Smp_027940.1 - {'IPR017452', 'IPR000276'}
SM_V10_1 - Smp_126730.1 - {'IPR017452', 'IPR000276'}
SM_V10_1 - Smp_170610.1 - {'IPR017452', 'IPR000276'}
SM_V10_1 - Smp_178420.1 - {'IPR017452'}
SM_V10_1 - Smp_315690.2 - {'IPR001547', 'IPR017853', 'IPR013780'}
SM_V10_1 - Smp_316850.1 - {'IPR017452', 'IPR000276'}
SM_V10_1 - Smp_317470.2 - {'IPR001547', 'IPR017853', 'IPR013780'}
SM_V10_1 - Smp_319310.2 - {'IPR017452', 'IPR000276'}
SM_V10_1 - Smp_324100.1 - {'IPR017452'}
SM_V10_1 - Smp_325900.1 - {'IPR017452'}
SM_V10_Z - Smp_162980.1 - {'IPR000276'}
SM_V10_Z - Smp_167870.1 - {'IPR017452', 'IPR000276'}
SM_V10_Z - Smp_204060.1 - {'IPR017452', 'IPR000276'}
SM_V10_Z - Smp_303010.1 - {'IPR001547', 'IPR017853', 'IPR013780'}
SM_V10_Z - Smp_322380.1 - {'IPR017452'}
SM_V10_Z - Smp_323980.1 - {'IPR017452'}
SM_V10_3 - Smp_083940.1 - {'IPR017452', 'IPR000276'}
SM_V10_3 - Smp_091950.1 - {'IPR017452', 'IPR000276'}
SM_V10_3 - Smp_117340.1 - {'IPR017452', 'IPR000276'}
SM_V10_3 - Smp_177720.1 - {'IPR0

### Helixer

In [12]:
acc_product, acc_tally_shared, acc_tally_missed_helixer, acc_tally_novel_helixer, missed_transcripts = interpro_accession_pipeline(wbps_species.db, hog_df, wbps_col, helixer_col, interproscan_dir, acc_product)

# Find InterPro accessions occurring with significantly different frequency than in control (acc_tally_shared)
helixer_novel_results = fisher_exact_for_two_lists_of_accessions(acc_tally_novel_helixer, acc_tally_shared)
helixer_missed_results = fisher_exact_for_two_lists_of_accessions(acc_tally_missed_helixer, acc_tally_shared)

In [13]:
interpro_accessions_in_novel_transcripts(acc_product, acc_tally_novel_helixer, helixer_novel_results, min_freq)

InterPro accessions occurring with significantly higher frequency in novel transcripts than in shared transcripts:
	IPR000009: Protein phosphatase 2A regulatory subunit PR55  (12 occurrences, 0 expected)
	IPR001461: Aspartic peptidase A1 family  (13 occurrences, 0 expected)
	IPR003961: Fibronectin type III  (72 occurrences, 1 expected)
	IPR021109: Aspartic peptidase domain superfamily (13 occurrences, 0 expected)
	IPR000217: Tubulin  (10 occurrences, 0 expected)
	IPR000008: C2 domain  (18 occurrences, 1 expected)
	IPR013098: Immunoglobulin I-set  (14 occurrences, 1 expected)
	IPR001623: DnaJ domain  (12 occurrences, 1 expected)
	IPR013783: Immunoglobulin-like fold  (39 occurrences, 5 expected)
	IPR002126: Cadherin-like  (13 occurrences, 2 expected)
	IPR003599: Immunoglobulin subtype  (18 occurrences, 2 expected)
	IPR036116: Fibronectin type III superfamily  (11 occurrences, 2 expected)
	IPR007110: Immunoglobulin-like domain  (20 occurrences, 3 expected)
	IPR013087: Zinc finger C2H2-typ

In [14]:
interpro_accessions_in_missed_transcripts(acc_product, acc_tally_missed_helixer, acc_tally_novel_helixer, helixer_missed_results, helixer_novel_results, min_freq=1)

InterPro accessions that are completely missing from shared transcripts, with high frequency in missed transcripts:
	IPR002453: Beta tubulin  (7 occurrences)
	IPR013838: Beta tubulin, autoregulation binding site  (6 occurrences)
	IPR026183: Taxilin family (4 occurrences)
	IPR001951: Histone H4  (3 occurrences)
	IPR019809: Histone H4, conserved site  (3 occurrences)
	IPR035425: CENP-T/Histone H4, histone fold (3 occurrences)
	IPR011331: Large ribosomal subunit protein eL37/eL43  (2 occurrences)
	IPR029040: RNA polymerase subunit RPABC4/transcription elongation factor Spt4  (2 occurrences)
	IPR001971: Small ribosomal subunit protein uS11  (2 occurrences)
	IPR005336: Mitochondrial pyruvate carrier (2 occurrences)
	IPR016487: Sm-like protein Lsm6/SmF  (2 occurrences)
	IPR005710: Ribosomal protein S4/S9, eukaryotic/archaeal  (2 occurrences)
	IPR018079: Small ribosomal subunit protein uS4, conserved site  (2 occurrences)
	IPR000892: Small ribosomal subunit protein eS26  (2 occurrences)
	IPR0

In [15]:
from collections import Counter
Counter(acc_tally_missed_helixer)["IPR017452"]
# helixer_missed_results["as_expected"]["IPR017452"]
Counter(acc_tally_shared)["IPR017452"]
# helixer_novel_results["as_expected"]["IPR017452"]

104

### Anno

In [16]:
acc_product, acc_tally_shared, acc_tally_missed_anno, acc_tally_novel_anno, missed_transcripts = interpro_accession_pipeline(wbps_species.db, hog_df, wbps_col, anno_col, interproscan_dir, acc_product)

# Find InterPro accessions occurring with significantly different frequency than in control (acc_tally_shared)
anno_novel_results = fisher_exact_for_two_lists_of_accessions(acc_tally_novel_anno, acc_tally_shared)
anno_missed_results = fisher_exact_for_two_lists_of_accessions(acc_tally_missed_anno, acc_tally_shared)

In [17]:
interpro_accessions_in_novel_transcripts(acc_product, acc_tally_novel_anno, anno_novel_results, min_freq=1)

InterPro accessions occurring with significantly higher frequency in novel transcripts than in shared transcripts:
	IPR008335: Eukaryotic molybdopterin oxidoreductase  (9 occurrences, 0 expected)
	IPR015925: Ryanodine/Inositol 1,4,5-trisphosphate receptor  (3 occurrences, 0 expected)
	IPR000477: Reverse transcriptase domain (10 occurrences, 0 expected)
	IPR005446: Voltage-dependent calcium channel, L-type, alpha-1 subunit  (4 occurrences, 0 expected)
	IPR023509: D-aminoacyl-tRNA deacylase-like superfamily (2 occurrences, 0 expected)
	IPR036374: Oxidoreductase, molybdopterin-binding domain superfamily  (2 occurrences, 0 expected)
	IPR002156: Ribonuclease H domain  (2 occurrences, 0 expected)
	IPR011030: Lipovitellin-phosvitin complex, superhelical domain  (2 occurrences, 0 expected)
	IPR001747: Vitellogenin, N-terminal  (2 occurrences, 0 expected)
	IPR006586: ADAM, cysteine-rich domain  (2 occurrences, 0 expected)
	IPR005016: Serine incorporator/TMS membrane protein (2 occurrences, 0 ex

In [18]:
interpro_accessions_in_missed_transcripts(acc_product, acc_tally_missed_anno, acc_tally_novel_anno, anno_missed_results, anno_novel_results, min_freq)

InterPro accessions that are completely missing from shared transcripts, with high frequency in missed transcripts:

InterPro accessions occurring with significantly higher frequency in missed transcripts than in shared transcripts:
	IPR009003: Peptidase S1, PA clan  (16 occurrences, 0 expected)
	IPR001254: Serine proteases, trypsin domain  (15 occurrences, 0 expected)
	IPR043504: Peptidase S1, PA clan, chymotrypsin-like fold (15 occurrences, 0 expected)
	IPR001372: Dynein light chain, type 1/2  (10 occurrences, 1 expected)
	IPR037177: Dynein light chain superfamily (10 occurrences, 1 expected)
	IPR017452: GPCR, rhodopsin-like, 7TM (18 occurrences, 3 expected)
	IPR000276: G protein-coupled receptor, rhodopsin-like  (13 occurrences, 3 expected)

InterPro accessions occurring as expected in missed transcripts with high frequency:

InterPro accessions occurring less frequently in missed transcripts than expected:
	IPR013083: Zinc finger, RING/FYVE/PHD-type  (1 occurrences, 6 expected)
	IP

### General

Find accessions that are significantly commonly missed by **all** tools

In [19]:
acc_tally_no_tool, acc_tally_one_plus_tool_shared = interpro_accession_pipeline_all_tools(wbps_species.db, hog_df, wbps_col, [braker_col, helixer_col, anno_col])

no_tools_results = fisher_exact_for_two_lists_of_accessions(acc_tally_no_tool, acc_tally_one_plus_tool_shared)

In [20]:
interpro_accessions_frequently_missed_by_all_tools(acc_product, acc_tally_no_tool, no_tools_results)

InterPro accessions occurring with significantly higher frequency in transcripts that were missed by all tools, than in transcripts shared by at least 1 tool:
	IPR001254: Serine proteases, trypsin domain  (15 occurrences, 0 expected)
	IPR043504: Peptidase S1, PA clan, chymotrypsin-like fold (15 occurrences, 0 expected)
	IPR009003: Peptidase S1, PA clan  (15 occurrences, 0 expected)
	IPR004254: AdipoR/Haemolysin-III-related (2 occurrences, 0 expected)

InterPro accessions that are completely missing from transcripts shared by at least 1 tool, but present in transcripts that were missed by all tools:
	IPR019404: Mediator complex, subunit Med11 (2 occurrences)

InterPro accessions occurring as expected in transcripts that were missed by all tools.
	IPR000276: G protein-coupled receptor, rhodopsin-like  (1 occurrences, 0 expected)
	IPR017452: GPCR, rhodopsin-like, 7TM (1 occurrences, 0 expected)


Find accessions that are significantly commonly missed by each tool.

In [21]:
interpro_accessions_frequently_missed_by_each_tool(
    acc_product,
    {
        "BRAKER3": braker3_missed_results,
        "Helixer": helixer_missed_results,
        "Anno": anno_missed_results
    },
    {
        "BRAKER3": acc_tally_missed_braker3,
        "Helixer": acc_tally_missed_helixer,
        "Anno": acc_tally_missed_anno
    }
)

IPR043504: Peptidase S1, PA clan, chymotrypsin-like fold
46.29 times more likely than with BRAKER3 (20 occurrences, 0 expected)
33.44 times more likely than with Helixer (16 occurrences, 0 expected)
34.35 times more likely than with Anno (15 occurrences, 0 expected)
~~~~~~~~~~
IPR017853: Glycoside hydrolase superfamily 
9.92 times more likely than with BRAKER3 (14 occurrences, 1 expected)
12.03 times more likely than with Helixer (13 occurrences, 1 expected)
5.2 times more likely than with Anno (6 occurrences, 1 expected)
~~~~~~~~~~
IPR021712: Uncharacterised domain UPF0506
7.32 times more likely than with BRAKER3 (4 occurrences, 1 expected)
9.92 times more likely than with Helixer (4 occurrences, 0 expected)
53.24 times more likely than with Anno (9 occurrences, 0 expected)
~~~~~~~~~~
IPR001254: Serine proteases, trypsin domain 
46.29 times more likely than with BRAKER3 (20 occurrences, 0 expected)
33.44 times more likely than with Helixer (16 occurrences, 0 expected)
34.35 times more

In [22]:
from collections import Counter 

{acc:acc_product[acc] for acc in acc_tally_one_plus_tool_shared if "rhodopsin" in acc_product[acc].lower()}
helixer_missed_results["as_expected"]["IPR000276"]
Counter(acc_tally_missed_braker3)["IPR000276"]

26

In [23]:
import re
MIN_WORD_LEN = 5
tools_missed_results = {
    "BRAKER3": braker3_missed_results,
    "Helixer": helixer_missed_results,
    "Anno": anno_missed_results
}
tools_novel_results = {
    "BRAKER3": braker3_novel_results,
    "Helixer": helixer_novel_results,
    "Anno": anno_novel_results
}
acc_tally_missed_tools = {
    "BRAKER3": acc_tally_missed_braker3,
    "Helixer": acc_tally_missed_helixer,
    "Anno": acc_tally_missed_anno
}
acc_tally_novel_tools = {
    "BRAKER3": acc_tally_novel_braker3,
    "Helixer": acc_tally_novel_helixer,
    "Anno": acc_tally_novel_anno
}
seen_words = set()
all_words = set()
for acc in set.intersection(*[set(results["more_frequent"].keys()) for results in tools_missed_results.values()]):
    prod = acc_product[acc]
    words = [w for w in re.split(r'\W', prod.lower()) if w and len(w) >= MIN_WORD_LEN]
    all_words.update(words)
    # Check if words occur more frequently in novel-only accessions
    for novel_acc in set.intersection(*[set(results["more_frequent"].keys()) for results in tools_novel_results.values()]):
        novel_prod = acc_product[novel_acc]
        for w in words:
            if w in novel_prod.lower():
                print(f"{w} appears in {novel_acc}: {novel_prod}")
                seen_words.add(w)

peptidase appears in IPR033121: Peptidase family A1 domain
peptidase appears in IPR001969: Aspartic peptidase, active site 
peptidase appears in IPR021109: Aspartic peptidase domain superfamily
peptidase appears in IPR001461: Aspartic peptidase A1 family 
peptidase appears in IPR000718: Peptidase M13 
peptidase appears in IPR018497: Peptidase M13, C-terminal domain 
peptidase appears in IPR001577: Peptidase M8, leishmanolysin
superfamily appears in IPR043502: DNA/RNA polymerase superfamily
superfamily appears in IPR021109: Aspartic peptidase domain superfamily
domain appears in IPR033121: Peptidase family A1 domain
domain appears in IPR000772: Ricin B, lectin domain 
domain appears in IPR002219: Protein kinase C-like, phorbol ester/diacylglycerol-binding domain 
domain appears in IPR021109: Aspartic peptidase domain superfamily
domain appears in IPR002068: Alpha crystallin/Hsp20 domain 
domain appears in IPR001623: DnaJ domain 
domain appears in IPR043128: Reverse transcriptase/Diguany

In [24]:
all_words.difference(seen_words)

{'chymotrypsin',
 'glycoside',
 'glycosyl',
 'hydrolase',
 'proteases',
 'serine',
 'trypsin',
 'uncharacterised',
 'upf0506'}

# Novel orthologues

In [25]:
print("Novel orthogroups")
total_orthogroups = hog_df.shape[0]
braker3_novel_ogs = hog_df[(hog_df[wbps_col].isna()) & ~(hog_df[braker_col].isna())].shape[0]
helixer_novel_ogs = hog_df[(hog_df[wbps_col].isna()) & ~(hog_df[helixer_col].isna())].shape[0]
anno_novel_ogs = hog_df[(hog_df[wbps_col].isna()) & ~(hog_df[anno_col].isna())].shape[0]
print(f"BRAKER3: {braker3_novel_ogs} ({round(100*braker3_novel_ogs/total_orthogroups, 2)}%)")
print(f"Helixer: {helixer_novel_ogs} ({round(100*helixer_novel_ogs/total_orthogroups, 2)}%)")
print(f"Anno: {anno_novel_ogs} ({round(100*anno_novel_ogs/total_orthogroups, 2)}%)")

Novel orthogroups
BRAKER3: 274 (2.85%)
Helixer: 407 (4.24%)
Anno: 516 (5.38%)


In [26]:
# novel_orthologue_pipeline(hog_df, wbps_col, anno_col, species_list)
# novel_orthologue_pipeline(hog_df, wbps_col, braker_col, species_list)
# novel_orthologue_pipeline(hog_df, wbps_col, helixer_col, species_list)
anno_esm_means = extract_esm_means("data/from_MARS/Sman_esm_pLDDTs_anno.txt").values()
braker3_esm_means = extract_esm_means("data/from_MARS/Sman_esm_pLDDTs_braker3.txt").values()
helixer_esm_means = extract_esm_means("data/from_MARS/Sman_esm_pLDDTs_helixer.txt").values()

print(statistics.mean(map(float, anno_esm_means)))
print(statistics.mean(map(float, braker3_esm_means)))
print(statistics.mean(map(float, helixer_esm_means)))

def boxplot_axes(idx, label, data, ylim=(0, 100)):
    ax = plt.subplot(1, 4, idx)
    ax.boxplot(data)
    if ylim:
        ax.set_ylim(top=ylim[1], bottom=ylim[0])
    ax.grid(which="both")
    ax.set_title(label.upper())
    ax.set_xticklabels([])
    ax.set_xlabel(round(statistics.mean(data), 2))

boxplot_axes(idx=1, label="anno", data=list(map(float, anno_esm_means)))
boxplot_axes(idx=2, label="braker3", data=list(map(float, braker3_esm_means)))
boxplot_axes(idx=3, label="helixer", data=list(map(float, helixer_esm_means)))
plt.show()


# anno_esm_means

54.20466926070039
57.278228782287826
54.64353233830846


In [27]:
import pandas as pd
cols = (
    "fn",
    "mean",
    "median",
    "stdev",
    "var",
    "max",
    "min",
    "perc_confident"
)
df = pd.read_csv("data/from_MARS/pLDDT_sman.csv", names=cols)
print(f"Mean of means: {df['mean'].mean()}")
print(f"% that are \"Confident\": {100*df[df['mean'] >= 70].shape[0]/df.shape[0]}")

# df.count()

Mean of means: 70.15563125399191
% that are "Confident": 54.939322972109856


In [28]:
print(f"% BRAKER3 that are \"Confident\": {100*len([p for p in braker3_esm_means if float(p) >= 70]) / len(braker3_esm_means)}")
print(f"% Anno that are \"Confident\": {100*len([p for p in anno_esm_means if float(p) >= 70]) / len(anno_esm_means)}")
print(f"% Helixer that are \"Confident\": {100*len([p for p in helixer_esm_means if float(p) >= 70]) / len(helixer_esm_means)}")

% BRAKER3 that are "Confident": 31.73431734317343
% Anno that are "Confident": 24.90272373540856
% Helixer that are "Confident": 21.890547263681594


In [29]:

df_anno = pd.read_csv("data/from_MARS/pLDDT_sman_anno.csv", names=cols)
df_braker3 = pd.read_csv("data/from_MARS/pLDDT_sman_braker3.csv", names=cols)
df_helixer = pd.read_csv("data/from_MARS/pLDDT_sman_helixer.csv", names=cols)

In [30]:
print(f"Mean of WBPS % that are \"Confident\" residues: {df['perc_confident'].mean()}")
print(f"Mean of BRAKER3 % that are \"Confident\" residues: {df_braker3['perc_confident'].mean()}")
print(f"Mean of Anno % that are \"Confident\" residues: {df_anno['perc_confident'].mean()}")
print(f"Mean of Helixer % that are \"Confident\" residues: {df_helixer['perc_confident'].mean()}")


Mean of WBPS % that are "Confident" residues: 57.85118160527997
Mean of BRAKER3 % that are "Confident" residues: 35.38376383763838
Mean of Anno % that are "Confident" residues: 30.747081712062258
Mean of Helixer % that are "Confident" residues: 30.46153846153846


In [31]:
df_anno[(df_anno["mean"] > 50) & (df_anno["perc_confident"] == 0)].sort_values("stdev")

Unnamed: 0,fn,mean,median,stdev,var,max,min,perc_confident
319,transcript:ENSSMAT00000005813.pdb,56,57,6,36,65,36,0
186,transcript:ENSSMAT00000008498.pdb,55,55,7,46,69,38,0
215,transcript:ENSSMAT00000034886.pdb,56,58,7,44,65,42,0
80,transcript:ENSSMAT00000012469.pdb,54,54,8,65,67,36,0
47,transcript:ENSSMAT00000034121.pdb,53,55,9,73,67,25,0
146,transcript:ENSSMAT00000026481.pdb,51,54,10,100,66,34,0
242,transcript:ENSSMAT00000028062.pdb,51,47,10,93,69,34,0
362,transcript:ENSSMAT00000030008.pdb,51,52,11,113,69,30,0


In [32]:
df_braker3[(df_braker3["mean"] > 50) & (df_braker3["perc_confident"] == 0)].sort_values("stdev")

Unnamed: 0,fn,mean,median,stdev,var,max,min,perc_confident
46,g2824.t1.pdb,55,55,3,12,62,49,0
15,g2879.t1.pdb,57,57,4,17,68,50,0
138,g2535.t1.pdb,61,62,5,25,70,49,0
117,g5842.t1.pdb,54,55,7,45,65,32,0
54,g1749.t1.pdb,51,52,11,113,69,30,0


In [33]:
df_helixer[(df_helixer["mean"] < 60) & (df_helixer["perc_confident"] > 40)]

Unnamed: 0,fn,mean,median,stdev,var,max,min,perc_confident
363,Schistosoma_mansoni_SM_V10_Z_001959.1.pdb,58,61,22,474,90,22,41


In [34]:
df_helixer[(df_helixer["mean"] > 50) & (df_helixer["perc_confident"] == 0)].sort_values("stdev")

Unnamed: 0,fn,mean,median,stdev,var,max,min,perc_confident
325,Schistosoma_mansoni_SM_V10_4_001163.1.pdb,53,53,3,10,59,46,0
237,Schistosoma_mansoni_SM_V10_6_000170.1.pdb,59,58,5,20,67,52,0
282,Schistosoma_mansoni_SM_V10_2_000542.1.pdb,51,50,6,36,65,41,0
304,Schistosoma_mansoni_SM_V10_2_000788.1.pdb,51,49,6,36,62,41,0
378,Schistosoma_mansoni_SM_V10_5_000309.1.pdb,51,53,6,35,61,36,0
21,Schistosoma_mansoni_SM_V10_1_000685.1.pdb,53,53,7,55,69,42,0
45,Schistosoma_mansoni_SM_V10_4_000066.1.pdb,57,59,8,69,69,19,0
48,Schistosoma_mansoni_SM_V10_4_000006.1.pdb,51,48,8,62,64,39,0
316,Schistosoma_mansoni_SM_V10_2_001157.1.pdb,51,50,8,59,65,35,0
332,Schistosoma_mansoni_SM_V10_2_000183.1.pdb,53,55,8,60,66,35,0
