In [40]:
import os.path
from collections import defaultdict
import contextlib
import json
import statistics
from collections import Counter
from operator import itemgetter

import matplotlib
from matplotlib import pyplot as plt
import numpy as np
from tqdm import tqdm

from orthologue_analysis.orthogroups import init_orthogroup_df
from orthologue_analysis.species import MansoniCladeFromTool, SpeciesList
from orthologue_analysis.utils import SequenceIDMapping, orthofinder_paths
from reannotation.analysis import (
    interpro_accessions_frequently_missed_by_all_tools,
    interpro_accessions_frequently_missed_by_each_tool,
    interpro_accessions_in_novel_transcripts,
    interpro_accessions_in_missed_transcripts,
    missed_transcripts_with_significantly_more_frequent_accessions
)
from reannotation.constants import ASPARTIC_PEPTIDASE_SUPERFAMILY
from reannotation.pipelines import (
    interpro_accession_pipeline_all_tools,
    interpro_accession_pipeline,
    suspicious_orthologue_pipeline,
    novel_orthologue_pipeline
)
from reannotation.statistics import fisher_exact_for_two_lists_of_accessions, count_transcripts_with_accession
from reannotation.utils import extract_accessions_from_transcript
from utils.esm import extract_esm_means
from utils.generic import flatten_nested_dict, flatten_list_to_set
from utils.gffutils import init_db

matplotlib.use("TkAgg")


results_label = "Results_Aug05"
wbps_ann_path = "data/from_WBPS/schistosoma_mansoni.PRJEA36577.WBPS19.annotations.gff3"
braker_path = "data/from_MARS/Schistosoma_mansoni_braker3_full.gff3"
helixer_path = "data/from_MARS/Schistosoma_mansoni_helixer_full.gff3"
anno_path = "data/from_EBI/schistosoma_mansoni_gca000000000v1.gff3"
db = init_db(wbps_ann_path, "db/Sman_wbps.db")
of = orthofinder_paths(results_label, subdir="Orthogroups")

wbps_col = "Sman_LT"
braker_col = "Sman_braker3_LT"
helixer_col = "Sman_helixer_LT"
anno_col = "Sman_anno_LT"

og_df = init_orthogroup_df(of["orthogroups"])
no_og_df = init_orthogroup_df(of["orthogroups_unassigned_genes"])
seq_id_map = SequenceIDMapping(of["wd"])
mars_data_dir = os.path.join("data", "from_MARS", "")
ebi_data_dir = os.path.join("data", "from_EBI", "")

species_list = SpeciesList([
    MansoniCladeFromTool("mansoni", data_dir=mars_data_dir, data_label="Sman_LT", prot_filename_suffix=".fa"),
    MansoniCladeFromTool("mansoni_braker3_reann", data_dir=mars_data_dir, data_label="Sman_braker3_LT", prot_filename_suffix=".fa"),
    MansoniCladeFromTool("mansoni_helixer_reann", data_dir=mars_data_dir, data_label="Sman_helixer_LT", prot_filename_suffix=".fa"),
    MansoniCladeFromTool("mansoni_anno_reann", data_dir=ebi_data_dir, data_label="Sman_anno_LT", prot_filename_suffix=".fa")],
    wd_path=of["wd"],
    load_blast=True
)

wbps_species = species_list.get_species_with_data_label(wbps_col)
braker_species = species_list.get_species_with_data_label(braker_col)
helixer_species = species_list.get_species_with_data_label(helixer_col)
anno_species = species_list.get_species_with_data_label(anno_col)

with open("data/acc_product.json", "r") as f:
    acc_product = json.loads(f.read())

min_freq = 10

interproscan_dir = "data/from_MARS/interproscan/sman"



loading /home/will/wbp_scratch/data/from_MARS/OrthoFinder/WorkingDirectory/Results_Aug05/Blast0_1.txt...
loading /home/will/wbp_scratch/data/from_MARS/OrthoFinder/WorkingDirectory/Results_Aug05/Blast0_2.txt...
loading /home/will/wbp_scratch/data/from_MARS/OrthoFinder/WorkingDirectory/Results_Aug05/Blast0_3.txt...
loading /home/will/wbp_scratch/data/from_MARS/OrthoFinder/WorkingDirectory/Results_Aug05/Blast0_2.txt...
loading /home/will/wbp_scratch/data/from_MARS/OrthoFinder/WorkingDirectory/Results_Aug05/Blast0_3.txt...
loading /home/will/wbp_scratch/data/from_MARS/OrthoFinder/WorkingDirectory/Results_Aug05/Blast0_1.txt...


# General Statistics

In [2]:
from utils.generic import makedirs
plot_dir = "plots/reannotation/"
makedirs(plot_dir)

In [3]:
print("Shared orthologues with WBPS:")
print("WBPS: {}".format(len(og_df[~og_df[wbps_col].isna()])))
print("BRAKER3: {}".format(len(og_df[~og_df[wbps_col].isna() & ~og_df[braker_col].isna()])))
print("Helixer: {}".format(len(og_df[~og_df[wbps_col].isna() & ~og_df[helixer_col].isna()])))
print("Anno: {}".format(len(og_df[~og_df[wbps_col].isna() & ~og_df[anno_col].isna()])))

Shared orthologues with WBPS:
WBPS: 8651
BRAKER3: 8003
Helixer: 8029
Anno: 8170


In [4]:
print("Total transcripts that have an orthologue with WBPS:")
print("WBPS: {}".format(og_df[~og_df[wbps_col].isna()][wbps_col].str.split(",").str.len().sum()))
print("BRAKER3: {}".format(og_df[~og_df[wbps_col].isna() & ~og_df[braker_col].isna()][braker_col].str.split(",").str.len().sum()))
print("Helixer: {}".format(og_df[~og_df[wbps_col].isna() & ~og_df[helixer_col].isna()][helixer_col].str.split(",").str.len().sum()))
print("Anno: {}".format(og_df[~og_df[wbps_col].isna() & ~og_df[anno_col].isna()][anno_col].str.split(",").str.len().sum()))

Total transcripts that have an orthologue with WBPS:
WBPS: 9427
BRAKER3: 8584
Helixer: 8737
Anno: 10078


In [5]:
print("Unique orthologues:")
wbps_na = og_df[wbps_col].isna()
braker_na = og_df[braker_col].isna()
helixer_na = og_df[helixer_col].isna()
anno_na = og_df[anno_col].isna()
novel_ogs = og_df[wbps_na & (~braker_na | ~helixer_na | ~anno_na)]
print(f"WBPS only: {len(og_df[~wbps_na & braker_na & helixer_na & anno_na])}")
print(f"BRAKER only: {len(og_df[wbps_na & ~braker_na & helixer_na & anno_na])}")
print(f"BRAKER + Helixer: {len(og_df[wbps_na & ~braker_na & ~helixer_na & anno_na])}")
print(f"BRAKER + Helixer + Anno: {len(og_df[wbps_na & ~braker_na & ~helixer_na & ~anno_na])}")
print(f"Helixer only: {len(og_df[wbps_na & braker_na & ~helixer_na & anno_na])}")
print(f"Helixer + Anno: {len(og_df[wbps_na & braker_na & ~helixer_na & ~anno_na])}")
print(f"Anno only: {len(og_df[wbps_na & braker_na & helixer_na & ~anno_na])}")
print(f"Anno + BRAKER: {len(og_df[wbps_na & ~braker_na & helixer_na & ~anno_na])}")
print(f"Any automated tool only: {len(novel_ogs)}")

Unique orthologues:
WBPS only: 17
BRAKER only: 19
BRAKER + Helixer: 51
BRAKER + Helixer + Anno: 56
Helixer only: 24
Helixer + Anno: 126
Anno only: 100
Anno + BRAKER: 95
Any automated tool only: 471


In [6]:
def count_all_mRNA_exons(label):
    count = []
    sp = species_list.get_species_with_data_label(label)
    for t in sp.db.all_features(featuretype="mRNA"):
        count.append(len(list(sp.db.children(t, featuretype="exon"))))
    return count


exon_count = defaultdict(list)
exon_count["wbps"] = count_all_mRNA_exons(wbps_col)
exon_count["anno"] = count_all_mRNA_exons(anno_col)
exon_count["braker"] = count_all_mRNA_exons(braker_col)
exon_count["helixer"] = count_all_mRNA_exons(helixer_col)

In [7]:
def count_all_mRNA_amino_acids(label):
    count = []
    sp = species_list.get_species_with_data_label(label)
    for t in sp.db.all_features(featuretype="mRNA"):
        cds_exons = list(sp.db.children(t, featuretype="CDS"))
        count.append(sp.get_amino_acid_count(cds_exons))
    return count

protein_lengths = defaultdict(list)
protein_lengths["wbps"] = count_all_mRNA_amino_acids(wbps_col)
protein_lengths["anno"] = count_all_mRNA_amino_acids(anno_col)
protein_lengths["braker"] = count_all_mRNA_amino_acids(braker_col)
protein_lengths["helixer"] = count_all_mRNA_amino_acids(helixer_col)

In [10]:
def boxplot_axes(idx, label, data, ylim=(0, 25)):
    ax = plt.subplot(1, 4, idx)
    ax.boxplot(data[label])
    if ylim:
        ax.set_ylim(top=ylim[1], bottom=ylim[0])
    ax.grid(which="both")
    ax.set_title(label.upper())
    ax.set_xticklabels([])
    ax.set_xlabel(round(statistics.mean(data[label]), 2))

boxplot_axes(1, "wbps", exon_count)
boxplot_axes(2, "anno", exon_count)
boxplot_axes(3, "braker", exon_count)
boxplot_axes(4, "helixer", exon_count)
plt.suptitle("Exon counts")
plt.text(-1.05, -1.7, 'Means')
plt.savefig(os.path.join(plot_dir, "sman_exon_count_boxplots.png"))
plt.show()

In [11]:
boxplot_axes(1, "wbps", protein_lengths, ylim=(0, 3000))
boxplot_axes(2, "anno", protein_lengths, ylim=(0, 3000))
boxplot_axes(3, "braker", protein_lengths, ylim=(0, 3000))
boxplot_axes(4, "helixer", protein_lengths, ylim=(0, 3000))
plt.suptitle("Amino acid counts")
plt.text(-1.05, -1.7, 'Means')
plt.savefig(os.path.join(plot_dir, "sman_prot_length_boxplots.png"))
plt.show()

In [13]:
wbps_species = species_list.get_species_with_data_label(wbps_col)
anno_species = species_list.get_species_with_data_label(anno_col)
braker_species = species_list.get_species_with_data_label(braker_col)
helixer_species = species_list.get_species_with_data_label(helixer_col)
x1, x2, x3 = [], [], []
x1e, x2e, x3e = [], [], []
y1, y2, y3 = [], [], []
y1e, y2e, y3e = [], [], []
for _, row in og_df[~og_df[wbps_col].isna()].iterrows():
    wbps_tran = wbps_species.db["transcript:" + list(map(str.strip, row[wbps_col].split(",")))[0].split("transcript_")[1]]
    wbps_cds_exons = list(wbps_species.db.children(wbps_tran, featuretype="CDS"))
    wbps_prot_length = wbps_species.get_amino_acid_count(wbps_cds_exons)
    if not row[anno_col] is np.nan:
        x1.append(wbps_prot_length)
        x1e.append(len(wbps_cds_exons))
        anno_tran = anno_species.db["transcript:" + list(map(str.strip, row[anno_col].split(",")))[0].split("transcript_")[1]]
        cds_exons = list(anno_species.db.children(anno_tran, featuretype="CDS"))
        y1.append(anno_species.get_amino_acid_count(cds_exons))
        y1e.append(len(cds_exons))
    if not row[braker_col] is np.nan:
        x2.append(wbps_prot_length)
        x2e.append(len(wbps_cds_exons))
        braker_tran = braker_species.db[list(map(str.strip, row[braker_col].split(",")))[0]]
        cds_exons = list(braker_species.db.children(braker_tran, featuretype="CDS"))
        y2.append(braker_species.get_amino_acid_count(cds_exons))
        y2e.append(len(cds_exons))
    if not row[helixer_col] is np.nan:
        x3.append(wbps_prot_length)
        x3e.append(len(wbps_cds_exons))
        helixer_tran = helixer_species.db[list(map(str.strip, row[helixer_col].split(",")))[0]]
        cds_exons = list(helixer_species.db.children(helixer_tran, featuretype="CDS"))
        y3.append(helixer_species.get_amino_acid_count(cds_exons))
        y3e.append(len(cds_exons))

In [14]:
def scatter_axes(idx, label, x, y, log=True):
    ax = plt.subplot(1, 3, idx)
    ax.scatter(x, y, linewidths=0.1, marker=".")
    if log:
        ax.set_yscale('log')
        ax.set_xscale('log')
    ax.set_xlabel("WBPS")
    ax.set_ylabel(label, rotation=90)
    ax.set_title(f"r={round(np.corrcoef(x, y)[1, 0], 2)} (\N{GREEK CAPITAL LETTER SIGMA}={len(x)})")

In [15]:
scatter_axes(1, "Anno", x1, y1)
scatter_axes(2, "BRAKER3", x2, y2)
scatter_axes(3, "Helixer", x3, y3)
plt.suptitle("Amino acid count correlations")
plt.tight_layout()
plt.savefig(os.path.join(plot_dir, "sman_prot_length_correlations.png"))
plt.show()

In [16]:
scatter_axes(1, "Anno", x1e, y1e, log=False)
scatter_axes(2, "BRAKER3", x2e, y2e, log=False)
scatter_axes(3, "Helixer", x3e, y3e, log=False)
plt.suptitle("Exon count correlations")
plt.tight_layout()
plt.savefig(os.path.join(plot_dir, "sman_exon_count_correlations.png"))
plt.show()

# Assessing merged/split genes

In [17]:
braker_merged, braker_split = suspicious_orthologue_pipeline(og_df, wbps_col, braker_col, species_list, seq_id_map)
anno_merged, anno_split = suspicious_orthologue_pipeline(og_df, wbps_col, anno_col, species_list, seq_id_map)
helixer_merged, helixer_split = suspicious_orthologue_pipeline(og_df, wbps_col, helixer_col, species_list, seq_id_map)

100%|██████████| 9122/9122 [00:46<00:00, 195.20it/s] 
100%|██████████| 9122/9122 [04:02<00:00, 37.68it/s] 
100%|██████████| 9122/9122 [01:20<00:00, 113.98it/s]


In [20]:
num_genes = len(list(braker_species.db.all_features(featuretype="gene")))
print(f"BRAKER3: merged={len(braker_merged)}, split={len(braker_split)}, total={round(100*(len(braker_split) + len(braker_merged)*2)/num_genes, 2)}")
num_genes = len(list(helixer_species.db.all_features(featuretype="gene")))
print(f"Helixer: merged={len(helixer_merged)}, split={len(helixer_split)}, total={round(100*(len(helixer_split) + len(helixer_merged)*2)/num_genes, 2)}")
num_genes = len(list(anno_species.db.all_features(featuretype="gene")))
print(f"Anno: merged={len(anno_merged)}, split={len(anno_split)}, total={round(100*(len(anno_split) + len(anno_merged)*2)/num_genes, 2)}")

BRAKER3: merged=17, split=10, total=0.48
Helixer: merged=161, split=20, total=3.45
Anno: merged=204, split=27, total=3.4


# InterPro accession investigation

### BRAKER3

In [21]:
acc_tally_shared_braker3, acc_tally_missed_braker3, acc_tally_novel_braker3, missed_transcripts = interpro_accession_pipeline(db, og_df, wbps_col, braker_col, interproscan_dir)
_, acc_tally_missed_braker3_unassigned, acc_tally_novel_braker3_unassigned, _ = interpro_accession_pipeline(db, no_og_df, wbps_col, braker_col, os.path.join(interproscan_dir, "unassigned_genes"))
acc_tally_missed_braker3 += acc_tally_missed_braker3_unassigned
acc_tally_novel_braker3 += acc_tally_novel_braker3_unassigned

# Find InterPro accessions occurring with significantly different frequency than in control (acc_tally_shared_braker3)
braker3_novel_results = fisher_exact_for_two_lists_of_accessions(acc_tally_novel_braker3, acc_tally_shared_braker3 + acc_tally_missed_braker3)
braker3_missed_results = fisher_exact_for_two_lists_of_accessions(acc_tally_missed_braker3, acc_tally_shared_braker3 + acc_tally_novel_braker3)

In [22]:
list(interpro_accessions_in_novel_transcripts(flatten_nested_dict(acc_product), acc_tally_novel_braker3, braker3_novel_results, 3))

InterPro accessions occurring with significantly higher frequency in novel transcripts than in shared transcripts:
	IPR037104: Annexin superfamily (3 occurrences, 0 expected)
	IPR003961: Fibronectin type III (3 occurrences, 0 expected)
	IPR036116: Fibronectin type III superfamily (3 occurrences, 0 expected)
	IPR036291: NAD(P)-binding domain superfamily (3 occurrences, 1 expected)
	IPR029071: Ubiquitin-like domain superfamily (3 occurrences, 1 expected)

InterPro accessions that are completely missing from shared transcripts, with high frequency in novel transcripts:
	IPR045609: Domain of unknown function DUF6451 (3 occurrences)



['IPR037104', 'IPR003961', 'IPR036116', 'IPR036291', 'IPR029071', 'IPR045609']

In [23]:
interpro_accessions_in_missed_transcripts(flatten_nested_dict(acc_product), acc_tally_missed_braker3, acc_tally_novel_braker3, braker3_missed_results, braker3_novel_results, min_freq)

InterPro accessions that are completely missing from shared transcripts, with high frequency in missed transcripts:

InterPro accessions occurring with significantly higher frequency in missed transcripts than in shared transcripts:
	IPR041036: Glycoside hydrolase family 5, C-terminal domain (10 occurrences, 0 expected)
	IPR009003: Peptidase S1, PA clan (27 occurrences, 0 expected)
	IPR001254: Serine proteases, trypsin domain (25 occurrences, 0 expected)
	IPR043504: Peptidase S1, PA clan, chymotrypsin-like fold (25 occurrences, 0 expected)
	IPR013780: Glycosyl hydrolase, all-beta (11 occurrences, 1 expected)
	IPR017853: Glycoside hydrolase superfamily (13 occurrences, 1 expected)
	IPR000276: G protein-coupled receptor, rhodopsin-like (23 occurrences, 4 expected)
	IPR017452: GPCR, rhodopsin-like, 7TM (26 occurrences, 4 expected)

InterPro accessions occurring as expected in missed transcripts with high frequency:
	IPR027417: P-loop containing nucleoside triphosphate hydrolase (19 occurr

In [24]:
missed_transcripts_with_significantly_more_frequent_accessions(wbps_species.db, missed_transcripts, acc_tally_missed_braker3, braker3_missed_results, min_freq)

SM_V10_1 - Smp_126730.1 - {'IPR000276', 'IPR017452'}
SM_V10_1 - Smp_170610.1 - {'IPR000276', 'IPR017452'}
SM_V10_1 - Smp_315690.2 - {'IPR013780', 'IPR041036', 'IPR017853'}
SM_V10_1 - Smp_316850.1 - {'IPR000276', 'IPR017452'}
SM_V10_1 - Smp_317470.2 - {'IPR013780', 'IPR041036', 'IPR017853'}
SM_V10_1 - Smp_324100.1 - {'IPR017452'}
SM_V10_1 - Smp_325900.1 - {'IPR017452'}
SM_V10_Z - Smp_162980.1 - {'IPR000276'}
SM_V10_Z - Smp_167870.1 - {'IPR000276', 'IPR017452'}
SM_V10_Z - Smp_204060.1 - {'IPR000276', 'IPR017452'}
SM_V10_Z - Smp_303010.1 - {'IPR013780', 'IPR041036', 'IPR017853'}
SM_V10_Z - Smp_322380.1 - {'IPR017452'}
SM_V10_Z - Smp_323980.1 - {'IPR017452'}
SM_V10_3 - Smp_083940.1 - {'IPR000276', 'IPR017452'}
SM_V10_3 - Smp_091950.1 - {'IPR000276', 'IPR017452'}
SM_V10_3 - Smp_177720.1 - {'IPR000276', 'IPR017452'}
SM_V10_3 - Smp_211260.1 - {'IPR009003'}
SM_V10_3 - Smp_241490.1 - {'IPR000276', 'IPR017452'}
SM_V10_3 - Smp_326640.1 - {'IPR000276', 'IPR017452'}
SM_V10_3 - Smp_333690.1 - {'IPR0

### Helixer

In [25]:
acc_tally_shared_helixer, acc_tally_missed_helixer, acc_tally_novel_helixer, missed_transcripts = interpro_accession_pipeline(db, og_df, wbps_col, helixer_col, interproscan_dir)
_, acc_tally_missed_helixer_unassigned, acc_tally_novel_helixer_unassigned, _ = interpro_accession_pipeline(db, no_og_df, wbps_col, helixer_col, os.path.join(interproscan_dir, "unassigned_genes"))
acc_tally_missed_helixer += acc_tally_missed_helixer_unassigned
acc_tally_novel_helixer += acc_tally_novel_helixer_unassigned

# Find InterPro accessions occurring with significantly different frequency than in control (acc_tally_shared_helixer)
helixer_novel_results = fisher_exact_for_two_lists_of_accessions(acc_tally_novel_helixer, acc_tally_shared_helixer + acc_tally_missed_helixer)
helixer_missed_results = fisher_exact_for_two_lists_of_accessions(acc_tally_missed_helixer, acc_tally_shared_helixer + acc_tally_novel_helixer)

In [26]:
list(interpro_accessions_in_novel_transcripts(flatten_nested_dict(acc_product), acc_tally_novel_helixer, helixer_novel_results, 3))

InterPro accessions occurring with significantly higher frequency in novel transcripts than in shared transcripts:
	IPR036964: Ras guanine-nucleotide exchange factor, catalytic domain superfamily (3 occurrences, 0 expected)
	IPR023578: Ras guanine nucleotide exchange factor domain superfamily (3 occurrences, 0 expected)
	IPR036869: Chaperone J-domain superfamily (5 occurrences, 1 expected)
	IPR001623: DnaJ domain (4 occurrences, 1 expected)
	IPR002347: Short-chain dehydrogenase/reductase SDR (3 occurrences, 1 expected)
	IPR013098: Immunoglobulin I-set (3 occurrences, 1 expected)
	IPR003599: Immunoglobulin subtype (6 occurrences, 1 expected)
	IPR003961: Fibronectin type III (4 occurrences, 1 expected)
	IPR036116: Fibronectin type III superfamily (4 occurrences, 1 expected)
	IPR007110: Immunoglobulin-like domain (7 occurrences, 2 expected)
	IPR003598: Immunoglobulin subtype 2 (4 occurrences, 1 expected)
	IPR036179: Immunoglobulin-like domain superfamily (6 occurrences, 2 expected)
	IPR02

['IPR036964',
 'IPR023578',
 'IPR036869',
 'IPR001623',
 'IPR002347',
 'IPR013098',
 'IPR003599',
 'IPR003961',
 'IPR036116',
 'IPR007110',
 'IPR003598',
 'IPR036179',
 'IPR029071',
 'IPR013783',
 'IPR027417',
 'IPR050951']

In [27]:
interpro_accessions_in_missed_transcripts(flatten_nested_dict(acc_product), acc_tally_missed_helixer, acc_tally_novel_helixer, helixer_missed_results, helixer_novel_results, min_freq=1)

InterPro accessions that are completely missing from shared transcripts, with high frequency in missed transcripts:
	IPR002453: Beta tubulin (7 occurrences)
	IPR013838: Beta tubulin, autoregulation binding site (6 occurrences)
	IPR026183: Taxilin family (4 occurrences)
	IPR001951: Histone H4 (3 occurrences)
	IPR019809: Histone H4, conserved site (3 occurrences)
	IPR035425: CENP-T/Histone H4, histone fold (3 occurrences)
	IPR000558: Histone H2B (3 occurrences)
	IPR001971: Small ribosomal subunit protein uS11 (2 occurrences)
	IPR001147: Large ribosomal subunit protein eL21 (2 occurrences)
	IPR018259: Large ribosomal subunit protein eL21, conserved site (2 occurrences)
	IPR036948: Large ribosomal subunit protein eL21 superfamily (2 occurrences)
	IPR005336: Mitochondrial pyruvate carrier (2 occurrences)
	IPR029040: RNA polymerase subunit RPABC4/transcription elongation factor Spt4 (2 occurrences)
	IPR011331: Large ribosomal subunit protein eL37/eL43 (2 occurrences)
	IPR000892: Small riboso

### Anno

In [29]:
acc_tally_shared_anno, acc_tally_missed_anno, acc_tally_novel_anno, missed_transcripts = interpro_accession_pipeline(db, og_df, wbps_col, anno_col, interproscan_dir)
_, acc_tally_missed_anno_unassigned, acc_tally_novel_anno_unassigned, _ = interpro_accession_pipeline(db, no_og_df, wbps_col, anno_col, os.path.join(interproscan_dir, "unassigned_genes"))
acc_tally_missed_anno += acc_tally_missed_anno_unassigned
acc_tally_novel_anno += acc_tally_novel_anno_unassigned

# Find InterPro accessions occurring with significantly different frequency than in control (acc_tally_shared_anno)
anno_novel_results = fisher_exact_for_two_lists_of_accessions(acc_tally_novel_anno, acc_tally_shared_anno + acc_tally_missed_anno)
anno_missed_results = fisher_exact_for_two_lists_of_accessions(acc_tally_missed_anno, acc_tally_shared_anno + acc_tally_novel_anno)

In [30]:
list(interpro_accessions_in_novel_transcripts(flatten_nested_dict(acc_product), acc_tally_novel_anno, anno_novel_results, min_freq=1))

InterPro accessions occurring with significantly higher frequency in novel transcripts than in shared transcripts:
	IPR001995: Peptidase A2A, retrovirus, catalytic (2 occurrences, 0 expected)
	IPR043128: Reverse transcriptase/Diguanylate cyclase domain (5 occurrences, 0 expected)
	IPR000477: Reverse transcriptase domain (4 occurrences, 0 expected)
	IPR027080: Protein Unc-13 (2 occurrences, 0 expected)
	IPR007081: RNA polymerase Rpb1, domain 5 (2 occurrences, 0 expected)
	IPR043502: DNA/RNA polymerase superfamily (6 occurrences, 0 expected)
	IPR005479: Carbamoyl-phosphate synthetase large subunit-like, ATP-binding domain (2 occurrences, 0 expected)
	IPR033749: Polyprenyl synthetase, conserved site (2 occurrences, 0 expected)
	IPR045867: DNA-directed RNA polymerase, subunit beta-prime (2 occurrences, 0 expected)
	IPR038207: DIX domain superfamily (2 occurrences, 0 expected)
	IPR001158: DIX domain (2 occurrences, 0 expected)
	IPR016185: Pre-ATP-grasp domain superfamily (2 occurrences, 0 e

['IPR001995',
 'IPR043128',
 'IPR000477',
 'IPR027080',
 'IPR007081',
 'IPR043502',
 'IPR005479',
 'IPR033749',
 'IPR045867',
 'IPR038207',
 'IPR001158',
 'IPR016185',
 'IPR043154',
 'IPR036045',
 'IPR008949',
 'IPR048684',
 'IPR036430',
 'IPR021109',
 'IPR011761',
 'IPR001300',
 'IPR000306',
 'IPR037272',
 'IPR000175',
 'IPR036880',
 'IPR002223',
 'IPR002126',
 'IPR023298',
 'IPR015919',
 'IPR003599',
 'IPR020894',
 'IPR018490',
 'IPR000595',
 'IPR036179',
 'IPR007110',
 'IPR000387',
 'IPR036259',
 'IPR036770',
 'IPR013783',
 'IPR002110',
 'IPR050174',
 'IPR051877',
 'IPR050098',
 'IPR050599',
 'IPR050951',
 'IPR050327',
 'IPR050122',
 'IPR051131',
 'IPR050382',
 'IPR050302',
 'IPR050216',
 'IPR052993',
 'IPR050927',
 'IPR051835',
 'IPR050217',
 'IPR031703',
 'IPR036906',
 'IPR008218',
 'IPR019186',
 'IPR019269',
 'IPR050964',
 'IPR053025',
 'IPR028119',
 'IPR002056',
 'IPR023392',
 'IPR022422',
 'IPR041577',
 'IPR051165',
 'IPR051783',
 'IPR015816',
 'IPR050733',
 'IPR050668',
 'IPR0

In [31]:
interpro_accessions_in_missed_transcripts(flatten_nested_dict(acc_product), acc_tally_missed_anno, acc_tally_novel_anno, anno_missed_results, anno_novel_results, min_freq)

InterPro accessions that are completely missing from shared transcripts, with high frequency in missed transcripts:

InterPro accessions occurring with significantly higher frequency in missed transcripts than in shared transcripts:
	IPR009003: Peptidase S1, PA clan (22 occurrences, 0 expected)
	IPR001254: Serine proteases, trypsin domain (20 occurrences, 0 expected)
	IPR043504: Peptidase S1, PA clan, chymotrypsin-like fold (20 occurrences, 0 expected)

InterPro accessions occurring as expected in missed transcripts with high frequency:

InterPro accessions occurring less frequently in missed transcripts than expected:
	IPR000719: Protein kinase domain (1 occurrences, 7 expected)
	IPR011009: Protein kinase-like domain superfamily (2 occurrences, 8 expected)
	IPR027417: P-loop containing nucleoside triphosphate hydrolase (6 occurrences, 14 expected)



### General

Find accessions that are significantly commonly missed by **all** tools

In [32]:
acc_tally_no_tool, acc_tally_one_plus_tool_shared, acc_tally_one_plus_tool_novel = interpro_accession_pipeline_all_tools(
    wbps_species,
    og_df,
    wbps_col,
    [braker_col, helixer_col, anno_col],
    interproscan_dir
)

acc_tally_no_tool_unassigned, _, acc_tally_one_plus_tool_novel_unassigned = interpro_accession_pipeline_all_tools(
    wbps_species,
    no_og_df,
    wbps_col,
    [braker_col, helixer_col, anno_col],
    os.path.join(interproscan_dir, "unassigned_genes")
)
acc_tally_no_tool += acc_tally_no_tool_unassigned
acc_tally_one_plus_tool_novel += acc_tally_one_plus_tool_novel_unassigned

no_tools_results = fisher_exact_for_two_lists_of_accessions(acc_tally_no_tool, acc_tally_one_plus_tool_shared + acc_tally_one_plus_tool_novel)
only_tools_results = fisher_exact_for_two_lists_of_accessions(acc_tally_one_plus_tool_novel, acc_tally_one_plus_tool_shared + acc_tally_no_tool)

In [36]:
def is_AP(item):
    return item[0] in ASPARTIC_PEPTIDASE_SUPERFAMILY

def filter_results_for_AP(results):
    output = {}
    for k, v in results.items():
        try:
            output[k] = dict(filter(is_AP, v.items()))
        except AttributeError:
            output[k] = list(filter(is_AP, v))
    return output

only_tools_results_AP = filter_results_for_AP(only_tools_results)
no_tools_results_AP = filter_results_for_AP(no_tools_results)
print(only_tools_results_AP)
print(no_tools_results_AP)

{'more_frequent': {'IPR021109': [11.860385731353473, 4.709663083007144e-07], 'IPR001995': [56.77794793261868, 8.927868241734657e-05]}, 'as_expected': {}, 'less_frequent': {}, 'not_occurring': []}
{'more_frequent': {}, 'as_expected': {}, 'less_frequent': {}, 'not_occurring': []}


In [43]:
overrepd_accs = list(interpro_accessions_in_novel_transcripts(flatten_nested_dict(acc_product), acc_tally_one_plus_tool_novel, only_tools_results, min_freq=1))

InterPro accessions occurring with significantly higher frequency in novel transcripts than in shared transcripts:
	IPR001995: Peptidase A2A, retrovirus, catalytic (4 occurrences, 0 expected)
	IPR000477: Reverse transcriptase domain (13 occurrences, 0 expected)
	IPR043128: Reverse transcriptase/Diguanylate cyclase domain (9 occurrences, 0 expected)
	IPR029008: ER membrane protein complex subunit 6-like (3 occurrences, 0 expected)
	IPR013209: Lipin/Ned1/Smp2 (LNS2) (3 occurrences, 0 expected)
	IPR007651: Lipin, N-terminal (3 occurrences, 0 expected)
	IPR006139: D-isomer specific 2-hydroxyacid dehydrogenase, catalytic domain (3 occurrences, 0 expected)
	IPR006140: D-isomer specific 2-hydroxyacid dehydrogenase, NAD-binding domain (3 occurrences, 0 expected)
	IPR023509: D-aminoacyl-tRNA deacylase-like superfamily (3 occurrences, 0 expected)
	IPR003732: D-aminoacyl-tRNA deacylase DTD (3 occurrences, 0 expected)
	IPR011387: Translation initiation factor 2A (2 occurrences, 0 expected)
	IPR011

There don't appear to be any TEs amongst the overrepresented "homologous superfamily" or "family" accessions

In [44]:
print(len(overrepd_accs))
with open("data/transposable_elements/transposable_element_iprs.json", "r") as f:
    te_accs = json.load(f)
set(te_accs.values()).intersection(overrepd_accs)

265


{'IPR000477', 'IPR001584', 'IPR002156', 'IPR024445'}

In [45]:
anno_prefix = "transcript_ENSSMAT"
braker_prefix = "g"
helixer_prefix = "Schistosoma_mansoni_SM_V10_"
non_te_unique_transcripts = set()
te_unique_transcripts = set()

for acc in tqdm(overrepd_accs, total=len(overrepd_accs)):
    tool_transcript_counts = [count_transcripts_with_accession(acc, p, interproscan_dir) for p in (anno_prefix, braker_prefix, helixer_prefix)]
    unique_transcripts = flatten_list_to_set(list(map(itemgetter(1), tool_transcript_counts)))
    print(f"{acc}: {sum(map(itemgetter(0), tool_transcript_counts))} ({len(unique_transcripts)} unique)")
    if acc in te_accs:
        te_unique_transcripts.update(unique_transcripts)
    else:
        non_te_unique_transcripts.update(unique_transcripts)

  0%|          | 1/265 [00:01<04:31,  1.03s/it]

IPR001995: 4 (4 unique)


  1%|          | 2/265 [00:01<04:19,  1.01it/s]

IPR000477: 13 (13 unique)


  1%|          | 3/265 [00:02<04:18,  1.01it/s]

IPR043128: 8 (8 unique)


  2%|▏         | 4/265 [00:04<04:21,  1.00s/it]

IPR029008: 3 (3 unique)


  2%|▏         | 5/265 [00:04<04:18,  1.00it/s]

IPR013209: 3 (3 unique)


  2%|▏         | 6/265 [00:05<04:16,  1.01it/s]

IPR007651: 3 (3 unique)


  3%|▎         | 7/265 [00:06<04:09,  1.03it/s]

IPR006139: 3 (3 unique)


  3%|▎         | 8/265 [00:07<04:04,  1.05it/s]

IPR006140: 3 (3 unique)


  3%|▎         | 9/265 [00:08<04:02,  1.06it/s]

IPR023509: 3 (3 unique)


  4%|▍         | 10/265 [00:09<03:57,  1.07it/s]

IPR003732: 3 (3 unique)


  4%|▍         | 11/265 [00:10<03:57,  1.07it/s]

IPR011387: 2 (2 unique)


  5%|▍         | 12/265 [00:11<03:53,  1.08it/s]

IPR011030: 2 (2 unique)


  5%|▍         | 13/265 [00:12<03:50,  1.09it/s]

IPR001747: 2 (2 unique)


  5%|▌         | 14/265 [00:13<03:50,  1.09it/s]

IPR007233: 2 (2 unique)


  6%|▌         | 15/265 [00:14<03:50,  1.08it/s]

IPR027120: 2 (2 unique)


  6%|▌         | 16/265 [00:15<03:51,  1.07it/s]

IPR002156: 2 (2 unique)


  6%|▋         | 17/265 [00:16<03:48,  1.08it/s]

IPR040364: 2 (2 unique)


  7%|▋         | 18/265 [00:17<03:51,  1.07it/s]

IPR000572: 2 (2 unique)


  7%|▋         | 19/265 [00:17<03:49,  1.07it/s]

IPR036374: 2 (2 unique)


  8%|▊         | 20/265 [00:18<03:47,  1.08it/s]

IPR008335: 2 (2 unique)


  8%|▊         | 21/265 [00:19<03:44,  1.09it/s]

IPR035959: 2 (2 unique)


  8%|▊         | 22/265 [00:20<03:41,  1.09it/s]

IPR036748: 2 (2 unique)


  9%|▊         | 23/265 [00:21<03:42,  1.09it/s]

IPR007523: 2 (2 unique)


  9%|▉         | 24/265 [00:22<03:40,  1.09it/s]

IPR019564: 2 (2 unique)


  9%|▉         | 25/265 [00:23<03:41,  1.08it/s]

IPR000998: 2 (2 unique)


 10%|▉         | 26/265 [00:24<03:47,  1.05it/s]

IPR049629: 2 (2 unique)


 10%|█         | 27/265 [00:25<03:44,  1.06it/s]

IPR038980: 1 (1 unique)


 11%|█         | 28/265 [00:26<03:43,  1.06it/s]

IPR043502: 18 (18 unique)


 11%|█         | 29/265 [00:27<03:40,  1.07it/s]

IPR031315: 3 (3 unique)


 11%|█▏        | 30/265 [00:28<03:40,  1.07it/s]

IPR027080: 2 (2 unique)


 12%|█▏        | 31/265 [00:29<03:37,  1.08it/s]

IPR005479: 3 (3 unique)


 12%|█▏        | 32/265 [00:30<03:37,  1.07it/s]

IPR026183: 6 (6 unique)


 12%|█▏        | 33/265 [00:30<03:36,  1.07it/s]

IPR032815: 2 (2 unique)


 13%|█▎        | 34/265 [00:31<03:35,  1.07it/s]

IPR011044: 2 (2 unique)


 13%|█▎        | 35/265 [00:32<03:34,  1.07it/s]

IPR006032: 2 (2 unique)


 14%|█▎        | 36/265 [00:33<03:33,  1.07it/s]

IPR018067: 2 (2 unique)


 14%|█▍        | 37/265 [00:34<03:32,  1.07it/s]

IPR013769: 2 (2 unique)


 14%|█▍        | 38/265 [00:35<03:32,  1.07it/s]

IPR008254: 2 (2 unique)


 15%|█▍        | 39/265 [00:36<03:30,  1.07it/s]

IPR029039: 2 (2 unique)


 15%|█▌        | 40/265 [00:37<03:33,  1.05it/s]

IPR005446: 1 (1 unique)


 15%|█▌        | 41/265 [00:38<03:31,  1.06it/s]

IPR042172: 0 (0 unique)


 16%|█▌        | 42/265 [00:39<03:32,  1.05it/s]

IPR000043: 0 (0 unique)


 16%|█▌        | 43/265 [00:40<03:32,  1.04it/s]

IPR035976: 0 (0 unique)


 17%|█▋        | 44/265 [00:41<03:31,  1.04it/s]

IPR023302: 0 (0 unique)


 17%|█▋        | 45/265 [00:42<03:32,  1.03it/s]

IPR021109: 17 (17 unique)


 17%|█▋        | 46/265 [00:43<03:32,  1.03it/s]

IPR013979: 2 (2 unique)


 18%|█▊        | 47/265 [00:44<03:32,  1.03it/s]

IPR038466: 1 (1 unique)


 18%|█▊        | 48/265 [00:45<03:29,  1.04it/s]

IPR014772: 2 (2 unique)


 18%|█▊        | 49/265 [00:46<03:26,  1.05it/s]

IPR016152: 2 (2 unique)


 19%|█▉        | 50/265 [00:47<03:23,  1.05it/s]

IPR003020: 2 (2 unique)


 19%|█▉        | 51/265 [00:48<03:22,  1.06it/s]

IPR004240: 2 (2 unique)


 20%|█▉        | 52/265 [00:49<03:22,  1.05it/s]

IPR004130: 0 (0 unique)


 20%|██        | 53/265 [00:50<03:21,  1.05it/s]

IPR007081: 0 (0 unique)


 20%|██        | 54/265 [00:50<03:21,  1.05it/s]

IPR038207: 2 (2 unique)


 21%|██        | 55/265 [00:51<03:21,  1.04it/s]

IPR001158: 2 (2 unique)


 21%|██        | 56/265 [00:52<03:18,  1.05it/s]

IPR048684: 3 (3 unique)


 22%|██▏       | 57/265 [00:53<03:19,  1.04it/s]

IPR011761: 3 (3 unique)


 22%|██▏       | 58/265 [00:54<03:16,  1.05it/s]

IPR024706: 3 (3 unique)


 22%|██▏       | 59/265 [00:55<03:15,  1.05it/s]

IPR000866: 3 (3 unique)


 23%|██▎       | 60/265 [00:56<03:17,  1.04it/s]

IPR001972: 3 (3 unique)


 23%|██▎       | 61/265 [00:57<03:17,  1.03it/s]

IPR043202: 3 (3 unique)


 23%|██▎       | 62/265 [00:58<03:16,  1.03it/s]

IPR041228: 3 (3 unique)


 24%|██▍       | 63/265 [00:59<03:12,  1.05it/s]

IPR041658: 3 (3 unique)


 24%|██▍       | 64/265 [01:00<03:10,  1.06it/s]

IPR042219: 3 (3 unique)


 25%|██▍       | 65/265 [01:01<03:09,  1.06it/s]

IPR043160: 3 (3 unique)


 25%|██▍       | 66/265 [01:02<03:08,  1.06it/s]

IPR023578: 0 (0 unique)


 25%|██▌       | 67/265 [01:03<03:09,  1.05it/s]

IPR037104: 8 (8 unique)


 26%|██▌       | 68/265 [01:04<03:08,  1.04it/s]

IPR002223: 3 (3 unique)


 26%|██▌       | 69/265 [01:05<03:07,  1.05it/s]

IPR036880: 3 (3 unique)


 26%|██▋       | 70/265 [01:06<03:08,  1.04it/s]

IPR020901: 2 (2 unique)


 27%|██▋       | 71/265 [01:07<03:10,  1.02it/s]

IPR000306: 2 (2 unique)


 27%|██▋       | 72/265 [01:08<03:11,  1.01it/s]

IPR001547: 6 (6 unique)


 28%|██▊       | 73/265 [01:09<03:16,  1.02s/it]

IPR026983: 3 (3 unique)


 28%|██▊       | 74/265 [01:10<03:12,  1.01s/it]

IPR036964: 0 (0 unique)


 28%|██▊       | 75/265 [01:11<03:12,  1.01s/it]

IPR003599: 14 (14 unique)


 29%|██▊       | 76/265 [01:12<03:09,  1.00s/it]

IPR036869: 8 (8 unique)


 29%|██▉       | 77/265 [01:13<03:09,  1.01s/it]

IPR013098: 10 (10 unique)


 29%|██▉       | 78/265 [01:14<03:10,  1.02s/it]

IPR002126: 4 (4 unique)


 30%|██▉       | 79/265 [01:15<03:08,  1.02s/it]

IPR001623: 8 (8 unique)


 30%|███       | 80/265 [01:16<03:10,  1.03s/it]

IPR012337: 5 (5 unique)


 31%|███       | 81/265 [01:17<03:05,  1.01s/it]

IPR015919: 4 (4 unique)


 31%|███       | 82/265 [01:18<03:04,  1.01s/it]

IPR003961: 11 (11 unique)


 31%|███▏      | 83/265 [01:19<03:02,  1.00s/it]

IPR020894: 4 (4 unique)


 32%|███▏      | 84/265 [01:20<02:59,  1.01it/s]

IPR007110: 13 (13 unique)


 32%|███▏      | 85/265 [01:21<02:59,  1.00it/s]

IPR036116: 11 (11 unique)


 32%|███▏      | 86/265 [01:22<02:58,  1.00it/s]

IPR036179: 14 (14 unique)


 33%|███▎      | 87/265 [01:23<02:55,  1.01it/s]

IPR002347: 5 (5 unique)


 33%|███▎      | 88/265 [01:24<02:52,  1.03it/s]

IPR013783: 15 (15 unique)


 34%|███▎      | 89/265 [01:25<02:55,  1.00it/s]

IPR017853: 8 (8 unique)


 34%|███▍      | 90/265 [01:26<02:54,  1.00it/s]

IPR036770: 9 (9 unique)


 34%|███▍      | 91/265 [01:27<02:52,  1.01it/s]

IPR002110: 9 (9 unique)


 35%|███▍      | 92/265 [01:28<02:49,  1.02it/s]

IPR029071: 8 (8 unique)


 35%|███▌      | 93/265 [01:29<02:46,  1.03it/s]

IPR050951: 10 (10 unique)


 35%|███▌      | 94/265 [01:30<02:43,  1.05it/s]

IPR050174: 4 (4 unique)


 36%|███▌      | 95/265 [01:31<02:42,  1.04it/s]

IPR034128: 5 (5 unique)


 36%|███▌      | 96/265 [01:32<02:40,  1.05it/s]

IPR045609: 5 (5 unique)


 37%|███▋      | 97/265 [01:33<02:41,  1.04it/s]

IPR050098: 3 (3 unique)


 37%|███▋      | 98/265 [01:34<02:40,  1.04it/s]

IPR050217: 3 (3 unique)


 37%|███▋      | 99/265 [01:34<02:35,  1.07it/s]

IPR031703: 3 (3 unique)


 38%|███▊      | 100/265 [01:35<02:35,  1.06it/s]

IPR050209: 4 (4 unique)


 38%|███▊      | 101/265 [01:36<02:32,  1.07it/s]

IPR008218: 3 (3 unique)


 38%|███▊      | 102/265 [01:37<02:31,  1.07it/s]

IPR036906: 3 (3 unique)


 39%|███▉      | 103/265 [01:38<02:29,  1.08it/s]

IPR019269: 3 (3 unique)


 39%|███▉      | 104/265 [01:39<02:29,  1.08it/s]

IPR001584: 3 (3 unique)


 40%|███▉      | 105/265 [01:40<02:29,  1.07it/s]

IPR041577: 3 (3 unique)


 40%|████      | 106/265 [01:41<02:28,  1.07it/s]

IPR050964: 5 (5 unique)


 40%|████      | 107/265 [01:42<02:28,  1.06it/s]

IPR051877: 0 (0 unique)


 41%|████      | 108/265 [01:43<02:26,  1.07it/s]

IPR050283: 2 (2 unique)


 41%|████      | 109/265 [01:44<02:24,  1.08it/s]

IPR050857: 2 (2 unique)


 42%|████▏     | 110/265 [01:45<02:23,  1.08it/s]

IPR005772: 2 (2 unique)


 42%|████▏     | 111/265 [01:46<02:21,  1.09it/s]

IPR019186: 2 (2 unique)


 42%|████▏     | 112/265 [01:47<02:23,  1.07it/s]

IPR028119: 2 (2 unique)


 43%|████▎     | 113/265 [01:48<02:24,  1.05it/s]

IPR022422: 2 (2 unique)


 43%|████▎     | 114/265 [01:49<02:26,  1.03it/s]

IPR002056: 2 (2 unique)


 43%|████▎     | 115/265 [01:50<02:26,  1.02it/s]

IPR023392: 2 (2 unique)


 44%|████▍     | 116/265 [01:51<02:27,  1.01it/s]

IPR051165: 2 (2 unique)


 44%|████▍     | 117/265 [01:52<02:28,  1.00s/it]

IPR015816: 2 (2 unique)


 45%|████▍     | 118/265 [01:53<02:28,  1.01s/it]

IPR050733: 2 (2 unique)


 45%|████▍     | 119/265 [01:54<02:25,  1.00it/s]

IPR054694: 2 (2 unique)


 45%|████▌     | 120/265 [01:55<02:31,  1.04s/it]

IPR041170: 2 (2 unique)


 46%|████▌     | 121/265 [01:56<02:33,  1.07s/it]

IPR003977: 2 (2 unique)


 46%|████▌     | 122/265 [01:57<02:29,  1.05s/it]

IPR050889: 2 (2 unique)


 46%|████▋     | 123/265 [01:58<02:26,  1.03s/it]

IPR003377: 2 (2 unique)


 47%|████▋     | 124/265 [01:59<02:32,  1.08s/it]

IPR050668: 2 (2 unique)


 47%|████▋     | 125/265 [02:00<02:40,  1.15s/it]

IPR031790: 2 (2 unique)


 48%|████▊     | 126/265 [02:01<02:33,  1.10s/it]

IPR016818: 2 (2 unique)


 48%|████▊     | 127/265 [02:02<02:26,  1.06s/it]

IPR050758: 2 (2 unique)


 48%|████▊     | 128/265 [02:03<02:21,  1.03s/it]

IPR029248: 2 (2 unique)


 49%|████▊     | 129/265 [02:04<02:16,  1.00s/it]

IPR005813: 2 (2 unique)


 49%|████▉     | 130/265 [02:05<02:18,  1.02s/it]

IPR035566: 2 (2 unique)


 49%|████▉     | 131/265 [02:06<02:14,  1.00s/it]

IPR007242: 2 (2 unique)


 50%|████▉     | 132/265 [02:07<02:16,  1.03s/it]

IPR006175: 2 (2 unique)


 50%|█████     | 133/265 [02:08<02:18,  1.05s/it]

IPR034096: 2 (2 unique)


 51%|█████     | 134/265 [02:09<02:15,  1.04s/it]

IPR053134: 2 (2 unique)


 51%|█████     | 135/265 [02:10<02:13,  1.03s/it]

IPR052066: 6 (6 unique)


 51%|█████▏    | 136/265 [02:11<02:10,  1.01s/it]

IPR052160: 1 (1 unique)


 52%|█████▏    | 137/265 [02:12<02:10,  1.02s/it]

IPR050599: 2 (2 unique)


 52%|█████▏    | 138/265 [02:13<02:08,  1.02s/it]

IPR050327: 0 (0 unique)


 52%|█████▏    | 139/265 [02:15<02:12,  1.05s/it]

IPR050122: 0 (0 unique)


 53%|█████▎    | 140/265 [02:16<02:11,  1.05s/it]

IPR051131: 0 (0 unique)


 53%|█████▎    | 141/265 [02:17<02:08,  1.04s/it]

IPR050382: 0 (0 unique)


 54%|█████▎    | 142/265 [02:18<02:06,  1.03s/it]

IPR050302: 0 (0 unique)


 54%|█████▍    | 143/265 [02:19<02:03,  1.01s/it]

IPR050216: 0 (0 unique)


 54%|█████▍    | 144/265 [02:20<02:03,  1.02s/it]

IPR052993: 0 (0 unique)


 55%|█████▍    | 145/265 [02:21<02:02,  1.02s/it]

IPR050927: 0 (0 unique)


 55%|█████▌    | 146/265 [02:22<02:00,  1.02s/it]

IPR051835: 0 (0 unique)


 55%|█████▌    | 147/265 [02:23<02:00,  1.02s/it]

IPR050173: 1 (1 unique)


 56%|█████▌    | 148/265 [02:24<01:57,  1.01s/it]

IPR053025: 1 (1 unique)


 56%|█████▌    | 149/265 [02:25<01:59,  1.03s/it]

IPR053095: 1 (1 unique)


 57%|█████▋    | 150/265 [02:26<01:57,  1.03s/it]

IPR051783: 1 (1 unique)


 57%|█████▋    | 151/265 [02:27<02:01,  1.06s/it]

IPR050274: 1 (1 unique)


 57%|█████▋    | 152/265 [02:28<01:58,  1.04s/it]

IPR033466: 1 (1 unique)


 58%|█████▊    | 153/265 [02:29<01:56,  1.04s/it]

IPR028064: 1 (1 unique)


 58%|█████▊    | 154/265 [02:30<01:53,  1.02s/it]

IPR039540: 1 (1 unique)


 58%|█████▊    | 155/265 [02:31<01:57,  1.07s/it]

IPR050379: 1 (1 unique)


 59%|█████▉    | 156/265 [02:32<01:56,  1.07s/it]

IPR050989: 1 (1 unique)


 59%|█████▉    | 157/265 [02:33<01:53,  1.05s/it]

IPR050092: 1 (1 unique)


 60%|█████▉    | 158/265 [02:34<01:52,  1.06s/it]

IPR017067: 1 (1 unique)


 60%|██████    | 159/265 [02:35<01:50,  1.04s/it]

IPR042653: 1 (1 unique)


 60%|██████    | 160/265 [02:36<01:52,  1.07s/it]

IPR040459: 1 (1 unique)


 61%|██████    | 161/265 [02:38<01:53,  1.09s/it]

IPR050561: 1 (1 unique)


 61%|██████    | 162/265 [02:39<01:49,  1.07s/it]

IPR006056: 1 (1 unique)


 62%|██████▏   | 163/265 [02:40<01:51,  1.10s/it]

IPR052149: 1 (1 unique)


 62%|██████▏   | 164/265 [02:41<01:51,  1.11s/it]

IPR053040: 1 (1 unique)


 62%|██████▏   | 165/265 [02:42<01:51,  1.11s/it]

IPR029681: 1 (1 unique)


 63%|██████▎   | 166/265 [02:43<01:48,  1.10s/it]

IPR007404: 1 (1 unique)


 63%|██████▎   | 167/265 [02:44<01:46,  1.09s/it]

IPR018552: 1 (1 unique)


 63%|██████▎   | 168/265 [02:45<01:46,  1.10s/it]

IPR034132: 1 (1 unique)


 64%|██████▍   | 169/265 [02:46<01:44,  1.09s/it]

IPR041373: 3 (3 unique)


 64%|██████▍   | 170/265 [02:48<01:52,  1.19s/it]

IPR052652: 1 (1 unique)


 65%|██████▍   | 171/265 [02:49<01:48,  1.16s/it]

IPR041588: 2 (2 unique)


 65%|██████▍   | 172/265 [02:50<01:53,  1.22s/it]

IPR051631: 1 (1 unique)


 65%|██████▌   | 173/265 [02:52<01:56,  1.26s/it]

IPR052269: 1 (1 unique)


 66%|██████▌   | 174/265 [02:53<01:51,  1.23s/it]

IPR024445: 1 (1 unique)


 66%|██████▌   | 175/265 [02:54<01:45,  1.18s/it]

IPR050235: 1 (1 unique)


 66%|██████▋   | 176/265 [02:55<01:40,  1.13s/it]

IPR042237: 0 (0 unique)


 67%|██████▋   | 177/265 [02:56<01:43,  1.18s/it]

IPR049813: 0 (0 unique)


 67%|██████▋   | 178/265 [02:57<01:41,  1.17s/it]

IPR051583: 0 (0 unique)


 68%|██████▊   | 179/265 [02:58<01:38,  1.14s/it]

IPR051833: 0 (0 unique)


 68%|██████▊   | 180/265 [02:59<01:34,  1.11s/it]

IPR050449: 0 (0 unique)


 68%|██████▊   | 181/265 [03:00<01:31,  1.08s/it]

IPR050701: 0 (0 unique)


 69%|██████▊   | 182/265 [03:01<01:30,  1.09s/it]

IPR050082: 0 (0 unique)


 69%|██████▉   | 183/265 [03:03<01:29,  1.09s/it]

IPR050079: 0 (0 unique)


 69%|██████▉   | 184/265 [03:04<01:30,  1.11s/it]

IPR052579: 0 (0 unique)


 70%|██████▉   | 185/265 [03:05<01:24,  1.06s/it]

IPR039870: 0 (0 unique)


 70%|███████   | 186/265 [03:06<01:21,  1.03s/it]

IPR051746: 0 (0 unique)


 71%|███████   | 187/265 [03:07<01:18,  1.00s/it]

IPR051974: 0 (0 unique)


 71%|███████   | 188/265 [03:08<01:15,  1.01it/s]

IPR052094: 0 (0 unique)


 71%|███████▏  | 189/265 [03:09<01:16,  1.00s/it]

IPR028325: 0 (0 unique)


 72%|███████▏  | 190/265 [03:10<01:14,  1.01it/s]

IPR051177: 0 (0 unique)


 72%|███████▏  | 191/265 [03:11<01:13,  1.00it/s]

IPR051857: 0 (0 unique)


 72%|███████▏  | 192/265 [03:12<01:14,  1.02s/it]

IPR051569: 0 (0 unique)


 73%|███████▎  | 193/265 [03:13<01:12,  1.01s/it]

IPR050348: 0 (0 unique)


 73%|███████▎  | 194/265 [03:14<01:11,  1.01s/it]

IPR050699: 0 (0 unique)


 74%|███████▎  | 195/265 [03:15<01:10,  1.01s/it]

IPR053003: 0 (0 unique)


 74%|███████▍  | 196/265 [03:16<01:12,  1.04s/it]

IPR050751: 0 (0 unique)


 74%|███████▍  | 197/265 [03:17<01:10,  1.03s/it]

IPR049883: 0 (0 unique)


 75%|███████▍  | 198/265 [03:18<01:08,  1.02s/it]

IPR050503: 0 (0 unique)


 75%|███████▌  | 199/265 [03:19<01:06,  1.00s/it]

IPR052795: 0 (0 unique)


 75%|███████▌  | 200/265 [03:20<01:04,  1.01it/s]

IPR051280: 0 (0 unique)


 76%|███████▌  | 201/265 [03:21<01:03,  1.01it/s]

IPR050185: 0 (0 unique)


 76%|███████▌  | 202/265 [03:22<01:01,  1.02it/s]

IPR050717: 0 (0 unique)


 77%|███████▋  | 203/265 [03:23<01:00,  1.02it/s]

IPR051281: 0 (0 unique)


 77%|███████▋  | 204/265 [03:24<00:59,  1.02it/s]

IPR051944: 0 (0 unique)


 77%|███████▋  | 205/265 [03:25<00:58,  1.02it/s]

IPR042635: 0 (0 unique)


 78%|███████▊  | 206/265 [03:26<01:00,  1.03s/it]

IPR042406: 0 (0 unique)


 78%|███████▊  | 207/265 [03:27<00:59,  1.03s/it]

IPR012932: 0 (0 unique)


 78%|███████▊  | 208/265 [03:28<00:57,  1.01s/it]

IPR051001: 0 (0 unique)


 79%|███████▉  | 209/265 [03:29<00:56,  1.01s/it]

IPR050606: 0 (0 unique)


 79%|███████▉  | 210/265 [03:30<00:54,  1.01it/s]

IPR009600: 0 (0 unique)


 80%|███████▉  | 211/265 [03:31<00:54,  1.00s/it]

IPR051341: 0 (0 unique)


 80%|████████  | 212/265 [03:32<00:55,  1.05s/it]

IPR051370: 0 (0 unique)


 80%|████████  | 213/265 [03:33<00:53,  1.03s/it]

IPR051578: 0 (0 unique)


 81%|████████  | 214/265 [03:34<00:52,  1.02s/it]

IPR050932: 0 (0 unique)


 81%|████████  | 215/265 [03:35<00:50,  1.01s/it]

IPR034168: 0 (0 unique)


 82%|████████▏ | 216/265 [03:36<00:49,  1.01s/it]

IPR051167: 0 (0 unique)


 82%|████████▏ | 217/265 [03:37<00:48,  1.00s/it]

IPR050409: 0 (0 unique)


 82%|████████▏ | 218/265 [03:38<00:46,  1.00it/s]

IPR054532: 0 (0 unique)


 83%|████████▎ | 219/265 [03:39<00:46,  1.00s/it]

IPR050117: 1 (1 unique)


 83%|████████▎ | 220/265 [03:40<00:45,  1.00s/it]

IPR051618: 0 (0 unique)


 83%|████████▎ | 221/265 [03:41<00:44,  1.02s/it]

IPR051434: 0 (0 unique)


 84%|████████▍ | 222/265 [03:42<00:44,  1.02s/it]

IPR050238: 0 (0 unique)


 84%|████████▍ | 223/265 [03:43<00:42,  1.01s/it]

IPR022324: 0 (0 unique)


 85%|████████▍ | 224/265 [03:44<00:41,  1.01s/it]

IPR039843: 0 (0 unique)


 85%|████████▍ | 225/265 [03:45<00:39,  1.00it/s]

IPR052942: 0 (0 unique)


 85%|████████▌ | 226/265 [03:46<00:38,  1.00it/s]

IPR050698: 0 (0 unique)


 86%|████████▌ | 227/265 [03:47<00:37,  1.02it/s]

IPR052394: 0 (0 unique)


 86%|████████▌ | 228/265 [03:48<00:36,  1.03it/s]

IPR050645: 0 (0 unique)


 86%|████████▋ | 229/265 [03:49<00:34,  1.04it/s]

IPR050745: 0 (0 unique)


 87%|████████▋ | 230/265 [03:50<00:33,  1.03it/s]

IPR050836: 0 (0 unique)


 87%|████████▋ | 231/265 [03:51<00:33,  1.02it/s]

IPR051739: 0 (0 unique)


 88%|████████▊ | 232/265 [03:52<00:32,  1.03it/s]

IPR051025: 0 (0 unique)


 88%|████████▊ | 233/265 [03:53<00:31,  1.03it/s]

IPR054429: 0 (0 unique)


 88%|████████▊ | 234/265 [03:54<00:30,  1.02it/s]

IPR050527: 0 (0 unique)


 89%|████████▊ | 235/265 [03:55<00:29,  1.02it/s]

IPR052311: 0 (0 unique)


 89%|████████▉ | 236/265 [03:56<00:28,  1.01it/s]

IPR050198: 0 (0 unique)


 89%|████████▉ | 237/265 [03:57<00:27,  1.03it/s]

IPR052093: 0 (0 unique)


 90%|████████▉ | 238/265 [03:58<00:26,  1.03it/s]

IPR052954: 0 (0 unique)


 90%|█████████ | 239/265 [03:58<00:24,  1.04it/s]

IPR053879: 0 (0 unique)


 91%|█████████ | 240/265 [03:59<00:23,  1.05it/s]

IPR050866: 0 (0 unique)


 91%|█████████ | 241/265 [04:00<00:23,  1.04it/s]

IPR051092: 0 (0 unique)


 91%|█████████▏| 242/265 [04:01<00:21,  1.05it/s]

IPR050149: 0 (0 unique)


 92%|█████████▏| 243/265 [04:02<00:20,  1.06it/s]

IPR051070: 0 (0 unique)


 92%|█████████▏| 244/265 [04:03<00:20,  1.04it/s]

IPR051519: 0 (0 unique)


 92%|█████████▏| 245/265 [04:04<00:19,  1.04it/s]

IPR053014: 0 (0 unique)


 93%|█████████▎| 246/265 [04:05<00:18,  1.02it/s]

IPR052208: 0 (0 unique)


 93%|█████████▎| 247/265 [04:06<00:17,  1.02it/s]

IPR050776: 0 (0 unique)


 94%|█████████▎| 248/265 [04:07<00:16,  1.03it/s]

IPR050440: 0 (0 unique)


 94%|█████████▍| 249/265 [04:08<00:15,  1.02it/s]

IPR051738: 0 (0 unique)


 94%|█████████▍| 250/265 [04:09<00:14,  1.02it/s]

IPR052434: 0 (0 unique)


 95%|█████████▍| 251/265 [04:10<00:13,  1.02it/s]

IPR050116: 0 (0 unique)


 95%|█████████▌| 252/265 [04:11<00:12,  1.04it/s]

IPR052403: 0 (0 unique)


 95%|█████████▌| 253/265 [04:12<00:11,  1.05it/s]

IPR050818: 0 (0 unique)


 96%|█████████▌| 254/265 [04:13<00:10,  1.06it/s]

IPR051337: 0 (0 unique)


 96%|█████████▌| 255/265 [04:14<00:09,  1.06it/s]

IPR050886: 0 (0 unique)


 97%|█████████▋| 256/265 [04:15<00:08,  1.04it/s]

IPR051382: 0 (0 unique)


 97%|█████████▋| 257/265 [04:16<00:07,  1.04it/s]

IPR051093: 0 (0 unique)


 97%|█████████▋| 258/265 [04:17<00:06,  1.04it/s]

IPR052224: 0 (0 unique)


 98%|█████████▊| 259/265 [04:18<00:05,  1.04it/s]

IPR038441: 0 (0 unique)


 98%|█████████▊| 260/265 [04:19<00:04,  1.07it/s]

IPR036366: 0 (0 unique)


 98%|█████████▊| 261/265 [04:19<00:03,  1.09it/s]

IPR050342: 0 (0 unique)


 99%|█████████▉| 262/265 [04:20<00:02,  1.10it/s]

IPR004323: 0 (0 unique)


 99%|█████████▉| 263/265 [04:21<00:01,  1.08it/s]

IPR011322: 0 (0 unique)


100%|█████████▉| 264/265 [04:22<00:00,  1.07it/s]

IPR000077: 0 (0 unique)


100%|██████████| 265/265 [04:23<00:00,  1.01it/s]

IPR023626: 0 (0 unique)





204 unique novel genes associated with overrepresented IPRs, none of which are associated explicitly with TEs.

- 85 Anno
- 49 BRAKER3
- 70 Helixer

In [46]:
print(len(non_te_unique_transcripts))
print(len([t for t in non_te_unique_transcripts if t.startswith(anno_prefix)]))
print(len([t for t in non_te_unique_transcripts if t.startswith(braker_prefix)]))
print(len([t for t in non_te_unique_transcripts if t.startswith(helixer_prefix)]))

# for t in non_te_unique_transcripts:
#     print(t)

235
93
65
77


In [47]:
te_unique_transcripts

set()

In [50]:
interpro_accessions_frequently_missed_by_all_tools(flatten_nested_dict(acc_product), acc_tally_no_tool, no_tools_results, min_freq=5)

InterPro accessions occurring with significantly higher frequency in transcripts that were missed by all tools, than in transcripts shared by at least 1 tool:
	IPR001254: Serine proteases, trypsin domain (20 occurrences, 0 expected)
	IPR043504: Peptidase S1, PA clan, chymotrypsin-like fold (20 occurrences, 0 expected)
	IPR009003: Peptidase S1, PA clan (21 occurrences, 0 expected)
	IPR007125: Histone H2A/H2B/H3 (5 occurrences, 0 expected)
	IPR009072: Histone-fold (8 occurrences, 0 expected)

InterPro accessions that are completely missing from transcripts shared by at least 1 tool, but present in transcripts that were missed by all tools:

InterPro accessions occurring as expected in transcripts that were missed by all tools.


Find accessions that are significantly commonly missed by each tool.

In [51]:
interpro_accessions_frequently_missed_by_each_tool(
    flatten_nested_dict(acc_product),
    {
        "BRAKER3": braker3_missed_results,
        "Helixer": helixer_missed_results,
        "Anno": anno_missed_results
    },
    {
        "BRAKER3": acc_tally_missed_braker3,
        "Helixer": acc_tally_missed_helixer,
        "Anno": acc_tally_missed_anno
    }
)

IPR015898: G-protein gamma-like domain
40.6 times more likely than with BRAKER3 (2 occurrences, 0 expected)
46.25 times more likely than with Helixer (2 occurrences, 0 expected)
69.27 times more likely than with Anno (2 occurrences, 0 expected)
~~~~~~~~~~
IPR036284: G-protein gamma-like domain superfamily
40.6 times more likely than with BRAKER3 (2 occurrences, 0 expected)
46.25 times more likely than with Helixer (2 occurrences, 0 expected)
69.27 times more likely than with Anno (2 occurrences, 0 expected)
~~~~~~~~~~
IPR002119: Histone H2A
16.26 times more likely than with BRAKER3 (4 occurrences, 0 expected)
69.61 times more likely than with Helixer (6 occurrences, 0 expected)
20.81 times more likely than with Anno (3 occurrences, 0 expected)
~~~~~~~~~~
IPR032458: Histone H2A conserved site
15.24 times more likely than with BRAKER3 (3 occurrences, 0 expected)
115.92 times more likely than with Helixer (5 occurrences, 0 expected)
17.32 times more likely than with Anno (2 occurrences, 0

In [53]:
import re
MIN_WORD_LEN = 5
tools_missed_results = {
    "BRAKER3": braker3_missed_results,
    "Helixer": helixer_missed_results,
    "Anno": anno_missed_results
}
tools_novel_results = {
    "BRAKER3": braker3_novel_results,
    "Helixer": helixer_novel_results,
    "Anno": anno_novel_results
}
acc_tally_missed_tools = {
    "BRAKER3": acc_tally_missed_braker3,
    "Helixer": acc_tally_missed_helixer,
    "Anno": acc_tally_missed_anno
}
acc_tally_novel_tools = {
    "BRAKER3": acc_tally_novel_braker3,
    "Helixer": acc_tally_novel_helixer,
    "Anno": acc_tally_novel_anno
}
seen_words = set()
all_words = set()
for acc in set.intersection(*[set(results["more_frequent"].keys()) for results in tools_missed_results.values()]):
    prod = flatten_nested_dict(acc_product)[acc]
    words = [w for w in re.split(r'\W', prod.lower()) if w and len(w) >= MIN_WORD_LEN]
    all_words.update(words)
    # Check if words occur more frequently in novel-only accessions
    for novel_acc in set.intersection(*[set(results["more_frequent"].keys()) for results in tools_novel_results.values()]):
        novel_prod = flatten_nested_dict(acc_product)[novel_acc]
        for w in words:
            if w in novel_prod.lower():
                print(f"{w} appears in {novel_acc}: {novel_prod}")
                seen_words.add(w)

In [54]:
all_words.difference(seen_words)

{'chymotrypsin',
 'conserved',
 'domain',
 'gamma',
 'histone',
 'patch',
 'peptidase',
 'proteases',
 'protein',
 'serine',
 'superfamily',
 'trypsin',
 'uncharacterised',
 'upf0506'}

# Novel orthologues

In [55]:
print("Novel transcripts (% of total predicted by tool)")

shared_braker_genes = len(og_df[~og_df[wbps_col].isna() & ~og_df[braker_col].isna()][braker_col].str.split(",").explode().unique())
novel_braker_orths = len(og_df[og_df[wbps_col].isna() & ~og_df[braker_col].isna()][braker_col].str.split(",").explode().unique())
novel_braker_ungs = len(no_og_df[no_og_df[wbps_col].isna() & ~no_og_df[braker_col].isna()][braker_col].str.split(",").explode().unique())
assert shared_braker_genes + novel_braker_orths + novel_braker_ungs == len(braker_species.all_transcript_ids)
print(f"BRAKER3: {novel_braker_orths + novel_braker_ungs} ({round(100*(novel_braker_orths + novel_braker_ungs)/len(braker_species.all_transcript_ids), 2)}%)")


shared_helixer_genes = len(og_df[~og_df[wbps_col].isna() & ~og_df[helixer_col].isna()][helixer_col].str.split(",").explode().unique())
novel_helixer_orths = len(og_df[og_df[wbps_col].isna() & ~og_df[helixer_col].isna()][helixer_col].str.split(",").explode().unique())
novel_helixer_ungs = len(no_og_df[no_og_df[wbps_col].isna() & ~no_og_df[helixer_col].isna()][helixer_col].str.split(",").explode().unique())
assert shared_helixer_genes + novel_helixer_orths + novel_helixer_ungs == len(helixer_species.all_transcript_ids)
print(f"Helixer: {novel_helixer_orths + novel_helixer_ungs} ({round(100*(novel_helixer_orths + novel_helixer_ungs)/len(helixer_species.all_transcript_ids), 2)}%)")

shared_anno_genes = len(og_df[~og_df[wbps_col].isna() & ~og_df[anno_col].isna()][anno_col].str.split(",").explode().unique())
novel_anno_orths = len(og_df[og_df[wbps_col].isna() & ~og_df[anno_col].isna()][anno_col].str.split(",").explode().unique())
novel_anno_ungs = len(no_og_df[no_og_df[wbps_col].isna() & ~no_og_df[anno_col].isna()][anno_col].str.split(",").explode().unique())
assert shared_anno_genes + novel_anno_orths + novel_anno_ungs == len(anno_species.all_transcript_ids)
print(f"Anno: {novel_anno_orths + novel_anno_ungs} ({round(100*(novel_anno_orths + novel_anno_ungs)/len(anno_species.all_transcript_ids), 2)}%)")


Novel transcripts (% of total predicted by tool)
BRAKER3: 508 (5.59%)
Helixer: 1164 (11.76%)
Anno: 2720 (21.25%)


In [4]:
## Run once to populate
# novel_orthologue_pipeline(no_og_df, wbps_col, anno_col, species_list, out_dir="data/novel_orthologue_sequences/sman/anno/unassigned_genes/")
# novel_orthologue_pipeline(no_og_df, wbps_col, braker_col, species_list, out_dir="data/novel_orthologue_sequences/sman/braker3/unassigned_genes/")
# novel_orthologue_pipeline(no_og_df, wbps_col, helixer_col, species_list, out_dir="data/novel_orthologue_sequences/sman/helixer/unassigned_genes/")

100%|██████████| 2437/2437 [00:01<00:00, 1677.62it/s]
100%|██████████| 2437/2437 [00:06<00:00, 358.46it/s] 


In [56]:
# novel_orthologue_pipeline(og_df, wbps_col, anno_col, species_list)
# novel_orthologue_pipeline(og_df, wbps_col, braker_col, species_list)
# novel_orthologue_pipeline(og_df, wbps_col, helixer_col, species_list)
anno_esm_means = extract_esm_means("data/from_MARS/Sman_esm_pLDDTs_anno.txt").values()
braker3_esm_means = extract_esm_means("data/from_MARS/Sman_esm_pLDDTs_braker3.txt").values()
helixer_esm_means = extract_esm_means("data/from_MARS/Sman_esm_pLDDTs_helixer.txt").values()

print(statistics.mean(map(float, anno_esm_means)))
print(statistics.mean(map(float, braker3_esm_means)))
print(statistics.mean(map(float, helixer_esm_means)))

def boxplot_axes(idx, label, data, ylim=(0, 100)):
    ax = plt.subplot(1, 4, idx)
    ax.boxplot(data)
    if ylim:
        ax.set_ylim(top=ylim[1], bottom=ylim[0])
    ax.grid(which="both")
    ax.set_title(label.upper())
    ax.set_xticklabels([])
    ax.set_xlabel(round(statistics.mean(data), 2))

boxplot_axes(idx=1, label="anno", data=list(map(float, anno_esm_means)))
boxplot_axes(idx=2, label="braker3", data=list(map(float, braker3_esm_means)))
boxplot_axes(idx=3, label="helixer", data=list(map(float, helixer_esm_means)))
plt.show()


# anno_esm_means

54.20466926070039
57.278228782287826
54.64353233830846


In [57]:
import pandas as pd
cols = (
    "fn",
    "mean",
    "median",
    "stdev",
    "var",
    "max",
    "min",
    "perc_confident"
)
df = pd.read_csv("data/from_MARS/pLDDT_sman.csv", names=cols)
print(f"Mean of means: {df['mean'].mean()}")
print(f"% that are \"Confident\": {100*df[df['mean'] >= 70].shape[0]/df.shape[0]}")

# df.count()

Mean of means: 70.15563125399191
% that are "Confident": 54.939322972109856


In [58]:
print(f"% BRAKER3 that are \"Confident\": {100*len([p for p in braker3_esm_means if float(p) >= 70]) / len(braker3_esm_means)}")
print(f"% Anno that are \"Confident\": {100*len([p for p in anno_esm_means if float(p) >= 70]) / len(anno_esm_means)}")
print(f"% Helixer that are \"Confident\": {100*len([p for p in helixer_esm_means if float(p) >= 70]) / len(helixer_esm_means)}")

% BRAKER3 that are "Confident": 31.73431734317343
% Anno that are "Confident": 24.90272373540856
% Helixer that are "Confident": 21.890547263681594


In [59]:

df_anno = pd.read_csv("data/from_MARS/pLDDT_sman_anno.csv", names=cols)
df_braker3 = pd.read_csv("data/from_MARS/pLDDT_sman_braker3.csv", names=cols)
df_helixer = pd.read_csv("data/from_MARS/pLDDT_sman_helixer.csv", names=cols)

In [60]:
print(f"Mean of WBPS % that are \"Confident\" residues: {df['perc_confident'].mean()}")
print(f"Mean of BRAKER3 % that are \"Confident\" residues: {df_braker3['perc_confident'].mean()}")
print(f"Mean of Anno % that are \"Confident\" residues: {df_anno['perc_confident'].mean()}")
print(f"Mean of Helixer % that are \"Confident\" residues: {df_helixer['perc_confident'].mean()}")


Mean of WBPS % that are "Confident" residues: 57.85118160527997
Mean of BRAKER3 % that are "Confident" residues: 35.38376383763838
Mean of Anno % that are "Confident" residues: 30.747081712062258
Mean of Helixer % that are "Confident" residues: 30.46153846153846


In [61]:
df_anno[(df_anno["mean"] > 50) & (df_anno["perc_confident"] == 0)].sort_values("stdev")

Unnamed: 0,fn,mean,median,stdev,var,max,min,perc_confident
319,transcript:ENSSMAT00000005813.pdb,56,57,6,36,65,36,0
186,transcript:ENSSMAT00000008498.pdb,55,55,7,46,69,38,0
215,transcript:ENSSMAT00000034886.pdb,56,58,7,44,65,42,0
80,transcript:ENSSMAT00000012469.pdb,54,54,8,65,67,36,0
47,transcript:ENSSMAT00000034121.pdb,53,55,9,73,67,25,0
146,transcript:ENSSMAT00000026481.pdb,51,54,10,100,66,34,0
242,transcript:ENSSMAT00000028062.pdb,51,47,10,93,69,34,0
362,transcript:ENSSMAT00000030008.pdb,51,52,11,113,69,30,0


In [32]:
df_braker3[(df_braker3["mean"] > 50) & (df_braker3["perc_confident"] == 0)].sort_values("stdev")

Unnamed: 0,fn,mean,median,stdev,var,max,min,perc_confident
46,g2824.t1.pdb,55,55,3,12,62,49,0
15,g2879.t1.pdb,57,57,4,17,68,50,0
138,g2535.t1.pdb,61,62,5,25,70,49,0
117,g5842.t1.pdb,54,55,7,45,65,32,0
54,g1749.t1.pdb,51,52,11,113,69,30,0


In [33]:
df_helixer[(df_helixer["mean"] < 60) & (df_helixer["perc_confident"] > 40)]

Unnamed: 0,fn,mean,median,stdev,var,max,min,perc_confident
363,Schistosoma_mansoni_SM_V10_Z_001959.1.pdb,58,61,22,474,90,22,41


In [62]:
df_helixer[(df_helixer["mean"] > 50) & (df_helixer["perc_confident"] == 0)].sort_values("stdev")

Unnamed: 0,fn,mean,median,stdev,var,max,min,perc_confident
325,Schistosoma_mansoni_SM_V10_4_001163.1.pdb,53,53,3,10,59,46,0
237,Schistosoma_mansoni_SM_V10_6_000170.1.pdb,59,58,5,20,67,52,0
282,Schistosoma_mansoni_SM_V10_2_000542.1.pdb,51,50,6,36,65,41,0
304,Schistosoma_mansoni_SM_V10_2_000788.1.pdb,51,49,6,36,62,41,0
378,Schistosoma_mansoni_SM_V10_5_000309.1.pdb,51,53,6,35,61,36,0
21,Schistosoma_mansoni_SM_V10_1_000685.1.pdb,53,53,7,55,69,42,0
45,Schistosoma_mansoni_SM_V10_4_000066.1.pdb,57,59,8,69,69,19,0
48,Schistosoma_mansoni_SM_V10_4_000006.1.pdb,51,48,8,62,64,39,0
316,Schistosoma_mansoni_SM_V10_2_001157.1.pdb,51,50,8,59,65,35,0
332,Schistosoma_mansoni_SM_V10_2_000183.1.pdb,53,55,8,60,66,35,0
