In [1]:
import os.path
import statistics

from orthologue_analysis.orthogroups import init_orthogroup_df
from orthologue_analysis.species import MansoniClade, SpeciesList, Species, AltSourceMixin
from orthologue_analysis.utils import SequenceIDMapping, orthofinder_paths
from reannotation.analysis import (
    interpro_accessions_frequently_missed_by_each_tool,
    interpro_accessions_frequently_missed_by_all_tools,
    interpro_accessions_in_novel_transcripts,
    interpro_accessions_in_missed_transcripts,
    missed_transcripts_with_significantly_more_frequent_accessions
)
from reannotation.pipelines import (
    interpro_accession_pipeline,
    interpro_accession_pipeline_all_tools,
    suspicious_orthologue_pipeline,
    novel_orthologue_pipeline
)
from reannotation.statistics import fisher_exact_for_two_lists_of_accessions
from utils.esm import extract_esm_means
from utils.gffutils import init_db


class Haemonchus(Species):
    abbr = "H"
    genus = "haemonchus"
    clade = 0


class HaemonchusFromTool(AltSourceMixin, Haemonchus):
    pass


results_label = "Results_Aug08"
wbps_ann_path = "data/from_WBPS/haemonchus_contortus.PRJEB506.WBPS19.annotations.gff3"
braker_path = "data/from_MARS/Haemonchus_contortus_braker3_full.gff3"
helixer_path = "data/from_MARS/Haemonchus_contortus_helixer_full.gff3"
anno_path = "data/from_EBI/haemonchus_contortus_gca000469685v2.gff3"
db = init_db(wbps_ann_path, "db/Hcon_wbps.db")
of = orthofinder_paths(results_label)

wbps_col = "Hcon_LT"
braker_col = "Hcon_braker3_LT"
helixer_col = "Hcon_helixer_LT"
anno_col = "Hcon_anno_LT"

hog_df = init_orthogroup_df(of["orthogroups"])
seq_id_map = SequenceIDMapping(of["wd"])
mars_data_dir = os.path.join("data", "from_MARS", "")
ebi_data_dir = os.path.join("data", "from_EBI", "")

species_list = SpeciesList([
    HaemonchusFromTool("contortus", data_dir=mars_data_dir, data_label="Hcon_LT", prot_filename_suffix=".fa"),
    HaemonchusFromTool("contortus_braker3_reann", data_dir=mars_data_dir, data_label="Hcon_braker3_LT", prot_filename_suffix=".fa"),
    HaemonchusFromTool("contortus_helixer_reann", data_dir=mars_data_dir, data_label="Hcon_helixer_LT", prot_filename_suffix=".fa"),
    HaemonchusFromTool("contortus_anno_reann", data_dir=ebi_data_dir, data_label="Hcon_anno_LT", prot_filename_suffix=".fa")],
    wd_path=of["wd"],
    load_blast=True
)

wbps_species = species_list.get_species_with_data_label(wbps_col)

min_freq = 10

interproscan_dir = "data/from_MARS/interproscan/hcon"
acc_product = {}

loading /home/will/wbp_scratch/data/from_MARS/OrthoFinder/WorkingDirectory/Results_Aug08/Blast0_1.txt...
loading /home/will/wbp_scratch/data/from_MARS/OrthoFinder/WorkingDirectory/Results_Aug08/Blast0_2.txt...
loading /home/will/wbp_scratch/data/from_MARS/OrthoFinder/WorkingDirectory/Results_Aug08/Blast0_3.txt...
loading /home/will/wbp_scratch/data/from_MARS/OrthoFinder/WorkingDirectory/Results_Aug08/Blast0_2.txt...
loading /home/will/wbp_scratch/data/from_MARS/OrthoFinder/WorkingDirectory/Results_Aug08/Blast0_3.txt...
loading /home/will/wbp_scratch/data/from_MARS/OrthoFinder/WorkingDirectory/Results_Aug08/Blast0_1.txt...


# General Statistics

In [2]:
print("Shared orthologues with WBPS:")
print("WBPS: {}".format(len(hog_df[~hog_df[wbps_col].isna()])))
print("BRAKER3: {}".format(len(hog_df[~hog_df[wbps_col].isna() & ~hog_df[braker_col].isna()])))
print("Helixer: {}".format(len(hog_df[~hog_df[wbps_col].isna() & ~hog_df[helixer_col].isna()])))
print("Anno: {}".format(len(hog_df[~hog_df[wbps_col].isna() & ~hog_df[anno_col].isna()])))

Shared orthologues with WBPS:
WBPS: 14973
BRAKER3: 12640
Helixer: 13614
Anno: 13577


Distribution of exon lengths

In [3]:
from matplotlib import pyplot as plt

In [None]:
# statistics_pipeline()

# Assessing merged/split genes

In [2]:
braker_merged, braker_split = suspicious_orthologue_pipeline(hog_df, wbps_col, braker_col, species_list, seq_id_map)
anno_merged, anno_split = suspicious_orthologue_pipeline(hog_df, wbps_col, anno_col, species_list, seq_id_map)
helixer_merged, helixer_split = suspicious_orthologue_pipeline(hog_df, wbps_col, helixer_col, species_list, seq_id_map)

100%|██████████| 16736/16736 [14:04<00:00, 19.82it/s] 
100%|██████████| 16736/16736 [17:13<00:00, 16.19it/s] 
100%|██████████| 16736/16736 [24:06<00:00, 11.57it/s] 


In [3]:
num_genes = len(list(species_list.get_species_with_data_label("Hcon_braker3_LT").db.all_features(featuretype="gene")))
print(f"BRAKER3: merged={len(braker_merged)}, split={len(braker_split)}, total={round(100*(len(braker_split) + len(braker_merged)*2)/num_genes, 2)}")
num_genes = len(list(species_list.get_species_with_data_label("Hcon_helixer_LT").db.all_features(featuretype="gene")))
print(f"Helixer: merged={len(helixer_merged)}, split={len(helixer_split)}, total={round(100*(len(helixer_split) + len(helixer_merged)*2)/num_genes, 2)}")
num_genes = len(list(species_list.get_species_with_data_label("Hcon_anno_LT").db.all_features(featuretype="gene")))
print(f"Anno: merged={len(anno_merged)}, split={len(anno_split)}, total={round(100*(len(anno_split) + len(anno_merged)*2)/num_genes, 2)}")



BRAKER3: merged=57, split=53, total=1.15
Helixer: merged=174, split=80, total=2.23
Anno: merged=306, split=55, total=3.19


# InterPro accession investigation

### BRAKER3

In [4]:
acc_product, acc_tally_shared, acc_tally_missed_braker3, acc_tally_novel_braker3, missed_transcripts = interpro_accession_pipeline(db, hog_df, wbps_col, braker_col, interproscan_dir, acc_product)

# Find InterPro accessions occurring with significantly different frequency than in control (acc_tally_shared)
braker3_novel_results = fisher_exact_for_two_lists_of_accessions(acc_tally_novel_braker3, acc_tally_shared)
braker3_missed_results = fisher_exact_for_two_lists_of_accessions(acc_tally_missed_braker3, acc_tally_shared)

In [5]:
interpro_accessions_in_novel_transcripts(acc_product, acc_tally_novel_braker3, braker3_novel_results, min_freq=10)

InterPro accessions occurring with significantly higher frequency in novel transcripts than in shared transcripts:
	IPR001506: Peptidase M12A  (90 occurrences, 4 expected)
	IPR001283: Cysteine-rich secretory protein-related  (87 occurrences, 4 expected)
	IPR000609: 7TM GPCR, serpentine receptor class g (Srg) (21 occurrences, 1 expected)
	IPR006652: Kelch repeat type 1  (20 occurrences, 1 expected)
	IPR002413: Venom allergen 5-like  (38 occurrences, 3 expected)
	IPR035940: CAP superfamily (54 occurrences, 5 expected)
	IPR014044: CAP domain  (46 occurrences, 4 expected)
	IPR018244: Allergen V5/Tpx-1-related, conserved site  (13 occurrences, 1 expected)
	IPR002486: Nematode cuticle collagen, N-terminal  (51 occurrences, 6 expected)
	IPR008160: Collagen triple helix repeat (40 occurrences, 6 expected)
	IPR002035: von Willebrand factor, type A  (11 occurrences, 2 expected)
	IPR013087: Zinc finger C2H2-type (45 occurrences, 7 expected)
	IPR001628: Zinc finger, nuclear hormone receptor-type  

In [6]:
interpro_accessions_in_missed_transcripts(acc_product, acc_tally_missed_braker3, acc_tally_novel_braker3, braker3_missed_results, braker3_novel_results, min_freq=75)

InterPro accessions that are completely missing from shared transcripts, with high frequency in missed transcripts:

InterPro accessions occurring with significantly higher frequency in missed transcripts than in shared transcripts:
	IPR005312: Protein of unknown function DUF1759  (155 occurrences, 6 expected)
	IPR001584: Integrase, catalytic core  (164 occurrences, 7 expected)
	IPR041588: Integrase zinc-binding domain  (152 occurrences, 6 expected)
	IPR001995: Peptidase A2A, retrovirus, catalytic  (94 occurrences, 4 expected)
	IPR008737: Peptidase aspartic, putative  (94 occurrences, 4 expected)
	IPR008042: Retrotransposon, Pao  (123 occurrences, 6 expected)
	IPR040676: Domain of unknown function DUF5641 (87 occurrences, 4 expected)
	IPR036397: Ribonuclease H superfamily  (402 occurrences, 24 expected)
	IPR018244: Allergen V5/Tpx-1-related, conserved site  (97 occurrences, 6 expected)
		IPR018244 also significantly more frequent in novel transcripts (13 occurrences, 1 expected)
	IPR00

In [7]:
missed_transcripts_with_significantly_more_frequent_accessions(wbps_species.db, missed_transcripts, acc_tally_missed_braker3, braker3_missed_results, min_freq)

hcontortus_chr4_Celeg_TT_arrow_pilon - HCON_00127470-00001 - {'IPR036397', 'IPR038717'}
hcontortus_chr4_Celeg_TT_arrow_pilon - HCON_00120440-00001 - {'IPR035940'}
hcontortus_chr4_Celeg_TT_arrow_pilon - HCON_00112380-00001 - {'IPR035940'}
hcontortus_chr4_Celeg_TT_arrow_pilon - HCON_00110860-00001 - {'IPR021109', 'IPR001969'}
hcontortus_chr4_Celeg_TT_arrow_pilon - HCON_00110640-00001 - {'IPR021109', 'IPR001969'}
hcontortus_chr4_Celeg_TT_arrow_pilon - HCON_00110340-00001 - {'IPR036397', 'IPR025898', 'IPR048703'}
hcontortus_chr4_Celeg_TT_arrow_pilon - HCON_00113120-00001 - {'IPR021109', 'IPR001969'}
hcontortus_chr4_Celeg_TT_arrow_pilon - HCON_00110900-00001 - {'IPR035940'}
hcontortus_chr4_Celeg_TT_arrow_pilon - HCON_00100285-00001 - {'IPR021109'}
hcontortus_chr4_Celeg_TT_arrow_pilon - HCON_00192680-00001 - {'IPR035940', 'IPR001283', 'IPR014044'}
hcontortus_chr4_Celeg_TT_arrow_pilon - HCON_00121040-00001 - {'IPR036397', 'IPR012337'}
hcontortus_chr4_Celeg_TT_arrow_pilon - HCON_00110400-00001

### Helixer

In [8]:
acc_product, acc_tally_shared, acc_tally_missed_helixer, acc_tally_novel_helixer, missed_transcripts = interpro_accession_pipeline(db, hog_df, wbps_col, helixer_col, interproscan_dir, acc_product)

# Find InterPro accessions occurring with significantly different frequency than in control (acc_tally_shared)
helixer_novel_results = fisher_exact_for_two_lists_of_accessions(acc_tally_novel_helixer, acc_tally_shared)
helixer_missed_results = fisher_exact_for_two_lists_of_accessions(acc_tally_missed_helixer, acc_tally_shared)

In [9]:
interpro_accessions_in_novel_transcripts(acc_product, acc_tally_novel_helixer, helixer_novel_results, min_freq)

InterPro accessions occurring with significantly higher frequency in novel transcripts than in shared transcripts:
	IPR002181: Fibrinogen, alpha/beta/gamma chain, C-terminal globular domain  (14 occurrences, 1 expected)
	IPR001506: Peptidase M12A  (87 occurrences, 4 expected)
	IPR001791: Laminin G domain  (11 occurrences, 1 expected)
	IPR006652: Kelch repeat type 1  (35 occurrences, 2 expected)
	IPR001283: Cysteine-rich secretory protein-related  (105 occurrences, 7 expected)
	IPR004000: Actin family  (10 occurrences, 1 expected)
	IPR000609: 7TM GPCR, serpentine receptor class g (Srg) (10 occurrences, 1 expected)
	IPR019421: 7TM GPCR, serpentine receptor class d (Srd) (14 occurrences, 1 expected)
	IPR002413: Venom allergen 5-like  (51 occurrences, 5 expected)
	IPR035940: CAP superfamily (70 occurrences, 8 expected)
	IPR002486: Nematode cuticle collagen, N-terminal  (49 occurrences, 6 expected)
	IPR019426: 7TM GPCR, serpentine receptor class v (Srv) (24 occurrences, 3 expected)
	IPR0013

In [10]:
interpro_accessions_in_missed_transcripts(acc_product, acc_tally_missed_helixer, acc_tally_novel_helixer, helixer_missed_results, helixer_novel_results, min_freq=30)

InterPro accessions that are completely missing from shared transcripts, with high frequency in missed transcripts:

InterPro accessions occurring with significantly higher frequency in missed transcripts than in shared transcripts:
	IPR027124: SWR1-complex protein 5/Craniofacial development protein 1/2  (36 occurrences, 2 expected)
	IPR036691: Endonuclease/exonuclease/phosphatase superfamily  (149 occurrences, 11 expected)
		IPR036691 also significantly more frequent in novel transcripts (20 occurrences, 6 expected)
	IPR005135: Endonuclease/exonuclease/phosphatase  (55 occurrences, 7 expected)
		IPR005135 also significantly more frequent in novel transcripts (11 occurrences, 4 expected)
	IPR000477: Reverse transcriptase domain  (252 occurrences, 36 expected)
		IPR000477 also significantly more frequent in novel transcripts (36 occurrences, 21 expected)
	IPR001461: Aspartic peptidase A1 family  (31 occurrences, 5 expected)
	IPR033121: Peptidase family A1 domain (31 occurrences, 5 expec

### Anno

In [11]:
acc_product, acc_tally_shared, acc_tally_missed_anno, acc_tally_novel_anno, missed_transcripts = interpro_accession_pipeline(db, hog_df, wbps_col, anno_col, interproscan_dir, acc_product)

# Find InterPro accessions occurring with significantly different frequency than in control (acc_tally_shared)
anno_novel_results = fisher_exact_for_two_lists_of_accessions(acc_tally_novel_anno, acc_tally_shared)
anno_missed_results = fisher_exact_for_two_lists_of_accessions(acc_tally_missed_anno, acc_tally_shared)

In [12]:
interpro_accessions_in_novel_transcripts(acc_product, acc_tally_novel_anno, anno_novel_results, min_freq=10)

InterPro accessions occurring with significantly higher frequency in novel transcripts than in shared transcripts:
	IPR004000: Actin family  (16 occurrences, 1 expected)
	IPR006652: Kelch repeat type 1  (32 occurrences, 1 expected)
	IPR001791: Laminin G domain  (11 occurrences, 1 expected)
	IPR000609: 7TM GPCR, serpentine receptor class g (Srg) (17 occurrences, 1 expected)
	IPR001283: Cysteine-rich secretory protein-related  (113 occurrences, 7 expected)
	IPR001506: Peptidase M12A  (67 occurrences, 5 expected)
	IPR002413: Venom allergen 5-like  (48 occurrences, 5 expected)
	IPR013087: Zinc finger C2H2-type (71 occurrences, 8 expected)
	IPR035940: CAP superfamily (60 occurrences, 8 expected)
	IPR003439: ABC transporter-like, ATP-binding domain  (16 occurrences, 2 expected)
	IPR014044: CAP domain  (54 occurrences, 8 expected)
	IPR004151: 7TM GPCR, serpentine receptor class e (Sre) (10 occurrences, 1 expected)
	IPR002486: Nematode cuticle collagen, N-terminal  (42 occurrences, 6 expected)

In [13]:
interpro_accessions_in_missed_transcripts(acc_product, acc_tally_missed_anno, acc_tally_novel_anno, anno_missed_results, anno_novel_results, min_freq=50)

InterPro accessions that are completely missing from shared transcripts, with high frequency in missed transcripts:

InterPro accessions occurring with significantly higher frequency in missed transcripts than in shared transcripts:
	IPR040676: Domain of unknown function DUF5641 (52 occurrences, 6 expected)
	IPR000477: Reverse transcriptase domain  (329 occurrences, 39 expected)
		IPR000477 also significantly more frequent in novel transcripts (33 occurrences, 18 expected)
	IPR008737: Peptidase aspartic, putative  (54 occurrences, 6 expected)
	IPR001995: Peptidase A2A, retrovirus, catalytic  (53 occurrences, 6 expected)
	IPR036691: Endonuclease/exonuclease/phosphatase superfamily  (126 occurrences, 17 expected)
		IPR036691 also significantly more frequent in novel transcripts (14 occurrences, 8 expected)
	IPR043502: DNA/RNA polymerase superfamily (299 occurrences, 40 expected)
	IPR043128: Reverse transcriptase/Diguanylate cyclase domain  (177 occurrences, 24 expected)
	IPR021109: Aspar

### General

Find accessions that are significantly commonly missed by **all** tools

In [14]:
acc_tally_no_tool, acc_tally_one_plus_tool_shared = interpro_accession_pipeline_all_tools(wbps_species.db, hog_df, wbps_col, [braker_col, helixer_col, anno_col])

no_tools_results = fisher_exact_for_two_lists_of_accessions(acc_tally_no_tool, acc_tally_one_plus_tool_shared)

In [15]:
interpro_accessions_frequently_missed_by_all_tools(acc_product, acc_tally_no_tool, no_tools_results)

InterPro accessions occurring with significantly higher frequency in transcripts that were missed by all tools, than in transcripts shared by at least 1 tool:
	IPR049163: DNA helicase Pif1-like, 2B domain (5 occurrences, 0 expected)
	IPR036691: Endonuclease/exonuclease/phosphatase superfamily  (85 occurrences, 5 expected)
	IPR005135: Endonuclease/exonuclease/phosphatase  (40 occurrences, 3 expected)
	IPR001461: Aspartic peptidase A1 family  (24 occurrences, 2 expected)
	IPR033121: Peptidase family A1 domain (24 occurrences, 2 expected)
	IPR048703: Transposable element Tc3 transposase-like, DNA-binding HTH domain (6 occurrences, 0 expected)
	IPR000477: Reverse transcriptase domain  (158 occurrences, 14 expected)
	IPR018244: Allergen V5/Tpx-1-related, conserved site  (29 occurrences, 3 expected)
	IPR043502: DNA/RNA polymerase superfamily (145 occurrences, 14 expected)
	IPR025898: Tc3 transposase, DNA binding domain  (4 occurrences, 0 expected)
	IPR010285: DNA helicase Pif1-like (3 occurr

Find accessions that are significantly commonly missed by each tool.

In [16]:
from reannotation.analysis import interpro_accessions_frequently_missed_by_each_tool
interpro_accessions_frequently_missed_by_each_tool(
    acc_product,
    {
        "BRAKER3": braker3_missed_results,
        "Helixer": helixer_missed_results,
        "Anno": anno_missed_results
    },
    {
        "BRAKER3": acc_tally_missed_braker3,
        "Helixer": acc_tally_missed_helixer,
        "Anno": acc_tally_missed_anno
    }
)

IPR048703: Transposable element Tc3 transposase-like, DNA-binding HTH domain
11.67 times more likely than with BRAKER3 (17 occurrences, 1 expected)
12.19 times more likely than with Helixer (12 occurrences, 1 expected)
6.67 times more likely than with Anno (10 occurrences, 1 expected)
~~~~~~~~~~
IPR005135: Endonuclease/exonuclease/phosphatase 
3.65 times more likely than with BRAKER3 (59 occurrences, 16 expected)
8.27 times more likely than with Helixer (55 occurrences, 7 expected)
6.52 times more likely than with Anno (56 occurrences, 9 expected)
~~~~~~~~~~
IPR005312: Protein of unknown function DUF1759 
28.08 times more likely than with BRAKER3 (155 occurrences, 6 expected)
3.79 times more likely than with Helixer (43 occurrences, 11 expected)
6.94 times more likely than with Anno (77 occurrences, 11 expected)
~~~~~~~~~~
IPR008737: Peptidase aspartic, putative 
21.72 times more likely than with BRAKER3 (94 occurrences, 4 expected)
3.57 times more likely than with Helixer (26 occurren

In [17]:
import re
MIN_WORD_LEN = 4
tools_missed_results = {
    "BRAKER3": braker3_missed_results,
    "Helixer": helixer_missed_results,
    "Anno": anno_missed_results
}
tools_novel_results = {
    "BRAKER3": braker3_novel_results,
    "Helixer": helixer_novel_results,
    "Anno": anno_novel_results
}
acc_tally_missed_tools = {
    "BRAKER3": acc_tally_missed_braker3,
    "Helixer": acc_tally_missed_helixer,
    "Anno": acc_tally_missed_anno
}
acc_tally_novel_tools = {
    "BRAKER3": acc_tally_novel_braker3,
    "Helixer": acc_tally_novel_helixer,
    "Anno": acc_tally_novel_anno
}
seen_words = set()
all_words = set()
for acc in set.intersection(*[set(results["more_frequent"].keys()) for results in tools_missed_results.values()]):
    prod = acc_product[acc]
    words = [w for w in re.split(r'\W', prod.lower()) if w and len(w) >= MIN_WORD_LEN]
    all_words.update(words)
    # Check if words occur more frequently in novel-only accessions
    for novel_acc in set.intersection(*[set(results["more_frequent"].keys()) for results in tools_novel_results.values()]):
        novel_prod = acc_product[novel_acc]
        for w in words:
            if w in novel_prod.lower():
                print(f"{w} appears in {novel_acc}: {novel_prod}")
                seen_words.add(w)

domain appears in IPR000210: BTB/POZ domain 
like appears in IPR011001: Saposin-like
like appears in IPR000276: G protein-coupled receptor, rhodopsin-like 
domain appears in IPR017984: Chromo domain subgroup 
like appears in IPR034035: Astacin-like metallopeptidase domain 
domain appears in IPR034035: Astacin-like metallopeptidase domain 
domain appears in IPR007582: TFIID subunit TAF5, NTD2 domain 
like appears in IPR003439: ABC transporter-like, ATP-binding domain 
binding appears in IPR003439: ABC transporter-like, ATP-binding domain 
domain appears in IPR003439: ABC transporter-like, ATP-binding domain 
domain appears in IPR002181: Fibrinogen, alpha/beta/gamma chain, C-terminal globular domain 
like appears in IPR001304: C-type lectin-like 
like appears in IPR000742: EGF-like domain 
domain appears in IPR000742: EGF-like domain 
like appears in IPR002413: Venom allergen 5-like 
domain appears in IPR014044: CAP domain 
domain appears in IPR003582: ShKT domain
domain appears in IPR01

In [18]:
all_words.difference(seen_words)

{'active',
 'aspartic',
 'associated',
 'catalytic',
 'cchc',
 'complex',
 'core',
 'craniofacial',
 'cyclase',
 'development',
 'diguanylate',
 'duf1759',
 'duf5641',
 'element',
 'function',
 'fusion',
 'glycoprotein',
 'helicase',
 'integrase',
 'mos1',
 'phlebovirus',
 'pif1',
 'polymerase',
 'putative',
 'retrotransposon',
 'retrovirus',
 'ribonuclease',
 'rnase',
 'swr1',
 'transposable',
 'transposase',
 'unknown'}

In [19]:
word = "cchc"

from collections import Counter
def frequency_of_word(word, tool_tally, acc_product):
    for tool, acc_tally in tool_tally.items():
        for acc, freq in Counter(acc_tally).items():
            prod = acc_product[acc]
            if word in prod.lower():
                print(f"{tool}: word '{word}' occurs in {acc} with frequency {freq}")
    

def find_word_in_accessions(word, acc_product):
    for acc, prod in acc_product.items():
        if word in prod.lower():
            print(f"{acc}: {prod}")

            
find_word_in_accessions(word, acc_product)
print("MISSED")
frequency_of_word(word, acc_tally_missed_tools, acc_product)
print("NOVEL")
frequency_of_word(word, acc_tally_novel_tools, acc_product)


IPR001878: Zinc finger, CCHC-type 
IPR036875: Zinc finger, CCHC-type superfamily 
IPR039846: Zinc finger CCHC domain-containing protein 4 
IPR041370: Methyltransferase EEF1AKMT1/ZCCHC4
IPR042246: Zinc finger CCHC domain-containing protein 9
IPR042344: Zinc finger CCHC domain-containing protein 14
MISSED
BRAKER3: word 'cchc' occurs in IPR001878 with frequency 99
BRAKER3: word 'cchc' occurs in IPR036875 with frequency 71
Helixer: word 'cchc' occurs in IPR001878 with frequency 19
Helixer: word 'cchc' occurs in IPR036875 with frequency 12
Anno: word 'cchc' occurs in IPR001878 with frequency 48
Anno: word 'cchc' occurs in IPR036875 with frequency 39
NOVEL
Helixer: word 'cchc' occurs in IPR001878 with frequency 3
Helixer: word 'cchc' occurs in IPR036875 with frequency 1
Anno: word 'cchc' occurs in IPR036875 with frequency 3
Anno: word 'cchc' occurs in IPR001878 with frequency 12


In [20]:
	
aspartic_peptidase_superfamily = [
    "IPR021109",
    "IPR001995",
    "IPR008503",
    "IPR011969",
    "IPR012848",
    "IPR018061",
    "IPR019103",
    "IPR024650",
    "IPR032799",
    "IPR032861",
    "IPR033121",
    "IPR033819",
    "IPR033821",
    "IPR033823",
    "IPR033866",
    "IPR033868",
    "IPR033869",
    "IPR033873",
    "IPR033874",
    "IPR033876",
    "IPR034122",
    "IPR034128",
    "IPR034129",
    "IPR034132",
    "IPR034135",
    "IPR034161",
    "IPR034162",
    "IPR034163",
    "IPR034164",
    "IPR034170",
    "IPR048054",
    "IPR001461",
    "IPR009119",
    "IPR009120",
    "IPR009121",
    "IPR024648",
    "IPR033144",
    "IPR033539",
    "IPR051708",
]

cfdp1 = ["IPR027124"]

rev_transcriptase_cyclase_superfamily = [
    "IPR043128",
    "IPR000160",
    "IPR001126",
    "IPR010659",
    "IPR010661",
    "IPR054767",
    "IPR004004",
    "IPR050469",
    "IPR052163",
]

ribonuclease_h_superfamily = [
    "IPR036397",
    "IPR012337"
]

In [21]:
for ap_acc in aspartic_peptidase_superfamily + ["IPR001969", "IPR008737"]:
    if ap_acc in acc_product:
        print(f"{ap_acc}: {acc_product[ap_acc]}")
    for tool, results in tools_missed_results.items():
        for cat, accs in results.items():
            for acc in accs:
                if acc == ap_acc:
                    print(f"\t{tool} missed - {cat}")
    for tool, results in tools_novel_results.items():
        for cat, accs in results.items():
            for acc in accs:
                if acc == ap_acc:
                    print(f"\t{tool} novel - {cat}")
    

IPR021109: Aspartic peptidase domain superfamily 
	BRAKER3 missed - more_frequent
	Helixer missed - more_frequent
	Anno missed - more_frequent
	BRAKER3 novel - as_expected
	Helixer novel - as_expected
	Anno novel - as_expected
IPR001995: Peptidase A2A, retrovirus, catalytic 
	BRAKER3 missed - more_frequent
	Helixer missed - more_frequent
	Anno missed - more_frequent
	Helixer novel - as_expected
	Anno novel - as_expected
IPR012848: Aspartic peptidase, N-terminal 
IPR018061: Retropepsins 
	BRAKER3 missed - not_occurring
IPR019103: Aspartic peptidase, DDI1-type 
IPR033121: Peptidase family A1 domain
	BRAKER3 missed - more_frequent
	Helixer missed - more_frequent
	Anno missed - more_frequent
	BRAKER3 novel - as_expected
	Helixer novel - as_expected
	Anno novel - as_expected
IPR034164: Pepsin-like domain
	BRAKER3 missed - as_expected
	Helixer missed - as_expected
	Anno missed - as_expected
	BRAKER3 novel - as_expected
IPR001461: Aspartic peptidase A1 family 
	BRAKER3 missed - more_frequent


In [22]:
# print(Counter(acc_tally_one_plus_tool_shared)["IPR034164"])
acc = "IPR012848"
print(Counter(acc_tally_novel_anno)[acc])
print(Counter(acc_tally_novel_braker3)[acc])
print(Counter(acc_tally_novel_helixer)[acc])
for k, v in braker3_novel_results.items():
    if acc in v:
        print(k)

0
0
0


In [23]:
from reannotation.utils import extract_accessions_from_transcript
from collections import defaultdict

all_cds_counts = []
ap_cds_counts = []
all_prot_lens = []
ap_prot_lens = []
transcript_count = 0
for t in wbps_species.db.all_features(featuretype="mRNA"):
    cds_exons = list(wbps_species.db.children(t, featuretype="CDS"))
    prot_len = wbps_species.get_amino_acid_count(cds_exons)
    all_cds_counts.append(len(cds_exons))
    all_prot_lens.append(prot_len)
    transcript_has_ap_acc = False
    for acc, prod in extract_accessions_from_transcript(t):
        # if acc in ["IPR021109", "IPR001995", "IPR033121", "IPR001461"]:
        if acc in aspartic_peptidase_superfamily:
            ap_cds_counts.append(len(cds_exons))
            ap_prot_lens.append(prot_len)
            transcript_has_ap_acc = True
    if transcript_has_ap_acc:
        transcript_count += 1

method:InterPro accession:IPR000924 description:Glutamyl/glutaminyl-tRNA synthetase 
method:InterPro accession:IPR001412 description:Aminoacyl-tRNA synthetase, class I, conserved site 
method:InterPro accession:IPR004514 description:Glutamine-tRNA synthetase 
method:InterPro accession:IPR007638 description:Glutaminyl-tRNA synthetase, class Ib, non-specific RNA-binding domain 2 
method:InterPro accession:IPR007639 description:Glutaminyl-tRNA synthetase, class Ib, non-specific RNA-binding domain, N-terminal 
method:InterPro accession:IPR011035 description:Large ribosomal subunit protein bL25/Gln-tRNA synthetase, anti-codon-binding domain superfamily 
method:InterPro accession:IPR014729 description:Rossmann-like alpha/beta/alpha sandwich fold 
method:InterPro accession:IPR020056 description:Large ribosomal subunit protein bL25/Gln-tRNA synthetase, N-terminal 
method:InterPro accession:IPR020058 description:Glutamyl/glutaminyl-tRNA synthetase, class Ib, catalytic domain 
method:InterPro ac

In [24]:
import statistics
print(statistics.median(all_cds_counts))
print(statistics.median(ap_cds_counts))
print(statistics.mean(all_prot_lens))
print(statistics.mean(ap_prot_lens))

transcript_count

7.0
4.0
407.471904315197
731.6088957055215


321

In [25]:
from bs4 import BeautifulSoup
import requests

url = "https://www.ebi.ac.uk/interpro/entry/InterPro/IPR036397/"
resp = requests.get(url)
soup = BeautifulSoup(resp.text)
soup.find("table", {"class": "vf-table"})

# Novel orthologues

In [26]:
print("Novel orthogroups")
total_orthogroups = hog_df.shape[0]
braker3_novel_ogs = hog_df[(hog_df[wbps_col].isna()) & ~(hog_df[braker_col].isna())].shape[0]
helixer_novel_ogs = hog_df[(hog_df[wbps_col].isna()) & ~(hog_df[helixer_col].isna())].shape[0]
anno_novel_ogs = hog_df[(hog_df[wbps_col].isna()) & ~(hog_df[anno_col].isna())].shape[0]
print(f"BRAKER3: {braker3_novel_ogs} ({round(100*braker3_novel_ogs/total_orthogroups, 2)}%)")
print(f"Helixer: {helixer_novel_ogs} ({round(100*helixer_novel_ogs/total_orthogroups, 2)}%)")
print(f"Anno: {anno_novel_ogs} ({round(100*anno_novel_ogs/total_orthogroups, 2)}%)")

Novel orthogroups
BRAKER3: 970 (5.8%)
Helixer: 1288 (7.7%)
Anno: 1444 (8.63%)


In [27]:
# novel_orthologue_pipeline(hog_df, wbps_col, anno_col, species_list, out_dir="data/novel_orthologue_sequences/hcon/anno/")
# novel_orthologue_pipeline(hog_df, wbps_col, braker_col, species_list, out_dir="data/novel_orthologue_sequences/hcon/braker3/")
# novel_orthologue_pipeline(hog_df, wbps_col, helixer_col, species_list, out_dir="data/novel_orthologue_sequences/hcon/helixer/")
anno_esm_means = extract_esm_means("data/from_MARS/Hcon_esm_pLDDTs_anno.txt").values()
braker3_esm_means = extract_esm_means("data/from_MARS/Hcon_esm_pLDDTs_braker3.txt").values()
helixer_esm_means = extract_esm_means("data/from_MARS/Hcon_esm_pLDDTs_helixer.txt").values()

print(statistics.mean(map(float, anno_esm_means)))
print(statistics.mean(map(float, braker3_esm_means)))
print(statistics.mean(map(float, helixer_esm_means)))

52.54681440443213
57.64515463917526
53.136907536907536


In [28]:
import pandas as pd
cols = (
    "fn",
    "mean",
    "median",
    "stdev",
    "var",
    "max",
    "min",
    "perc_confident"
)
df = pd.read_csv("data/from_MARS/pLDDT_hcon.csv", names=cols)
print(f"Mean of means: {df['mean'].mean()}")
print(f"% that are \"Confident\": {100*df[df['mean'] >= 70].shape[0]/df.shape[0]}")


Mean of means: 71.23547583457898
% that are "Confident": 58.3756851021425


In [29]:

df_anno = pd.read_csv("data/from_MARS/pLDDT_hcon_anno.csv", names=cols)
df_braker3 = pd.read_csv("data/from_MARS/pLDDT_hcon_braker3.csv", names=cols)
df_helixer = pd.read_csv("data/from_MARS/pLDDT_hcon_helixer.csv", names=cols)

In [30]:
print(f"Mean of WBPS % that are \"Confident\" residues: {df['perc_confident'].mean()}")
print(f"Mean of BRAKER3 % that are \"Confident\" residues: {df_braker3['perc_confident'].mean()}")
print(f"Mean of Anno % that are \"Confident\" residues: {df_anno['perc_confident'].mean()}")
print(f"Mean of Helixer % that are \"Confident\" residues: {df_helixer['perc_confident'].mean()}")

Mean of WBPS % that are "Confident" residues: 57.66113602391629
Mean of BRAKER3 % that are "Confident" residues: 36.06907216494845
Mean of Anno % that are "Confident" residues: 29.202908587257618
Mean of Helixer % that are "Confident" residues: 27.52991452991453
