In [1]:
import os.path
from collections import Counter

from orthologue_analysis.orthogroups import init_orthogroup_df
from orthologue_analysis.species import AltSourceMixin, MansoniClade, SpeciesList
from orthologue_analysis.utils import SequenceIDMapping, orthofinder_paths
from reannotation.pipelines import interpro_accession_pipeline, suspicious_orthologue_pipeline
from utils.gffutils import init_db


class MansoniCladeFromTool(AltSourceMixin, MansoniClade):
    pass

In [2]:
results_label = "Results_Aug05"
wbps_ann_path = "data/from_WBPS/schistosoma_mansoni.PRJEA36577.WBPS19.annotations.gff3"
braker_path = "data/from_MARS/Schistosoma_mansoni_braker3_full.gff3"
helixer_path = "data/from_MARS/Schistosoma_mansoni_helixer_full.gff3"
anno_path = "data/from_EBI/schistosoma_mansoni_gca000000000v1.gff3"
db = init_db(wbps_ann_path, "db/Sman_wbps.db")
of = orthofinder_paths(results_label)

wbps_col = "Sman_LT"
braker_col = "Sman_braker3_LT"
helixer_col = "Sman_helixer_LT"
anno_col = "Sman_anno_LT"

hog_df = init_orthogroup_df(of["orthogroups"])
seq_id_map = SequenceIDMapping(of["wd"])
mars_data_dir = os.path.join("data", "from_MARS", "")
ebi_data_dir = os.path.join("data", "from_EBI", "")

# General Statistics

In [3]:
print("Shared orthologues with WBPS:")
print("WBPS: {}".format(len(hog_df[~hog_df[wbps_col].isna()])))
print("BRAKER3: {}".format(len(hog_df[~hog_df[wbps_col].isna() & ~hog_df[braker_col].isna()])))
print("Helixer: {}".format(len(hog_df[~hog_df[wbps_col].isna() & ~hog_df[helixer_col].isna()])))
print("Anno: {}".format(len(hog_df[~hog_df[wbps_col].isna() & ~hog_df[anno_col].isna()])))

Shared orthologues with WBPS:
WBPS: 8970
BRAKER3: 8203
Helixer: 8332
Anno: 8392


In [None]:
# statistics_pipeline()

# Assessing merged/split genes

In [4]:
species_list = SpeciesList([
    MansoniCladeFromTool("mansoni", data_dir=mars_data_dir, data_label="Sman_LT", prot_filename_suffix=".fa"),
    MansoniCladeFromTool("mansoni_braker3_reann", data_dir=mars_data_dir, data_label="Sman_braker3_LT", prot_filename_suffix=".fa"),
    MansoniCladeFromTool("mansoni_helixer_reann", data_dir=mars_data_dir, data_label="Sman_helixer_LT", prot_filename_suffix=".fa"),
    MansoniCladeFromTool("mansoni_anno_reann", data_dir=ebi_data_dir, data_label="Sman_anno_LT", prot_filename_suffix=".fa")],
    wd_path=of["wd"],
    load_blast=True
)

loading /home/will/wbp_scratch/data/from_MARS/OrthoFinder/WorkingDirectory/Results_Aug05/Blast0_1.txt...
loading /home/will/wbp_scratch/data/from_MARS/OrthoFinder/WorkingDirectory/Results_Aug05/Blast0_2.txt...
loading /home/will/wbp_scratch/data/from_MARS/OrthoFinder/WorkingDirectory/Results_Aug05/Blast0_3.txt...
loading /home/will/wbp_scratch/data/from_MARS/OrthoFinder/WorkingDirectory/Results_Aug05/Blast0_2.txt...
loading /home/will/wbp_scratch/data/from_MARS/OrthoFinder/WorkingDirectory/Results_Aug05/Blast0_3.txt...
loading /home/will/wbp_scratch/data/from_MARS/OrthoFinder/WorkingDirectory/Results_Aug05/Blast0_1.txt...


In [5]:
braker_merged, braker_split = suspicious_orthologue_pipeline(hog_df, wbps_col, braker_col, species_list, seq_id_map)
anno_merged, anno_split = suspicious_orthologue_pipeline(hog_df, wbps_col, anno_col, species_list, seq_id_map)
helixer_merged, helixer_split = suspicious_orthologue_pipeline(hog_df, wbps_col, helixer_col, species_list, seq_id_map)

100%|██████████| 9600/9600 [01:40<00:00, 95.43it/s]  
100%|██████████| 9600/9600 [07:44<00:00, 20.69it/s] 
100%|██████████| 9600/9600 [02:45<00:00, 58.09it/s] 


In [6]:
# TODO Tabulate totals for each, and compare commonly merged/split transcripts between tools.
from reannotation.utils import extract_accessions_from_transcript
wbps_species = species_list.get_species_with_data_label(wbps_col)
acc_product = {}
acc_list = []
for k in braker_merged.keys():
    tran = wbps_species.db["transcript:" + k]
    for acc, prod in extract_accessions_from_transcript(tran):
        acc_product[acc] = prod
        acc_list.append(acc)

method:InterPro accession:IPR011333 description:SKP1/BTB/POZ domain superfamily 
method:InterPro accession:IPR021777 description:SANT and BTB domain regulator of CSR, BTB domain 
method:InterPro accession:IPR045902 description:SANBR-like
method:InterPro accession:IPR007497 description:Interleukin-1 receptor-associated kinase 1-binding protein 1/DUF541 
method:InterPro accession:IPR030312 description:Interleukin-1 receptor-associated kinase 1-binding protein 1
method:InterPro accession:IPR009060 description:UBA-like superfamily
method:InterPro accession:IPR040191 description:U3 small nucleolar RNA-associated protein 10
method:InterPro accession:IPR002082 description:Aspartate carbamoyltransferase 
method:InterPro accession:IPR002195 description:Dihydroorotase, conserved site 
method:InterPro accession:IPR002474 description:Carbamoyl-phosphate synthase small subunit, N-terminal domain 
method:InterPro accession:IPR005479 description:Carbamoyl-phosphate synthetase large subunit-like, ATP-

# InterPro accession investigation

In [7]:
# S mansoni transcripts without an orthologue with any tool
for _, row in hog_df[~(hog_df[wbps_col].isna()) & (hog_df[braker_col].isna()) & (hog_df[helixer_col].isna()) & (hog_df[anno_col].isna())].iterrows():
    for tid in (p.split("transcript_")[1].strip() for p in row[wbps_col].split(",")):
        tran = db["transcript:" + tid]
        print(f"{tran.seqid}\t{tid}")

SM_V10_4	Smp_349670.1
SM_V10_6	Smp_349150.1
SM_V10_1	Smp_350240.1
SM_V10_2	Smp_349580.1
SM_V10_1	Smp_179970.1
SM_V10_1	Smp_201180.1
SM_V10_Z	Smp_203990.1
SM_V10_WSR	Smp_349550.1
SM_V10_Z	Smp_350410.1
SM_V10_1	Smp_319440.1
SM_V10_1	Smp_319430.1
SM_V10_1	Smp_319420.1
SM_V10_1	Smp_319450.1
SM_V10_1	Smp_319460.1
SM_V10_6	Smp_349480.1
SM_V10_6	Smp_349470.1
SM_V10_1	Smp_348100.1
SM_V10_1	Smp_348200.1
SM_V10_1	Smp_200770.1
SM_V10_Z	Smp_204940.1
SM_V10_1	Smp_349510.1
SM_V10_2	Smp_202620.1
SM_V10_Z	Smp_323200.1
SM_V10_Z	Smp_323180.1
SM_V10_Z	Smp_323170.1
SM_V10_5	Smp_329960.1
SM_V10_Z	Smp_321750.1
SM_V10_3	Smp_325560.1
SM_V10_3	Smp_325550.1
SM_V10_3	Smp_325580.1
SM_V10_1	Smp_200850.1
SM_V10_1	Smp_200820.1
SM_V10_1	Smp_200840.1
SM_V10_6	Smp_205700.1
SM_V10_6	Smp_331330.1
SM_V10_6	Smp_331340.1
SM_V10_6	Smp_331320.1
SM_V10_Z	Smp_323190.1
SM_V10_Z	Smp_323290.1
SM_V10_Z	Smp_323280.1
SM_V10_Z	Smp_323240.1
SM_V10_Z	Smp_324150.1
SM_V10_Z	Smp_324160.1
SM_V10_5	Smp_306680.1
SM_V10_5	Smp_330270.1
SM_V10_5

In [8]:
braker3_results = interpro_accession_pipeline(db, hog_df, wbps_col, braker_col)
helixer_results = interpro_accession_pipeline(db, hog_df, wbps_col, helixer_col)
anno_results = interpro_accession_pipeline(db, hog_df, wbps_col, anno_col)

method:InterPro accession:IPR011496 description:Beta-N-acetylglucosaminidase, catalytic domain 
method:InterPro accession:IPR017853 description:Glycoside hydrolase superfamily
method:InterPro accession:IPR002015 description:Proteasome/cyclosome repeat 
method:InterPro accession:IPR011989 description:Armadillo-like helical 
method:InterPro accession:IPR016024 description:Armadillo-type fold 
method:InterPro accession:IPR016643 description:26S proteasome regulatory complex, non-ATPase subcomplex, Rpn1 subunit 
method:InterPro accession:IPR040892 description:RPN1, N-terminal 
method:InterPro accession:IPR041433 description:26S proteasome non-ATPase regulatory subunit RPN1, C-terminal
method:InterPro accession:IPR002151 description:Kinesin light chain 
method:InterPro accession:IPR011990 description:Tetratricopeptide-like helical domain superfamily 
method:InterPro accession:IPR019734 description:Tetratricopeptide repeat
method:InterPro accession:IPR002110 description:Ankyrin repeat 
metho

Find accessions that are significantly commonly missed by each tool.

In [9]:
for acc in set(anno_results["l3_more_expressed"].keys()).intersection(braker3_results["l3_more_expressed"].keys()).intersection(helixer_results["l3_more_expressed"].keys()):
    print(acc)
    acc_list = braker3_results["acc_list3"]
    freq = Counter(acc_list)[acc]
    l3_more_expressed = braker3_results["l3_more_expressed"]
    acc_product = braker3_results["acc_product"]
    print(f"More expressed than BRAKER3: {acc}: {freq} - significantly more expressed ({l3_more_expressed[acc]}) - {acc_product[acc]}")

    acc_list = helixer_results["acc_list3"]
    freq = Counter(acc_list)[acc]
    l3_more_expressed = helixer_results["l3_more_expressed"]
    acc_product = helixer_results["acc_product"]
    print(f"More expressed than Helixer: {acc}: {freq} - significantly more expressed ({l3_more_expressed[acc]}) - {acc_product[acc]}")

    acc_list = anno_results["acc_list3"]
    freq = Counter(acc_list)[acc]
    l3_more_expressed = anno_results["l3_more_expressed"]
    acc_product = anno_results["acc_product"]
    print(f"More expressed than Anno: {acc}: {freq} - significantly more expressed ({l3_more_expressed[acc]}) - {acc_product[acc]}")
    print("~~~~~~~~~~")

IPR017853
More expressed than BRAKER3: IPR017853: 14 - significantly more expressed (0.10364298636135343) - Glycoside hydrolase superfamily 
More expressed than Helixer: IPR017853: 13 - significantly more expressed (0.0851562196023501) - Glycoside hydrolase superfamily 
More expressed than Anno: IPR017853: 6 - significantly more expressed (0.19206082969920607) - Glycoside hydrolase superfamily 
~~~~~~~~~~
IPR001254
More expressed than BRAKER3: IPR001254: 20 - significantly more expressed (0.03128709831479839) - Serine proteases, trypsin domain 
More expressed than Helixer: IPR001254: 16 - significantly more expressed (0.03781725032862584) - Serine proteases, trypsin domain 
More expressed than Anno: IPR001254: 15 - significantly more expressed (0.03600402279584311) - Serine proteases, trypsin domain 
~~~~~~~~~~
IPR013780
More expressed than BRAKER3: IPR013780: 13 - significantly more expressed (0.052095552249067756) - Glycosyl hydrolase, all-beta 
More expressed than Helixer: IPR013780