In [1]:
import os.path
from collections import defaultdict
import statistics

import matplotlib
from matplotlib import pyplot as plt
import numpy as np


from orthologue_analysis.orthogroups import init_orthogroup_df
from orthologue_analysis.species import PristionchusFromTool, SpeciesList
from orthologue_analysis.utils import SequenceIDMapping, orthofinder_paths
from ppac_merged_split_run_utils import pickle_cache_suspicious_orthologue_pipeline
from reannotation.analysis import (
    interpro_accessions_frequently_missed_by_all_tools,
    interpro_accessions_in_novel_transcripts,
    interpro_accessions_in_missed_transcripts,
    missed_transcripts_with_significantly_more_frequent_accessions
)
from reannotation.pipelines import interpro_accession_pipeline, suspicious_orthologue_pipeline, novel_orthologue_pipeline
from reannotation.statistics import fisher_exact_for_two_lists_of_accessions
from reannotation.utils import extract_accessions_from_transcript
from utils.esm import extract_esm_means
from utils.gffutils import init_db

matplotlib.use("TkAgg")

results_label = "Results_Aug21"
wbps_ann_path = "data/from_WBPS/pristionchus_pacificus.PRJNA12644.WBPS19.annotations.gff3"
braker_path = "data/from_MARS/Pristionchus_pacificus_braker3_full.gff3"
helixer_path = "data/from_MARS/Pristionchus_pacificus_helixer_full.gff3"
anno_path = "data/from_EBI/pristionchus_pacificus_gca000180635v4.gff3"
db = init_db(wbps_ann_path, "db/Ppac_wbps.db")
of = orthofinder_paths(results_label, subdir="Orthogroups")

wbps_col = "Ppac_LT"
braker_col = "Ppac_braker3_LT"
helixer_col = "Ppac_helixer_LT"
anno_col = "Ppac_anno_LT"

og_df = init_orthogroup_df(of["orthogroups"])
no_og_df = init_orthogroup_df(of["orthogroups_unassigned_genes"])
seq_id_map = SequenceIDMapping(of["wd"])
mars_data_dir = os.path.join("data", "from_MARS", "")
ebi_data_dir = os.path.join("data", "from_EBI", "")

species_list = SpeciesList([
    PristionchusFromTool("pacificus", data_dir=mars_data_dir, data_label="Ppac_LT", prot_filename_suffix=".fa"),
    PristionchusFromTool("pacificus_braker3_reann", data_dir=mars_data_dir, data_label="Ppac_braker3_LT", prot_filename_suffix=".fa"),
    PristionchusFromTool("pacificus_helixer_reann", data_dir=mars_data_dir, data_label="Ppac_helixer_LT", prot_filename_suffix=".fa"),
    PristionchusFromTool("pacificus_anno_reann", data_dir=ebi_data_dir, data_label="Ppac_anno_LT", prot_filename_suffix=".fa")],
    wd_path=of["wd"],
    load_blast=True
)

wbps_species = species_list.get_species_with_data_label(wbps_col)
braker_species = species_list.get_species_with_data_label(braker_col)
helixer_species = species_list.get_species_with_data_label(helixer_col)
anno_species = species_list.get_species_with_data_label(anno_col)

min_freq = 10

interproscan_dir = "data/from_MARS/interproscan/ppac"


loading /home/will/wbp_scratch/data/from_MARS/OrthoFinder/WorkingDirectory/Results_Aug21/Blast0_1.txt...
loading /home/will/wbp_scratch/data/from_MARS/OrthoFinder/WorkingDirectory/Results_Aug21/Blast0_2.txt...
loading /home/will/wbp_scratch/data/from_MARS/OrthoFinder/WorkingDirectory/Results_Aug21/Blast0_3.txt...
loading /home/will/wbp_scratch/data/from_MARS/OrthoFinder/WorkingDirectory/Results_Aug21/Blast0_2.txt...
loading /home/will/wbp_scratch/data/from_MARS/OrthoFinder/WorkingDirectory/Results_Aug21/Blast0_3.txt...
loading /home/will/wbp_scratch/data/from_MARS/OrthoFinder/WorkingDirectory/Results_Aug21/Blast0_1.txt...


# General Statistics

In [2]:
print("Shared orthologues with WBPS:")
print("WBPS: {}".format(len(og_df[~og_df[wbps_col].isna()])))
print("BRAKER3: {}".format(len(og_df[~og_df[wbps_col].isna() & ~og_df[braker_col].isna()])))
print("Helixer: {}".format(len(og_df[~og_df[wbps_col].isna() & ~og_df[helixer_col].isna()])))
print("Anno: {}".format(len(og_df[~og_df[wbps_col].isna() & ~og_df[anno_col].isna()])))

Shared orthologues with WBPS:
WBPS: 18931
BRAKER3: 15968
Helixer: 14372
Anno: 16484


# Assessing merged/split genes

In [3]:
braker_merged, braker_split = pickle_cache_suspicious_orthologue_pipeline("braker", og_df, wbps_col, braker_col, species_list, seq_id_map, wbps_prefix="Transcript")
anno_merged, anno_split = pickle_cache_suspicious_orthologue_pipeline("anno", og_df, wbps_col, anno_col, species_list, seq_id_map, wbps_prefix="Transcript")
helixer_merged, helixer_split = pickle_cache_suspicious_orthologue_pipeline("helixer", og_df, wbps_col, helixer_col, species_list, seq_id_map, wbps_prefix="Transcript")

In [5]:
num_genes = len(list(braker_species.db.all_features(featuretype="gene")))
print(f"BRAKER3: merged={len(braker_merged)}, split={len(braker_split)}, total={round(100*(len(braker_split) + len(braker_merged)*2)/num_genes, 2)}")
num_genes = len(list(helixer_species.db.all_features(featuretype="gene")))
print(f"Helixer: merged={len(helixer_merged)}, split={len(helixer_split)}, total={round(100*(len(helixer_split) + len(helixer_merged)*2)/num_genes, 2)}")
num_genes = len(list(anno_species.db.all_features(featuretype="gene")))
print(f"Anno: merged={len(anno_merged)}, split={len(anno_split)}, total={round(100*(len(anno_split) + len(anno_merged)*2)/num_genes, 2)}")

BRAKER3: merged=19, split=96, total=0.56
Helixer: merged=349, split=591, total=4.0
Anno: merged=958, split=100, total=7.13


# InterPro accession investigation

### There are no IPR accessions in the annotation... Will have to write brand new pipelines for dealing with PFAM domains (which are present)

# Novel orthologues

In [6]:
print("Novel transcripts (% of total predicted by tool)")

shared_braker_genes = len(og_df[~og_df[wbps_col].isna() & ~og_df[braker_col].isna()][braker_col].str.split(",").explode().unique())
novel_braker_orths = len(og_df[og_df[wbps_col].isna() & ~og_df[braker_col].isna()][braker_col].str.split(",").explode().unique())
novel_braker_ungs = len(no_og_df[no_og_df[wbps_col].isna() & ~no_og_df[braker_col].isna()][braker_col].str.split(",").explode().unique())
assert shared_braker_genes + novel_braker_orths + novel_braker_ungs == len(braker_species.all_transcript_ids)
print(f"BRAKER3: {novel_braker_orths + novel_braker_ungs} ({round(100*(novel_braker_orths + novel_braker_ungs)/len(braker_species.all_transcript_ids), 2)}%)")


shared_helixer_genes = len(og_df[~og_df[wbps_col].isna() & ~og_df[helixer_col].isna()][helixer_col].str.split(",").explode().unique())
novel_helixer_orths = len(og_df[og_df[wbps_col].isna() & ~og_df[helixer_col].isna()][helixer_col].str.split(",").explode().unique())
novel_helixer_ungs = len(no_og_df[no_og_df[wbps_col].isna() & ~no_og_df[helixer_col].isna()][helixer_col].str.split(",").explode().unique())
assert shared_helixer_genes + novel_helixer_orths + novel_helixer_ungs == len(helixer_species.all_transcript_ids)
print(f"Helixer: {novel_helixer_orths + novel_helixer_ungs} ({round(100*(novel_helixer_orths + novel_helixer_ungs)/len(helixer_species.all_transcript_ids), 2)}%)")

shared_anno_genes = len(og_df[~og_df[wbps_col].isna() & ~og_df[anno_col].isna()][anno_col].str.split(",").explode().unique())
novel_anno_orths = len(og_df[og_df[wbps_col].isna() & ~og_df[anno_col].isna()][anno_col].str.split(",").explode().unique())
novel_anno_ungs = len(no_og_df[no_og_df[wbps_col].isna() & ~no_og_df[anno_col].isna()][anno_col].str.split(",").explode().unique())
assert shared_anno_genes + novel_anno_orths + novel_anno_ungs == len(anno_species.all_transcript_ids)
print(f"Anno: {novel_anno_orths + novel_anno_ungs} ({round(100*(novel_anno_orths + novel_anno_ungs)/len(anno_species.all_transcript_ids), 2)}%)")


Novel transcripts (% of total predicted by tool)
BRAKER3: 4799 (19.93%)
Helixer: 14519 (45.06%)
Anno: 7057 (24.95%)


In [9]:
# novel_orthologue_pipeline(hog_df, wbps_col, anno_col, species_list, out_dir="data/novel_orthologue_sequences/ppac/anno/")
# novel_orthologue_pipeline(hog_df, wbps_col, braker_col, species_list, out_dir="data/novel_orthologue_sequences/ppac/braker3/")
# novel_orthologue_pipeline(hog_df, wbps_col, helixer_col, species_list, out_dir="data/novel_orthologue_sequences/ppac/helixer/")
anno_esm_means = extract_esm_means("data/from_MARS/Ppac_esm_pLDDTs_anno.txt").values()
braker3_esm_means = extract_esm_means("data/from_MARS/Ppac_esm_pLDDTs_braker3.txt").values()
helixer_esm_means = extract_esm_means("data/from_MARS/Ppac_esm_pLDDTs_helixer.txt").values()

print(statistics.mean(map(float, anno_esm_means)))
print(statistics.mean(map(float, braker3_esm_means)))
print(statistics.mean(map(float, helixer_esm_means)))


62.12942808365344
64.89034238488784
53.90892296967584


In [10]:
import pandas as pd
cols = (
    "fn",
    "mean",
    "median",
    "stdev",
    "var",
    "max",
    "min",
    "perc_confident"
)
df = pd.read_csv("data/from_MARS/pLDDT_ppac.csv", names=cols)
print(f"Mean of means: {df['mean'].mean()}")
print(f"% that are \"Confident\": {100*df[df['mean'] >= 70].shape[0]/df.shape[0]}")


Mean of means: 60.59425048046124
% that are "Confident": 31.262011531069827


In [11]:
df_anno = pd.read_csv("data/from_MARS/pLDDT_ppac_anno.csv", names=cols)
df_braker3 = pd.read_csv("data/from_MARS/pLDDT_ppac_braker3.csv", names=cols)
df_helixer = pd.read_csv("data/from_MARS/pLDDT_ppac_helixer.csv", names=cols)

In [12]:
print(f"Mean of WBPS % that are \"Confident\" residues: {df['perc_confident'].mean()}")
print(f"Mean of BRAKER3 % that are \"Confident\" residues: {df_braker3['perc_confident'].mean()}")
print(f"Mean of Anno % that are \"Confident\" residues: {df_anno['perc_confident'].mean()}")
print(f"Mean of Helixer % that are \"Confident\" residues: {df_helixer['perc_confident'].mean()}")

Mean of WBPS % that are "Confident" residues: 36.404548366431776
Mean of BRAKER3 % that are "Confident" residues: 47.615584415584415
Mean of Anno % that are "Confident" residues: 43.181391378574475
Mean of Helixer % that are "Confident" residues: 29.80027884280237
