In [None]:
import os.path
from collections import defaultdict
import statistics

import matplotlib
from matplotlib import pyplot as plt
import numpy as np

from orthologue_analysis.orthogroups import init_orthogroup_df
from orthologue_analysis.species import AltSourceMixin, Species, SpeciesList
from orthologue_analysis.utils import SequenceIDMapping, orthofinder_paths
from reannotation.analysis import (
    interpro_accessions_frequently_missed_by_all_tools,
    interpro_accessions_in_novel_transcripts,
    interpro_accessions_in_missed_transcripts,
    missed_transcripts_with_significantly_more_frequent_accessions
)
from reannotation.pipelines import interpro_accession_pipeline, suspicious_orthologue_pipeline, novel_orthologue_pipeline
from reannotation.statistics import fisher_exact_for_two_lists_of_accessions
from reannotation.utils import extract_accessions_from_transcript
from utils.esm import extract_esm_means
from utils.gffutils import init_db

matplotlib.use("TkAgg")


class Pristionchus(Species):
    abbr = "P"
    genus = "pristionchus"
    clade = 0


class PristionchusFromTool(AltSourceMixin, Pristionchus):
    pass


results_label = "Results_Aug21"
wbps_ann_path = "data/from_WBPS/pristionchus_pacificus.PRJNA12644.WBPS19.annotations.gff3"
braker_path = "data/from_MARS/Pristionchus_pacificus_braker3_full.gff3"
helixer_path = "data/from_MARS/Pristionchus_pacificus_helixer_full.gff3"
anno_path = "data/from_EBI/pristionchus_pacificus_gca000180635v4.gff3"
db = init_db(wbps_ann_path, "db/Ppac_wbps.db")
of = orthofinder_paths(results_label)

wbps_col = "Ppac_LT"
braker_col = "Ppac_braker3_LT"
helixer_col = "Ppac_helixer_LT"
anno_col = "Ppac_anno_LT"

hog_df = init_orthogroup_df(of["orthogroups"])
seq_id_map = SequenceIDMapping(of["wd"])
mars_data_dir = os.path.join("data", "from_MARS", "")
ebi_data_dir = os.path.join("data", "from_EBI", "")

species_list = SpeciesList([
    PristionchusFromTool("pacificus", data_dir=mars_data_dir, data_label="Ppac_LT", prot_filename_suffix=".fa"),
    PristionchusFromTool("pacificus_braker3_reann", data_dir=mars_data_dir, data_label="Ppac_braker3_LT", prot_filename_suffix=".fa"),
    PristionchusFromTool("pacificus_helixer_reann", data_dir=mars_data_dir, data_label="Ppac_helixer_LT", prot_filename_suffix=".fa"),
    PristionchusFromTool("pacificus_anno_reann", data_dir=ebi_data_dir, data_label="Ppac_anno_LT", prot_filename_suffix=".fa")],
    wd_path=of["wd"],
    load_blast=True
)

wbps_species = species_list.get_species_with_data_label(wbps_col)

min_freq = 10

interproscan_dir = "data/from_MARS/interproscan/ppac"

acc_product = {}

# General Statistics

In [None]:
print("Shared orthologues with WBPS:")
print("WBPS: {}".format(len(hog_df[~hog_df[wbps_col].isna()])))
print("BRAKER3: {}".format(len(hog_df[~hog_df[wbps_col].isna() & ~hog_df[braker_col].isna()])))
print("Helixer: {}".format(len(hog_df[~hog_df[wbps_col].isna() & ~hog_df[helixer_col].isna()])))
print("Anno: {}".format(len(hog_df[~hog_df[wbps_col].isna() & ~hog_df[anno_col].isna()])))

# Assessing merged/split genes

In [None]:
import pickle

def pickle_cache_suspicious_orthologue_pipeline(tool, *args, **kwargs):
    merged_path = os.path.join("data", "tmp", "ppac_{}_merged.pickle".format(tool))
    split_path = os.path.join("data", "tmp", "ppac_{}_split.pickle".format(tool))
    if os.path.isfile(merged_path) and os.path.isfile(split_path):
        with open(merged_path, "rb") as f:
            merged = pickle.load(f)
        with open(split_path, "rb") as f:
            split = pickle.load(f)
    else:
        merged, split = suspicious_orthologue_pipeline(*args, **kwargs)
        with open(merged_path, 'wb') as f:
            pickle.dump(merged, f, protocol=pickle.HIGHEST_PROTOCOL)
        with open(split_path, 'wb') as f:
            pickle.dump(split, f, protocol=pickle.HIGHEST_PROTOCOL)
    return merged, split

In [None]:
braker_merged, braker_split = pickle_cache_suspicious_orthologue_pipeline("braker", hog_df, wbps_col, braker_col, species_list, seq_id_map, wbps_prefix="Transcript")
anno_merged, anno_split = pickle_cache_suspicious_orthologue_pipeline("anno", hog_df, wbps_col, anno_col, species_list, seq_id_map, wbps_prefix="Transcript")
helixer_merged, helixer_split = pickle_cache_suspicious_orthologue_pipeline("helixer", hog_df, wbps_col, helixer_col, species_list, seq_id_map, wbps_prefix="Transcript")

In [None]:
num_genes = len(list(species_list.get_species_with_data_label("Ppac_braker3_LT").db.all_features(featuretype="gene")))
print(f"BRAKER3: merged={len(braker_merged)}, split={len(braker_split)}, total={round(100*(len(braker_split) + len(braker_merged)*2)/num_genes, 2)}")
num_genes = len(list(species_list.get_species_with_data_label("Ppac_helixer_LT").db.all_features(featuretype="gene")))
print(f"Helixer: merged={len(helixer_merged)}, split={len(helixer_split)}, total={round(100*(len(helixer_split) + len(helixer_merged)*2)/num_genes, 2)}")
num_genes = len(list(species_list.get_species_with_data_label("Ppac_anno_LT").db.all_features(featuretype="gene")))
print(f"Anno: merged={len(anno_merged)}, split={len(anno_split)}, total={round(100*(len(anno_split) + len(anno_merged)*2)/num_genes, 2)}")

In [None]:
anno_merged, anno_split = suspicious_orthologue_pipeline(hog_df, wbps_col, anno_col, species_list, seq_id_map, wbps_prefix="Transcript")

In [None]:
helixer_merged, helixer_split = suspicious_orthologue_pipeline(hog_df, wbps_col, helixer_col, species_list, seq_id_map, wbps_prefix="Transcript")

# InterPro accession investigation

### BRAKER3

In [None]:
acc_product, acc_tally_shared, acc_tally_missed_braker3, acc_tally_novel_braker3, missed_transcripts = interpro_accession_pipeline(db, hog_df, wbps_col, braker_col, interproscan_dir, acc_product, prefix="Transcript")

# Find InterPro accessions occurring with significantly different frequency than in control (acc_tally_shared)
braker3_novel_results = fisher_exact_for_two_lists_of_accessions(acc_tally_novel_braker3, acc_tally_shared)
braker3_missed_results = fisher_exact_for_two_lists_of_accessions(acc_tally_missed_braker3, acc_tally_shared)

In [None]:
interpro_accessions_in_missed_transcripts(acc_product, acc_tally_missed_braker3, acc_tally_novel_braker3, braker3_missed_results, braker3_novel_results, min_freq=75)

### There are no IPR accessions in the annotation... Will have to write brand new pipelines for dealing with PFAM domains (which are present)

# Novel orthologues

In [None]:
print("Novel orthogroups")
total_orthogroups = hog_df.shape[0]
braker3_novel_ogs = hog_df[(hog_df[wbps_col].isna()) & ~(hog_df[braker_col].isna())].shape[0]
helixer_novel_ogs = hog_df[(hog_df[wbps_col].isna()) & ~(hog_df[helixer_col].isna())].shape[0]
anno_novel_ogs = hog_df[(hog_df[wbps_col].isna()) & ~(hog_df[anno_col].isna())].shape[0]
print(f"BRAKER3: {braker3_novel_ogs} ({round(100*braker3_novel_ogs/total_orthogroups, 2)}%)")
print(f"Helixer: {helixer_novel_ogs} ({round(100*helixer_novel_ogs/total_orthogroups, 2)}%)")
print(f"Anno: {anno_novel_ogs} ({round(100*anno_novel_ogs/total_orthogroups, 2)}%)")

In [None]:
# novel_orthologue_pipeline(hog_df, wbps_col, anno_col, species_list, out_dir="data/novel_orthologue_sequences/ppac/anno/")
# novel_orthologue_pipeline(hog_df, wbps_col, braker_col, species_list, out_dir="data/novel_orthologue_sequences/ppac/braker3/")
# novel_orthologue_pipeline(hog_df, wbps_col, helixer_col, species_list, out_dir="data/novel_orthologue_sequences/ppac/helixer/")
anno_esm_means = extract_esm_means("data/from_MARS/Ppac_esm_pLDDTs_anno.txt").values()
braker3_esm_means = extract_esm_means("data/from_MARS/Ppac_esm_pLDDTs_braker3.txt").values()
helixer_esm_means = extract_esm_means("data/from_MARS/Ppac_esm_pLDDTs_helixer.txt").values()

print(statistics.mean(map(float, anno_esm_means)))
print(statistics.mean(map(float, braker3_esm_means)))
print(statistics.mean(map(float, helixer_esm_means)))


In [None]:
import pandas as pd
cols = (
    "fn",
    "mean",
    "median",
    "stdev",
    "var",
    "max",
    "min",
    "perc_confident"
)
df = pd.read_csv("data/from_MARS/pLDDT_ppac.csv", names=cols)
print(f"Mean of means: {df['mean'].mean()}")
print(f"% that are \"Confident\": {100*df[df['mean'] >= 70].shape[0]/df.shape[0]}")


In [None]:
df_anno = pd.read_csv("data/from_MARS/pLDDT_ppac_anno.csv", names=cols)
df_braker3 = pd.read_csv("data/from_MARS/pLDDT_ppac_braker3.csv", names=cols)
df_helixer = pd.read_csv("data/from_MARS/pLDDT_ppac_helixer.csv", names=cols)

In [None]:
print(f"Mean of WBPS % that are \"Confident\" residues: {df['perc_confident'].mean()}")
print(f"Mean of BRAKER3 % that are \"Confident\" residues: {df_braker3['perc_confident'].mean()}")
print(f"Mean of Anno % that are \"Confident\" residues: {df_anno['perc_confident'].mean()}")
print(f"Mean of Helixer % that are \"Confident\" residues: {df_helixer['perc_confident'].mean()}")