In [1]:
import os.path
import pandas as pd

busco_stats_path = "data/WBPS19_busco_stats.tsv"
outdir = "data/helixer_training_species/"

full_df = pd.read_csv(busco_stats_path, delimiter="\t")

# Take genus to be first part of species name
full_df["genus"] = full_df["genome"].str.split("_").apply(lambda x: x[0])

Determinine candidates for rerunning Helixer/BRAKER on once a good model has been trained. These have the biggest difference in quality of assembly (generally good) vs quality of existing annotation (generally poor). If we were to take a species with a poor annotation but an equally poor assembly, we would understandably not gain much improved BUSCO scores regardless of the annotation tool we choose.

In [2]:
full_df["C_diff"] = full_df["C_as"] - full_df["C_an"]
reann_set = full_df.sort_values("C_diff", ascending=False).head(5)
reann_set["genome"].to_csv(os.path.join(outdir, "reannotation_set.txt"), sep="\n", index=False, header=False)
reann_set[["genome", "C_as", "C_an", "C_diff"]]

Unnamed: 0,genome,C_as,C_an,C_diff
106,heterorhabditis_bacteriophora_prjna13977,87.1,31.4,55.7
231,teladorsagia_circumcincta_prjna72569,71.8,38.4,33.4
5,ancylostoma_ceylanicum_prjna72583,87.2,58.9,28.3
86,globodera_pallida_prjna702104,59.5,38.6,20.9
8,angiostrongylus_cantonensis_prjna350391,83.6,67.4,16.2


For the training set and validation sets, we should ensure a minimum BUSCO Completeness of **60** (as a broad threshold), and that the species isn't already earmarked for reannotation.

In [3]:
df = full_df[(full_df["C_an"] >= 60) & (~full_df["genome"].isin(reann_set["genome"]))]

For training set, we'll pick one from each genus, prioritising the one with the highest annotation BUSCO Completeness

In [4]:
train_set = df.sort_values("C_an", ascending=False).drop_duplicates("genus")
train_set["genome"].sort_values().to_csv(os.path.join(outdir, "training_set.txt"), sep="\n", index=False, header=False)
train_set.head(5)

Unnamed: 0,genome,C_an,D_an,F_an,M_an,N_an,C_as,D_as,F_as,M_as,N_as,genus,C_diff
268,caenorhabditis_elegans_prjna13758,99.7,0.3,0.3,0.0,982,98.6,0.6,0.8,0.6,982,caenorhabditis,-1.1
271,onchocerca_volvulus_prjeb513,99.2,0.4,0.8,0.0,982,97.6,0.3,1.7,0.7,982,onchocerca,-1.6
265,brugia_malayi_prjna10729,98.9,0.6,0.9,0.2,982,96.8,1.1,1.8,1.4,982,brugia,-2.1
264,wuchereria_bancrofti_prjna275548,98.2,2.3,0.9,0.9,982,97.5,1.9,1.7,0.8,982,wuchereria,-0.7
117,loa_loa_prjna37757,96.3,0.7,3.7,0.0,982,96.0,0.7,3.7,0.3,982,loa,-0.3


For validation set, we'll pick the second best species from each criteria, based on previous criteria.

In [5]:
valid_set = df.sort_values("C_an", ascending=False)[~df["genome"].isin(train_set["genome"])].drop_duplicates("genus")
valid_set["genome"].sort_values().to_csv(os.path.join(outdir, "validation_set.txt"), sep="\n", index=False, header=False)
valid_set.head(5)

  valid_set = df.sort_values("C_an", ascending=False)[~df["genome"].isin(train_set["genome"])].drop_duplicates("genus")


Unnamed: 0,genome,C_an,D_an,F_an,M_an,N_an,C_as,D_as,F_as,M_as,N_as,genus,C_diff
267,caenorhabditis_briggsae_prjna10731,98.7,0.5,0.9,0.4,982,97.7,0.7,1.5,0.8,982,caenorhabditis,-1.0
116,loa_loa_prjna246086,94.7,1.3,3.5,1.8,982,97.5,1.3,2.0,0.5,982,loa,2.8
176,pristionchus_maxplancki_prjeb27334,93.5,13.4,4.2,2.3,982,90.6,9.4,5.5,3.9,982,pristionchus,-2.9
151,oscheius_tipulae_prjeb15512,89.8,3.4,5.7,4.5,982,89.4,3.0,3.2,7.4,982,oscheius,-0.4
24,brugia_pahangi_prjeb497,89.8,1.1,6.7,3.5,982,89.7,0.7,6.6,3.7,982,brugia,-0.1


In [6]:
print("Total species:\t\t\t{}".format(len(full_df)))
print("Species in...")
print("\treannotation set:\t{}".format(len(reann_set)))
print("\ttraining set:\t\t{}".format(len(train_set)))
print("\tvalidation set:\t\t{}".format(len(valid_set)))

Total species:			275
Species in...
	reannotation set:	5
	training set:		85
	validation set:		43
