In [None]:
import os.path
import pandas as pd

busco_stats_path = "data/WBPS19_busco_stats.tsv"
outdir = "data/tiberius_training_species/"

# Will be building a model for roundworms and flatworms separately
phylum = {}
with open(os.path.join(outdir, "nematoda_genera_clade_v.txt"), "r") as f:
    nematodes = f.read().splitlines()
    phylum.update({k: "nematoda" for k in nematodes})
with open(os.path.join(outdir, "platyhelminthes_genera.txt"), "r") as f:
    platyhelminthes = f.read().splitlines()
    phylum.update({k: "platyhelminthes" for k in platyhelminthes})

full_df = pd.read_csv(busco_stats_path, delimiter="\t")

# Take genus to be first part of species name
full_df["genus"] = full_df["genome"].str.split("_").apply(lambda x: x[0])
full_df["phylum"] = full_df["genus"].apply(lambda x: phylum.get(x, None))
full_df = full_df.dropna()

A number of species (and their close relatives) have been selected for reannotation, so sould be removed from contention so as not to bias training.

In [18]:
with open(os.path.join(outdir, "reannotation_set.txt")) as f:
    reann_set = f.read().splitlines()
full_df = full_df[~full_df["genome"].isin(reann_set)]

For the training set and validation sets, we should ensure a minimum BUSCO Completeness of **60** (as a broad threshold).

In [19]:
df = full_df[(full_df["C_an"] >= 60)]

For training sets, we'll pick one from each genus, prioritising the one with the highest annotation BUSCO Completeness

In [23]:
train_set = df.sort_values("C_an", ascending=False).drop_duplicates("genus")
train_set[train_set["phylum"] == "nematoda"]["genome"].sort_values().to_csv(os.path.join(outdir, "training_set_nematoda.txt"), sep="\n", index=False, header=False)
train_set[train_set["phylum"] == "platyhelminthes"]["genome"].sort_values().to_csv(os.path.join(outdir, "training_set_platyhelminthes.txt"), sep="\n", index=False, header=False)
train_set.head(5)

Unnamed: 0,genome,C_an,D_an,F_an,M_an,N_an,C_as,D_as,F_as,M_as,N_as,genus,phylum
143,nippostrongylus_brasiliensis_prjna994163,93.9,2.6,3.5,2.6,982,90.0,1.8,7.1,2.9,982,nippostrongylus,nematoda
4,ancylostoma_ceylanicum_prjna231479,92.6,2.0,5.0,2.4,982,90.1,1.7,6.1,3.8,982,ancylostoma,nematoda
140,necator_americanus_prjna1007425,92.1,2.1,4.7,3.2,982,87.9,1.3,6.8,5.3,982,necator,nematoda
119,macrostomum_lignano_prjna371498,92.1,85.4,2.5,5.4,978,88.4,76.9,3.5,8.1,978,macrostomum,platyhelminthes
138,micoletzkya_japonica_prjeb27334,90.1,2.3,5.3,4.6,982,86.6,1.9,6.8,6.6,982,micoletzkya,nematoda


For validation set, we'll pick the second best species from each criteria, based on previous criteria.

In [24]:
valid_set = df.sort_values("C_an", ascending=False)[~df["genome"].isin(train_set["genome"])].drop_duplicates("genus")
valid_set[valid_set["phylum"] == "nematoda"]["genome"].sort_values().to_csv(os.path.join(outdir, "validation_set_nematoda.txt"), sep="\n", index=False, header=False)
valid_set[valid_set["phylum"] == "platyhelminthes"]["genome"].sort_values().to_csv(os.path.join(outdir, "validation_set_platyhelminthes.txt"), sep="\n", index=False, header=False)
valid_set.head(5)

  valid_set = df.sort_values("C_an", ascending=False)[~df["genome"].isin(train_set["genome"])].drop_duplicates("genus")


Unnamed: 0,genome,C_an,D_an,F_an,M_an,N_an,C_as,D_as,F_as,M_as,N_as,genus,phylum
207,schmidtea_mediterranea_s2f19h1prjna885486,86.2,6.9,1.6,12.2,978,81.6,3.2,3.8,14.6,978,schmidtea,platyhelminthes
254,trichobilharzia_regenti_prjeb44434,83.4,4.7,4.4,12.2,978,74.6,3.4,3.8,21.6,978,trichobilharzia,platyhelminthes
71,echinococcus_granulosus_prjeb121,81.3,2.4,5.3,13.4,978,69.3,1.2,6.9,23.8,978,echinococcus,platyhelminthes
108,hymenolepis_diminuta_prjeb30942,80.5,2.0,4.2,15.3,978,77.3,1.2,3.6,19.1,978,hymenolepis,platyhelminthes
118,macrostomum_lignano_prjna284736,79.6,57.1,4.6,15.8,978,87.9,69.3,3.8,8.3,978,macrostomum,platyhelminthes


In [25]:
print("Total species:\t\t\t{}".format(len(full_df)))
print("Species in...")
print("\treannotation set:\t{}".format(len(reann_set)))
print("\ttraining set:\t\t{}".format(len(train_set)))
print("\tvalidation set:\t\t{}".format(len(valid_set)))

Total species:			81
Species in...
	reannotation set:	66
	training set:		34
	validation set:		17
