# Quality Assurance of Genomes

Using checkM output data.

In [None]:
checkm_marker_data = pd.read_csv(
    "../data/02_sar86_quality/checkm_sar86_marker_data.txt",
    sep="\t",
    index_col="Node Id: 2; Marker lineage: Gammaproteobacteria",
)

In [None]:
himb_marker_genes = checkm_marker_data.loc["HIMB1674"]

In [None]:
absent_markers = himb_marker_genes[himb_marker_genes == 0]

In [None]:
# Save absent marker genes in HIMB1674 genome
absent_marker_file = open("../data/02_sar86_quality/absent_markers.txt", "w")
for gene in absent_markers:
    absent_marker_file.write(gene + "\n")

absent_marker_file.close()

# Save present marker genes in HIMB1674 genomes
present_markers = himb_marker_genes[himb_marker_genes == 1]
present_markers = present_markers.index.tolist()

present_marker_file = open("../data/02_sar86_quality/present_markers.txt", "w")
for gene in present_markers:
    present_marker_file.write(gene + "\n")

present_marker_file.close()

After rerunning CheckM and exlcuding HIMB1674 absent marker genes

In [None]:
# Load quality result data
quality_results = pd.read_csv(
    "../data/02_sar86_quality/checkm_sar86_quality_data_corrected.txt",
    sep="\t",
    index_col="Bin Id",
)

In [None]:
# Identify passable quality genomes
good_genomes = quality_results[
    (quality_results["Completeness"] > 80) & (quality_results["Contamination"] < 5)
]

## Cluster genomes

In [None]:
ani_data = pd.read_csv(
    "../data/03_species_clusters/fastani/sar86_fastani.txt", sep="\t", header=None
)

ani_column_corection = {
    0: "Query Genome",
    1: "Reference Genome",
    2: "ANI Value",
    3: "Count of Bidirectional fragment mappings",
    4: "Total Query Fragments",
}

ani_data.rename(columns=ani_column_corection, inplace=True)


In [None]:
# Cout out path part in genome names
ani_data["Query Genome"] = ani_data["Query Genome"].str.replace(r"^.*/", "", regex=True)
ani_data["Query Genome"] = ani_data["Query Genome"].str.replace(
    r"(\.[0-9])?\.fa$", "", regex=True
)

ani_data["Reference Genome"] = ani_data["Reference Genome"].str.replace(
    r"^.*/", "", regex=True
)
ani_data["Reference Genome"] = ani_data["Reference Genome"].str.replace(
    r"(\.[0-9])?\.fa$", "", regex=True
)

In [None]:
# Calculate alignment fraction
ani_data["Alignment Fraction"] = (
    ani_data["Count of Bidirectional fragment mappings"]
    / ani_data["Total Query Fragments"]
) * 100

In [None]:
# Drop rows where the query and reference genome are the same
index_names = ani_data[(ani_data["Query Genome"] == ani_data["Reference Genome"])].index
ani_data.drop(index_names, inplace=True)

In [None]:
# Only use values with high alignment and high ANI values
ani_data_filtered = ani_data[
    (ani_data["Alignment Fraction"] > 65) & (ani_data["ANI Value"] > 80)
]

In [None]:
# Function to help group species
def group_species(
    df_ani: pd.DataFrame, df_quality: pd.DataFrame, ani: int, prior_reps: list
):
    species_clusters = {}
    clustered = set()
    # Remove the genome version number incase it is still there
    df_quality.index = df_quality.index.str.replace(r"\.[0-9]$", "", regex=True)

    # Cluster based on prior reps
    for rep in prior_reps:
        # Identify genomes that are closely related to the rep, both of the
        # comparisons are checked i.e. when rep was the query and when rep was
        # the reference
        tmp_species_row = df_ani.loc[rep][df_ani.loc[rep] > ani].index.tolist()
        tmp_species_column = df_ani[rep][df_ani[rep] > ani].index.tolist()
        # Here we remove any duplicate enteries, i.e. the case where both of
        # the comparisons were above the specified ANI by turning the list into
        # a set and then back into a list
        tmp_species = list(set(tmp_species_row + tmp_species_column))
        species_clusters[rep] = tmp_species
        # Use update instead of add since update treats each tuple entry on its own and
        # doesn't treat the whole list as a single set entry
        clustered.update(tuple(tmp_species + [rep]))

    # Cluster remaining genomes using denovo method
    sorted_genomes = df_quality.sort_values(
        by=["Completeness"], ascending=False
    ).index.tolist()
    
    for denovo in sorted_genomes:
        # If the genome is already in a cluster ignore it and move on
        if denovo in clustered:
            continue
        # If a genome cannot be found in the ANI matrix that means that it did not have any closely
        # related genomes above the minimum threshold set by FastANI
        elif denovo not in df_ani.columns and denovo not in df_ani.index:
            species_clusters[denovo] = []
            clustered.update([denovo])
            continue
        # If a genome has not already been clustered, cluster it!
        tmp_species_row = df_ani.loc[denovo][df_ani.loc[denovo] > ani].index.tolist()
        tmp_species_column = df_ani[denovo][df_ani[denovo] > ani].index.tolist()
        tmp_species_not_filtered = list(set(tmp_species_row + tmp_species_column))
        # Drop any genomes already clustered:
        tmp_species = [x for x in tmp_species_not_filtered if x not in clustered]
        species_clusters[denovo] = tmp_species
        tmp_cluster_update = tuple(tmp_species + [denovo])
        clustered.update(tmp_cluster_update)

    print("Genomes clustered: {}".format(len(clustered)))
    # Return a dictionary that contains the representative species for each species cluster
    # and its associated genomes
    return species_clusters

In [None]:
# --- Calculate the number of species clusters for different ANI values --- #
species_num = []
lower_bound = 80
upper_bound = 99
# Range is inclusive on the first value and exclusive on the last so we check each value
# from 80 to 98
for i in range(lower_bound, upper_bound, 1):
    # Get the length of the dictionary which corresponds to the number of
    # species groups based on the ANI value.
    species_num.append(len(group_species(ani_matrix, good_genomes, i, ["HIMB1674"])))

Identified 93% as optimal cutoff

In [None]:
# Rerun with the 93% ANI
selected_species_groups = group_species(ani_matrix, good_genomes, 93, ["HIMB1674"])