# Compare the taxa found using different sequencing technologies.

We have three different sequencing technologies, MGI short read (300 bp paired end reads), ONT MinION, and ONT PromethION. Here, we compare the different sequencing technologies to see which taxa we find.

You can find the original version of this [notebook on Rob's GitHub]()

In [1]:
# load the libaries
import os
import sys

import re
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.collections import PatchCollection
from matplotlib.colors import ListedColormap
import pandas as pd
import seaborn as sns
import json

In [2]:
def read_taxonomy(seqmethod, taxonomy, minreads = 0):
    """
    Read the taxonomy file and return a data frame
    """
    firstchar = taxonomy[0]
    tax_file = f"../../{seqmethod}/Taxonomy/{seqmethod}_reads_{taxonomy}.normalised.tsv.gz"

    df = pd.read_csv(tax_file, sep='\t', compression='gzip')
    df = df[df['taxonomy'].str.contains('k__Bacteria')]
    df = df[df['taxonomy'].str.contains(f'{firstchar}__')]
    df = df[~df['taxonomy'].str.endswith(f'{firstchar}__')]
    df = df.set_index('taxonomy')
    df.index = df.index.str.split(';').str[-1]
    if minreads: 
        df = df[~(df.lt(minreads).all(axis=1))]
    # here we average all the samples that have the same name. We may see this, eg. in Genus where we have different Streptococcuses in different samples
    df = df.groupby(df.index).mean()
    df = df.sort_index(axis=1)
    
    return df

def sorted_presence_absence(df1, df2, df3, minrowsum=0, asc_sort=False):
    """
    Join the two tables and return the sorted version
    """
    # filter so we only include samples sequenced on both MGI and MinION
    common_columns = df1.columns.intersection(df2.columns).intersection(df3.columns)
    df1_both = df1[common_columns]
    df2_both = df2[common_columns]
    df3_both = df3[common_columns]
    
    # create a presence/absence matrix
    df1_presence = (df1_both > 0).astype(int)
    df2_presence = (df2_both > 0).astype(int)*2
    df3_presence = (df3_both > 0).astype(int)*4
    
    # here we filter on the minimum number of columns each taxa is in if requested
    if minrowsum > 0:
        df1_presence = df1_presence.loc[df1_presence[df1_presence.sum(axis=1) > minrowsum].index]
        df2_presence = df2_presence.loc[df2_presence[df2_presence.sum(axis=1) > (2 * minrowsum)].index]
        df3_presence = df3_presence.loc[df3_presence[df3_presence.sum(axis=1) > (4 * minrowsum)].index]
    
    # combine the  matrices and sort them
    both = df1_presence.add(df2_presence, fill_value=0).add(df3_presence, fill_value=0)
    sboth = both.loc[both.sum(axis=1).sort_values(ascending=asc_sort).index]
    sboth = sboth.sort_index(axis=1) # sort by column names

    return sboth

def convert_df_xyz(mgi_df, sboth):
    """
    Convert the data frame into xyz values. X and Y come from `sboth` (sorted both MGI & Minion) and Z comes from MGI coverage.
    """
   
    # we need to get the MGI data in the same order as the sboth dataframe
    mgi_reorg = mgi_df.reindex(index=sboth.index, columns=sboth.columns)

    # Prepare data for 3D plot
    x = np.arange(sboth.shape[1])  # x coordinates (samples)
    y = np.arange(sboth.shape[0])  # y coordinates (taxa)
    x, y = np.meshgrid(x, y)  # Create a meshgrid for x and y
    z = mgi_reorg.values  # Height values

    return (x, y, z)

In [3]:
taxonomies = ['phylum', 'class', 'order', 'family', 'genus', 'species']

colors = ['#e41a1c', '#377eb8', '#4daf4a', '#984ea3', '#ff7f00', '#ffff33', '#a65628', '#f781bf']
cmap = ListedColormap(colors)

row = 0
col = 0
minreads = 10
subsc = "abcdefgh"
for i, tax in enumerate(taxonomies):
    mgi_df = read_taxonomy("MGI", tax, minreads)
    min_df = read_taxonomy("MinION", tax)
    pro_df = read_taxonomy("Promethion", tax)
    common_taxonomies = mgi_df.index.intersection(min_df.index).intersection(pro_df.index)


    # Create subsets of the DataFrames containing only the common taxonomies
    mgi_df_l = mgi_df.loc[common_taxonomies]
    min_df_l = min_df.loc[common_taxonomies]
    pro_df_l = pro_df.loc[common_taxonomies]
    
    sboth = sorted_presence_absence(mgi_df_l, min_df_l, pro_df_l, minrowsum=0, asc_sort=True)
    x, y, z = convert_df_xyz(mgi_df_l, sboth)


    # Define a color map based on the heatmap values
    colors = cmap(sboth.values / sboth.values.max())

    # Create a 3D plot
    fig = plt.figure(figsize=(12, 8))
    ax = fig.add_subplot(111, projection='3d')
    sc = ax.plot_surface(x,y,z, facecolors=colors, edgecolor='None', vmin=0, vmax=3, antialiased=True)
    

    # Set labels
    ax.title.set_text(tax)
    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_zticks([])
    ax.set_xlabel(f"{sboth.shape[1]} pwCF samples")
    ax.set_ylabel(f"{sboth.shape[0]} {tax} taxonomies")
    ax.set_zlabel("Normalized MGI read depth")

    if i == 2:
        row = 1
        col = 0
    else:
        col += 1
        
        
    # plt.title(f"Fig. S1{subsc[i]}. {tax} taxonomic level comparisons")
    plt.title("")
    # Show color bar
    mappable = plt.cm.ScalarMappable(cmap=cmap)
    mappable.set_array(np.linspace(0, 8, 256))  # Color bar mapping
    cbar = plt.colorbar(mappable, ax=ax)
    cbar.set_label('How the taxa were detected')
    cbar.set_ticks([0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5])
    cbar.ax.set_yticklabels(["Not seen in this sample", "MGI Only", 'MinION Only', 'MGI + MinION', 'Promethion Only', 'MGI + Promethion', 'MinION + Promethion', 'All'])
    plt.tight_layout()  # Adjust layout to make room for the x-axis labels
    plt.savefig(f"final_images/FigS1{subsc[i]}.png", dpi=300)
    plt.savefig(f"final_images/FigS1{subsc[i]}.svg", dpi=300)

    plt.show()

FileNotFoundError: [Errno 2] No such file or directory: '../../MGI/Taxonomy/MGI_reads_phylum.normalised.tsv.gz'