In [None]:
mapping_dir=snakemake.params.mapping_dir
sampling=snakemake.params.sampling
qc_read_counts=snakemake.input.df_counts_paired
SAMPLES=list(snakemake.params.samples)
# ------------------------------------------------------
output_mapping_stats_html=snakemake.output.mapping_stats_html
output_filtered_viral_png=snakemake.output.filtered_viral_png
output_filtered_viral_svg=snakemake.output.filtered_viral_svg
output_filtered_unfiltered_png=snakemake.output.filtered_unfiltered_png
output_filtered_unfiltered_svg=snakemake.output.filtered_unfiltered_svg

In [None]:
# import necessary modules
import pandas as pd
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
import numpy as np

#------------------------------------------
#Create a list of colors
colors_rarefaction=sns.color_palette("colorblind", n_colors=len(list(snakemake.params.samples)))
#Create a LinearSegmentedColormap object
cmap1=LinearSegmentedColormap.from_list("my_colormap", sns.color_palette("colorblind", n_colors=5))


sns.set_style("ticks",{'axes.grid' : True})
sns.set_palette("colorblind")

plt.rcParams["axes.linewidth"] = 1.5
plt.rcParams["xtick.major.width"] = 1.5
plt.rcParams["ytick.major.width"] = 1.5
plt.rcParams["xtick.major.size"] = 8
plt.rcParams["ytick.major.size"] = 8
plt.rcParams["axes.titlepad"] = 20

plt.rcParams['svg.fonttype'] = 'none'
plt.rcParams["axes.titlesize"] = 30
plt.rcParams['axes.labelsize'] = 23.5
plt.rcParams['xtick.labelsize'] = 18
plt.rcParams['ytick.labelsize'] = 18
plt.rcParams['legend.fontsize'] = 18
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = ['Liberation Sans']
plt.rcParams['text.usetex'] = False

plt.rcParams['svg.fonttype'] = 'none'
plt.rcParams["savefig.dpi"]=300

# Calculate mapping statistics

In [None]:
# Read data from qc_read_counts into a DataFrame
df_counts_paired = pd.read_csv(qc_read_counts, index_col=0)

# Select columns "sample" and "bbduk" from df_counts_paired
reads_df = df_counts_paired[["sample", "bbduk"]]

# Create a new column "bbduk2" in reads_df, using a lambda function to calculate the minimum value between 2000000 and each value in "bbduk"
reads_df['bbduk2'] = reads_df['bbduk'].apply(lambda x: min(2000000, x))

# Rename columns in reads_df
reads_df.columns = ["sample", "read_count", "bbduk2"]

# Initialize empty lists
mapped_pair_f = []
mapped_pair_u = []
mapped_pair_v = []
mapped_pair_a = []

# Iterate over SAMPLES
for sample in SAMPLES:
    # Read content from files into corresponding lists
    content_f = open(mapping_dir + "/bowtie2_flagstats_filtered_" + sample + "." + sampling + '.txt').readlines()
    content_u = open(mapping_dir + "/STATS_FILES/bowtie2_flagstats_filtered_" + sample + "_unfiltered_contigs." + sampling + '.txt').readlines()
    content_v = open(mapping_dir + "/STATS_FILES/bowtie2_flagstats_filtered_" + sample + "_viral_contigs." + sampling + '.txt').readlines()
    content_a = open(mapping_dir + "/STATS_FILES/bowtie2_flagstats_filtered_" + sample + "_assembled_contigs." + sampling + '.txt').readlines()

    # Append calculated values to the respective lists
    mapped_pair_f.append(int(content_f[1].split()[0]) / 2)
    mapped_pair_u.append(int(content_u[1].split()[0]) / 2)
    mapped_pair_v.append(int(content_v[1].split()[0]) / 2)
    mapped_pair_a.append(int(content_a[1].split()[0]) / 2)

# Create a new DataFrame df_mapped
df_mapped = pd.DataFrame()
df_mapped["sample"] = SAMPLES
df_mapped["mapped_f"] = mapped_pair_f
df_mapped["mapped_u"] = mapped_pair_u
df_mapped["mapped_v"] = mapped_pair_v
df_mapped["mapped_a"] = mapped_pair_a

# Merge df_mapped with reads_df based on the "sample" column
df_mapped = df_mapped.merge(reads_df, left_on="sample", right_on="sample")

# Calculate "% assembled", "% viral", "% unfiltered", and "% filtered" based on the merged DataFrame columns
df_mapped["% assembled"] = df_mapped["mapped_a"] * 100 / df_mapped["bbduk2"]
df_mapped["% viral"] = df_mapped["mapped_v"] * 100 / df_mapped["bbduk2"]
df_mapped["% unfiltered"] = df_mapped["mapped_u"] * 100 / df_mapped["bbduk2"]
df_mapped["% filtered"] = df_mapped["mapped_f"] * 100 / df_mapped["read_count"]

# Calculate additional columns based on the merged DataFrame columns
df_mapped["filtered-viral"] = df_mapped["% filtered"] - df_mapped["% viral"]
df_mapped["filtered-unfiltered"] = df_mapped["% filtered"] - df_mapped["% unfiltered"]

# Calculate the mean row and append it to df_mapped
df_mapped.loc['mean'] = df_mapped.mean(numeric_only=True)

# Drop unnecessary columns from df_mapped
df_mapped = df_mapped.drop(["mapped_f", "mapped_u", "mapped_v", "mapped_a", "bbduk2"], axis=1)

# Round values in df_mapped to 1 decimal place
df_mapped = df_mapped.round(1)

# Create a styled DataFrame stats_df_out with background color gradient
stats_df_out = df_mapped.style.set_precision(2).background_gradient(cmap="RdYlGn").render()

# Write the HTML table to a file
with open(output_mapping_stats_html, "w") as fp:
    fp.write(stats_df_out)

# Display the styled DataFrame without writing to a file
df_mapped.style.set_precision(2).background_gradient(cmap="RdYlGn")


# Plot percentage of mapped reads to viral contigs vs filtered vOTUs

In [None]:
# Create a figure with a size of 12x12 inches
plt.figure(figsize=(12, 12))

# Plot a regression plot using the data from df_mapped, with '%mapped viral' as x-axis and '%mapped filtered' as y-axis
sns.regplot(data=df_mapped, x='% viral', y='% filtered')

# Add a red dashed line from (0, 0) to (100, 100) to represent a 1:1 relationship
plt.plot([0, 100], [0, 100], 'r--')

# Set the x-axis label
plt.xlabel('% Reads mapping to viral contigs')

# Set the y-axis label
plt.ylabel('% Reads mapping to filtered vOTUs')

# Save the figure as a PNG file with tight bounding box and transparent background
plt.savefig(output_filtered_viral_png, format="png", bbox_inches="tight", transparent=True)

# Save the figure as an SVG file with tight bounding box and transparent background
plt.savefig(output_filtered_viral_svg, format="svg", bbox_inches="tight", transparent=True)

# Display the plot
plt.show()

# Plot percentage of mapped reads to unfiltered vOTUs vs filtered vOTUs

In [None]:
# Create a figure with a size of 12x12 inches
plt.figure(figsize=(12, 12))

# Plot a regression plot using the data from df_mapped, with '% unfiltered' as x-axis and '% filtered' as y-axis
sns.regplot(data=df_mapped, x='% unfiltered', y='% filtered')

# Add a red dashed line from (0, 0) to (100, 100) to represent a 1:1 relationship
plt.plot([0, 100], [0, 100], 'r--')

# Set the x-axis label
plt.xlabel('% Reads mapping to unfiltered vOTUs')

# Set the y-axis label
plt.ylabel('% Reads mapping to filtered vOTUs')

# Save the figure as a PNG file with tight bounding box and transparent background
plt.savefig(output_filtered_unfiltered_png, format="png", bbox_inches="tight", transparent=True)

# Save the figure as an SVG file with tight bounding box and transparent background
plt.savefig(output_filtered_unfiltered_svg, format="svg", bbox_inches="tight", transparent=True)

# Display the plot
plt.show()