In [None]:
mapping_dir=snakemake.params.mapping_dir
clean_dir=snakemake.params.clean_dir
sampling=snakemake.params.sampling
qc_read_counts=snakemake.input.df_counts_paired
SAMPLES=list(snakemake.params.samples)
SAMPLES_sub_file=snakemake.input.key_samples

In [None]:
# import necessary modules
import pandas as pd
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
import numpy as np

#------------------------------------------
#Create a list of colors
colors_rarefaction=sns.color_palette("colorblind", n_colors=len(list(snakemake.params.samples)))
#Create a LinearSegmentedColormap object
cmap1=LinearSegmentedColormap.from_list("my_colormap", sns.color_palette("colorblind", n_colors=5))


sns.set_style("ticks",{'axes.grid' : True})
sns.set_palette("colorblind")

plt.rcParams["axes.linewidth"] = 1.5
plt.rcParams["xtick.major.width"] = 1.5
plt.rcParams["ytick.major.width"] = 1.5
plt.rcParams["xtick.major.size"] = 8
plt.rcParams["ytick.major.size"] = 8
plt.rcParams["axes.titlepad"] = 20

plt.rcParams['svg.fonttype'] = 'none'
plt.rcParams["axes.titlesize"] = 30
plt.rcParams['axes.labelsize'] = 23.5
plt.rcParams['xtick.labelsize'] = 18
plt.rcParams['ytick.labelsize'] = 18
plt.rcParams['legend.fontsize'] = 18
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = ['Liberation Sans']
plt.rcParams['text.usetex'] = False

plt.rcParams['svg.fonttype'] = 'none'
plt.rcParams["savefig.dpi"]=300

# Calculate mapping statistics

In [None]:
# Read data from qc_read_counts into a DataFrame
df_counts_paired = pd.read_csv(qc_read_counts, index_col=0)

# Select columns "sample" and "bbduk" from df_counts_paired
reads_df = df_counts_paired[["sample", "bbduk"]]

# Create a new column "bbduk2" in reads_df, using a lambda function to calculate the minimum value between 2000000 and each value in "bbduk"
reads_df['bbduk2'] = reads_df['bbduk'].apply(lambda x: min(2000000, x))

# Rename columns in reads_df
reads_df.columns = ["sample", "read_count", "bbduk2"]

# Initialize empty lists
mapped_pair_f = []
mapped_pair_u = []
mapped_pair_v = []
mapped_pair_a = []
mapped_pair_4 = []

# Iterate over SAMPLES
for sample in SAMPLES:
    # Read content from files into corresponding lists
    content_f = open(mapping_dir + "/bowtie2_flagstats_filtered_" + sample + "." + sampling + '.txt').readlines()
    content_u = open(mapping_dir + "/STATS_FILES/bowtie2_flagstats_filtered_" + sample + "_unfiltered_contigs." + sampling + '.txt').readlines()
    content_v = open(mapping_dir + "/STATS_FILES/bowtie2_flagstats_filtered_" + sample + "_viral_contigs." + sampling + '.txt').readlines()
    content_a = open(mapping_dir + "/STATS_FILES/bowtie2_flagstats_filtered_" + sample + "_assembled_contigs." + sampling + '.txt').readlines()

    # Append calculated values to the respective lists
    mapped_pair_f.append(int(content_f[1].split()[0]) / 2)
    mapped_pair_u.append(int(content_u[1].split()[0]) / 2)
    mapped_pair_v.append(int(content_v[1].split()[0]) / 2)
    mapped_pair_a.append(int(content_a[1].split()[0]) / 2)

# Create a new DataFrame df_mapped
df_mapped = pd.DataFrame()
df_mapped["sample"] = SAMPLES
df_mapped["mapped_f"] = mapped_pair_f
df_mapped["mapped_u"] = mapped_pair_u
df_mapped["mapped_v"] = mapped_pair_v
df_mapped["mapped_a"] = mapped_pair_a

# Merge df_mapped with reads_df based on the "sample" column
df_mapped = df_mapped.merge(reads_df, left_on="sample", right_on="sample")

# Calculate "% assembled", "% viral", "% unfiltered", and "% filtered" based on the merged DataFrame columns
df_mapped["% assembled"] = df_mapped["mapped_a"] * 100 / df_mapped["bbduk2"]
df_mapped["% viral"] = df_mapped["mapped_v"] * 100 / df_mapped["bbduk2"]
df_mapped["% unfiltered"] = df_mapped["mapped_u"] * 100 / df_mapped["bbduk2"]
df_mapped["% filtered"] = df_mapped["mapped_f"] * 100 / df_mapped["read_count"]

# Calculate additional columns based on the merged DataFrame columns
df_mapped["filtered-viral"] = df_mapped["% filtered"] - df_mapped["% viral"]
df_mapped["filtered-unfiltered"] = df_mapped["% filtered"] - df_mapped["% unfiltered"]

# Drop unnecessary columns from df_mapped
df_mapped = df_mapped.drop(["mapped_f", "mapped_u", "mapped_v", "mapped_a", "bbduk2"], axis=1)

# Round values in df_mapped to 1 decimal place
df_mapped = df_mapped.round(1)

# Display the styled DataFrame without writing to a file
df_mapped.style.set_precision(2).background_gradient(cmap="RdYlGn")


# Subsample reads

In [None]:
with open(SAMPLES_sub_file) as f:
    SAMPLES_sub = f.readlines()
SAMPLES_sub = [x.strip() for x in SAMPLES_sub]

In [None]:
df_subsample=df_mapped.copy()[["sample", "read_count","% filtered"]]
df_subsample["viral_reads"]=df_subsample["read_count"]*df_subsample["% filtered"]/100
# df_subsample[]
filtered_df_subsample = df_subsample[df_subsample['sample'].isin(SAMPLES_sub)]
viral_reads=int(filtered_df_subsample["viral_reads"].min())
print(viral_reads)

df_subsample["subsample"]=round(viral_reads*100/df_subsample["% filtered"])


for index, row in df_subsample.iterrows():
    sample_value = row["sample"]
    sampling_data = row["subsample"]
    
    # Create a file with the "sample" column value as the filename
    filename = f"{clean_dir}/{sample_value}_sub_sampling_reads.txt"
    
    # Write the "sampling" data to the file
    with open(filename, "w") as file:
        file.write(str(int(sampling_data)))
    

df_subsample.sort_values(by="viral_reads").style.set_precision(2).background_gradient(cmap="RdYlGn")

In [None]:
df_subsample[df_subsample["read_count"]<df_subsample["subsample"]]["sample"].to_list()