In [None]:
# import necessary modules
import pandas as pd
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
import numpy as np
import math
from matplotlib import colors

#------------------------------------------
cmap1=LinearSegmentedColormap.from_list("my_colormap", sns.color_palette("colorblind", n_colors=5))


sns.set_style("ticks",{'axes.grid' : True})
sns.set_palette("colorblind")

plt.rcParams["axes.linewidth"] = 1.5
plt.rcParams["xtick.major.width"] = 1.5
plt.rcParams["ytick.major.width"] = 1.5
plt.rcParams["xtick.major.size"] = 8
plt.rcParams["ytick.major.size"] = 8
plt.rcParams["axes.titlepad"] = 20

plt.rcParams['svg.fonttype'] = 'none'
plt.rcParams["axes.titlesize"] = 30
plt.rcParams['axes.labelsize'] = 23.5
plt.rcParams['xtick.labelsize'] = 18
plt.rcParams['ytick.labelsize'] = 18
plt.rcParams['legend.fontsize'] = 18
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = ['Liberation Sans']
plt.rcParams['text.usetex'] = False

plt.rcParams['svg.fonttype'] = 'none'
plt.rcParams["savefig.dpi"]=300

In [None]:
SAMPLING=snakemake.params.sampling
illumina_postqc=snakemake.input.postqc_txt
mapping_dir=snakemake.params.mapping_dir
clean_dir=snakemake.params.clean_dir
threshold_bases=snakemake.params.threshold_bases
REFERENCE=snakemake.params.reference
THRESHOLD_RPKM=snakemake.params.threshold_RPKM

out_raw_RPKM_file=snakemake.output.raw_RPKM_file
out_norm_RPKM_file=snakemake.output.norm_RPKM_file
out_raw_count_file=snakemake.output.raw_count_file
out_norm_count_file=snakemake.output.norm_count_file
out_coverage_RPKM_file=snakemake.output.coverage_RPKM_file
out_coverage_bases_RPKM_file=snakemake.output.coverage_bases_RPKM_file

out_filtered_raw_RPKM_file=snakemake.output.filtered_raw_RPKM_file
out_filtered_norm_RPKM_file=snakemake.output.filtered_norm_RPKM_file
out_filtered_raw_count_file=snakemake.output.filtered_raw_count_file
out_filtered_norm_count_file=snakemake.output.filtered_norm_count_file
out_filtered_75_raw_RPKM_file=snakemake.output.filtered_75_raw_RPKM_file
out_filtered_75_norm_RPKM_file=snakemake.output.filtered_75_norm_RPKM_file
    


In [None]:
postqc = pd.read_csv(illumina_postqc, sep="\t")

# # create a new dataframe with columns for Filename, average sequence length, and total sequences from the postqc dataframe
read_stats_df2 = postqc[["Filename","avg_sequence_length"]]

# replace certain portions of the Filename column with standard suffixes (e.g. _forward -> _R1)
read_stats_df2.loc[:, "Filename"] = read_stats_df2["Filename"].str.replace("_forward", "_R1")
read_stats_df2.loc[:, "Filename"] = read_stats_df2["Filename"].str.replace("_reverse", "_R2")
read_stats_df2.loc[:, "Filename"] = read_stats_df2["Filename"].str.replace("_unpaired", "_U")
read_stats_df2.loc[:, "Filename"] = read_stats_df2["Filename"].str.split("_paired").str[0]
read_stats_df2.loc[:, "Filename"] = read_stats_df2["Filename"].str.split("_clean").str[0]

# create a new columns in read_stats_df2 for the sample name, based on the first part of the Filename (before the final underscore)
read_stats_df2.loc[:, "Sample"] = read_stats_df2["Filename"].str.rsplit("_",n=1).str[0]
read_stats_df2.loc[:, "Read_type"] = read_stats_df2["Filename"].str.rsplit("_",n=1).str[1]

# set the Filename column as the index of read_stats_df2
read_stats_df2 = read_stats_df2.set_index('Filename')

# rename the columns of read_stats_df2
read_stats_df2.columns=["length_post", "sample", "Read_type"]
read_stats_df2=read_stats_df2[read_stats_df2["Read_type"]!="U"]
read_length_paired=read_stats_df2.groupby("sample").mean(numeric_only=True)
read_length_paired.loc[:, "length_post"] = read_length_paired["length_post"].round(2)
read_length_paired

In [None]:
SAMPLES=read_length_paired.index.to_list()
SAMPLES

In [None]:
#!/usr/bin/env python3

# Written by Alejandro Reyes
# Reformatted & Edited Laura Forero

def normalise_rpkm_bt2(sample,all_file, toss_file, paired_len):
    all_df=pd.read_csv(all_file, sep="\t")
    toss_df=pd.read_csv(toss_file, sep="\t")

    all_df.columns=["ID", "Avg_fold", "Length", "Covered_bases", "Read_Count", "Variance", "Trimmed Mean", "RPKM_" + sample]
    all_df["Covered_percent"]=all_df["Covered_bases"]*100/all_df["Length"]
    all_df=all_df.drop(columns=["Avg_fold","Variance", "Trimmed Mean"])
    all_df=all_df.add_suffix('_all')

    toss_df.columns=["ID", "Avg_fold", "Length", "Covered_bases", "Read_Count", "Variance", "Trimmed Mean", "RPKM_" + sample]
    toss_df["Covered_percent"]=toss_df["Covered_bases"]*100/toss_df["Length"]
    toss_df=toss_df.drop(columns=[ "Avg_fold","Variance", "Trimmed Mean"])
    toss_df=toss_df.add_suffix('_toss')

    
    combined_df=all_df.merge(toss_df, left_on="ID_all", right_on="ID_toss")
    combined_df=combined_df.drop(columns=["ID_toss", "Length_toss"])
    
    combined_df["extra_reads"]=abs(combined_df["Read_Count_all"]-combined_df["Read_Count_toss"])
    combined_df["test"]= (combined_df["Read_Count_all"]* 1E6/((combined_df["Length_all"]/1000)*combined_df["Read_Count_all"].sum()))

    combined_df["toss_cov"] = combined_df.apply(lambda row: (row["Read_Count_toss"] / row["Covered_bases_toss"]) if row["Covered_bases_toss"] > 0 else 0, axis=1)
    combined_df["delta_map_base"] = abs(combined_df["Covered_bases_all"]-combined_df["Covered_bases_toss"])
    combined_df["needed_reads"] = round(combined_df["toss_cov"]*combined_df["delta_map_base"]*0.9)

    combined_df["extra"] = combined_df.apply(lambda row: min(abs(row["extra_reads"]), abs(row["needed_reads"])), axis=1)
    
    combined_df["cov_ratio"]=np.log10((combined_df["Covered_percent_all"]+1)/(combined_df["Covered_percent_toss"]+1))

    combined_df["needed_ratio"]=np.log10((combined_df["needed_reads"]+1)/(combined_df["extra_reads"]+1))
    combined_df["Read_count_norm_" + sample] = (combined_df["Read_Count_toss"]+combined_df["extra"]).astype('int')

    threshold = threshold_bases 
    combined_df.loc[(combined_df["Covered_bases_toss"] < threshold) & (combined_df["Read_count_norm_" + sample] > 0), ["Read_count_norm_" + sample]] = 0

    total_reads_map = combined_df["Read_Count_toss"].sum()
    total_reads_map=total_reads_map+combined_df["extra"].sum()
    combined_df["RPKM_norm_" + sample] = ((combined_df["Read_count_norm_" + sample])* 1E9)/(combined_df["Length_all"]*total_reads_map)
    
    combined_df["exp_ratio"] = combined_df.apply(lambda row: np.log10(100 * (1 - math.exp(-1 * ((row["Read_Count_toss"] + row["needed_reads"]) * paired_len) / row["Length_all"])) / row["Covered_percent_all"]) if (row["Read_Count_toss"] > 0) and (total_reads_map > 0) else 0, axis=1)
    
    combined_df=combined_df[['ID_all', 'Length_all', "Read_Count_all", "Read_Count_toss",'RPKM_'+ sample + '_all', 'Covered_percent_all' , 'RPKM_'+ sample + '_toss', 'Covered_percent_toss', "Read_count_norm_" + sample, 'RPKM_norm_'+ sample, "Covered_bases_all", "Covered_bases_toss"]]
    combined_df.columns=['Contig', 'Length_' + sample,"Count_" + sample, "Read_Count_toss_" + sample,'RPKM_'+ sample + '_all', 'Covered_percent_all_'+ sample , 'RPKM_'+ sample + '_toss', 'Covered_percent_toss_' + sample, "Count_norm_" + sample , 'RPKM_norm_'+ sample, "Covered_bases_all_" + sample, "Covered_bases_toss_" + sample ]
    return(combined_df)


In [None]:
def filter_abundances(RPKM, counts, coverage_bases, coverage_percent, contig_length):
    contig_lengths_6667 = contig_length >= 6667
    bases_covered_gt_5000 = coverage_bases > 5000
    percentage_covered_gt_75 = coverage_percent > 75

    # # Create a filtered abundance DataFrame with the same shape as RPKM_raw, initialized with zeros
    filtered_RPKM_df = pd.DataFrame(0, index=RPKM.index, columns=RPKM.columns)
    filtered_counts_df = pd.DataFrame(0, index=RPKM.index, columns=RPKM.columns)

    condition1 = contig_lengths_6667 & bases_covered_gt_5000
    condition2 = ~contig_lengths_6667 & percentage_covered_gt_75

    filtered_RPKM_df[condition1] = RPKM[condition1]
    filtered_RPKM_df[condition2] = RPKM[condition2]
    
    filtered_counts_df[condition1] = counts[condition1]
    filtered_counts_df[condition2] = counts[condition2]
    
    # threshold_RPKM=THRESHOLD_RPKM
    # filtered_RPKM_df[filtered_RPKM_df<threshold_RPKM]=0
    # filtered_counts_df[filtered_RPKM_df<threshold_RPKM]=0
    
    return(filtered_RPKM_df, filtered_counts_df)

def filter_abundances_75(RPKM, coverage_bases, coverage_percent, contig_length):
    percentage_covered_gt_75 = coverage_percent > 75

    # # Create a filtered abundance DataFrame with the same shape as RPKM_raw, initialized with zeros
    filtered_RPKM_df = pd.DataFrame(0, index=RPKM.index, columns=RPKM.columns)

    filtered_RPKM_df[percentage_covered_gt_75] = RPKM[percentage_covered_gt_75]

    return(filtered_RPKM_df)


In [None]:
print("NUMBER OF SAMPLES =" , len(SAMPLES))

n=0
for sample in SAMPLES:
    print(round(((n+1) * 100 / len(SAMPLES)),1), "%", end='\r')
    if len(REFERENCE)>0:
        all_file=mapping_dir + "/REFERENCES/bowtie2_" + REFERENCE + "_" + sample + "_" + SAMPLING + "_covstats.txt"
        toss_file=mapping_dir + "/REFERENCES/bowtie2_" + REFERENCE + "_" + sample + "_" + SAMPLING + "_unique_covstats.txt"
    else:
        all_file=mapping_dir + "/bowtie2_" + sample + "_" + SAMPLING + "_covstats.txt"
        toss_file=mapping_dir + "/bowtie2_" + sample + "_"+ SAMPLING + "_unique_covstats.txt"
    
    read_len=int(read_length_paired.loc[sample]["length_post"])
    count_df=normalise_rpkm_bt2(sample,all_file, toss_file, read_len)  

    if n == 0:
        df_RPKM_norm = count_df[["Contig", "RPKM_norm_" + sample]]
        df_RPKM_raw = count_df[["Contig", "RPKM_" + sample + "_all"]]
        df_count_raw = count_df[["Contig", "Count_" + sample ]]
        df_count_norm = count_df[["Contig", "Count_norm_" + sample ]]
        df_RPKM_cov = count_df[["Contig", "Covered_percent_all_"+ sample]]
        df_RPKM_cov_bases = count_df[["Contig", "Covered_bases_all_"+ sample]]
        df_RPKM_len = count_df[["Contig", "Length_" + sample]]
        
    else:
        df_RPKM_norm = df_RPKM_norm.merge(count_df[["Contig", "RPKM_norm_" + sample]], on="Contig")
        df_RPKM_raw = df_RPKM_raw.merge(count_df[["Contig", "RPKM_" + sample + "_all"]], on="Contig")
        df_count_raw = df_count_raw.merge(count_df[["Contig", "Count_" + sample]], on="Contig")
        df_count_norm = df_count_norm.merge(count_df[["Contig", "Count_norm_" + sample ]], on="Contig")
        df_RPKM_cov = df_RPKM_cov.merge(count_df[["Contig", "Covered_percent_all_"+ sample]], on="Contig")
        df_RPKM_cov_bases = df_RPKM_cov_bases.merge(count_df[["Contig", "Covered_bases_all_"+ sample]], on="Contig")
        df_RPKM_len = df_RPKM_len.merge(count_df[["Contig", "Length_" + sample]], on="Contig")
        
    n=n+1


df_RPKM_raw=df_RPKM_raw.rename(columns={"Contig": "vOTU"}).set_index('vOTU')
df_RPKM_norm=df_RPKM_norm.rename(columns={"Contig": "vOTU"}).set_index('vOTU')
df_count_raw=df_count_raw.rename(columns={"Contig": "vOTU"}).set_index('vOTU')
df_count_norm=df_count_norm.rename(columns={"Contig": "vOTU"}).set_index('vOTU')
df_RPKM_cov=df_RPKM_cov.rename(columns={"Contig": "vOTU"}).set_index('vOTU')
df_RPKM_cov_bases=df_RPKM_cov_bases.rename(columns={"Contig": "vOTU"}).set_index('vOTU')
df_RPKM_len=df_RPKM_len.rename(columns={"Contig": "vOTU"}).set_index('vOTU')


df_RPKM_raw.columns = df_RPKM_raw.columns.str.strip('RPKM_').str.rstrip('_all')
df_RPKM_norm.columns = df_RPKM_norm.columns.str.strip('RPKM_norm_').str.rstrip('_all')
df_count_raw.columns = df_count_raw.columns.str.replace('Count_', "")
df_count_norm.columns = df_count_norm.columns.str.replace('Count_norm_', "")
df_RPKM_cov.columns = df_RPKM_cov.columns.str.replace('Covered_percent_all_', "")
df_RPKM_cov_bases.columns = df_RPKM_cov_bases.columns.str.replace('Covered_bases_all_', "")
df_RPKM_len.columns = df_RPKM_len.columns.str.replace('Length_', "")

df_mean_coverage_raw=df_count_raw.div(df_RPKM_len)
df_mean_coverage_norm=df_count_norm.div(df_RPKM_len)

df_RPKM_raw.to_csv(out_raw_RPKM_file)
df_RPKM_norm.to_csv(out_norm_RPKM_file)
df_RPKM_cov.to_csv(out_coverage_RPKM_file)
df_RPKM_cov_bases.to_csv(out_coverage_bases_RPKM_file)
df_count_raw.to_csv(out_raw_count_file)
df_count_norm.to_csv(out_norm_count_file)



## Subsampled

In [None]:
df_mapped_sub=pd.DataFrame()

if SAMPLING=="sub":
    mapped_pair=[]
    sub_reads=[]
    for sample in SAMPLES:
        content = open(mapping_dir + "/bowtie2_flagstats_filtered_" + sample + "." +  "sub.txt").readlines()
        mapped_pair.append(int(content[1].split()[0]) / 2)
        content2 = open(clean_dir + "/" + sample + "_sub_sampling_reads_final.txt").readline()
        sub_reads.append(int(content2))

    df_mapped_sub["sample"]=SAMPLES
    df_mapped_sub["mapped"]=mapped_pair
    df_mapped_sub["sub_reads"]=sub_reads
    df_mapped_sub["viral_reads"]=df_mapped_sub["mapped"]

    df_mapped_sub["%mapped"]=df_mapped_sub["mapped"]*100/df_mapped_sub["sub_reads"]
    df_mapped_sub.loc['mean'] = df_mapped_sub.mean(numeric_only=True)
    df_mapped_sub=df_mapped_sub.sort_values(by="viral_reads")
    # df_mapped_sub.sort_values(by="viral_reads").style.set_precision(2).background_gradient(cmap="RdYlGn", vmin=0)
df_mapped_sub.style.background_gradient(cmap="RdYlGn", vmin=0)


## RPKM normalised vs raw
Percentage indicates the ammount of contigs that lost coverage (0 now)

In [None]:
import numpy as np
import matplotlib.pyplot as plt

df_RPKM_norm2=df_RPKM_norm.copy()
df_RPKM_raw2=df_RPKM_raw.copy()
df_RPKM_cov2=df_RPKM_cov.copy()

col_names_norm = [col + "_norm" for col in df_RPKM_norm.columns]
col_names_raw = [col + "_raw" for col in df_RPKM_raw.columns]
col_names_cov = [col + "_cov" for col in df_RPKM_cov.columns]

df_RPKM_norm2.columns = col_names_norm
df_RPKM_raw2.columns = col_names_raw
df_RPKM_cov2.columns = col_names_cov


max_val=max(df_RPKM_norm.select_dtypes(include=[float]).max().max(),df_RPKM_raw.select_dtypes(include=[float]).max().max())
min_val=min(df_RPKM_norm.select_dtypes(include=[float]).replace(0, np.nan).min().min(),df_RPKM_raw.select_dtypes(include=[float]).replace(0, np.nan).min().min())
merged_df = df_RPKM_norm2.merge(df_RPKM_raw2, left_index=True,right_index=True).merge(df_RPKM_cov2, left_index=True,right_index=True)


# Determine common axis limits for log scale
max_val = max(df_RPKM_norm.select_dtypes(include=[float]).max().max(),
              df_RPKM_raw.select_dtypes(include=[float]).max().max())
min_val = min(df_RPKM_norm.select_dtypes(include=[float]).replace(0, np.nan).min().min(),
              df_RPKM_raw.select_dtypes(include=[float]).replace(0, np.nan).min().min())

# Set the limits to be the same for x and y axis
axis_limits = [min_val, max_val]


n_columns=4
# Set up the subplots
len_samples=len(SAMPLES)
if len_samples == 1:
    len_samples = 2

fig, axes = plt.subplots(nrows=(len_samples + n_columns-1) // n_columns, ncols=n_columns, figsize=(n_columns*3.74, ((len_samples + n_columns -1) // n_columns)*4.5), sharey=True, sharex=False)

# Your existing loop for plotting...
for i, sample in enumerate(SAMPLES):
    row = i // n_columns
    col = i % n_columns
    ax = axes[row, col]
    print(round(((i+1) * 100 / len(SAMPLES)),1), "%", end='\r')
    percent_0=round((len(merged_df[merged_df[sample + "_raw"]==0])/len(merged_df)-len(merged_df[merged_df[sample + "_norm"]==0])/len(merged_df))*100,2)
    ax.plot([-2.5, max_val-0.5], [-2.5, max_val-0.5], 'r--')
    ax.scatter(x=merged_df[sample + "_raw"], y=merged_df[sample + "_norm"], s=2, alpha=0.3)

    ax.set_xlim(axis_limits)
    ax.set_ylim(axis_limits)
    ax.set_title(sample)
    ax.set_xlabel("RPKM raw")
    ax.set_ylabel("RPKM normalised")

    ax.text(0.95, 0.05, str(percent_0) + '%', transform=ax.transAxes, ha='right', va='bottom',fontsize=20, color="firebrick")

    
    # Set the same ticks for both axes
    ax.set_xticklabels(ax.get_xticks(), rotation=90)

    ax.set_xscale('log')
    ax.set_yscale('log')   
    
    ax.set_xticks([10**i for i in range(int(np.log10(min_val)), int(np.log10(max_val))+1)])
    ax.set_yticks([10**i for i in range(int(np.log10(min_val)), int(np.log10(max_val))+1)])
#     ax.set_yticks([10**i for i in range(int(np.log10(min_val)), int(np.log10(max_val))+1)])

 
    
if len(SAMPLES) % n_columns != 0:
    for i in range(len(SAMPLES) % n_columns, n_columns):
        fig.delaxes(axes[-1, i])
        
plt.tight_layout()
plt.show()


## Length and raw RPKM of removed

In [None]:
len_df=df_RPKM_len[SAMPLES[0]].to_frame()
len_df.columns=["Length"]

merged_df = merged_df.sort_index()
len_df = len_df.sort_index()

len_samples=len(SAMPLES)
if len_samples == 1:
    len_samples = 2
fig, axes = plt.subplots(nrows=(len_samples+ 3) // 4, ncols=4, figsize=(16, ((len_samples + 3) // 4)*4.5),sharey=False,sharex=True)

n=0

for i, sample in enumerate(SAMPLES):
    row = i // 4
    col = i % 4
    ax = axes[row, col]
    zero_df_len=len_df[(merged_df[sample + "_raw"]>0) & (merged_df[sample + "_norm"]==0)]
    zero_df=merged_df[(merged_df[sample + "_raw"]>0) & (merged_df[sample + "_norm"]==0)]
    print(round(((i+1) * 100 / len(SAMPLES)),1), "%", end='\r')

    percent_0=round((len(merged_df[merged_df[sample + "_raw"]==0])/len(merged_df)-len(merged_df[merged_df[sample + "_norm"]==0])/len(merged_df))*100,2)
    bin_edges = [i * 1000 for i in range(0, 50)]
    ax.scatter( x=zero_df_len["Length"],y=zero_df[sample + "_raw"],)
    ax.set_title(sample)
    ax.set_xlabel("Contig length")
    ax.set_ylabel("RPKM")
    ax.text(0.95, 0.05, 'n=' + str(len(zero_df)) , transform=ax.transAxes, ha='right', va='bottom',fontsize=20, color="firebrick")

if len(SAMPLES) % 4 != 0:
    for i in range(len(SAMPLES) % 4, 4):
        fig.delaxes(axes[-1, i])
        
plt.tight_layout()
plt.show()


# Filter by Coverage + RPKM

In [None]:
filtered_RPKM_raw_df,filtered_counts_raw_df=filter_abundances(df_RPKM_raw, df_count_raw, df_RPKM_cov_bases, df_RPKM_cov, df_RPKM_len)
filtered_RPKM_norm_df,filtered_counts_norm_df=filter_abundances(df_RPKM_norm, df_count_norm, df_RPKM_cov_bases, df_RPKM_cov, df_RPKM_len)
filtered_75_RPKM_raw_df=filter_abundances_75(df_RPKM_raw, df_RPKM_cov_bases, df_RPKM_cov, df_RPKM_len)
filtered_75_RPKM_norm_df=filter_abundances_75(df_RPKM_norm, df_RPKM_cov_bases, df_RPKM_cov, df_RPKM_len)

filtered_RPKM_raw_df.to_csv(out_filtered_raw_RPKM_file)
filtered_counts_raw_df.to_csv(out_filtered_raw_count_file)
filtered_RPKM_norm_df.to_csv(out_filtered_norm_RPKM_file)
filtered_counts_norm_df.to_csv(out_filtered_norm_count_file)
filtered_75_RPKM_raw_df.to_csv(out_filtered_75_raw_RPKM_file)
filtered_75_RPKM_norm_df.to_csv(out_filtered_75_norm_RPKM_file)