# LONG READ ORF SIZES

In [None]:
# import necessary modules
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_style("ticks",{'axes.grid' : True})
sns.set_palette("colorblind")

plt.rcParams["axes.linewidth"] = 1.5
plt.rcParams["xtick.major.width"] = 1.5
plt.rcParams["ytick.major.width"] = 1.5
plt.rcParams["xtick.major.size"] = 8
plt.rcParams["ytick.major.size"] = 8
plt.rcParams["axes.titlepad"] = 20

plt.rcParams['svg.fonttype'] = 'none'
plt.rcParams['axes.labelsize'] = 23.5
plt.rcParams['xtick.labelsize'] = 18
plt.rcParams['ytick.labelsize'] = 18
plt.rcParams['legend.fontsize'] = 18
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = ['Liberation Sans']
plt.rcParams['text.usetex'] = False
plt.rcParams['svg.fonttype'] = 'none'
plt.rcParams["savefig.dpi"]=300

In [None]:
input_hybrid=snakemake.input.hybrid
input_canu=snakemake.input.canu
input_medaka=snakemake.input.medaka
input_racon1=snakemake.input.racon1
input_racon2=snakemake.input.racon2
input_pilon1=snakemake.input.scaffolds_pilon1_final
input_pilon2=snakemake.input.scaffolds_pilon2_final
input_pilon3=snakemake.input.scaffolds_pilon3_final
input_pilon4=snakemake.input.scaffolds_pilon4_final
input_caudovirales=snakemake.input.caudovirales

SAMPLES_NANOPORE=list(snakemake.params.samples_nanopore)
#----------------------------------------------------------------------
output_orf_length_png=snakemake.output.orf_length_png
output_orf_length_svg=snakemake.output.orf_length_svg


In [None]:
hybrid=pd.read_csv(input_hybrid, sep="\t", names=["name", "len", "aa"]).dropna().assign(Name='Hybrid')
canu=pd.read_csv(input_canu, sep="\t", names=["name", "len", "aa"]).dropna().assign(Name='Canu')
medaka=pd.read_csv(input_medaka, sep="\t", names=["name", "len", "aa"]).dropna().assign(Name='Medaka')
racon1=pd.read_csv(input_racon1, sep="\t", names=["name", "len", "aa"]).dropna().assign(Name='Racon 1')
racon2=pd.read_csv(input_racon2, sep="\t", names=["name", "len", "aa"]).dropna().assign(Name='Racon 2')
pilon1=pd.read_csv(input_pilon1, sep="\t", names=["name", "len", "aa"]).dropna().assign(Name='Pilon 1')
pilon2=pd.read_csv(input_pilon2, sep="\t", names=["name", "len", "aa"]).dropna().assign(Name='Pilon 2')
pilon3=pd.read_csv(input_pilon3, sep="\t", names=["name", "len", "aa"]).dropna().assign(Name='Pilon 3')
pilon4=pd.read_csv(input_pilon4, sep="\t", names=["name", "len", "aa"]).dropna().assign(Name='Pilon 4')
caudovirales=pd.read_csv(input_caudovirales, sep="\t", names=["len"]).dropna().assign(Name='Caudovirales \n (09/05/23)')

aa_len_df_full=pd.concat([canu , medaka, racon1, racon2, pilon1, pilon2, pilon3, pilon4,hybrid,caudovirales])
aa_len_df=pd.concat([canu , medaka, racon2, pilon4,hybrid,caudovirales])



In [None]:
# Calculate statistics on ORFs of each assembly type
aa_len_df_full.groupby("Name").agg(['mean', 'median', 'std', 'sum']).reindex(["Hybrid", "Canu","Medaka","Racon (1)","Racon (2)","Pilon (1)","Pilon (2)","Pilon (3)","Pilon (4)", "Caudovirales \n (09/05/23)"]).style.background_gradient(cmap="RdYlGn")

In [None]:
# Create the bar chart using pandas dataframe
ax=sns.boxplot(x="Name", y="len", data=aa_len_df, showfliers = False)

# Set x and y labels and tick label sizes
ax.set_xlabel("Assembly")
ax.set_ylabel("ORF length (aa)")
ax.set_xticklabels(ax.get_xticklabels(),rotation=90)

# Set the y-limit 
ax.set_ylim(0, ax.get_ylim()[1])

# # Save the figure as png and svg
ax.figure.savefig(output_orf_length_png, format="png", bbox_inches = "tight",transparent=True)
ax.figure.savefig(output_orf_length_svg, format="svg", bbox_inches = "tight",transparent=True)

# Display the plot
plt.show()