In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from Bio import SeqIO
import os
import sys

In [None]:
# If executing with snakemake, use the path below
sys.path.append(snakemake.config["WORKFLOW_PATH"]+'/snakemodules/notebooks/src/')
# If executing directly with jupyter notebook from the commandline, use the path below
# sys.path.append(os.getcwd()+'/src/')

from stat_func import calculator
from stat_func import visualization
%matplotlib inline

### Input files

In [None]:
# If executing directly with jupyter notebook from the commandline, use the path below
# import glob
# all_samples = glob.glob('/home/kedic/popinSnake/workdir/**/**.contigs.fa')
# If executing with snakemake, use the path below
all_samples = snakemake.input.contigs

In [None]:
sample_seq = []
for sample in all_samples:
    seq_objects=SeqIO.parse(sample,'fasta')
    sequences=[]
    for seq in seq_objects:
        sequences.append(seq)
    sample_seq.append(sequences)

In [None]:
sample_names = []

for fp in all_samples:   
    folder_path = os.path.dirname(fp)
    folder = os.path.basename(folder_path)
    sample_names.append(folder)
    sample_names.sort()

### Apply functions

In [None]:
# Assuming sample_seq is defined and contains the sequence data
sample_list, sample_dict = calculator.process_sequences(sample_seq, sample_names)


### Visualization

In [None]:
import matplotlib.pyplot as plt
import scipy.stats as ss
import numpy as np

In [None]:
# Combine all length data to determine the common bin edges
all_lengths = np.concatenate([sample_dict[df]['Seq_LEN'] for df in sample_dict])
# Define bin size and range for consistent binning across all subplots
bin_min = np.min(all_lengths)
bin_max = np.max(all_lengths) 
bin_size = 50  # Adjust bin size as needed
bins = np.arange(bin_min, bin_max + bin_size, bin_size)  # Consistent bin edges

f, axs = plt.subplots(len(sample_dict), 1, figsize=(10, 20))

# Loop through each sample and plot
for i, df in enumerate(sample_dict):
    
    len_dist = sample_dict[df]['Seq_LEN']
    
    # Fit a normal distribution to the data
    mu0, sigma0 = ss.norm.fit(len_dist)
   
    # Use consistent bins for histogram
    values, bins0, _ = axs[i].hist(len_dist, bins=bins, density=True) 
    bin_centers0 = 0.5 * (bins0[1:] + bins0[:-1])
    
    # Compute and plot the PDF
    pdf = ss.norm.pdf(x=bin_centers0, loc=mu0, scale=sigma0)
    N50_value = calculator.calculate_N50(list(len_dist))

    axs[i].plot(bin_centers0, pdf, label="PDF", color='orange', linewidth=3)  # Plot PDF
    axs[i].vlines(N50_value, 0, 0.002, colors='red', linestyles='dashed', label='N50')

    # Set titles and labels
    axs[i].set_title(df + '_Contig-Length Distribution')
    axs[i].set(xlabel="contig length")
    axs[i].legend()
    
    # Set consistent x and y limits
    # axs[i].set_xlim(0, 4300)
    # axs[i].set_ylim(0, 0.002)

# Adjust subplot layout
plt.subplots_adjust(left=0.1,
                    bottom=0.1,
                    right=0.9,
                    top=0.9,
                    wspace=0.4,
                    hspace=0.4)
plt.tight_layout()
plt.savefig(snakemake.output.len, dpi=300)
plt.show()

In [None]:
# Combine all GC content data to determine the common bin edges
all_gc_contents = np.concatenate([sample_dict[df]['GC_Content'] for df in sample_dict])

# Define bin size and range for consistent binning across all subplots
gc_min = np.min(all_gc_contents)
gc_max = np.max(all_gc_contents)
bin_size = 20  # Adjust bin size if necessary
bins = np.linspace(gc_min, gc_max, bin_size)  # Consistent bin edges

f, axs = plt.subplots(len(sample_dict), 1, figsize=(10, 20))

# Loop through each sample and plot
for i, df in enumerate(sample_dict):
    gc_dist = sample_dict[df]['GC_Content']
    
    # Fit a normal distribution to the data
    mu1, sigma1 = ss.norm.fit(gc_dist)

    # Use consistent bins for the histogram
    values, bins1, _ = axs[i].hist(gc_dist, bins=bins, density=True)
    bin_centers1 = 0.5 * (bins1[1:] + bins1[:-1])
    
    # Compute and plot the PDF
    pdf = ss.norm.pdf(x=bin_centers1, loc=mu1, scale=sigma1)
    axs[i].plot(bin_centers1, pdf, label="PDF", color='orange', linewidth=3)  # Plot PDF
    
    # Set titles and labels
    axs[i].set_title(df + '_GC-content distribution')
    axs[i].set(xlabel="GC percentage")
    axs[i].legend()

    # Optional: Set consistent x and y limits (if needed, uncomment below)
    # axs[i].set_xlim(gc_min, gc_max)
    # axs[i].set_ylim(0, 0.1)

# Adjust subplot layout
plt.subplots_adjust(left=0.1,
                    bottom=0.1, 
                    right=0.9, 
                    top=0.9, 
                    wspace=0.4, 
                    hspace=0.4)
plt.tight_layout()
plt.savefig(snakemake.output.gc, dpi=300, bbox_inches='tight')
plt.show()