In [2]:
#!/usr/bin/env python

"""
Demultiplex Stats Plotter

This script reads a Demultiplex_Stats.csv file and plots the number of reads for each sample ID.

Dependencies:
- pandas
- matplotlib
- argparse
- itertools
- io

Usage:
./plot_reads_pf.py --file_path /path/to/Demultiplex_Stats.csv

Inputs:
- --file_path: Path to the Demultiplex_Stats.csv file.

Outputs:
- Displays a horizontal bar chart showing the number of reads for each sample ID.
- Saves the plot as reads_pf_labeled.png in the script's directory.

The script performs the following steps:
1. Reads the Demultiplex_Stats.csv file until the line containing "[Top Unknown Barcodes]" is reached.
2. Creates a pandas DataFrame from the collected data.
3. Summarizes the number of reads for each sample ID.
4. Excludes the "Undetermined" sample from the analysis.
5. Sorts the data by the number of reads in descending order.
6. Plots a horizontal bar chart displaying the number of reads for each sample ID.
7. Annotates the chart with the total number of reads.
8. Saves the plot as reads_pf_labeled.png.

Note: Ensure that matplotlib is properly configured for your environment, such as running the script in a Jupyter Notebook environment.

Author: [Your Name]
Date: [Current Date]
"""

import argparse
import matplotlib.pyplot as plt
import pandas as pd
from itertools import takewhile
from io import StringIO  # Use the standard StringIO module

def plot_demultiplex_stats(file_path):
    # Convert the takewhile generator into a list to keep the content in memory
    with open(file_path, 'r') as file:
        lines_before_string = list(takewhile(lambda line: "[Top Unknown Barcodes]" not in line, file))

    # Create a pandas DataFrame from the collected lines
    df = pd.read_csv(StringIO(''.join(lines_before_string)))

    # Summing up the "# Reads" for each "SampleID"
    summed_df = df.groupby('SampleID')['# Reads'].sum().reset_index()

    # Exclude the row with SampleID "Undetermined"
    filtered_summed_df = summed_df[summed_df['SampleID'] != 'Undetermined']

    # Sort the DataFrame by '# Reads' in descending order
    sorted_summed_df = filtered_summed_df.sort_values(by='# Reads', ascending=False)

    # Sort the DataFrame by '# Reads' in descending order and reverse the order
    # to properly display the bars in descending order
    sorted_summed_df = sorted_summed_df[::-1]

    # Sum up the "# Reads" column to get total_reads
    total_reads = sorted_summed_df['# Reads'].sum()

    # Print the total_reads
    print("Total Reads:", total_reads)

    # Create a horizontal bar chart with # Reads on the Y-axis
    plt.barh(sorted_summed_df['SampleID'], sorted_summed_df['# Reads'])
    plt.xlabel('# Reads')
    plt.ylabel('SampleID')
    plt.title('# Reads for Each SampleID')

    # Format total_reads with commas
    total_reads = '{:,}'.format(total_reads)

    # Add text annotation for total reads with a box around it
    plt.text(-0.3, -0.05, f'Total Reads: {total_reads}', ha='center', va='center', transform=plt.gca().transAxes,
             bbox=dict(facecolor='white', edgecolor='black', boxstyle='round,pad=0.5'))

    # Save the plot to a file
    plt.savefig('reads_pf_labeled.png')

    plt.show()

if __name__ == "__main__":
    # Create argument parser
    parser = argparse.ArgumentParser(description="Demultiplex Stats Plotter")
    parser.add_argument("--file_path", type=str, help="Path to Demultiplex_Stats.csv file", required=True)

    # Parse command-line arguments
    args = parser.parse_args()

    # Call the function with the provided file path
    plot_demultiplex_stats(args.file_path)


usage: ipykernel_launcher.py [-h] --file_path FILE_PATH
ipykernel_launcher.py: error: the following arguments are required: --file_path


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [4]:
!python plot_reads_pf.ipynb --file /gale/netapp/seq2/illumina_runs/231109_M00412_0744_000000000-DM39L_231110112525154373554/Reports/Demultiplex_Stats.csv

Traceback (most recent call last):
  File "plot_reads_pf.ipynb", line 159, in <module>
    "execution_count": null,
NameError: name 'null' is not defined
