# Quality control of 10x transcriptomic FASTQ files
This Python Jupyter notebook performs quality control analysis on the FASTQ files created by [cellranger mkfastq](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/mkfastq).

Import Python modules:

In [None]:
import os

import mizani

from IPython.display import display, HTML

import pandas as pd

from plotnine import *

Get `snakemake` variables [as described here](https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#jupyter-notebook-integration):

In [None]:
expt = snakemake.params.expt
qc_stats = snakemake.input.qc_stats
qc_plot = snakemake.output.qc_plot

print(f"Analyzing experiment {expt} using QC stats in the following files:\n  " +
      '  \n'.join(qc_stats))

Read the QC stats for each run for the experiment:

In [None]:
stats_df = pd.concat([pd.read_csv(statfile, names=['statistic', 'value'])
                        .assign(run10x=os.path.basename(statfile)
                                       .replace('_qc_stats.csv', '')
                                       .replace(f"{expt}_", ''))
                      for statfile in qc_stats
                      ])

print(f"Statistics for {expt}")
display(HTML(
    stats_df
    .pivot_table(index='statistic', values='value', columns='run10x')
    .to_html()
    ))

Plot the QC stats:

In [None]:
p = (ggplot(stats_df, aes('run10x', 'value')) +
     geom_point(size=2) +
     facet_wrap('~ statistic', ncol=4, scales='free_y') +
     theme(axis_text_x=element_text(angle=90),
           figure_size=(11 + 0.5 * stats_df['run10x'].nunique(), 4),
           subplots_adjust={'wspace': 0.3},) +
     expand_limits(y=(0, 1)) +
     scale_y_continuous(labels=mizani.formatters.custom_format('{:.2g}')) +
     ggtitle(f"{expt} 10x Illumina FASTQ quality control stats") +
     xlab('Illumina sequencing run')
     )
_ = p.draw()

print(f"Saving plot to {qc_plot}")
p.save(qc_plot, verbose=False)