# Summarize and QC `STARsolo` alignments of the transcriptomic data
This Python Jupyter notebook summarizes the `STARsolo` alignments of the transcriptomic data.

Import Python modules:

In [None]:
from IPython.display import HTML, display

import matplotlib.pyplot as plt

import pandas as pd

Get `snakemake` variables [as described here](https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#jupyter-notebook-integration):

In [None]:
expt = snakemake.wildcards.expt
summary = snakemake.input.summary
umi_per_cell = snakemake.input.umi_per_cell
qc_plot = snakemake.output.qc_plot

print(f"Analyzing transcriptomic stats for experiment {expt}")

Read in the statistics on from the `STARsolo` output files:

In [None]:
print(f"Reading STARsolo stats from {summary}")
stats = pd.read_csv(summary, names=['statistic', 'value'])

display(HTML(stats.to_html(float_format='%.2g', index=False)))

n_cells = int(stats.set_index('statistic').at['Estimated Number of Cells',
                                              'value'])

Read the number of UMIs per cell barcode:

In [None]:
print(f"Reading UMIs per cell barcode from: {umi_per_cell}")
umis = (pd.read_csv(umi_per_cell, names=['number of UMIs'])
        .assign(cell_barcode_rank=lambda x: x.index + 1,
                is_cell=lambda x: x['cell_barcode_rank'] <= n_cells,
                is_cell_color=lambda x: x['is_cell'].map({True: 'orange',
                                                          False: 'gray'}))
        )

Now plot the statistics. In particular:
 - Make [knee plot](https://liorpachter.wordpress.com/tag/knee-plot) showing how the number of cells was called from the number of UMIs per cell barcode; this is supposed to distinguish true cells from empty droplets.
 - Plot average number of genes, reads, UMIs per cell
 - Plot read-level statistics

In [None]:
fig, axes = plt.subplots(ncols=3,
                         figsize=(11, 7.25),
                         gridspec_kw={'width_ratios': [1, 1, 1]})
fig.suptitle(f"transcriptomics alignment summary for experiment {expt}")

# make knee plot
_ = umis.plot(x='cell_barcode_rank',
              y='number of UMIs',
              kind='scatter',
              c='is_cell_color',
              s=1.5,
              logx=True,
              logy=True,
              ax=axes[0],
              rasterized=True,
              )
axes[0].axvline(x=n_cells, linestyle='dashed', color='green',
                linewidth=1.5, alpha=0.5)
axes[0].title.set_text(f"knee plot: {n_cells} cells")

# average number of genes, reads, UMIs among called cells
stats_per_cell = (
    stats
    .loc[stats['statistic'].str.contains('per Cell')]
    .assign(statistic=lambda x: x['statistic'].str.replace(' per Cell', ''))
    )
_ = stats_per_cell.plot(x='statistic',
                        y='value',
                        kind='bar',
                        ax=axes[1],
                        legend=False,
                        )
axes[1].set_ylabel('counts per cell')
axes[1].set_xlabel('')
ymax = stats_per_cell['value'].max()
axes[1].set_ylim(0, 1.5 * ymax)
# add numbers to bars
for p in axes[1].patches:
    axes[1].annotate(f"{p.get_height():.2g}",
                     (p.get_x() + 0.1, p.get_height() + 0.05 * ymax),
                     rotation=90)
axes[1].title.set_text('per-cell statistics')

# Read-level statistics
stats_mapping = stats.loc[stats['statistic'].str.contains(
                    'Reads Mapped to|Reads With Valid|Fraction of Reads'
                    )]
_ = stats_mapping.plot(x='statistic',
                       y='value',
                       kind='bar',
                       ax=axes[2],
                       legend=False,
                       )
axes[2].set_ylabel('fraction of reads')
axes[2].set_xlabel('')
axes[2].set_ylim(0, 1.2 * stats_mapping['value'].max())
# add numbers to bars
for p in axes[2].patches:
    axes[2].annotate(f"{p.get_height():.2f}",
                     (p.get_x(), 1.05 * p.get_height()))
axes[2].title.set_text('read-level statistics')

fig.tight_layout()

print(f"Saving plot to {qc_plot}")
fig.savefig(qc_plot)