# Coverage of viral genes 10x transcriptomic reads
This Python Jupyter notebook examines where the aligned 10x transcriptomics reads fall on the viral genes (coverage), including with respect to viral barcodes and viral tags.

Import Python modules:

In [None]:
import collections

import Bio.SeqIO

from IPython.display import display, HTML

import pandas as pd

from pymodules.plot_viral_genes import plot_genes_and_coverage

import pysam

Get `snakemake` variables [as described here](https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#jupyter-notebook-integration):

In [None]:
bam = snakemake.input.bam
bai = snakemake.input.bai
viral_genbank = snakemake.input.viral_genbank
plot = snakemake.output.plot
expt = snakemake.wildcards.expt

Get the viral gene names:

In [None]:
print(f"Reading viral genes from {viral_genbank}")
viral_genes = list(Bio.SeqIO.parse(viral_genbank, 'genbank'))
viral_gene_names = [s.id for s in viral_genes]
print(f"There are {len(viral_genes)} viral genes:\n\t" +
      '\n\t'.join(viral_gene_names))
assert len(viral_gene_names) == len(set(viral_gene_names)), 'viral gene names not unique'

Now get alignment statistics for each gene.
Specifically, use [pysam](https://pysam.readthedocs.io/) to get a data frame giving the coverage at each site for each viral gene.

In [None]:
coverage_list = []

for gene, gene_name in zip(viral_genes, viral_gene_names):
    assert gene.id == gene_name
    print(f"Getting statistics for {gene_name} from {bam}")
    
    with pysam.AlignmentFile(bam, mode='rb', index_filename=bai) as bamfile:
        if len(gene) != bamfile.get_reference_length(gene_name):
            raise ValueError(f"length of {gene_name} not as expected in {bam}")
            
        coverage_list.append(
                pd.DataFrame(dict(zip('ACGT',
                                      bamfile.count_coverage(contig=gene_name))))
                .assign(coverage=lambda x: x.sum(axis=1),
                        site=lambda x: x.index + 1,
                        gene=gene_name)
                )
        
coverage_df = (pd.concat(coverage_list, sort=False, ignore_index=True)
               .assign(gene=lambda x: pd.Categorical(x['gene'],
                                                     viral_gene_names,
                                                     ordered=True))
               )
print('\nFirst few lines of `coverage_df`:')
display(HTML(coverage_df.head().to_html(index=False)))

Sanity check to make sure we got the right number of sites for each gene in `coverage_df`:

In [None]:
for gene in viral_genes:
    gene_name = gene.id
    length = len(coverage_df
                 .query('gene == @gene_name')
                 )
    if length != len(gene):
        raise ValueError(f"coverage not for expected number sites for {gene_name}")

Now plot coverage per site alongside gene structure.
In this plot, we indicate the viral tags (blue) and viral barcodes (orange).
We also indicate mutations that are observed at each position with colors: for instance, an enrichment of `A` mutations before a peak could indicate premature polyadenylation or oligo-dT mis-priming:

In [None]:
fig, _ = plot_genes_and_coverage(viral_genes,
                                 coverage_df.assign(sample=expt),
                                 color_mutations=True,
                                 figtitle=f"coverage of viral genes in 10x transcriptomics for {expt}")

print(f"Saving figure to {plot}")
fig.savefig(plot)