# Analyze consensus from deep sequencing pileup versus Genbank accessions

Import Python modules:

In [24]:
import Bio.AlignIO

import pandas as pd

Get the pileup-consensus / Genbank alignments along with the descriptions from `snakemake`:

In [27]:
#alignments = snakemake.input.alignments
#descriptors = snakemake.params.descriptors

alignments = ['results/consensus_to_genbank_alignments/bbmap/SARS-CoV-2/SRR10971381.fa',
 'results/consensus_to_genbank_alignments/bbmap/SARS-CoV-2/SRR11092056.fa',
 'results/consensus_to_genbank_alignments/bbmap/SARS-CoV-2/SRR11092057.fa',
 'results/consensus_to_genbank_alignments/bbmap/SARS-CoV-2/SRR11092058.fa',
 'results/consensus_to_genbank_alignments/bbmap/SARS-CoV-2/SRR11092059.fa',
 'results/consensus_to_genbank_alignments/bbmap/SARS-CoV-2/SRR11092060.fa',
 'results/consensus_to_genbank_alignments/bbmap/SARS-CoV-2/SRR11092061.fa',
 'results/consensus_to_genbank_alignments/bbmap/SARS-CoV-2/SRR11092062.fa',
 'results/consensus_to_genbank_alignments/bbmap/SARS-CoV-2/SRR11092063.fa',
 'results/consensus_to_genbank_alignments/bbmap/SARS-CoV-2/SRR11092064.fa',
 'results/consensus_to_genbank_alignments/bbmap/SARS-CoV-2/SRR10903401.fa',
 'results/consensus_to_genbank_alignments/bbmap/SARS-CoV-2/SRR10903402.fa',
 'results/consensus_to_genbank_alignments/bbmap/SARS-CoV-2/SRR11177792.fa',
 'results/consensus_to_genbank_alignments/bwa-mem2/SARS-CoV-2/SRR10971381.fa',
 'results/consensus_to_genbank_alignments/bwa-mem2/SARS-CoV-2/SRR11092056.fa',
 'results/consensus_to_genbank_alignments/bwa-mem2/SARS-CoV-2/SRR11092057.fa',
 'results/consensus_to_genbank_alignments/bwa-mem2/SARS-CoV-2/SRR11092058.fa',
 'results/consensus_to_genbank_alignments/bwa-mem2/SARS-CoV-2/SRR11092059.fa',
 'results/consensus_to_genbank_alignments/bwa-mem2/SARS-CoV-2/SRR11092060.fa',
 'results/consensus_to_genbank_alignments/bwa-mem2/SARS-CoV-2/SRR11092061.fa',
 'results/consensus_to_genbank_alignments/bwa-mem2/SARS-CoV-2/SRR11092062.fa',
 'results/consensus_to_genbank_alignments/bwa-mem2/SARS-CoV-2/SRR11092063.fa',
 'results/consensus_to_genbank_alignments/bwa-mem2/SARS-CoV-2/SRR11092064.fa',
 'results/consensus_to_genbank_alignments/bwa-mem2/SARS-CoV-2/SRR10903401.fa',
 'results/consensus_to_genbank_alignments/bwa-mem2/SARS-CoV-2/SRR10903402.fa',
 'results/consensus_to_genbank_alignments/bwa-mem2/SARS-CoV-2/SRR11177792.fa']
alignments = ['../' + a for a in alignments]

descriptors = [{'aligner': 'bbmap', 'genome': 'SARS-CoV-2', 'accession': 'SRR10971381'},
 {'aligner': 'bbmap', 'genome': 'SARS-CoV-2', 'accession': 'SRR11092056'},
 {'aligner': 'bbmap', 'genome': 'SARS-CoV-2', 'accession': 'SRR11092057'},
 {'aligner': 'bbmap', 'genome': 'SARS-CoV-2', 'accession': 'SRR11092058'},
 {'aligner': 'bbmap', 'genome': 'SARS-CoV-2', 'accession': 'SRR11092059'},
 {'aligner': 'bbmap', 'genome': 'SARS-CoV-2', 'accession': 'SRR11092060'},
 {'aligner': 'bbmap', 'genome': 'SARS-CoV-2', 'accession': 'SRR11092061'},
 {'aligner': 'bbmap', 'genome': 'SARS-CoV-2', 'accession': 'SRR11092062'},
 {'aligner': 'bbmap', 'genome': 'SARS-CoV-2', 'accession': 'SRR11092063'},
 {'aligner': 'bbmap', 'genome': 'SARS-CoV-2', 'accession': 'SRR11092064'},
 {'aligner': 'bbmap', 'genome': 'SARS-CoV-2', 'accession': 'SRR10903401'},
 {'aligner': 'bbmap', 'genome': 'SARS-CoV-2', 'accession': 'SRR10903402'},
 {'aligner': 'bbmap', 'genome': 'SARS-CoV-2', 'accession': 'SRR11177792'},
 {'aligner': 'bwa-mem2', 'genome': 'SARS-CoV-2', 'accession': 'SRR10971381'},
 {'aligner': 'bwa-mem2', 'genome': 'SARS-CoV-2', 'accession': 'SRR11092056'},
 {'aligner': 'bwa-mem2', 'genome': 'SARS-CoV-2', 'accession': 'SRR11092057'},
 {'aligner': 'bwa-mem2', 'genome': 'SARS-CoV-2', 'accession': 'SRR11092058'},
 {'aligner': 'bwa-mem2', 'genome': 'SARS-CoV-2', 'accession': 'SRR11092059'},
 {'aligner': 'bwa-mem2', 'genome': 'SARS-CoV-2', 'accession': 'SRR11092060'},
 {'aligner': 'bwa-mem2', 'genome': 'SARS-CoV-2', 'accession': 'SRR11092061'},
 {'aligner': 'bwa-mem2', 'genome': 'SARS-CoV-2', 'accession': 'SRR11092062'},
 {'aligner': 'bwa-mem2', 'genome': 'SARS-CoV-2', 'accession': 'SRR11092063'},
 {'aligner': 'bwa-mem2', 'genome': 'SARS-CoV-2', 'accession': 'SRR11092064'},
 {'aligner': 'bwa-mem2', 'genome': 'SARS-CoV-2', 'accession': 'SRR10903401'},
 {'aligner': 'bwa-mem2', 'genome': 'SARS-CoV-2', 'accession': 'SRR10903402'},
 {'aligner': 'bwa-mem2', 'genome': 'SARS-CoV-2', 'accession': 'SRR11177792'}]

Create a data frame with the alignments, and then process it to get:
 - total sites in alignment
 - sites that are identical
 - sites that are ambiguous (a `N` in one sequence)
 - sites that are gapped in either sequence but have a called identity in other
 - sites that are mismatched called nucleotides

In [43]:
assert len(alignments) == len(descriptors)

alignments_df = (
    pd.DataFrame({'accession': [d['accession'] for d in descriptors],
                  'aligner': [d['aligner'] for d in descriptors],
                  'genome': [d['genome'] for d in descriptors],
                  'alignment_file': alignments,
                  })
    )

def alignment_stats(aln_file):
    aln = Bio.AlignIO.read(aln_file, 'fasta')
    assert len(aln) == 2
    s1 = str(aln[0].seq).upper()
    s2 = str(aln[1].seq).upper()
    assert len(s1) == len(s2)
    total_sites = len(s1)
    identical_sites = ambiguous_sites = mismatched_sites = gapped_sites = 0
    for nt1, nt2 in zip(s1, s2):
        if nt1 == nt2:
            identical_sites += 1
        elif nt1 == 'N' or nt2 == 'N':
            ambiguous_sites += 1
        elif nt1 == '-' or nt2 == '-':
            gapped_sites += 1
        else:
            mismatched_sites += 1
    assert total_sites == identical_sites + ambiguous_sites + mismatched_sites + gapped_sites
    return total_sites, identical_sites, ambiguous_sites, mismatched_sites, gapped_sites

for col, vals in zip(['total', 'identical', 'ambiguous', 'mismatched', 'gapped'],
                     zip(*alignments_df['alignment_file'].map(alignment_stats))
                     ):
    alignments_df[col] = vals

alignment_stats(alignments_df.at[0, 'alignment_file'])

alignments_df

Unnamed: 0,accession,aligner,genome,alignment_file,total,identical,ambiguous,mismatched,gapped
0,SRR10971381,bbmap,SARS-CoV-2,../results/consensus_to_genbank_alignments/bbm...,29903,29870,0,0,33
1,SRR11092056,bbmap,SARS-CoV-2,../results/consensus_to_genbank_alignments/bbm...,29870,1871,27999,0,0
2,SRR11092057,bbmap,SARS-CoV-2,../results/consensus_to_genbank_alignments/bbm...,29891,29630,240,0,21
3,SRR11092058,bbmap,SARS-CoV-2,../results/consensus_to_genbank_alignments/bbm...,29870,9581,20288,1,0
4,SRR11092059,bbmap,SARS-CoV-2,../results/consensus_to_genbank_alignments/bbm...,29870,29746,124,0,0
5,SRR11092060,bbmap,SARS-CoV-2,../results/consensus_to_genbank_alignments/bbm...,29870,27033,2837,0,0
6,SRR11092061,bbmap,SARS-CoV-2,../results/consensus_to_genbank_alignments/bbm...,29870,24589,5280,1,0
7,SRR11092062,bbmap,SARS-CoV-2,../results/consensus_to_genbank_alignments/bbm...,29891,29857,13,0,21
8,SRR11092063,bbmap,SARS-CoV-2,../results/consensus_to_genbank_alignments/bbm...,29870,25106,4752,1,11
9,SRR11092064,bbmap,SARS-CoV-2,../results/consensus_to_genbank_alignments/bbm...,29870,16522,13347,1,0


In [39]:
list(zip(*[('a', 'b', 'e'), ('c', 'd', 'f')]))

[('a', 'c'), ('b', 'd'), ('e', 'f')]