In [1]:
import pysam
from Bio import SeqIO
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from tqdm import tqdm

In [24]:
samfile = pysam.AlignmentFile("ecoli_10K_sorted.bam", "rb")
samfile_corr = pysam.AlignmentFile("ecoli_10K_corr_sorted.bam", "rb")
genome = next(SeqIO.parse("MG1655-K12.first10K.fasta", "fasta")).seq

SNP = np.zeros((2, 3), dtype=np.int64)

p = samfile.pileup('gi|49175990|ref|NC_000913.2|')
p_corr = samfile_corr.pileup('gi|49175990|ref|NC_000913.2|')

for x, y in tqdm(zip(p, p_corr)):
    pos_x = x.reference_pos
    pos_y = y.reference_pos
    if pos_x != pos_y:
        raise RuntimeError("pos_x != pos_y")
    
    gen = genome[pos_x].upper()
    
    vals = dict(zip(x.get_query_names(), x.get_query_sequences()))
    corrs = dict(zip(y.get_query_names(), y.get_query_sequences()))
    
    for k, v in vals.items():
        if v.upper() == gen:
            k1 = 1
        else:
            k1 = 0
        if k in corrs:
            w = corrs[k]
            if w.upper() == 'N':
                k2 = 2
            elif w.upper() == gen:
                k2 = 1
            else:
                k2 = 0
        else:
            k2 = 2
        SNP[k1, k2] += 1

SNP = pd.DataFrame(SNP, index=['error', 'correct'], columns=['error', 'correct', 'absent'])
print("===== ecoli_10K & BayesHammer =====")
print(SNP)

10000it [00:06, 1449.45it/s]

===== ecoli_10K & BayesHammer =====
         error  correct  absent
error     1562     6175     706
correct     11  5168796   58838





spades.py -1 ecoli_400K_err_1.fastq.gz -2 ecoli_400K_err_2.fastq.gz --only-error-correction -o ecoli_400K -t 6

bwa index MG1655-K12.first400K.fasta

bwa mem MG1655-K12.first400K.fasta ecoli_400K_err_1.fastq.gz ecoli_400K_err_2.fastq.gz > ecoli_400K.sam

samtools view -b ecoli_400K.sam > ecoli_400K.bam && samtools sort ecoli_400K.bam > ecoli_400K_sorted.bam && samtools index ecoli_400K_sorted.bam

bwa mem MG1655-K12.first400K.fasta ecoli_400K/corrected/ecoli_400K_err_1.fastq.00.0_0.cor.fastq.gz ecoli_400K/corrected/ecoli_400K_err_2.fastq.00.0_0.cor.fastq.gz > ecoli_400K_corr.sam

samtools view -b ecoli_400K_corr.sam > ecoli_400K_corr.bam && samtools sort ecoli_400K_corr.bam > ecoli_400K_corr_sorted.bam && samtools index ecoli_400K_corr_sorted.bam

In [25]:
samfile = pysam.AlignmentFile("ecoli_400K_sorted.bam", "rb")
samfile_corr = pysam.AlignmentFile("ecoli_400K_corr_sorted.bam", "rb")
genome = next(SeqIO.parse("MG1655-K12.first400K.fasta", "fasta")).seq

SNP = np.zeros((2, 3), dtype=np.int64)

p = samfile.pileup('gi|49175990|ref|NC_000913.2|')
p_corr = samfile_corr.pileup('gi|49175990|ref|NC_000913.2|')

for x, y in tqdm(zip(p, p_corr)):
    pos_x = x.reference_pos
    pos_y = y.reference_pos
    if pos_x != pos_y:
        raise RuntimeError("pos_x != pos_y")
    
    gen = genome[pos_x].upper()
    
    vals = dict(zip(x.get_query_names(), x.get_query_sequences()))
    corrs = dict(zip(y.get_query_names(), y.get_query_sequences()))
    
    for k, v in vals.items():
        if v.upper() == gen:
            k1 = 1
        else:
            k1 = 0
        if k in corrs:
            w = corrs[k]
            if w.upper() == 'N':
                k2 = 2
            elif w.upper() == gen:
                k2 = 1
            else:
                k2 = 0
        else:
            k2 = 2
        SNP[k1, k2] += 1

SNP = pd.DataFrame(SNP, index=['error', 'correct'], columns=['error', 'correct', 'absent'])
print("===== ecoli_400K & BayesHammer =====")
print(SNP)

400000it [05:38, 1182.51it/s]

===== ecoli_400K & BayesHammer =====
          error    correct   absent
error    146044     267495    35743
correct     331  244513618  4042921





TrimmomaticPE -threads 6 ecoli_400K_err_1.fastq.gz ecoli_400K_err_2.fastq.gz ecoli_400K_trim_1.fastq ecoli_400K_trim_1u.fastq ecoli_400K_trim_2.fastq ecoli_400K_trim_2u.fastq LEADING:5 TRAILING:5 SLIDINGWINDOW:4:20 MINLEN:50

Input Read Pairs: 1381602 Both Surviving: 1173279 (84.92%) Forward Only Surviving: 88372 (6.40%) Reverse Only Surviving: 87991 (6.37%) Dropped: 31960 (2.31%)

In [3]:
samfile = pysam.AlignmentFile("ecoli_400K_sorted.bam", "rb")
samfile_corr = pysam.AlignmentFile("ecoli_400K_trim_corr_sorted.bam", "rb")
genome = next(SeqIO.parse("MG1655-K12.first400K.fasta", "fasta")).seq

SNP = np.zeros((2, 3), dtype=np.int64)

p = samfile.pileup('gi|49175990|ref|NC_000913.2|')
p_corr = samfile_corr.pileup('gi|49175990|ref|NC_000913.2|')

for x, y in tqdm(zip(p, p_corr)):
    pos_x = x.reference_pos
    pos_y = y.reference_pos
    if pos_x != pos_y:
        raise RuntimeError("pos_x != pos_y")
    
    gen = genome[pos_x].upper()
    
    vals = dict(zip(x.get_query_names(), x.get_query_sequences()))
    corrs = dict(zip(y.get_query_names(), y.get_query_sequences()))
    
    for k, v in vals.items():
        if v.upper() == gen:
            k1 = 1
        else:
            k1 = 0
        if k in corrs:
            w = corrs[k]
            if w.upper() == gen:
                k2 = 1
            else:
                k2 = 0
        else:
            k2 = 2
        SNP[k1, k2] += 1

SNP = pd.DataFrame(SNP, index=['error', 'correct'], columns=['error', 'correct', 'absent'])
print("===== ecoli_400K & Trimmomatic =====")
print(SNP)

400000it [05:31, 1207.41it/s]

===== ecoli_400K & Trimmomatic =====
          error    correct    absent
error    267356        222    181704
correct     181  211212238  37344451



