Метод:
1. Получил скорректированные риды. 
2. Выровнял исходные риды на референс, скорректированные парные риды в PE режиме и утратившие парность в single режиме.
3. По всем SAM файлам по CIGAR восстановил выравнивания с вставкой _ на месте инделов.
4. Смёрджил выравнивания исходных ридов и скорректированных по QNAME (предварительно восстановив суффиксы /1, /2 как в исходных парных ридах).
5. Прошёлся по всем параллельным четвёркам нуклеотидов из исходного рида, части референса, куда он выровнялся, скорректированного рида, части референса, куда выровнялся скорректированный рид. Посчитал количество каждых случаев в требумой матрице.

In [1]:
import pandas as pd
import numpy as np
from itertools import zip_longest
import re
from IPython.display import display 
from csv import QUOTE_NONE 
from multiprocessing import Pool

In [2]:
SAM_COL_NAMES = ['QNAME', 'FLAG', 'RNAME', 'POS', 'MAPQ', 'CIGAR', 'RNEXT', 'PNEXT', 'TLEN', 'SEQ', 'QUAL', 'TAGS']

In [3]:
def read_ref(ref_path):
    ref = ''
    with open(ref_path, 'r') as fi:
        for line in fi:
            if line[0] != '>':
                ref += line.strip()
    return ref

def read_sam(sam_path, skiprows=2):
    sam = pd.read_csv(sam_path, sep='\t', skiprows=skiprows, quoting=QUOTE_NONE, names=SAM_COL_NAMES)
    sam.drop(columns=['FLAG', 'RNAME', 'MAPQ', 'RNEXT', 'PNEXT', 'TLEN', 'QUAL', 'TAGS'], inplace=True)
    return sam

In [4]:
def parse_cigar(x):
    i, row = x
    pos = row['POS']
    
    read_seq = row['SEQ']
    cigar = re.findall(r'(\d+)([MIDNSHP=X])', row['CIGAR'])
    seq_aligned = []
    ref_aligned = []
    i = 0
    j = 0
    for cig_len, cig_ch in cigar:
        for _ in range(int(cig_len)):
            if cig_ch == 'M':
                # match or mismatch
                seq_aligned.append(read_seq[i])
                ref_aligned.append(ref[pos + j - 1])
            elif cig_ch == 'D':
                # deletion
                seq_aligned.append('_')
                ref_aligned.append(ref[pos + j - 1])
            elif cig_ch == 'I':
                # insertion
                seq_aligned.append(read_seq[i])
                ref_aligned.append('_')

            if cig_ch not in 'DNHP':
                i += 1
            if cig_ch not in 'ISHP':
                j += 1               
    
    res = row[['read', 'POS']]
    res['seq_aligned'] = ''.join(seq_aligned).replace('N', '_')
    res['ref_aligned'] = ''.join(ref_aligned)
    
    return res

In [5]:
def zero_trim(x):
    return x if x > 0 else 0

In [6]:
def read_raw_sam(filename_raw):
    sam_raw = read_sam(filename_raw)
    sam_raw['read_num'] = '1'
    sam_raw.loc[sam_raw.index[1::2], 'read_num'] = '2'
    sam_raw['read'] = sam_raw[['QNAME', 'read_num']].apply(lambda x: '/'.join(x), axis=1)
    sam_raw.drop(index=sam_raw[sam_raw['CIGAR'] == '*'].index, inplace=True)
    with Pool() as p:
        res = p.map(parse_cigar, sam_raw.iterrows())
    new_sam_raw = pd.DataFrame(res)
    new_sam_raw.set_index('read', inplace=True)
    return new_sam_raw

In [7]:
def read_spades_sam(filename_paired, filename_unpaired):
    sam_spades_paired = read_sam(filename_paired)
    sam_spades_paired['read_num'] = '1'
    sam_spades_paired.loc[sam_spades_paired.index[1::2], 'read_num'] = '2'
    sam_spades_paired['read'] = sam_spades_paired[['QNAME', 'read_num']].apply(lambda x: '/'.join(x), axis=1)
    sam_spades_paired.drop(index=sam_spades_paired[sam_spades_paired['CIGAR'] == '*'].index, inplace=True)
    with Pool() as p:
        res = p.map(parse_cigar, sam_spades_paired.iterrows())
    new_sam_spades_paired = pd.DataFrame(res)
    
    sam_spades_unpaired = read_sam(filename_unpaired)
    sam_spades_unpaired['read'] = sam_spades_unpaired['QNAME'].apply(lambda x: '/'.join(x.rsplit('_', 1)))
    with Pool() as p:
        res = p.map(parse_cigar, sam_spades_unpaired.iterrows())
    new_sam_spades_unpaired = pd.DataFrame(res)
    
    new_sam_spades = new_sam_spades_paired.append(new_sam_spades_unpaired, ignore_index=True)
    new_sam_spades.set_index('read', inplace=True)
    
    return new_sam_spades

In [8]:
def read_trimmomatic_sam(filename_paired, filename_unpaired1, filename_unpaired2):
    sam_trimmomatic_paired = read_sam(filename_paired)
    sam_trimmomatic_paired['read_num'] = '1'
    sam_trimmomatic_paired.loc[sam_trimmomatic_paired.index[1::2], 'read_num'] = '2'
    sam_trimmomatic_paired['read'] = sam_trimmomatic_paired[['QNAME', 'read_num']].apply(lambda x: '/'.join(x), axis=1)
    sam_trimmomatic_paired.drop(index=sam_trimmomatic_paired[sam_trimmomatic_paired['CIGAR'] == '*'].index, inplace=True)
    with Pool() as p:
        res = p.map(parse_cigar, sam_trimmomatic_paired.iterrows())
    new_sam_trimmomatic_paired = pd.DataFrame(res)
    
    sam_trimmomatic_unpaired1 = read_sam(filename_unpaired1)
    sam_trimmomatic_unpaired1['read'] = sam_trimmomatic_unpaired1['QNAME'].apply(lambda x: '/'.join(x.rsplit('_', 1)))
    with Pool() as p:
        res = p.map(parse_cigar, sam_trimmomatic_unpaired1.iterrows())
    new_sam_trimmomatic_unpaired1 = pd.DataFrame(res)
    
    sam_trimmomatic_unpaired2 = read_sam(filename_unpaired2)
    sam_trimmomatic_unpaired2['read'] = sam_trimmomatic_unpaired2['QNAME'].apply(lambda x: '/'.join(x.rsplit('_', 1)))
    with Pool() as p:
        res = p.map(parse_cigar, sam_trimmomatic_unpaired2.iterrows())
    new_sam_trimmomatic_unpaired2 = pd.DataFrame(res)
    
    new_sam_trimmomatic_unpaired = new_sam_trimmomatic_unpaired1.append(new_sam_trimmomatic_unpaired2, ignore_index=True)
    new_sam_trimmomatic = new_sam_trimmomatic_paired.append(new_sam_trimmomatic_unpaired, ignore_index=True)
    new_sam_trimmomatic.set_index('read', inplace=True)
    
    return new_sam_trimmomatic

In [9]:
def count_row(x):
    i, row = x
    count = np.zeros((2, 3), dtype=int)

    raw_begin = zero_trim(row['POS_raw'] - row['POS_corr'])
    corr_begin = zero_trim(row['POS_corr'] - row['POS_raw'])
    
    for raw_seq, raw_ref, corr_seq, corr_ref in zip_longest('_' * raw_begin + row['seq_aligned_raw'], 
                                                    '_' * raw_begin + row['ref_aligned_raw'],
                                                    '_' * corr_begin + row['seq_aligned_corr'], 
                                                    '_' * corr_begin + row['ref_aligned_corr'],
                                                            fillvalue='_'):

        if raw_seq != raw_ref and corr_seq != corr_ref and raw_seq != '_' and corr_seq != '_':
            count[0, 0] += 1
        elif raw_seq != raw_ref and corr_seq == corr_ref and raw_seq != '_' and corr_seq != '_':
            count[0, 1] += 1
        elif raw_seq != raw_ref and corr_seq == '_':
            count[0, 2] += 1
        elif raw_seq == raw_ref and corr_seq != corr_ref and raw_seq != '_' and corr_seq != '_':
            count[1, 0] += 1
        elif raw_seq == raw_ref and corr_seq == corr_ref and raw_seq != '_' and corr_seq != '_':
            count[1, 1] += 1
        elif raw_seq == raw_ref and corr_seq == '_':
            count[1, 2] += 1
        
    return count

# Тестовые данные

In [230]:
ref = read_ref('MG1655-K12.first10K.fasta')
raw_sam = read_raw_sam('ecoli_10K_err.sam')

## Spades

In [236]:
spades_sam = read_spades_sam('ecoli_10K_spades_paired.sam', 'ecoli_10K_spades_unpaired.sam')

In [237]:
new_sam = raw_sam.merge(spades_sam, left_index=True, right_index=True, how='inner', suffixes=('_raw', '_corr'))

In [249]:
with Pool() as p:
    counts = p.map(count_row, new_sam.iterrows())

In [250]:
total_counts = sum(counts)

In [251]:
total_counts_df = pd.DataFrame(total_counts, 
            columns=['Error in corrected reads', 'Correct base in corrected reads', 'Base is absent in corrected reads'],
            index=['Error in raw data', 'Correct base in raw data'])

**Абсолютные значения**

In [252]:
total_counts_df

Unnamed: 0,Error in corrected reads,Correct base in corrected reads,Base is absent in corrected reads
Error in raw data,2739,16320,30744
Correct base in raw data,20,5272548,460621


**Проценты**

In [253]:
total_counts_df / total_counts.sum() * 100

Unnamed: 0,Error in corrected reads,Correct base in corrected reads,Base is absent in corrected reads
Error in raw data,0.047363,0.282207,0.531628
Correct base in raw data,0.000346,91.173358,7.965098


## Trimmomatic

In [255]:
trimmomatic_sam = read_trimmomatic_sam('ecoli_10K_trimmomatic_paired.sam', 
                                       'ecoli_10K_trimmomatic_forward_unpaired.sam', 
                                       'ecoli_10K_trimmomatic_reverse_unpaired.sam')

In [257]:
new_sam = raw_sam.merge(trimmomatic_sam, left_index=True, right_index=True, how='inner', suffixes=('_raw', '_corr'))

In [259]:
with Pool() as p:
    counts = p.map(count_row, new_sam.iterrows())

In [260]:
total_counts = sum(counts)
total_counts_df = pd.DataFrame(total_counts, 
            columns=['Error in corrected reads', 'Correct base in corrected reads', 'Base is absent in corrected reads'],
            index=['Error in raw data', 'Correct base in raw data'])

**Абсолютные значения**

In [261]:
total_counts_df

Unnamed: 0,Error in corrected reads,Correct base in corrected reads,Base is absent in corrected reads
Error in raw data,18754,65,29252
Correct base in raw data,49,5126565,311502


**Проценты**

In [262]:
total_counts_df / total_counts.sum() * 100

Unnamed: 0,Error in corrected reads,Correct base in corrected reads,Base is absent in corrected reads
Error in raw data,0.34184,0.001185,0.533194
Correct base in raw data,0.000893,93.444955,5.677933


# Целые данные

In [7]:
ref = read_ref('MG1655-K12.first400K.fasta')
raw_sam = read_raw_sam('ecoli_400K_err.sam')

Process ForkPoolWorker-2:
Process ForkPoolWorker-4:
Process ForkPoolWorker-3:
Process ForkPoolWorker-1:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
Traceback (most recent call last):
  File "/usr/lib/python3.8/multiprocessing/pool.py", line 114, in worker
    task = get()
  File "/usr/lib/python3

KeyboardInterrupt: 

**На целых данных пока не получилось, потому что слишком долго работает**