# QC 步骤
查看测序和 Mapping 质量

## load packages

In [1]:
import os
import pandas as pd
import numpy as np
from glob import glob

## 搜集 MultiQC 信息

In [2]:
df_qc = pd.read_html('../qc/multiqc/multiqc_report.html')[0]
# df_qc
df_qc['Sample Name'] = df_qc['Sample Name'].str[:-3]
df_qc['% Dups'] = df_qc['% Dups'].str[:-1].astype(float)
df_qc['% GC'] = df_qc['% GC'].str[:-1].astype(float)
df_qc['Read Length'] = df_qc['Read Length'].str[:-3].astype(int)
df_qc['% Failed'] = df_qc['% Failed'].str[:-1].astype(float)
df_qc = df_qc.groupby('Sample Name').agg(np.mean)
df_qc['Read Length'] = df_qc['Read Length'] * 2
df_qc['M Seqs'] = df_qc['M Seqs'] * 2
df_qc.reset_index(inplace=True)
df_qc.rename(columns={'M Seqs': 'M Seqs <raw fq>'}, inplace=True)
df_qc

Unnamed: 0,Sample Name,% Dups,% GC,Read Length,% Failed,M Seqs <raw fq>
0,DetectSeq_ATP8-DddA11_REP-1,62.8,42.0,300.0,10.0,568.4
1,DetectSeq_ATP8-DddA6_REP-1,66.2,41.0,300.0,10.0,612.8
2,DetectSeq_ATP8-DddAwt_REP-1,55.95,40.0,300.0,10.0,583.0
3,DetectSeq_JAK2-DddA11_REP-1,60.15,43.0,300.0,10.0,521.4
4,DetectSeq_JAK2-DddA11_REP-2,49.7,43.0,300.0,15.0,355.8
5,DetectSeq_SIRT6-DddA11_REP-1,57.4,41.5,300.0,10.0,528.0
6,DetectSeq_SIRT6-DddA11_REP-2,40.75,42.0,300.0,5.0,427.0
7,test,8.75,42.0,300.0,0.0,0.0


## 搜集 Hisat-3N Mapping 信息

In [3]:
ls = sorted(glob('../bam/*_hisat3n.hisat3n.log'))
tmpls = []

for file in ls:
    with open(file, 'rt') as f:
        sname = file.split('/')[-1].replace('_hisat3n.hisat3n.log', '')
        ratio = float(f.readlines()[-1].split('%')[0])
        tmpls.append([sname, ratio])
df_3n = pd.DataFrame(tmpls, columns=['Sample Name', '% Hisat-3n'])
df_3n

Unnamed: 0,Sample Name,% Hisat-3n
0,DetectSeq_ATP8-DddA11_REP-1,66.9
1,DetectSeq_ATP8-DddA6_REP-1,71.75
2,DetectSeq_ATP8-DddAwt_REP-1,79.26
3,DetectSeq_JAK2-DddA11_REP-1,61.5
4,DetectSeq_JAK2-DddA11_REP-2,61.79
5,DetectSeq_SIRT6-DddA11_REP-1,69.31
6,DetectSeq_SIRT6-DddA11_REP-2,74.18
7,test,66.62


## 搜集 Final Mapping 信息 （Hisat3N mapping + BWA remapping）

In [4]:
# 法1：
# ls = sorted(glob('../bam/*_final_rmdup.bam.flagstats.tsv'))
# tmpls = []

# for file in ls:
#     with open(file, 'rt') as f:
#         sname = file.split('/')[-1].replace('_final_rmdup.bam.flagstats.tsv', '')
#         ratio = float(f.readlines()[6].split('\t')[0])
#         tmpls.append([sname, ratio])
# df_final_map_flagstats = pd.DataFrame(tmpls, columns=['Sample Name', 'Seqs'])
# df_final_map_flagstats['']
# df_final_map_flagstats

# 法2：
text = """\
touch ../bam/all_final_mapped_reads.txt
rm ../bam/all_final_mapped_reads.txt

for i in `ls ../bam/*_rmdup.bam`
    samtools idxstats $i | \
    	awk '{sum += $3} END {print sum/1000000 "M reads"}' | \
    	xargs echo "$i" \
    	>> ../bam/all_final_mapped_reads.txt
"""
with open('../bam/all_final_mapped_reads.sh', 'wt') as f:
    f.write(text)

assert os.system('zsh ../bam/all_final_mapped_reads.sh') == 0

df_final_map = pd.read_csv('../bam/all_final_mapped_reads.txt', sep=' ', header=None,
                           names=['Sample Name', 'M Seqs', '_'], usecols=[0, 1])
df_final_map

Unnamed: 0,Sample Name,M Seqs
0,../bam/DetectSeq_ATP8-DddA11_REP-1_final_rmdup...,176.744M
1,../bam/DetectSeq_ATP8-DddA6_REP-1_final_rmdup.bam,163.108M
2,../bam/DetectSeq_ATP8-DddAwt_REP-1_final_rmdup...,219.36M
3,../bam/DetectSeq_JAK2-DddA11_REP-1_final_rmdup...,163.946M
4,../bam/DetectSeq_JAK2-DddA11_REP-2_final_rmdup...,143.23M
5,../bam/DetectSeq_SIRT6-DddA11_REP-1_final_rmdu...,180.379M
6,../bam/DetectSeq_SIRT6-DddA11_REP-2_final_rmdu...,215.754M
7,../bam/test_final_rmdup.bam,0.030129M


In [5]:
df_final_map['Sample Name'] = df_final_map['Sample Name'].str.replace('../bam/', '').str.replace('_final_rmdup.bam', '')
df_final_map.rename(columns={'M Seqs': 'M Seqs <final mapped rm dup>'}, inplace=True)
df_final_map

  df_final_map['Sample Name'] = df_final_map['Sample Name'].str.replace('../bam/', '').str.replace('_final_rmdup.bam', '')


Unnamed: 0,Sample Name,M Seqs <final mapped rm dup>
0,DetectSeq_ATP8-DddA11_REP-1,176.744M
1,DetectSeq_ATP8-DddA6_REP-1,163.108M
2,DetectSeq_ATP8-DddAwt_REP-1,219.36M
3,DetectSeq_JAK2-DddA11_REP-1,163.946M
4,DetectSeq_JAK2-DddA11_REP-2,143.23M
5,DetectSeq_SIRT6-DddA11_REP-1,180.379M
6,DetectSeq_SIRT6-DddA11_REP-2,215.754M
7,test,0.030129M


## 汇总 QC 信息

In [6]:
df_qc_all = df_qc.merge(df_3n).merge(df_final_map)
df_qc_all

Unnamed: 0,Sample Name,% Dups,% GC,Read Length,% Failed,M Seqs <raw fq>,% Hisat-3n,M Seqs <final mapped rm dup>
0,DetectSeq_ATP8-DddA11_REP-1,62.8,42.0,300.0,10.0,568.4,66.9,176.744M
1,DetectSeq_ATP8-DddA6_REP-1,66.2,41.0,300.0,10.0,612.8,71.75,163.108M
2,DetectSeq_ATP8-DddAwt_REP-1,55.95,40.0,300.0,10.0,583.0,79.26,219.36M
3,DetectSeq_JAK2-DddA11_REP-1,60.15,43.0,300.0,10.0,521.4,61.5,163.946M
4,DetectSeq_JAK2-DddA11_REP-2,49.7,43.0,300.0,15.0,355.8,61.79,143.23M
5,DetectSeq_SIRT6-DddA11_REP-1,57.4,41.5,300.0,10.0,528.0,69.31,180.379M
6,DetectSeq_SIRT6-DddA11_REP-2,40.75,42.0,300.0,5.0,427.0,74.18,215.754M
7,test,8.75,42.0,300.0,0.0,0.0,66.62,0.030129M


In [7]:
df_qc_all['Read Length'] = df_qc_all['Read Length'].astype(int)
df_qc_all['M Seqs <Hisat-3n mapped>'] = df_qc_all['M Seqs <raw fq>'] * df_qc_all['% Hisat-3n'] / 100
df_qc_all['M Seqs <Hisat-3n mapped>'] = df_qc_all['M Seqs <Hisat-3n mapped>'].map(lambda x: round(x, 2))
df_qc_all['M Seqs <final mapped rm dup>'] = df_qc_all['M Seqs <final mapped rm dup>'].str[:-1].astype(float)
df_qc_all['M Seqs <final mapped rm dup>'] = df_qc_all['M Seqs <final mapped rm dup>'].map(lambda x: round(x, 2))
df_qc_all = df_qc_all.drop(columns=['% Failed', '% Hisat-3n'])
df_qc_all = df_qc_all.iloc[:, [0, 1, 2, 3, 4, 6, 5]].copy()
df_qc_all = df_qc_all[df_qc_all['Sample Name'] != 'test'].copy()
df_qc_all['% Effective Seqs'] = df_qc_all['M Seqs <final mapped rm dup>'] / df_qc_all['M Seqs <raw fq>'] * 100
df_qc_all['% Effective Seqs'] = df_qc_all['% Effective Seqs'].map(lambda x: round(x, 2))
df_qc_all

Unnamed: 0,Sample Name,% Dups,% GC,Read Length,M Seqs <raw fq>,M Seqs <Hisat-3n mapped>,M Seqs <final mapped rm dup>,% Effective Seqs
0,DetectSeq_ATP8-DddA11_REP-1,62.8,42.0,300,568.4,380.26,176.74,31.09
1,DetectSeq_ATP8-DddA6_REP-1,66.2,41.0,300,612.8,439.68,163.11,26.62
2,DetectSeq_ATP8-DddAwt_REP-1,55.95,40.0,300,583.0,462.09,219.36,37.63
3,DetectSeq_JAK2-DddA11_REP-1,60.15,43.0,300,521.4,320.66,163.95,31.44
4,DetectSeq_JAK2-DddA11_REP-2,49.7,43.0,300,355.8,219.85,143.23,40.26
5,DetectSeq_SIRT6-DddA11_REP-1,57.4,41.5,300,528.0,365.96,180.38,34.16
6,DetectSeq_SIRT6-DddA11_REP-2,40.75,42.0,300,427.0,316.75,215.75,50.53
