# 标准质控流程

In [42]:
import glob
import json
import platform
import pprint

In [43]:
pp = pprint.PrettyPrinter(indent=4)
pp?

In [44]:
if platform.system() == 'Linux':
    root_path = '/lustre1/chengqiyi_pkuhpc/zhaohn'
    thread = 20
    control = {
        "bam":
            f"{root_path}/3.project/2022_DdCBE-3D-Genome_topic/2022-09-30_Detect-seq_batch-1/bam/Vector-merge_hg38_merge_sort_rmdup.MAPQ20.bam",
        "name":
            "Vector-merge_hg38"
    }
elif platform.system() == 'Darwin':
    root_path = '/Volumes/Data-a/Bio'
    thread = 8
    control = {
        "bam":
            f"/Users/zhaohuanan/PycharmProjects/snakepipes_detect-seq/bam/test_ctrl.bam",
        "name":
            "test_ctrl"
    }

In [45]:
pp.pprint(platform.system())
pp.pprint(thread)
pp.pprint(control)

'Linux'
20
{   'bam': '/lustre1/chengqiyi_pkuhpc/zhaohn/3.project/2022_DdCBE-3D-Genome_topic/2022-09-30_Detect-seq_batch-1/bam/Vector-merge_hg38_merge_sort_rmdup.MAPQ20.bam',
    'name': 'Vector-merge_hg38'}


## 参数设置

## Detect-seq
```
hisat-3n-build \
-p 24 \
genome_ucsc_hg38.fa \
genome_ucsc_hg38.fa.hisat3n-foryilab_bcCT_standard_mode.snp_hap_exon_ss \
--exon genome_ucsc_hg38.fa.hisat2.exon \
--haplotype genome_ucsc_hg38.fa.hisat2.snp151Common.haplotype \
--snp genome_ucsc_hg38.fa.hisat2.snp151Common.snp \
--ss genome_ucsc_hg38.fa.hisat2.ss \
--base-change C,T
```

## Direct-seq
```
hisat-3n-build \
-p 24 \
genome_ucsc_hg38.fa \
genome_ucsc_hg38.fa.hisat3n-foryilab_bcAG_standard_mode.snp_hap_exon_ss \
--exon genome_ucsc_hg38.fa.hisat2.exon \
--haplotype genome_ucsc_hg38.fa.hisat2.snp151Common.haplotype \
--snp genome_ucsc_hg38.fa.hisat2.snp151Common.snp \
--ss genome_ucsc_hg38.fa.hisat2.ss \
--base-change A,G
```
## 【TODO】注意到 CT 和 AG 的 index md5 值一致，可能可以指定 CT 就可以使用 AG？未尝试

In [46]:
# Detect-seq
genome_hisat3n_index = f"{root_path}/1.database/db_genomes/genome_fa/genome_ucsc_hg38/genome_ucsc_hg38.fa.hisat3n-foryilab_bcCT_standard_mode.snp_hap_exon_ss"
base_change = "C,T"
query_mutation_type = "CT,GA"
# Direct-seq
# genome_hisat3n_index = f"{root_path}/1.database/db_genomes/genome_fa/genome_ucsc_hg38/genome_ucsc_hg38.fa.hisat3n-foryilab_bcAG_standard_mode.snp_hap_exon_ss"
# base_change = 'A,G'
# query_mutation_type = "AG,TC"


snp_list = ",".join([
    f"{root_path}/1.database/db_genomes/cell_line_mutations/293T/293T_BE_INPUT_VCF/293T-EMX1-Mock-Input.site_index.rmdup.bed",
    f"{root_path}/1.database/db_genomes/cell_line_mutations/293T/293T_BE_INPUT_VCF/293T-Mock-Input-covaris_bwa_hg38_sort_rmdup.recall.merge.Genotype.filter.rmdup_signal.vcf",
])

## 生成samples.json

In [47]:
ls = sorted(glob.glob("../fastq/*.fastq.gz"))
assert ls != []  # 需要非空
ls_se = [i for i in ls if i.endswith("SE.fastq.gz")]
ls_pe = [i for i in ls if i.endswith("R1.fastq.gz")]

In [48]:
if ls_se:
    ls_sample = [i.split("/")[-1].split("_SE.fastq")[0] for i in ls_se]
    end_type = "SE"

if ls_pe:
    ls_sample = [i.split("/")[-1].split("_R1.fastq")[0] for i in ls_pe]
    end_type = "PE"

In [49]:
pp.pprint(ls_sample)

[   'DetectSeq_ATP8-DddA11_REP-1',
    'DetectSeq_ATP8-DddA6_REP-1',
    'DetectSeq_ATP8-DddAwt_REP-1',
    'DetectSeq_JAK2-DddA11_REP-1',
    'DetectSeq_JAK2-DddA11_REP-2',
    'DetectSeq_SIRT6-DddA11_REP-1',
    'DetectSeq_SIRT6-DddA11_REP-2',
    'test']


In [50]:
ls_sample = [
    "DetectSeq_ATP8-DddA11_REP-1",
    "DetectSeq_ATP8-DddA6_REP-1",
    "DetectSeq_ATP8-DddAwt_REP-1",
    "DetectSeq_JAK2-DddA11_REP-1",
    "DetectSeq_SIRT6-DddA11_REP-1",
    "test",
]

In [51]:
platform = "MGI"  # Illumina MGI ...

In [52]:
dt = {
    "seq_mode": end_type,
    "samples": ls_sample,
    "thread": thread,
    "genome_hisat3n_index": genome_hisat3n_index,
    "platform": platform,
    "base_change": base_change,
    "snp_list": snp_list,
    "control": control,
    "query_mutation_type": query_mutation_type,
}

In [53]:
pp.pprint(dt)

{   'base_change': 'C,T',
    'control': {   'bam': '/lustre1/chengqiyi_pkuhpc/zhaohn/3.project/2022_DdCBE-3D-Genome_topic/2022-09-30_Detect-seq_batch-1/bam/Vector-merge_hg38_merge_sort_rmdup.MAPQ20.bam',
                   'name': 'Vector-merge_hg38'},
    'genome_hisat3n_index': '/lustre1/chengqiyi_pkuhpc/zhaohn/1.database/db_genomes/genome_fa/genome_ucsc_hg38/genome_ucsc_hg38.fa.hisat3n-foryilab_bcCT_standard_mode.snp_hap_exon_ss',
    'platform': 'MGI',
    'query_mutation_type': 'CT,GA',
    'samples': [   'DetectSeq_ATP8-DddA11_REP-1',
                   'DetectSeq_ATP8-DddA6_REP-1',
                   'DetectSeq_ATP8-DddAwt_REP-1',
                   'DetectSeq_JAK2-DddA11_REP-1',
                   'DetectSeq_SIRT6-DddA11_REP-1',
                   'test'],
    'seq_mode': 'PE',
    'snp_list': '/lustre1/chengqiyi_pkuhpc/zhaohn/1.database/db_genomes/cell_line_mutations/293T/293T_BE_INPUT_VCF/293T-EMX1-Mock-Input.site_index.rmdup.bed,/lustre1/chengqiyi_pkuhpc/zhaohn/1.database/d

In [54]:
with open("./samples.json", "wt") as f:
    f.write(json.dumps(dt))