# Execute QC pipeline on synth data

The aim of this notebook is to run the quality control pipeline on the CAMI communities.

The CAMI High-complexity communities can be found [here](https://data.cami-challenge.org/participate), and were downloaded using the java client from the URL `https://openstack.cebitec.uni-bielefeld.de:8080/swift/v1/CAMI_I_TOY_HIGH`. The original files are interleaved `fastq` files; to deinterleave and obtain forward and reverse reads, we used [the following script]( https://gist.github.com/nathanhaigh/3521724)

# Init

In [2]:
import os
import pandas as pd

  return f(*args, **kwds)


# Var

In [3]:
# Dirs
work_dir = "/ebio/abt3_projects/small_projects/jdelacuesta/DBs_benchmark"
sample_folder = os.path.join(work_dir, "data", "cami")
pipeline_folder = os.path.join(work_dir, "bin/llmgqc")

# Files
samples_file_combined = os.path.join(sample_folder, "samples.txt")

# Prepare config file

## Samples file

In [4]:
H_S001 = ["H_S001", "1", "1", "/ebio/abt3_projects2/databases_no-backup/CAMI/High_Complexity/H_S001_R1.fq.gz", "/ebio/abt3_projects2/databases_no-backup/CAMI/High_Complexity/H_S001_R2.fq.gz"]
H_S002 = ["H_S002", "1", "1", "/ebio/abt3_projects2/databases_no-backup/CAMI/High_Complexity/H_S002_R1.fq.gz", "/ebio/abt3_projects2/databases_no-backup/CAMI/High_Complexity/H_S002_R2.fq.gz"]
H_S003 = ["H_S003", "1", "1", "/ebio/abt3_projects2/databases_no-backup/CAMI/High_Complexity/H_S003_R1.fq.gz", "/ebio/abt3_projects2/databases_no-backup/CAMI/High_Complexity/H_S003_R2.fq.gz"]
H_S004 = ["H_S004", "1", "1", "/ebio/abt3_projects2/databases_no-backup/CAMI/High_Complexity/H_S004_R1.fq.gz", "/ebio/abt3_projects2/databases_no-backup/CAMI/High_Complexity/H_S004_R2.fq.gz"]
H_S005 = ["H_S005", "1", "1", "/ebio/abt3_projects2/databases_no-backup/CAMI/High_Complexity/H_S005_R1.fq.gz", "/ebio/abt3_projects2/databases_no-backup/CAMI/High_Complexity/H_S005_R2.fq.gz"]

cami_files = [H_S001, H_S002, H_S003, H_S004, H_S005]

samples_table = pd.DataFrame(cami_files)
samples_table.columns = ["Sample", "Run", "Lane", "Read1", "Read2"]
samples_table

Unnamed: 0,Sample,Run,Lane,Read1,Read2
0,H_S001,1,1,/ebio/abt3_projects2/databases_no-backup/CAMI/H...,/ebio/abt3_projects2/databases_no-backup/CAMI/H...
1,H_S002,1,1,/ebio/abt3_projects2/databases_no-backup/CAMI/H...,/ebio/abt3_projects2/databases_no-backup/CAMI/H...
2,H_S003,1,1,/ebio/abt3_projects2/databases_no-backup/CAMI/H...,/ebio/abt3_projects2/databases_no-backup/CAMI/H...
3,H_S004,1,1,/ebio/abt3_projects2/databases_no-backup/CAMI/H...,/ebio/abt3_projects2/databases_no-backup/CAMI/H...
4,H_S005,1,1,/ebio/abt3_projects2/databases_no-backup/CAMI/H...,/ebio/abt3_projects2/databases_no-backup/CAMI/H...


In [5]:
# Write samples file for QC pipeline
samples_file = os.path.join(sample_folder, "samples.txt")
samples_table.to_csv(samples_file, sep="\t", index=False)

In [6]:
config_default = os.path.join(pipeline_folder, 'config.yaml')
!cat $config_default

#-- I/O --#
# table with sample --> read_file information
samples_file: tests/samples/samples_amy_n6.txt

# output location
output_dir: tests/output_amy_n6/

# read file path
# use "None" if full file path is included in the samples_file
read_file_path: None

#-- Software parameters --#
# Use "Skip" to skip any of these steps. If no params for rule, use ""
# `clumpify`: change dupedist if not HiSeq3000/4000 (dupedist=40 for NextSeq, HiSeq2500, and MiSeq)
params:
  # read download
  remote: just_single=False      # If True, just download read1 (if remote file)
  # validation, conversion, subsampling
  validate_reads: ""
  convert_fastq_to_1.8: ""
  seqtk_sample: Skip    # Use number to subsample reads (eg., 1000000)
  fastqc_on_raw: ""
  # de-duplication
  clumpify: dedupe=t dupedist=2500 optical=t    # this will likely fail for remote (SRA) samples
  fastqc_on_dedup: ""
  # adapter removal & quality trimming/filtering
  bbduk: ref=./adapters/bbmap_adapters.fa 

In [7]:
config_text = """#-- I/O --#
# table with sample --> read_file information
samples_file: {samples_file}

# output location
output_dir: {output_dir}

# read file path
# use "None" if full file path is included in the samples_file
read_file_path: None

#-- Software parameters --#
# Use "Skip" to skip any of these steps. If no params for rule, use ""
# `clumpify`: change dupedist if not HiSeq3000/4000 (dupedist=40 for NextSeq, HiSeq2500, and MiSeq)
params:
  # read download
  remote: just_single=False      # If True, just download read1 (if remote file)
  # validation, conversion, subsampling
  validate_reads: Skip # ""
  convert_fastq_to_1.8: ""
  seqtk_sample: Skip    # Use number to subsample reads (eg., 1000000)
  fastqc_on_raw: ""
  # de-duplication
  clumpify: Skip # dedupe=t dupedist=40 optical=t    # this will likely fail for remote (SRA) samples
  fastqc_on_dedup: ""
  # adapter removal & quality trimming/filtering
  bbduk: ref=./adapters/bbmap_adapters.fa fastawrap=300 k=23
  skewer: -x ./adapters/PE_all.fa -n -l 100 -q 25
  fastqc_on_qual: ""
  # removal of 'contaminant' reads
  bbmap: minratio=0.9 maxindel=1 bwr=0.16 bw=12 fast minhits=2 qtrim=r trimq=10 untrim idtag printunmappedcount kfilter=25 maxsites=1 k=14 pairlen=1000 rescuedist=1000
  fastqc_on_filter: ""
  fastqc_on_final: ""
  # taxonomy
  centrifuge: Skip
  krona: Skip
  # coverage
  nonpareil: -T kmer
  nonpareil_summary: 1e9   # this is target seq. depth
  # master "Skip": reads combined then called "final" reads (skips all QC steps)
  skip_all_QC: False

#-- Databases --#
## hg19 = human genome database for filtering out human reads
filter_db: /ebio/abt3_projects2/databases_no-backup/hg19/hg19
# centrifuge db
centrifuge_db: /ebio/abt3_projects2/databases_no-backup/centrifuge/p+h+v
# krona taxonomy db
krona_tax_db: /ebio/abt3_projects2/databases_no-backup/krona/taxonomy


#-- Snakemake pipeline --#
## To use /tmp/global2/, see http://ilm.eb.local/user-guide/#Scratch-space-on-_002ftmp_002fglobal2
pipeline:
  snakemake_folder: ./
  script_folder: ./bin/scripts/
  temp_folder: /tmp/global/    # your username will be added automatically to this path
  run_skip_locally: True        # trivial "skip" steps run locally (not qsub)
"""

In [11]:
# Create output directory
cami_qc_output = os.path.join(work_dir, "data", "qc_cami")
if not os.path.exists(cami_qc_output):
    os.mkdir(cami_qc_output)

In [12]:
config = config_text.format(samples_file=samples_file, 
                       output_dir=cami_qc_output)
config_file_new = os.path.join(pipeline_folder, 'config_cami.yaml')
with open(config_file_new, 'w') as outF:
    outF.write(config)

!cat $config_file_new 

#-- I/O --#
# table with sample --> read_file information
samples_file: /ebio/abt3_projects/small_projects/jdelacuesta/DBs_benchmark/data/cami/samples.txt

# output location
output_dir: /ebio/abt3_projects/small_projects/jdelacuesta/DBs_benchmark/data/qc_cami

# read file path
# use "None" if full file path is included in the samples_file
read_file_path: None

#-- Software parameters --#
# Use "Skip" to skip any of these steps. If no params for rule, use ""
# `clumpify`: change dupedist if not HiSeq3000/4000 (dupedist=40 for NextSeq, HiSeq2500, and MiSeq)
params:
  # read download
  remote: just_single=False      # If True, just download read1 (if remote file)
  # validation, conversion, subsampling
  validate_reads: Skip # ""
  convert_fastq_to_1.8: ""
  seqtk_sample: Skip    # Use number to subsample reads (eg., 1000000)
  fastqc_on_raw: ""
  # de-duplication
  clumpify: Skip # dedupe=t dupedist=40 optical=t    # this will likely fail for remote (SRA) samples


In [13]:
# Create snakemake command
conda_env = 'source activate snakemake'
SGE_out_dir = os.path.join(work_dir, "tmp/SGE_out/llmgqc")
QC_cmd = "cd {llmgqc}; {conda_env}; screen -L -S llmgqc {exe} config_cami.yaml cluster.json {SGE_out} {jobs} \
    --keep-going --rerun-incomplete --dryrun"

QC_cmd = QC_cmd.format(conda_env = conda_env,
                 llmgqc = pipeline_folder, 
                 exe = './snakemake_sge.sh',
                 SGE_out = SGE_out_dir,
                 jobs = 10)
print(QC_cmd)

cd /ebio/abt3_projects/small_projects/jdelacuesta/DBs_benchmark/bin/llmgqc; source activate snakemake; screen -L -S llmgqc ./snakemake_sge.sh config_cami.yaml cluster.json /ebio/abt3_projects/small_projects/jdelacuesta/DBs_benchmark/tmp/SGE_out/llmgqc 10     --keep-going --rerun-incomplete --dryrun


# Session Info

In [4]:
sessionInfo = "find {0} -name '*.yaml' | xargs head -n 1000".format(os.path.join(pipeline_folder, 'bin', 'envs'))
!$sessionInfo

==> /ebio/abt3_projects/small_projects/jdelacuesta/DBs_benchmark/bin/llmgqc/bin/envs/bbmap.yaml <==
channels:
- conda-forge
- bioconda
dependencies:
- pigz
- bioconda::bbmap=37.78

==> /ebio/abt3_projects/small_projects/jdelacuesta/DBs_benchmark/bin/llmgqc/bin/envs/centrifuge.yaml <==
channels:
- conda-forge
- bioconda
dependencies:
- pigz
- bioconda::centrifuge

==> /ebio/abt3_projects/small_projects/jdelacuesta/DBs_benchmark/bin/llmgqc/bin/envs/fastqc.yaml <==
channels: !!python/tuple
- bioconda
dependencies:
- bioconda::fastqc=0.11.7

==> /ebio/abt3_projects/small_projects/jdelacuesta/DBs_benchmark/bin/llmgqc/bin/envs/fqtools.yaml <==
channels:
- conda-forge
- bioconda
dependencies:
- pigz
- python=3
- bioconda::seqtk
- bioconda::biopython=1.70
- bioconda::fqtools=2.0
==> /ebio/abt3_projects/small_projects/jdelacuesta/DBs_benchmark/bin/llmgqc/bin/envs/krona.yaml <==
channels:
- conda-forge
- bioconda
dependencies:
- pigz
- bioconda::krona
==> /