In [1]:
import sk8s

## Set up the environment

In [None]:
image = sk8s.docker_build(image_name="ngs-1",
                          conda=["bwa", "gatk4", "samtools", "google-cloud-sdk", "bedtools"],
                          channels=["conda-forge", "bioconda"],
                          pip=["numpy", "scipy", "matplotlib", "pandas"],
                          additional_config="RUN apt-get install -y gcc python3-dev python3-setuptools && pip3 uninstall crcmod && pip3 install --no-cache-dir -U crcmod")
    
image

In [2]:
#image = "gcr.io/jared-genome-analysis/ngs-1"
image = "ngs-1"

In [3]:
# Create a volume to store the reference
#reference_volume = sk8s.create_volume("100Gi", name="reference-volume")
reference_volume = "reference-volume"
reference_volume

'reference-volume'

In [4]:
default_volumes={reference_volume: f"/mnt/{reference_volume}",
                 "gcs-creds": f"/root/.config/gcloud"}

In [None]:
def populate_reference_volume(volume):
    import subprocess

    def run_silent(cmd):
        return subprocess.run(cmd, check=True, shell=True,
                              stdout=subprocess.PIPE,
                              stderr=subprocess.PIPE)

    #result = run_silent(f"""mkdir -p /mnt/{volume}/hg38/ && gsutil -m rsync -r gs://genomics-public-data/references/hg38/v0/ /mnt/{volume}/hg38/""")
    result = run_silent(f"""mkdir -p /mnt/{volume}/hg38/""")
    result = run_silent(f"""gsutil -m rsync -r  gs://jared-genome/ref/GRCh38/ /mnt/{volume}/hg38/""")
    return "OK"

#populate_reference_job = sk8s.run(populate_reference_volume, reference_volume,
#                                  volumes=default_volumes,
#                                  image=image,
#                                  asynchro=True)
#populate_reference_job

In [5]:
def run(cmd):
    import subprocess

    result = subprocess.run(cmd,
                            check=False,
                            shell=True,
                            stdout=subprocess.PIPE,
                            stderr=subprocess.STDOUT,
                            encoding="utf-8")
    if result.returncode == 0:
        return result.stdout
    else:
        print("Error Running Command.")
        print("Command:", cmd)
        print("Log:")
        print(result.stdout)
        raise subprocess.CalledProcessError(result.returncode, cmd=cmd, output=result.stdout)


## Prepare input data (not really part of the pipeline)

In [None]:
def generate_fq_chunk(bam, region, output_fq):
    import os

    token = run("gcloud auth application-default print-access-token").strip()
    os.environ["GCS_OAUTH_TOKEN"] = token

    run(f"samtools view -b {bam} {region} > ./in.bam")
    run(f"samtools index in.bam")
    run(f"samtools collate in.bam out")
    run(f"samtools fastq -o out.fq out.bam")
    run(f"gsutil cp out.fq {output_fq}")

    return output_fq


def concatenate(input_files, output_file):
    import os
    run("mkdir ./data/")
    run(f"gsutil -m cp {' '.join(input_files)} ./data/")
    local_input_files = ["./data/" + os.path.basename(f) for f in input_files]
    run(f"cat {' '.join(local_input_files)} > out")
    run(f"gsutil cp out {output_file}")
    return output_file


def generate_fq(bam, region_size, output_file):
    import subprocess
    cmd = f'bedtools makewindows -g /mnt/{reference_volume}/GRCh38/Homo_sapiens_assembly38.fasta.genome -w {region_size} | grep -vP "_|HLA"'

    regions = ["{}:{}-{}".format(*(r.split("\t")))
               for r in
               run(cmd).strip().split("\n")]

    jobs = [sk8s.run(generate_fq_chunk, bam, region, f"gs://jared-genome/sk8s/pipeline_test_1/fastq/chunk_{idx}.fq",
                     image=image, volumes=default_volumes, asynchro=True)
            for idx, region in enumerate(regions[0:3])]

    fastqs = list(map(sk8s.wait, jobs))

    return sk8s.run(concatenate, fastqs, output_file,
                    image=image, volumes=default_volumes,
                    asynchro=False)


results = sk8s.run(generate_fq, "gs://jared-genome/jared.bam", int(1e6), "gs://jared-genome/sk8s/pipeline_test_1/jared_interleaved.fq",
                   image=image,
                   volumes=default_volumes,
                   asynchro=False)
results


In [None]:
sk8s.wait(populate_reference_job)
sk8s.wait(collate_job)

## Define pipeline tasks

In [17]:
def split_fastq(fastq, n_chunks, output_prefix):
    import os

    run(f"gsutil cp {fastq} ./input.fq")

    os.mkdir("./output/")
    output_files = [f"./output/chunk_{i}.fq" for i in range(n_chunks)]
    output_fds = [open(f, "w") for f in output_files]
    current_output = 0
    
    with open("input.fq") as fp:
        while True:
            name = fp.readline()
            seq = fp.readline()
            strand = fp.readline()
            qual = fp.readline()
            if name == "": break
            print(name, seq, strand, qual, sep="\n", end="", file=output_fds[current_output])
            current_output = (current_output + 1) % n_chunks
    
    [f.close() for f in output_fds]

    fastqs = []
    for idx, file in enumerate(output_files):
        fastqs.append(f"{output_prefix}{idx}.fq")
        run(f"gsutil cp {file} {output_prefix}{idx}.fq")

    return fastqs

In [16]:
def align(fq, output_bam, reference, read_group):
    run(f"gsutil -m cp {fq} ./fq.fq")
    run(f'bwa mem -p -R "{read_group}" {reference} fq.fq | samtools sort > out.bam')
    run(f"samtools index out.bam")
    run(f"gsutil -m cp out.bam {output_bam}")
    run(f"gsutil -m cp out.bam.bai {output_bam}.bai")
    return output_bam

In [None]:
def merge_bams(bams, output_bam):
    import os
    from multiprocessing import Pool
    os.path.mkdir("./bams/")
    pool = Pool()
    pool.map(run, [f"gsutil -m cp {bam}.bai ./bams/" for bam in bams])
    pool.map(run, [f"gsutil -m cp {bam} ./bams/" for bam in bams])
    run(f"samtools merge ./bams/*.bam ./out.bam")
    run(f"samtools index out.bam")
    run(f"gsutil cp out.bam {output_bam}")
    run(f"gsutil cp out.bam.bai {output_bam}.bai")
    return output_bam

In [8]:
def call_snps(reference, bam, roi, output_vcf):
    import subprocess

    def run_and_log(cmd):
        proc = subprocess.run(cmd, check=True, shell=True, encoding="utf-8",
                              stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        return dict(cmd=cmd,
                    returncode=proc.returncode,
                    stdout=proc.stdout,
                    stderr=proc.stderr)

    region = "%s:%d-%d" % roi

    results = []
    results.append(run_and_log(f"gsutil cp {bam} ./in.bam"))
    results.append(run_and_log(f"gsutil cp {bam}.bai ./in.bam.bai"))
    results.append(run_and_log(f"gatk HaplotypeCaller -R {reference} -I in.bam -O out.vcf -L {region}"))
    results.append(run_and_log(f"gsutil cp out.vcf {output_vcf}"))
    return results

## Execute pipeline

In [14]:

reference=f"/mnt/{reference_volume}/GRCh38/Homo_sapiens_assembly38.fasta"
sample_name="jared"
read_group=f'@RG\\tID:{sample_name}\\tSM:{sample_name}\\tPL:ILLUMINA'
ROI = ("chr1", 14674463, 14697776)
#fq = "gs://jared-genome/jared_interleaved.fq"
fq = "gs://jared-genome/tiny_interleaved.fq"
output_prefix = "gs://jared-genome/sk8s/pipeline_test_1/"
output_bam = f"{output_prefix}tiny.bam"
output_vcf = f"{output_prefix}tiny.vcf"

In [18]:
fastqs = sk8s.run(split_fastq, fq, 3, f"{output_prefix}fastq_chunks/chunk_",
                  image=image, volumes=default_volumes, asynchro=False)
fastqs

KeyboardInterrupt: 

In [15]:
bams = sk8s.map(lambda fq: align(fq, output_bam, reference, read_group), fastqs,
                image=image, volumes=default_volumes,
                requests={"memory": "8Gi", "ephemeral-storage": "10Gi", "cpu": "1"},
                limits={"memory": "8Gi", "ephemeral-storage": "10Gi"},
                asynchro=False)

bams

[[{'cmd': 'gsutil -m cp gs://jared-genome/sk8s/pipeline_test_1/fastq_chunks/chunk_0.fq ./fq.fq',
   'returncode': 0,
   'stdout': '',
   'stderr': 'Copying gs://jared-genome/sk8s/pipeline_test_1/fastq_chunks/chunk_0.fq...\n/ [0/1 files][    0.0 B/ 19.2 MiB]   0% Done                                    \n-\n- [0/1 files][  7.7 MiB/ 19.2 MiB]  40% Done                                    \n\\\n|\n| [0/1 files][ 19.1 MiB/ 19.2 MiB]  99% Done                                    \n| [1/1 files][ 19.2 MiB/ 19.2 MiB] 100% Done                                    \nOperation completed over 1 objects/19.2 MiB.                                     \n'},
  {'cmd': 'bwa mem -p -R "@RG\\tID:jared\\tSM:jared\\tPL:ILLUMINA" /mnt/reference-volume/GRCh38/Homo_sapiens_assembly38.fasta fq.fq | samtools sort > out.bam',
   'returncode': 0,
   'stdout': '',
   'stderr': '[M::bwa_idx_load_from_disk] read 3171 ALT contigs\n[M::process] read 81823 sequences (8182300 bp)...\n[M::process] 81823 single-end sequences

In [None]:
bam = sk8s.run(merge_bams, bams, f"{output_prefix}bams/{sample_name}.bam",
               image=image, volumes=default_volumes,
               requests={"memory": "8Gi", "ephemeral-storage": "10Gi", "cpu": "1"},
               limits={"memory": "8Gi", "ephemeral-storage": "10Gi"},
               asynchro=False)
bam

In [None]:
snp_result = sk8s.run(call_snps, reference, output_bam, ROI, output_vcf,
                       image=image, asynchro=False, volumes=[reference_volume],
                       requests={"memory": "8Gi", "ephemeral-storage": "10Gi", "cpu": "2"},
                       limits={"memory": "8Gi", "ephemeral-storage": "10Gi"})
snp_result

In [None]:
import subprocess
print(subprocess.run(f"gsutil cat {output_vcf} | grep -v '^#' | head -n10",
               shell=True, stdout=subprocess.PIPE, encoding="utf-8").stdout)