In [1]:
import sk8s

## Set up the environment

In [2]:
image = sk8s.docker_build(image_name="ngs-1",
                          conda=["bwa", "gatk4", "samtools", "google-cloud-sdk"],
                          channels=["conda-forge", "bioconda"],
                          pip=["numpy", "scipy", "matplotlib", "pandas"],
                          additional_config="RUN apt-get install -y gcc python3-dev python3-setuptools && pip3 uninstall crcmod && pip3 install --no-cache-dir -U crcmod")
    
image

'gcr.io/jared-genome-analysis/ngs-1'

In [3]:
# Create a volume to store the reference
#reference_volume = sk8s.create_volume("100Gi", name="reference-volume")
reference_volume = "reference-volume"
#reference_volume

In [4]:
def populate_reference_volume(volume):
    import subprocess

    def run_silent(cmd):
        return subprocess.run(cmd, check=True, shell=True,
                              stdout=subprocess.PIPE,
                              stderr=subprocess.PIPE)

    #result = run_silent(f"""mkdir -p /mnt/{volume}/hg38/ && gsutil -m rsync -r gs://genomics-public-data/references/hg38/v0/ /mnt/{volume}/hg38/""")
    result = run_silent(f"""mkdir -p /mnt/{volume}/hg38/""")
    result = run_silent(f"""gsutil -m rsync -r  gs://jared-genome/ref/GRCh38/ /mnt/{volume}/hg38/""")
    return "OK"

result = sk8s.run(populate_reference_volume, reference_volume,
                  volumes=[reference_volume],
                  image=image,
                  asynchro=False)
print(result)

## Define pipeline tasks

In [6]:

def align(fq1, fq2, output_bam, reference, read_group):
    import subprocess

    def run_and_log(cmd):
        proc = subprocess.run(cmd, check=True, shell=True, encoding="utf-8",
                              stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        return dict(cmd=cmd,
                    returncode=proc.returncode,
                    stdout=proc.stdout,
                    stderr=proc.stderr)

    results = []
    results.append(run_and_log(f"gsutil -m cp {fq1} ./fq1.fq"))
    results.append(run_and_log(f"gsutil -m cp {fq2} ./fq2.fq"))
    results.append(run_and_log(f'bwa mem -R "{read_group}" {reference} fq1.fq fq2.fq | samtools sort > out.bam'))
    results.append(run_and_log(f"samtools index out.bam"))
    results.append(run_and_log(f"gsutil -m cp out.bam {output_bam}"))
    results.append(run_and_log(f"gsutil -m cp out.bam.bai {output_bam}.bai"))

    return results


In [7]:

def call_snps(reference, bam, roi, output_vcf):
    import subprocess

    def run_and_log(cmd):
        proc = subprocess.run(cmd, check=True, shell=True, encoding="utf-8",
                              stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        return dict(cmd=cmd,
                    returncode=proc.returncode,
                    stdout=proc.stdout,
                    stderr=proc.stderr)

    region = "%s:%d-%d" % roi

    results = []
    results.append(run_and_log(f"gsutil cp {bam} ./in.bam"))
    results.append(run_and_log(f"gsutil cp {bam}.bai ./in.bam.bai"))
    results.append(run_and_log(f"gatk HaplotypeCaller -R {reference} -I in.bam -O out.vcf -L {region}"))
    results.append(run_and_log(f"gsutil cp out.vcf {output_vcf}"))
    return results

## Execute pipeline

In [17]:

reference=f"/mnt/{reference_volume}/hg38/Homo_sapiens_assembly38.fasta"
sample_name="jared"
read_group=f'@RG\\tID:{sample_name}\\tSM:{sample_name}\\tPL:ILLUMINA'
ROI = ("chr1", 14674463, 14697776)
fq1 = "gs://jared-genome/tiny_r1.fq"
fq2 = "gs://jared-genome/tiny_r2.fq"
output_prefix = "gs://jared-genome/sk8s/"
output_bam = f"{output_prefix}tiny.bam"
output_vcf = f"{output_prefix}tiny.vcf"

In [11]:

alignment_result = sk8s.run(align, fq1, fq2, output_bam, reference, read_group,
                            image=image, volumes=[reference_volume],
                            requests={"memory": "16Gi", "ephemeral-storage": "10Gi", "cpu": "1"},
                            limits={"memory": "16Gi", "ephemeral-storage": "10Gi"},
                            asynchro=False)
alignment_result

[{'cmd': 'gsutil -m cp gs://jared-genome/tiny_r1.fq ./fq1.fq',
  'returncode': 0,
  'stdout': '',
  'stderr': 'Copying gs://jared-genome/tiny_r1.fq...\n/ [0/1 files][    0.0 B/ 28.7 MiB]   0% Done                                    \n-\n- [1/1 files][ 28.7 MiB/ 28.7 MiB] 100% Done                                    \nOperation completed over 1 objects/28.7 MiB.                                     \n'},
 {'cmd': 'gsutil -m cp gs://jared-genome/tiny_r2.fq ./fq2.fq',
  'returncode': 0,
  'stdout': '',
  'stderr': 'Copying gs://jared-genome/tiny_r2.fq...\n/ [0/1 files][    0.0 B/ 28.7 MiB]   0% Done                                    \n-\n- [0/1 files][ 27.1 MiB/ 28.7 MiB]  94% Done                                    \n\\\n\\ [1/1 files][ 28.7 MiB/ 28.7 MiB] 100% Done                                    \nOperation completed over 1 objects/28.7 MiB.                                     \n'},
 {'cmd': 'bwa mem -R "@RG\\tID:jared\\tSM:jared\\tPL:ILLUMINA" /mnt/reference-volume/hg38/Homo_sapien

In [18]:
snp_result = sk8s.run(call_snps, reference, output_bam, ROI, output_vcf,
                       image=image, asynchro=False, volumes=[reference_volume],
                       requests={"memory": "8Gi", "ephemeral-storage": "10Gi", "cpu": "2"},
                       limits={"memory": "8Gi", "ephemeral-storage": "10Gi"})
snp_result

[{'cmd': 'gsutil cp gs://jared-genome/sk8s/tiny.bam ./in.bam',
  'returncode': 0,
  'stdout': '',
  'stderr': 'Copying gs://jared-genome/sk8s/tiny.bam...\n/ [0 files][    0.0 B/ 14.9 MiB]                                                \n-\n- [1 files][ 14.9 MiB/ 14.9 MiB]                                                \nOperation completed over 1 objects/14.9 MiB.                                     \n'},
 {'cmd': 'gsutil cp gs://jared-genome/sk8s/tiny.bam.bai ./in.bam.bai',
  'returncode': 0,
  'stdout': '',
  'stderr': 'Copying gs://jared-genome/sk8s/tiny.bam.bai...\n/ [0 files][    0.0 B/  1.4 MiB]                                                \n/ [1 files][  1.4 MiB/  1.4 MiB]                                                \nOperation completed over 1 objects/1.4 MiB.                                      \n'},
 {'cmd': 'gatk HaplotypeCaller -R /mnt/reference-volume/hg38/Homo_sapiens_assembly38.fasta -I in.bam -O out.vcf -L chr1:14674463-14697776',
  'returncode': 0,
  'stdout': ''

In [19]:
import subprocess
print(subprocess.run(f"gsutil cat {output_vcf} | grep -v '^#' | head -n10",
               shell=True, stdout=subprocess.PIPE, encoding="utf-8").stdout)

chr1	14674841	.	T	C	306.64	.	AC=1;AF=0.500;AN=2;BaseQRankSum=0.421;DP=26;ExcessHet=0.0000;FS=10.037;MLEAC=1;MLEAF=0.500;MQ=60.00;MQRankSum=0.000;QD=12.27;ReadPosRankSum=-0.849;SOR=2.461	GT:AD:DP:GQ:PL	0/1:14,11:25:99:314,0,393
chr1	14674953	.	T	TTTTTA	553.60	.	AC=1;AF=0.500;AN=2;BaseQRankSum=1.183;DP=23;ExcessHet=0.0000;FS=3.796;MLEAC=1;MLEAF=0.500;MQ=60.00;MQRankSum=0.000;QD=24.07;ReadPosRankSum=-0.601;SOR=1.245	GT:AD:DP:GQ:PL	0/1:9,14:23:99:561,0,336
chr1	14674974	.	A	G	673.64	.	AC=1;AF=0.500;AN=2;BaseQRankSum=0.058;DP=28;ExcessHet=0.0000;FS=0.000;MLEAC=1;MLEAF=0.500;MQ=60.00;MQRankSum=0.000;QD=24.06;ReadPosRankSum=0.000;SOR=0.646	GT:AD:DP:GQ:PL	0/1:11,17:28:99:681,0,411
chr1	14674984	.	T	C	631.64	.	AC=1;AF=0.500;AN=2;BaseQRankSum=-0.679;DP=28;ExcessHet=0.0000;FS=1.540;MLEAC=1;MLEAF=0.500;MQ=60.00;MQRankSum=0.000;QD=24.29;ReadPosRankSum=0.053;SOR=0.412	GT:AD:DP:GQ:PL	0/1:10,16:26:99:639,0,372
chr1	14675235	.	G	T	913.06	.	AC=2;AF=1.00;AN=2;DP=30;ExcessHet=0.0000;FS=0.000;MLEAC=2;MLEAF