In [1]:
%load_ext lab_black
import subprocess, os
import pandas as pd
import glob, random

In [2]:
all_bam = {
    bam_path.split("/")[1].split(".trim")[0]: bam_path
    for bam_path in glob.glob("bam/*bam")
}
list(all_bam.items())[:3]

[('ATAC188_CB_FR062.1_ND_sB',
  'bam/ATAC188_CB_FR062.1_ND_sB.trimmed.bowtie2.filtered.bam'),
 ('ATAC088_CB_FR028.1_CVID_sB',
  'bam/ATAC088_CB_FR028.1_CVID_sB.trimmed.bowtie2.filtered.bam'),
 ('ATAC111_CB_FR036.1_NFKB1_sB',
  'bam/ATAC111_CB_FR036.1_NFKB1_sB.trimmed.bowtie2.filtered.bam')]

Index bam files

In [14]:
for bam in all_bam.values():
    bam_index_proc = subprocess.run(f"samtools index {bam} -@ 30", shell=True)
    if bam_index_proc.returncode != 0:
        RuntimeError(f"{bam} index failed")

Deeptools Create BigWig

In [None]:
bws = {}
for sample_label, bam in all_bam.items():
	bw = f"{sample_label}.bigWig"
	deeptool_proc = subprocess.run(
    		f"bamCoverage -b {bam} -o {bw} -p 30 -bs 10 --normalizeUsing CPM  --minMappingQuality 30",
    		shell=True,
		stderr=subprocess.STDOUT,
		stdout=subprocess.PIPE
	)
	if deeptool_proc.returncode != 0:
		RuntimeError(f"{sample_label} failed:\n{deeptool_proc.stdout.decode()}")
	bws[sample_label] = bw

Create track hub for UCSC genome browser visualization

In [4]:
!mkdir -p Bodo_ATAC_Hub/hg38

In [5]:
with open("Bodo_ATAC_Hub/hub.txt", "w") as o:
    o.write("hub Bodo_ATAC_Hub\n")
    o.write("shortLabel Bodo_ATAC\n")
    o.write("longLabel Collection of patient ATAC data from Bodo\n")
    o.write("genomesFile genomes.txt\n")
    o.write("email hal213@ucsd.edu\n")

with open("Bodo_ATAC_Hub/genomes.txt", "w") as o:
    o.write("genome hg38\n")
    o.write("trackDb hg38/trackDb.txt\n")

In [26]:
grouped = {}
for sample_label in all_bam.keys():
    # subprocess.run(f"mv {bws[sample_label]} Bodo_ATAC_Hub/hg38", shell=True)
    genometype = sample_label.split(".")[1].split("_")[1]
    try:
        grouped[genometype]
    except KeyError:
        grouped[genometype] = []
    grouped[genometype].append(sample_label)


with open("Bodo_ATAC_Hub/hg38/trackDb.txt", "w") as o:
    for i, genometype in enumerate(grouped.keys()):
        color = f"{random.randrange(0, 256)},{random.randrange(0, 256)},{random.randrange(0, 256)}"
        o.write(
            "\n".join(
                [
                    f"track composition_{genometype}",
                    "type bigWig",
                    "compositeTrack on",
                    f"shortLabel {genometype} ATAC tracks",
                    f"longLabel {genometype} ATAC tracks",
                    "visibility full",
                    "yLineOnOff on",
                    "autoScale on",
                    'yLineMark "0.0"',
                    "alwaysZero on",
                    "graphType bar",
                    "maxHeightPixels 128:75:11",
                    "smoothingWindow off",
                    "windowingFunction maximum",
                    f"color {color}",
                    # f"priority {i+1}",
                ]
            )
        )
        o.write("\n\n")
        for sample_label in grouped[genometype]:
            o.write(
                "\n".join(
                    [
                        f"\ttrack {sample_label}",
                        f"\tbigDataUrl {bws[sample_label]}",
                        f"\tshortLabel {sample_label}",
                        f"\tlongLabel {sample_label}",
                        f"\tparent composition_{genometype} on",
                        "\ttype bigWig",
                    ]
                )
            )
            o.write("\n\n")