In [1]:
%load_ext lab_black
import subprocess, os

In [2]:
# MODIFY THIS CELL
# sample name: give more detailed, cell type, experiment, targets etc...
out_prefix = "/Extension_HDD1/Xen_PP/data/cXen_Gata4_CR_rep1/cXen_Gata4_CR_rep1"
env_bin = "/home/coco/miniconda3/envs/cut_run/bin/"

# Trim Adaptor PE Mode

In [3]:
# MODIFY THIS CELL
R1 = "/Extension_HDD1/Xen_PP/data/cXen_Gata4_CR_rep1/nR155-L3-G5-P155-CTGAAGCT-ACGTCCTG-READ1-Sequences.txt.gz"
R2 = "/Extension_HDD1/Xen_PP/data/cXen_Gata4_CR_rep1/nR155-L3-G5-P155-CTGAAGCT-ACGTCCTG-READ2-Sequences.txt.gz"
num_cpus = 30
adaptor_seq_fa = "/Extension_HDD1/Xen_PP/TruSeqAdapters.fa"

In [4]:
R1_paired_trimmed = f"{out_prefix}_1.pair.fastq"
R2_paired_trimmed = f"{out_prefix}_2.pair.fastq"
R1_unpaired_trimmed = f"{out_prefix}_1.unpair.fastq"
R2_unpaired_trimmed = f"{out_prefix}_2.unpair.fastq"

if R1.endswith("gz"):
    R1_paired_trimmed += ".gz"
    R2_paired_trimmed += ".gz"
    R1_unpaired_trimmed += ".gz"
    R2_unpaired_trimmed += ".gz"

In [5]:
trim_adaptor = subprocess.run(
    [
        env_bin + "trimmomatic",
        "PE",
        "-threads",
        str(num_cpus),
        "-phred33",
        R1,
        R2,
        R1_paired_trimmed,
        R1_unpaired_trimmed,
        R2_paired_trimmed,
        R2_unpaired_trimmed,
        f"ILLUMINACLIP:{adaptor_seq_fa}:2:15:4:4:true",
        "LEADING:20",
        "TRAILING:20",
        "SLIDINGWINDOW:4:15",
        "MINLEN:25",
    ],
    capture_output=True,
)
if trim_adaptor.returncode != 0:
    print(trim_adaptor.stderr.decode())

# Bowtie2 Mappping

In [6]:
# MODIFY THIS CELL
genome_index = "/home/software/bowtie2-2.2.9/genome/mm10/mm10"
sam_all_out = f"{out_prefix}.sam"

In [7]:
mapping_stats = ""
with open(sam_all_out, "w") as o:
    bwt2_mapping = subprocess.Popen(
        [
            env_bin + "bowtie2",
            "-x",
            genome_index,
            "-p",
            str(num_cpus),
            "-I",
            "10",
            "-X",
            "700",
            "--dovetail",
            "--phred33",
            "-1",
            R1_paired_trimmed,
            "-2",
            R2_paired_trimmed,
        ],
        stderr=subprocess.PIPE,
        stdout=o,
    )

    # capture mapping stats
    for line in iter(bwt2_mapping.stderr.readline, b""):
        if line.startswith(b"WARN"):
            continue
        else:
            mapping_stats += line.decode()

    bwt2_mapping.wait()

print(mapping_stats)

20186279 reads; of these:
  20186279 (100.00%) were paired; of these:
    504343 (2.50%) aligned concordantly 0 times
    11653788 (57.73%) aligned concordantly exactly 1 time
    8028148 (39.77%) aligned concordantly >1 times
    ----
    504343 pairs aligned concordantly 0 times; of these:
      14903 (2.95%) aligned discordantly 1 time
    ----
    489440 pairs aligned 0 times concordantly or discordantly; of these:
      978880 mates make up the pairs; of these:
        891582 (91.08%) aligned 0 times
        53307 (5.45%) aligned exactly 1 time
        33991 (3.47%) aligned >1 times
97.79% overall alignment rate



# Keep Fragment by Size 

In [8]:
# MODIFY THIS CELL
low, high = 0, 120
filter_sorted_bam = f"{out_prefix}.{low}_{high}.sorted.bam"

In [9]:
# get mapped frags within the size range
filter_by_size = subprocess.Popen(
    f"awk '(substr($0,1,1)==\"@\") || ($9>={low} && $9<={high}) || ($9<=-{low} && $9>=-{high})' {sam_all_out}",
    shell=True,
    stdout=subprocess.PIPE,
)
# sort by query name and convert to bam
sort_bam = subprocess.Popen(
    [
        "samtools",
        "sort",
        "-n",
        "-O",
        "bam",
        "-o",
        filter_sorted_bam,
        "-@",
        str(num_cpus),
        "-",  # pipe from filter output stream
    ],
    stderr=subprocess.DEVNULL,
    stdin=filter_by_size.stdout,
)
sort_bam.wait()

0

# MACS2 Call Peaks

In [10]:
# MODIFY THIS CELL
keep_dup = 3
macs2_genome = "mm"
macs2_out_prefix = f"{out_prefix}_{low}_{high}.macs2"

In [14]:
with open(f"{macs2_out_prefix}.log", "w") as log:
    macs2_callpeaks = subprocess.Popen(
        [
            f"{env_bin}macs2",
            "callpeak",
            "-t",
            filter_sorted_bam,
            "-g",
            macs2_genome,
            "-f",
            "BAMPE",
            "-n",
            macs2_out_prefix,
            "-q",
            str(0.01),
            "-B",
            "--SPMR",
            "--keep-dup",
            str(keep_dup),
        ],
        stderr=log,
    )
    macs2_callpeaks.wait()

# peak number
subprocess.run(["wc", "-l", f"{macs2_out_prefix}_peaks.narrowPeak"])

59506 /Extension_HDD1/Xen_PP/data/cXen_Gata4_CR_rep1/cXen_Gata4_CR_rep1_0_120.macs2_callpeak_peaks.narrowPeak


CompletedProcess(args=['wc', '-l', '/Extension_HDD1/Xen_PP/data/cXen_Gata4_CR_rep1/cXen_Gata4_CR_rep1_0_120.macs2_callpeak_peaks.narrowPeak'], returncode=0)

# Make Bigwig Using Homer

In [20]:
# MODIFY THIS CELL
tagdir = f"{out_prefix}_{low}_{high}.tagdir"
bigwig_genome = "mm10"
url = "http://unzip.4d-genome.com:8080/paula"
webdir = "/usr/local/apache2/htdocs/paula/"

In [16]:
subprocess.run(
    f"makeTagDirectory {tagdir} {filter_sorted_bam} -tbp {keep_dup}", shell=True
)

	Will parse file: /Extension_HDD1/Xen_PP/data/cXen_Gata4_CR_rep1/cXen_Gata4_CR_rep1.0_120.sorted.bam

	Creating directory: /Extension_HDD1/Xen_PP/data/cXen_Gata4_CR_rep1/cXen_Gata4_CR_rep1_0_120.tagdir and removing existing *.tags.tsv

	Treating /Extension_HDD1/Xen_PP/data/cXen_Gata4_CR_rep1/cXen_Gata4_CR_rep1.0_120.sorted.bam as a bam file
	Reading alignment file /Extension_HDD1/Xen_PP/data/cXen_Gata4_CR_rep1/cXen_Gata4_CR_rep1.0_120.sorted.bam

	Optimizing tag files...
	Estimated genome size = 2725486875
	Estimated average read density = 0.004589 per bp
	Total Tags = 12505935.5
	Total Positions = 10831164
	Average tag length = 63.5
	Median tags per position = 0 (ideal: 1)
	Average tags per position = 0.723
	Restricting tags per bp...
	Fragment Length Estimate: 300
	Peak Width Estimate: 0
		!!! No reliable estimate for peak size
		Setting Peak width estimate to be equal to fragment length estimate
	Autocorrelation quality control metrics:
		Same strand fold enrichment: 4.5
		Diff stra

CompletedProcess(args='makeTagDirectory /Extension_HDD1/Xen_PP/data/cXen_Gata4_CR_rep1/cXen_Gata4_CR_rep1_0_120.tagdir /Extension_HDD1/Xen_PP/data/cXen_Gata4_CR_rep1/cXen_Gata4_CR_rep1.0_120.sorted.bam -tbp 3', returncode=0)

In [19]:
subprocess.run(
    f"makeBigWig.pl {tagdir} {bigwig_genome} -url NA -webdir NA -update",
    shell=True,
    stderr=subprocess.DEVNULL,
)

CompletedProcess(args='makeBigWig.pl /Extension_HDD1/Xen_PP/data/cXen_Gata4_CR_rep1/cXen_Gata4_CR_rep1_0_120.tagdir mm10 -url NA -webdir NA -update', returncode=0)

In [24]:
# setup the link for bigwig
import random


def create_bw_track_controller(bw_path, webdir, host, track_name=None, color=None):
    if track_name is None:
        track_name = bw_path.split("/")[-1]

    if color is None:
        color = f"{random.randrange(0, 256)},{random.randrange(0, 256)},{random.randrange(0, 256)}"

    # host url for the webdir
    subprocess.run(["ln", "-s", os.path.abspath(bw_path), os.path.abspath(webdir)])
    track_ctl = (
        "track type=bigWig "
        + f"name={track_name} "
        + f"bigDataUrl={host}/{track_name} "
        + f"color={color} "
        + 'visibility=full yLineOnOff=on autoScale=on yLineMark="0.0" alwaysZero=on graphType=bar maxHeightPixels=128:75:11 windowingFunction=maximum smoothingWindow=off'
    )

    with open(f"{bw_path}.track_control.txt", "w") as o:
        o.write(track_ctl)

    bw = bw_path.split("/")[-1]
    with open(f"{webdir}/{bw}.track_control.txt", "w") as o:
        o.write(
            "#"
            + " ".join(["ln", "-s", os.path.abspath(bw_path), os.path.abspath(webdir)])
            + "\n"
        )
        o.write(track_ctl)


prefix = tagdir.split("/")[-1]
create_bw_track_controller(f"{tagdir}/{prefix}.ucsc.bigWig", webdir, url)

In [9]:
# (Optional) Remove Duplications
# final_bam = f"{out_prefix}.{low}_{high}.sorted.rmdup.bam"
# 
# subprocess.run(
# 	[
# 		"picard",
# 		"MarkDuplicates",
# 		"--INPUT",
# 		filter_sorted_bam,
# 		"--OUTPUT",
#   		final_bam,
# 		"--M",
# 		f"{final_bam}.picard_rmdup.metrics",
# 		"--ASSUME_SORT_ORDER",
# 		"queryname"
# 	]
# 	stderr = subprocess.DEVNULL
# )