In [1]:
%load_ext lab_black
import subprocess, os

# Input and Parameters

Tips for call histone: change fragment size and set `broad=True` 

In [2]:
# MODIFY THIS CELL

# system control
env_bin = "/home/coco/miniconda3/envs/cut_run/bin/"
num_cpus = 30

# sample name: give more detailed, cell type, experiment, targets etc...
out_prefix = "/Extension_HDD1/Xen_PP/data/cXen_Sox17_CR_rep1/cXen_Sox17_CR_rep1"

# FASTQ input paths
R1 = "/Extension_HDD1/Xen_PP/data/cXen_Sox17_CR_rep1/nR161-L3-G7-P053-GAATTCGT-GCCTCTAT-READ1-Sequences.txt.gz"
R2 = "/Extension_HDD1/Xen_PP/data/cXen_Sox17_CR_rep1/nR161-L3-G7-P053-GAATTCGT-GCCTCTAT-READ2-Sequences.txt.gz"

# adaptor sequence for Trimmomatic
adaptor_seq_fa = "/Extension_HDD1/Xen_PP/TruSeqAdapters.fa"

# bowtie2 mapping setup
genome_index = "/home/software/bowtie2-2.2.9/genome/mm10/mm10"

# fragment size
low, high = 0, 120

# server for bigwig
url = "http://unzip.4d-genome.com:8080/paula/Test_Clean_Regularly/"
webdir = "/usr/local/apache2/htdocs/paula/Test_Clean_Regularly/"

# MACS2 call peak parameters
keep_dup = 3
macs2_genome = "mm"
broad = False

# Trim Adaptor PE Mode

In [4]:
R1_paired_trimmed = f"{out_prefix}_1.pair.fastq"
R2_paired_trimmed = f"{out_prefix}_2.pair.fastq"
R1_unpaired_trimmed = f"{out_prefix}_1.unpair.fastq"
R2_unpaired_trimmed = f"{out_prefix}_2.unpair.fastq"

if R1.endswith("gz"):
    R1_paired_trimmed += ".gz"
    R2_paired_trimmed += ".gz"
    R1_unpaired_trimmed += ".gz"
    R2_unpaired_trimmed += ".gz"

print(f"Trimming {R1} and {R2}")
trim_adaptor = subprocess.run(
    [
        env_bin + "trimmomatic",
        "PE",
        "-threads",
        str(num_cpus),
        "-phred33",
        R1,
        R2,
        R1_paired_trimmed,
        R1_unpaired_trimmed,
        R2_paired_trimmed,
        R2_unpaired_trimmed,
        f"ILLUMINACLIP:{adaptor_seq_fa}:2:15:4:4:true",
        "LEADING:20",
        "TRAILING:20",
        "SLIDINGWINDOW:4:15",
        "MINLEN:25",
    ],
    capture_output=True,
)
if trim_adaptor.returncode != 0:
    print(trim_adaptor.stderr.decode())

Trimming /Extension_HDD1/Xen_PP/data/cXen_Sox17_CR_rep1/nR161-L3-G7-P053-GAATTCGT-GCCTCTAT-READ1-Sequences.txt.gz and /Extension_HDD1/Xen_PP/data/cXen_Sox17_CR_rep1/nR161-L3-G7-P053-GAATTCGT-GCCTCTAT-READ2-Sequences.txt.gz


# Bowtie2 Mappping

In [5]:
sam_all_out = f"{out_prefix}.sam"
mapping_stats = ""
with open(sam_all_out, "w") as o:
    bwt2_mapping = subprocess.Popen(
        [
            env_bin + "bowtie2",
            "-x",
            genome_index,
            "-p",
            str(num_cpus),
            "-I",
            "10",
            "-X",
            "700",
            "--dovetail",
            "--phred33",
            "-1",
            R1_paired_trimmed,
            "-2",
            R2_paired_trimmed,
        ],
        stderr=subprocess.PIPE,
        stdout=o,
    )

    # capture mapping stats
    for line in iter(bwt2_mapping.stderr.readline, b""):
        if line.startswith(b"WARN"):
            continue
        else:
            mapping_stats += line.decode()

    bwt2_mapping.wait()

In [6]:
print(f"Mapped {sam_all_out}")
print(mapping_stats)

Mapped /Extension_HDD1/Xen_PP/data/cXen_Sox17_CR_rep1/cXen_Sox17_CR_rep1.sam
19476563 reads; of these:
  19476563 (100.00%) were paired; of these:
    936325 (4.81%) aligned concordantly 0 times
    10911764 (56.03%) aligned concordantly exactly 1 time
    7628474 (39.17%) aligned concordantly >1 times
    ----
    936325 pairs aligned concordantly 0 times; of these:
      55442 (5.92%) aligned discordantly 1 time
    ----
    880883 pairs aligned 0 times concordantly or discordantly; of these:
      1761766 mates make up the pairs; of these:
        1548432 (87.89%) aligned 0 times
        123410 (7.00%) aligned exactly 1 time
        89924 (5.10%) aligned >1 times
96.02% overall alignment rate



# Keep Fragment by Size 

In [7]:
filter_sorted_bam = f"{out_prefix}.{low}_{high}.sorted.bam"
# get mapped frags within the size range
print(f"Making {filter_sorted_bam}")
filter_by_size = subprocess.Popen(
    f"awk '(substr($0,1,1)==\"@\") || ($9>={low} && $9<={high}) || ($9<=-{low} && $9>=-{high})' {sam_all_out}",
    shell=True,
    stdout=subprocess.PIPE,
)
# sort by query name and convert to bam
sort_bam = subprocess.Popen(
    [
        "samtools",
        "sort",
        "-O",
        "bam",
        "-o",
        filter_sorted_bam,
        "-@",
        str(num_cpus),
        "-",  # pipe from filter output stream
    ],
    stdin=filter_by_size.stdout,
)
sort_bam.wait()

Making /Extension_HDD1/Xen_PP/data/cXen_Sox17_CR_rep1/cXen_Sox17_CR_rep1.0_120.sorted.bam


[bam_sort_core] merging from 0 files and 30 in-memory blocks...


0

In [8]:
if sort_bam.returncode != 0:
    print("sort bam fail")
else:
    print("bam successfully sorted!")

bam successfully sorted!


# Deeptools Create BigWig

In [9]:
bw = f"{out_prefix}.{low}_{high}.bigWig"
print(f"Generating {bw}")

# index bam
subprocess.run(f"samtools index {filter_sorted_bam} -@ {num_cpus}", shell=True)

# make bigWig
subprocess.run(
    f"bamCoverage -b {filter_sorted_bam} -o {bw} -p {num_cpus} -bs 10", shell=True
)

# setup the link for bigWig
import random


def create_bw_track_controller(bw_path, webdir, host, track_name=None, color=None):
    if track_name is None:
        track_name = bw_path.split("/")[-1]

    if color is None:
        color = f"{random.randrange(0, 256)},{random.randrange(0, 256)},{random.randrange(0, 256)}"

    # host url for the webdir
    subprocess.run(["ln", "-s", os.path.abspath(bw_path), os.path.abspath(webdir)])
    track_ctl = (
        "track type=bigWig "
        + f"name={track_name} "
        + f"bigDataUrl={host}/{track_name} "
        + f"color={color} "
        + 'visibility=full yLineOnOff=on autoScale=on yLineMark="0.0" alwaysZero=on graphType=bar maxHeightPixels=128:75:11 windowingFunction=maximum smoothingWindow=off'
    )

    with open(f"{bw_path}.track_control.txt", "w") as o:
        o.write(track_ctl)

    bw = bw_path.split("/")[-1]
    with open(f"{webdir}/{bw}.track_control.txt", "w") as o:
        o.write(
            "#"
            + " ".join(["ln", "-s", os.path.abspath(bw_path), os.path.abspath(webdir)])
            + "\n"
        )
        o.write(track_ctl)


create_bw_track_controller(bw, webdir, url)

Generating /Extension_HDD1/Xen_PP/data/cXen_Sox17_CR_rep1/cXen_Sox17_CR_rep1.0_120.bigWig


bamFilesList: ['/Extension_HDD1/Xen_PP/data/cXen_Sox17_CR_rep1/cXen_Sox17_CR_rep1.0_120.sorted.bam']
binLength: 10
numberOfSamples: None
blackListFileName: None
defaultFragmentLength: read length
numberOfProcessors: 30
verbose: False
region: None
bedFile: None
minMappingQuality: None
ignoreDuplicates: False
chrsToSkip: []
stepSize: 10
center_read: False
samFlag_include: None
samFlag_exclude: None
minFragmentLength: 0
maxFragmentLength: 0
zerosToNans: False
smoothLength: None
save_data: False
out_file_for_raw_data: None
maxPairedFragmentLength: 1000


# MACS2 Call Peaks

In [10]:
macs2_out_prefix = f"{out_prefix}_{low}_{high}.macs2"

with open(f"{macs2_out_prefix}.log", "w") as log:
    macs2_cmd = [
        f"{env_bin}macs2",
        "callpeak",
        "-t",
        filter_sorted_bam,
        "-g",
        macs2_genome,
        "-f",
        "BAMPE",
        "-n",
        macs2_out_prefix,
        "-q",
        str(0.01),
        "-B",
        "--SPMR",
        "--keep-dup",
        str(keep_dup),
    ]
    if broad:
        macs2_cmd.append("--broad")

    macs2_callpeaks = subprocess.Popen(
        macs2_cmd,
        stderr=log,
    )
    macs2_callpeaks.wait()

# peak number
subprocess.run(["wc", "-l", f"{macs2_out_prefix}_peaks.narrowPeak"])

26454 /Extension_HDD1/Xen_PP/data/cXen_Sox17_CR_rep1/cXen_Sox17_CR_rep1_0_120.macs2_peaks.narrowPeak


CompletedProcess(args=['wc', '-l', '/Extension_HDD1/Xen_PP/data/cXen_Sox17_CR_rep1/cXen_Sox17_CR_rep1_0_120.macs2_peaks.narrowPeak'], returncode=0)

In [16]:
# Make Bigwig Using Homer

# MODIFY THIS CELL
# tagdir = f"{out_prefix}_{low}_{high}.tagdir"
# bigwig_genome = "mm10"
# url = "http://unzip.4d-genome.com:8080/paula"
# webdir = "/usr/local/apache2/htdocs/paula/"
#
# subprocess.run(
#     f"makeTagDirectory {tagdir} {filter_sorted_bam} -tbp {keep_dup}", shell=True
# )
#
# subprocess.run(
#     f"makeBigWig.pl {tagdir} {bigwig_genome} -url NA -webdir NA -update",
#     shell=True,
#     stderr=subprocess.DEVNULL,
# )
#
# # setup the link for bigwig
# import random
#
#
# def create_bw_track_controller(bw_path, webdir, host, track_name=None, color=None):
#     if track_name is None:
#         track_name = bw_path.split("/")[-1]
#
#     if color is None:
#         color = f"{random.randrange(0, 256)},{random.randrange(0, 256)},{random.randrange(0, 256)}"
#
#     # host url for the webdir
#     subprocess.run(["ln", "-s", os.path.abspath(bw_path), os.path.abspath(webdir)])
#     track_ctl = (
#         "track type=bigWig "
#         + f"name={track_name} "
#         + f"bigDataUrl={host}/{track_name} "
#         + f"color={color} "
#         + 'visibility=full yLineOnOff=on autoScale=on yLineMark="0.0" alwaysZero=on graphType=bar maxHeightPixels=128:75:11 windowingFunction=maximum smoothingWindow=off'
#     )
#
#     with open(f"{bw_path}.track_control.txt", "w") as o:
#         o.write(track_ctl)
#
#     bw = bw_path.split("/")[-1]
#     with open(f"{webdir}/{bw}.track_control.txt", "w") as o:
#         o.write(
#             "#"
#             + " ".join(["ln", "-s", os.path.abspath(bw_path), os.path.abspath(webdir)])
#             + "\n"
#         )
#         o.write(track_ctl)
#
#
# prefix = tagdir.split("/")[-1]
# create_bw_track_controller(f"{tagdir}/{prefix}.ucsc.bigWig", webdir, url)

# (Optional) Remove Duplications
# final_bam = f"{out_prefix}.{low}_{high}.sorted.rmdup.bam"
#
# subprocess.run(
# 	[
# 		"picard",
# 		"MarkDuplicates",
# 		"--INPUT",
# 		filter_sorted_bam,
# 		"--OUTPUT",
#   		final_bam,
# 		"--M",
# 		f"{final_bam}.picard_rmdup.metrics",
# 		"--ASSUME_SORT_ORDER",
# 		"queryname"
# 	]
# 	stderr = subprocess.DEVNULL
# )