# Preprocessing ChIP-Seq data

## Convert TagAlign to Bam

In [1]:
%%bash
conda run -n chip_seq bash -c 'for f in data/*.tagAlign; do \
    base=$(basename $f _norm_sorted.tagAlign); \
    bedtools bedtobam \
        -i "$f" \
        -g utils/hg19.genome \
        > data/${base}.bam; done'

## Samtools sort and index

In [3]:
%%bash
conda run -n chip_seq bash -c 'for f in data/*.bam; do \
    base=$(basename $f .bam); \
    samtools sort "$f" -o data/${base}_sorted.bam; done'

conda run -n chip_seq bash -c 'for f in data/*_sorted.bam; do \
    base=$(basename $f .bam); \
    samtools index "$f" data/${base}.bai; done'

[bam_sort_core] merging from 2 files and 1 in-memory blocks...
[bam_sort_core] merging from 3 files and 1 in-memory blocks...
[bam_sort_core] merging from 4 files and 1 in-memory blocks...
[bam_sort_core] merging from 4 files and 1 in-memory blocks...
[bam_sort_core] merging from 3 files and 1 in-memory blocks...
[bam_sort_core] merging from 2 files and 1 in-memory blocks...
[bam_sort_core] merging from 18 files and 1 in-memory blocks...
[bam_sort_core] merging from 3 files and 1 in-memory blocks...
[bam_sort_core] merging from 3 files and 1 in-memory blocks...
[bam_sort_core] merging from 4 files and 1 in-memory blocks...
[bam_sort_core] merging from 3 files and 1 in-memory blocks...
[bam_sort_core] merging from 3 files and 1 in-memory blocks...
[bam_sort_core] merging from 2 files and 1 in-memory blocks...
[bam_sort_core] merging from 2 files and 1 in-memory blocks...
[bam_sort_core] merging from 2 files and 1 in-memory blocks...
[bam_sort_core] merging from 4 files and 1 in-memory b

## macs2 call peak

In [4]:
%%bash
conda run -n chip_seq bash -c '
samples=(
    "GSM2576895 CS13-12383 H3K4me1"
    "GSM2576896 CS13-12690 H3K4me1"
    "GSM2576897 CS13-12829 H3K4me1"
    "GSM2576898 CS13-12830 H3K4me1"
    "GSM2576899 CS13-12877 H3K4me1"
    "GSM2576900 CS14-12408 H3K4me1"
    "GSM2576901 CS14-12709 H3K4me1"
    "GSM2576902 CS14-12913 H3K4me1"
    "GSM2576903 CS15-13000 H3K4me1"
    "GSM2576904 CS15-13019 H3K4me1"
    "GSM2576905 CS15-13128 H3K4me1"
    "GSM2576906 CS17-12331 H3K4me1"
    "GSM2576907 CS17-12341 H3K4me1"
    "GSM2576908 CS17-12611 H3K4me1"
    "GSM2576909 CS13-12383 H3K4me2"
    "GSM2576910 CS13-12690 H3K4me2"
    "GSM2576911 CS13-12829 H3K4me2"
    "GSM2576912 CS13-12830 H3K4me2"
    "GSM2576913 CS13-12877 H3K4me2"
    "GSM2576914 CS14-12408 H3K4me2"
    "GSM2576915 CS14-12709 H3K4me2"
    "GSM2576916 CS14-12913 H3K4me2"
    "GSM2576917 CS15-13000 H3K4me2"
    "GSM2576918 CS15-13019 H3K4me2"
    "GSM2576919 CS15-13128 H3K4me2"
    "GSM2576920 CS17-12191 H3K4me2"
    "GSM2576921 CS17-12331 H3K4me2"
    "GSM2576922 CS17-12341 H3K4me2"
    "GSM2576923 CS17-12611 H3K4me2"
    "GSM2576924 CS20-12104 H3K4me2"
    "GSM2576925 CS13-12383 H3K4me3"
    "GSM2576926 CS13-12690 H3K4me3"
    "GSM2576927 CS13-12829 H3K4me3"
    "GSM2576928 CS13-12830 H3K4me3"
    "GSM2576929 CS13-12877 H3K4me3"
    "GSM2576930 CS14-12408 H3K4me3"
    "GSM2576931 CS14-12709 H3K4me3"
    "GSM2576932 CS14-12913 H3K4me3"
    "GSM2576933 CS15-13000 H3K4me3"
    "GSM2576934 CS15-13019 H3K4me3"
    "GSM2576935 CS15-13128 H3K4me3"
    "GSM2576936 CS17-12331 H3K4me3"
    "GSM2576937 CS17-12341 H3K4me3"
    "GSM2576938 CS17-12611 H3K4me3"
    "GSM2576939 CS13-12383 H3K27ac"
    "GSM2576940 CS13-12690 H3K27ac"
    "GSM2576941 CS13-12829 H3K27ac"
    "GSM2576942 CS13-12830 H3K27ac"
    "GSM2576943 CS13-12877 H3K27ac"
    "GSM2576944 CS14-12408 H3K27ac"
    "GSM2576945 CS14-12709 H3K27ac"
    "GSM2576946 CS14-12913 H3K27ac"
    "GSM2576947 CS15-13000 H3K27ac"
    "GSM2576948 CS15-13019 H3K27ac"
    "GSM2576949 CS15-13128 H3K27ac"
    "GSM2576950 CS17-12191 H3K27ac"
    "GSM2576951 CS17-12331 H3K27ac"
    "GSM2576952 CS17-12341 H3K27ac"
    "GSM2576953 CS17-12611 H3K27ac"
    "GSM2576954 CS20-12104 H3K27ac"
    "GSM2576956 CS13-12383 H3K27me3"
    "GSM2576957 CS13-12690 H3K27me3"
    "GSM2576958 CS13-12829 H3K27me3"
    "GSM2576959 CS13-12830 H3K27me3"
    "GSM2576960 CS13-12877 H3K27me3"
    "GSM2576961 CS14-12408 H3K27me3"
    "GSM2576962 CS14-12709 H3K27me3"
    "GSM2576963 CS14-12913 H3K27me3"
    "GSM2576964 CS15-13000 H3K27me3"
    "GSM2576965 CS15-13019 H3K27me3"
    "GSM2576966 CS15-13128 H3K27me3"
    "GSM2576967 CS17-12331 H3K27me3"
    "GSM2576968 CS17-12341 H3K27me3"
    "GSM2576969 CS17-12611 H3K27me3"
    "GSM2576970 CS13-12383 H3K36me3"
    "GSM2576971 CS13-12690 H3K36me3"
    "GSM2576972 CS13-12829 H3K36me3"
    "GSM2576973 CS13-12830 H3K36me3"
    "GSM2576974 CS13-12877 H3K36me3"
    "GSM2576975 CS14-12408 H3K36me3"
    "GSM2576976 CS14-12709 H3K36me3"
    "GSM2576977 CS14-12913 H3K36me3"
    "GSM2576978 CS15-13000 H3K36me3"
    "GSM2576979 CS15-13019 H3K36me3"
    "GSM2576980 CS15-13128 H3K36me3"
    "GSM2576981 CS17-12331 H3K36me3"
    "GSM2576982 CS17-12341 H3K36me3"
    "GSM2576983 CS17-12611 H3K36me3"
)

for entry in "${samples[@]}"; do
    read gsm sample mark <<< "$entry"
    macs2 callpeak \
        -t data/${gsm}_${sample}-${mark}_sorted.bam \
        -c data/${sample}-Input_sorted.bam \
        -f BAM -g hs -B -n ${sample}-${mark} \
        -q 0.01 \
        --outdir data/
done'

INFO  @ Tue, 17 Feb 2026 13:04:53: 
# Command line: callpeak -t data/GSM2576895_CS13-12383-H3K4me1_sorted.bam -c data/CS13-12383-Input_sorted.bam -f BAM -g hs -B -n CS13-12383-H3K4me1 -q 0.01 --outdir data/
# ARGUMENTS LIST:
# name = CS13-12383-H3K4me1
# format = BAM
# ChIP-seq file = ['data/GSM2576895_CS13-12383-H3K4me1_sorted.bam']
# control file = ['data/CS13-12383-Input_sorted.bam']
# effective genome size = 2.70e+09
# band width = 300
# model fold = [5, 50]
# qvalue cutoff = 1.00e-02
# The maximum gap between significant sites is assigned as the read length/tag size.
# The minimum length of peaks is assigned as the predicted fragment length "d".
# Larger dataset will be scaled towards smaller dataset.
# Range for calculating regional lambda is: 1000 bps and 10000 bps
# Broad region calling is off
# Paired-End mode is off
 
INFO  @ Tue, 17 Feb 2026 13:04:53: #1 read tag files... 
INFO  @ Tue, 17 Feb 2026 13:04:53: #1 read treatment tags... 
INFO  @ Tue, 17 Feb 2026 13:04:54:  10000

## Edit first column for bedtools intersect

In [5]:
%%bash
# remove "chr" prefix in col 1 for later bedtools intersect
conda run -n chip_seq bash -c 'for f in data/*_peaks.narrowPeak; do \
    base=$(basename $f .narrowPeak); \
    awk "{gsub(\"chr\", \"\", \$1); print}" "$f" > data/${base}_col1fixed.bed; done'

# convert to tab delimited
conda run -n chip_seq bash -c 'for f in data/*_peaks_col1fixed.bed; do \
    base=$(basename $f .bed); \
    sed -e "s/ /\t/g" "$f" > data/${base}_tabdelim.bed; done'

conda run -n chip_seq bash -c 'rm data/*_peaks_col1fixed.bed'

## Intersect gwas summary and all ChIP-Seq peaks

In [6]:
%%bash
# overlap gwas summary and all ChIP-Seq peaks
conda run -n chip_seq bash -c 'for f in data/*_peaks_col1fixed_tabdelim.bed; do \
    base=$(basename $f _peaks_col1fixed_tabdelim.bed); \
    bedtools intersect -wao \
        -a ../snp_array/outputs/GWAS_Summary.bed \
        -b "$f" \
        > outputs/gwas_summary_${base}_intersect.bed; done'