# 配置环境
使用HiC-Pro中的yml文件创建conda环境
```
mamba env create -f environment.yml
```

接下来按照github readme文件make configure和make install

激活环境
```
conda activate HiC-Pro_v3.1.0
```

# 酶切图谱
```
python /lustre1/chengqiyi_pkuhpc/zhaohn/0.apps/HiC-Pro_installed/HiC-Pro_3.1.0/bin/utils/digest_genome.py \
    -r hindiii \
    -o hg38_hindiii.bed \
    /lustre1/chengqiyi_pkuhpc/zhaohn/1.database/db_genomes/genome_fa/genome_ucsc_hg38/genome_ucsc_hg38.fa
```

# 染色体长度
fasta的fai文件中的前两列信息就是长度
```
samtools faidx /lustre1/chengqiyi_pkuhpc/zhaohn/1.database/db_genomes/genome_fa/genome_ucsc_hg38/genome_ucsc_hg38.fa

cut -f1-2 /lustre1/chengqiyi_pkuhpc/zhaohn/1.database/db_genomes/genome_fa/genome_ucsc_hg38/genome_ucsc_hg38.fa.fai > chrom_hg38.sizes
```

# 下载测试文件
```
wget  https://zerkalo.curie.fr/partage/HiC-Pro/HiCPro_testdata.tar.gz && tar -zxvf HiCPro_testdata.tar.gz
```

# 集群提交
## 生成提交任务脚本
```
HiC-Pro -i test_data -o out_dir -c config-hicpro.txt -p
```
## 提交在北极星
```
sbatch HiCPro_step1_ZHN-HiC.sh
```

# 软件目录script下的make_slurm_script.sh需要修改

【直接复制】

```shell
#!/bin/bash

## HiC-Pro
## Copyright (c) 2015 Institut Curie                               
## Author(s): Nicolas Servant
## Contact: nicolas.servant@curie.fr
## This software is distributed without any guarantee under the terms of the BSD-3 licence.
## See the LICENCE file for details

##
## Create SLURM Torque files
##

dir=$(dirname $0)

usage()
{
    echo "usage: $0 -c CONFIG [-s STEP]"
}

MAKE_OPTS=""

while [ $# -gt 0 ]
do
    case "$1" in
    (-c) conf_file=$2; shift;;
	(-s) MAKE_OPTS=$2; shift;;
    (--) shift; break;;
    (-*) echo "$0: error - unrecognized option $1" 1>&2; exit 1;;
    (*)  suffix=$1; break;;
    esac
    shift
done

if [ -z "$conf_file" ]; then usage; exit 1; fi

CONF=$conf_file . $dir/hic.inc.sh
unset FASTQFILE

## Define input files
if [[ $MAKE_OPTS == "" || $MAKE_OPTS == *"mapping"* ]]
then
    inputfile=inputfiles_${JOB_NAME}.txt
    ifq=$(get_hic_files $RAW_DIR .fq)
    ifastq=$(get_hic_files $RAW_DIR .fastq)
    echo -e "$ifq\n$ifastq" | grep $PAIR1_EXT | sed -e "s|$RAW_DIR||" -e "s|^/||" > $inputfile
    count=$(cat $inputfile | wc -l)
elif [[ $MAKE_OPTS == *"proc_hic"* ]]
then
    inputfile=inputfiles_${JOB_NAME}.txt
    get_hic_files $RAW_DIR .bam | grep $PAIR1_EXT | sed -e "s|$RAW_DIR||" -e "s|^/||" > $inputfile
    count=$(cat $inputfile | wc -l)
fi

## Paralelle Implementation
if [[ $MAKE_OPTS == "" || $MAKE_OPTS == *"mapping"* || $MAKE_OPTS == *"proc_hic"* ]]
then
    make_target="all_sub"
    ## Remove per sample steps
    if [[ $MAKE_OPTS != "" ]]; then 
    make_target=$(echo $MAKE_OPTS | sed -e 's/,/ /g'); 
    make_target=$(echo $make_target | sed -e 's/merge_persample//g');
    make_target=$(echo $make_target | sed -e 's/build_contact_maps//g');
    make_target=$(echo $make_target | sed -e 's/ice_norm//g');
        make_target=$(echo $make_target | sed -e 's/quality_checks//g');
    fi
 
    ## step 1 - parallel
    torque_script=HiCPro_step1_${JOB_NAME}.sh
 
    cat > ${torque_script} <<EOF
#!/bin/bash
#SBATCH -N 1
#SBATCH -c ${N_CPU}
#SBATCH -p ${JOB_QUEUE}

#SBATCH --job-name=s1_${JOB_NAME}_HiCpro
#SBATCH --export=ALL
#SBATCH --no-requeue
#SBATCH -A ${JOB_ACCOUNT}
#SBATCH --qos=${JOB_QOS}
EOF
    
    if [[ $count -gt 1 ]]; then
	echo -e "#SBATCH --array=1-$count" >> ${torque_script}
    fi
    cat >> ${torque_script} <<EOF
FASTQFILE=\$SLURM_SUBMIT_DIR/$inputfile; export FASTQFILE
make --file ${SCRIPTS}/Makefile CONFIG_FILE=${conf_file} CONFIG_SYS=${INSTALL_PATH}/config-system.txt $make_target 2>&1
EOF
    
    chmod +x ${torque_script}

    ## User message
    echo "The following command will launch the parallel workflow through $count torque jobs:"
    echo sbatch ${torque_script}
fi    


## Per sample Implementation
if [[ $MAKE_OPTS == "" || $MAKE_OPTS == *"build_contact_maps"* || $MAKE_OPTS == *"ice_norm"* || $MAKE_OPTS == *"quality_checks"* ]]
then
    make_target="all_persample"
    ## Remove parallele mode
    if [[ $MAKE_OPTS != "" ]]; 
    then 
	make_target=$(echo $MAKE_OPTS | sed -e 's/,/ /g'); 
	make_target=$(echo $make_target | sed -e 's/mapping//g');
	make_target=$(echo $make_target | sed -e 's/proc_hic//g');
    fi

    torque_script_s2=HiCPro_step2_${JOB_NAME}.sh
    cat > ${torque_script_s2} <<EOF
#!/bin/bash

#SBATCH -N 1
#SBATCH -c 1
#SBATCH -p ${JOB_QUEUE}

#SBATCH --job-name=s2_${JOB_NAME}_HiCpro
#SBATCH --export=ALL
#SBATCH --no-requeue
#SBATCH -A ${JOB_ACCOUNT}
#SBATCH --qos=${JOB_QOS}

cd \$SLURM_SUBMIT_DIR

make --file ${SCRIPTS}/Makefile CONFIG_FILE=${conf_file} CONFIG_SYS=${INSTALL_PATH}/config-system.txt $make_target 2>&1
EOF
    
    chmod +x ${torque_script_s2}

    ## User message
    echo "The following command will merge the processed data and run the remaining steps per sample:"
    echo sbatch ${torque_script_s2}
fi
```

# config file
```shell
# Copy and edit the configuration file ‘config-hicpro.txt’ in your local folder. 
# 
# The ‘[]’ options are optional and can be undefined.
# 带有[]中括号的参数可以不定义，为可选参数
# Please change the variable settings below if necessary
#########################################################################
## Paths and Settings  - Do not edit !
## 输入输出文件路径，尽量不动
#########################################################################
TMP_DIR = tmp

LOGS_DIR = logs

BOWTIE2_OUTPUT_DIR = bowtie_results

MAPC_OUTPUT = hic_results

# Link to rawdata folder. The user usually not need to change this option
# 尽量不该rawdata路径
RAW_DIR = rawdata

#######################################################################
## SYSTEM AND SCHEDULER - Start Editing Here !!
## 从这里开始编辑
#######################################################################
# ?文档中没有说明
SORT_RAM = 1000M

# name of the main log file
LOGFILE = hicpro.log

# 【可选参数】name of the job on the cluster
JOB_NAME = ZHN-HiC 

# 【可选参数】队列指定
JOB_ACCOUNT = chengqiyi_g1
# 【cnlong】
# N_CPU = 20
# JOB_QUEUE = cn-long
# JOB_QOS = chengqiyicnl

# 【cn-short】
# N_CPU = 20
# JOB_QUEUE = cn-short
# JOB_QOS = chengqiyicns

# 【cn_nl】
N_CPU = 20
JOB_QUEUE = cn_nl
JOB_QOS = chengqiyicnnl

# 【fat4way】
# N_CPU = 24
# JOB_QUEUE = fat4way
# JOB_QOS = chengqiyif4w

# 【fat8way】
# N_CPU = 64
# JOB_QUEUE = fat8way
# JOB_QOS = chengqiyif8w
#########################################################################
## Data
#########################################################################
# Keyword for first mate detection. Default:_R1
PAIR1_EXT = _R1

# Keywoard for seconde mate detection. Default:_R2
PAIR2_EXT = _R2

#######################################################################
## Alignment options
#######################################################################
# Minimum mapping quality. 
# Reads with lower quality are discarded. 
# Default: 0
MIN_MAPQ = 10

# Path to bowtie2 indexes
BOWTIE2_IDX_PATH = /lustre1/chengqiyi_pkuhpc/zhaohn/1.database/db_genomes/genome_fa/genome_ucsc_hg38/

# bowtie2 options for mapping step1. 
# Default: –very-sensitive -L 30 –score-min L,-0.6,-0.2 –end-to-end –reorder
BOWTIE2_GLOBAL_OPTIONS = --very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder

# bowtie2 options for mapping step2. 
# Default: –very-sensitive -L 20 –score-min L,-0.6,-0.2 –end-to-end –reorder
BOWTIE2_LOCAL_OPTIONS =  --very-sensitive -L 20 --score-min L,-0.6,-0.2 --end-to-end --reorder

#######################################################################
## Annotation files
#######################################################################
# Reference genome prefix used for genome indexes. 
# Default: hg19
# 【需要升级为hg38，查阅文档】
REFERENCE_GENOME = genome_ucsc_hg38.fa.bowtie2_index

# Chromsome size file. 
# Loaded from the ANNOTATION folder in the HiC-Pro installation directory. 
# Default: chrom_hg19.sizes
# 这个要放到安装文件夹下头！
# 【需要升级为hg38，查阅文档】
GENOME_SIZE = chrom_hg38.sizes

#######################################################################
## Allele specific analysis
## http://nservant.github.io/HiC-Pro/AS.html#as
#######################################################################
# VCF file to SNPs which can be used to distinguish parental origin. See the allele specific section for more details
# 【可选参数】
# ALLELE_SPECIFIC_SNP = 

#######################################################################
## Capture Hi-C analysis
#######################################################################
# BED file of target regions to focus on (mainly used for capture Hi-C data
# 【可选参数】
# CAPTURE_TARGET =
REPORT_CAPTURE_REPORTER = 1

#######################################################################
## Digestion Hi-C
#######################################################################
# BED file with restriction fragments. 
# Full path or name of file available in the ANNOTATION folder. 
# Default: HindIII_resfrag_hg19.bed
# 含有限制性片段的BED文件
# 这个要放到安装文件夹下头！
# 【需要升级为hg38，查阅文档】
GENOME_FRAGMENT = HindIII_resfrag_hg38.bed

# Ligation site sequence used for reads trimming. 
# Depends on the fill in strategy. 
# Example: AAGCTAGCTT
# 酶的序列，例如：
# HindIII，为AAGCTAGCTT
# MboI，为GATCGATC
LIGATION_SITE = AAGCTAGCTT

# Minimum size of restriction fragments to consider for the Hi-C processing.
# Example: 100
# 为Hi-C处理考虑的限制片段的【最小大小】
MIN_FRAG_SIZE = 100

# Maximum size of restriction fragments to consider for the Hi-C processing.
# Example: 100000
# # 为Hi-C处理考虑的限制片段的【最大大小】
MAX_FRAG_SIZE = 100000

# Minimum sequenced insert size. 
# Shorter 3C products are discarded. 
# Example: 100
# 测得【最小】插入大小。
# 较短的3C产物会被丢弃
MIN_INSERT_SIZE = 100

# Maximum sequenced insert size. 
# Larger 3C products are discarded. 
# Example: 600
# 测得【最大】插入大小。
# 较短的3C产物会被丢弃
MAX_INSERT_SIZE = 600

#######################################################################
## Hi-C processing
#######################################################################
# Filter short range contact below the specified distance. 
# Mainly useful for DNase Hi-C. 
# Example: 1000
MIN_CIS_DIST =

# Create output files with all classes of 3C products. 
# Default: 0
GET_ALL_INTERACTION_CLASSES = 1

# Create a BAM file with all aligned reads flagged according to 
# their classifaction and mapping category. 
# 是否保留BAM文件，默认不保留
# Default: 0
GET_PROCESS_SAM = 1

# Remove singleton reads. 
# Default: 1
RM_SINGLETON = 1

# Remove multi-mapped reads. 
# Default: 1
RM_MULTI = 1

# Remove duplicated reads’ pairs. 
# Default: 1
RM_DUP = 1

#######################################################################
## Contact Maps
#######################################################################
# Resolution of contact maps to generate (space separated). 
# Default: 20000 40000 150000 500000 1000000
BIN_SIZE = 20000 40000 150000 500000 1000000

# Binning step size in ‘n’ coverage _i.e._ window step. 
# Default: 1
# BIN_STEP

# Output matrix format. 
# Must be complete, asis, upper or lower. 
# Default: upper
MATRIX_FORMAT = upper

#######################################################################
## Normalization
#######################################################################
# Maximum number of iteration for ICE normalization. 
# Default: 100
MAX_ITER = 100

# Define which pourcentage of bins with low counts should be force to zero. 
# Default: 0.02. 
# Replace SPARSE_FILTERING
FILTER_LOW_COUNT_PERC = 0.02

# Define which pourcentage of bins with low counts should be discarded 
# before normalization. 
# Default: 0
FILTER_HIGH_COUNT_PERC = 0

# The relative increment in the results before declaring convergence. 
# Default: 0.1
EPS = 0.1

```

# Convert ValidPairs to Juicer .hic
~/0.apps/HiC-Pro_installed/HiC-Pro_3.1.0/bin/utils/hicpro2juicebox.sh -h
提示需要Hi Doug,

Please take a look at: 

https://github.com/theaidenlab/juicer/tree/master/AWS/scripts

juicebox_tools.jar is located there.

juicebox_tools.jar and juicebox_CLT.jar are different names for the same thing.
If your script is using CLT, just rename that jar to the correct name.

```shell
~/0.apps/HiC-Pro_installed/HiC-Pro_3.1.0/bin/utils/hicpro2juicebox.sh -i dixon_2M.allValidPairs -g ~/0.apps/HiC-Pro_installed/HiC-Pro_3.1.0/annotation/chrom_hg38.sizes -j ~/0.apps/juicerbox/juicer_tools.jar -r ~/0.apps/HiC-Pro_installed/HiC-Pro_3.1.0/annotation/HindIII_resfrag_hg38.bed
```

# call loop [juicer]
https://github.com/aidenlab/juicer

http://www.360doc.com/content/19/1224/14/68068867_881786243.shtml

juicer采用ArrowHead算法对原始的交互矩阵进行转化，并预测TAD拓扑关联结构域，采用HiCUUPS算法识别染色质环chromatin loops。和其他Hi-C数据处理软件相比，juicer的功能更为齐全

```shell
Command Line Tools Usage
Detailed documentation about the command line tools can be found on the wiki:

Annotating features with Arrowhead, HiCCUPS, MotifFinder, APA, Eigenvector, and Pearsons
Creating .hic with Pre
Extracting data from .hic files with dump
To launch the command line tools, use the shell script “juicer_tools” on Unix/MacOS or type

java -jar juicer_tools.jar (command...) [flags...] <parameters...>`
There are different flavors of juicer_tools that depend on the CUDA version. If you do not use GPUs, these versions are equivalent. Otherwise, juicer_tools.X.X.jar uses CUDA version X.X

For HiCCUPS loop calling without the shell or bat script, you will need to call: java -Xms512m -Xmx2048m -Djava.library.path=path/to/natives/ -jar juicer_tools.jar hiccups [flags...] <parameters...> where path/to/natives is the path to the native libraries used for Jcuda By default, these are located in the lib/jcuda folder.

In the command line tools, there are several analysis functions:

apa for conducting aggregate peak analysis
hiccups for annotating loops
motifs for finding CTCF motifs
arrowhead for annotating contact domains
eigenvector for calculating the eigenvector (first PC) of the Pearson's
pearsons for calculating the Pearson's
The juicer_tools (Unix/MacOS) script can be used in place of the unwieldy java -Djava.library.path=path/to/natives/ -jar juicer_tools.jar
```

# call TAD [HiC Explorer / TopDom]
/lustre1/chengqiyi_pkuhpc/zhaohn/0.apps/miniconda3/bin/hicexplorer

这个链接【重点参考！！！】
https://blog.csdn.net/hzau_yang/article/details/100031590
```shell
hicBuildMatrix --samFiles mate_R1.bam mate_R2.bam \
                 --binSize 10000 \
                 --restrictionSequence GATC \
                 --threads 4
                 --inputBufferSize 100000
                 --outBam hic.bam \
                 -o hic_matrix.h5
                 --QCfolder ./hicQC
```

该步骤最终生成一个bam文件，一个h5文件，以及一个hicQC文件夹报告，改报告写的比较详细，懂Hi-C的一般都能看懂，里面包含了比对率，数据有效利用率等信息；h5这个文件格式比较复杂，操作起来需要hicmatrix包的HiCMatrix函数；


3.校正Hi-C交互矩阵
默认使用KR标准化方法，也可以使用ICE，通过–correctionMethod参数控制；
–chromosomes控制输出的染色体，通过空格隔开，如chr1 chr2 chr3;

```shell
# KR 校正
hicCorrectMatrix correct -m hic_matrix.h5 --filterThreshold -1.5 5 -o hic_corrected.h5
# 画热图
hicPlotMatrix -m hic_corrected.h5 -o hic_plot.png --region 1:20000000-80000000 --log1p
```
该命令可以根据输出文件后缀画出不同的格式图，也可以通过–perChr参数控制每条染色体单独画；


找TAD
找TAD的软件有很多，hicexplorer的方法与Topdom有点相似，总的来说算比较简单粗暴的，实现
hicFindTADs -m hic_corrected.h5 --outPrefix hic_corrected --numberOfProcessors 16

# call compartments

5.找compartment


hicPCA -m hic_corrected.h5 --outFileName pca1.bw pca2.bw --format bigwig --pearsonMatrix pearson.h5

通常可以根据基因密度来调整第一主成分的符号，获得最终的compartment；统计完每个bin的基因数量，得到bigwig文件，然后通过–extraTrack可以直接对PCA结果调整符号；也可以通过–pearsonMatrix和–obsexpMatrix生成计算compartment的中间处理中的矩阵，如pearson矩阵；

画图

hicPlotMatrix -m pearson.h5 --outFileName pca1.png --perChr --bigwig pca1.bw

```shell
###############################################################################################
# 2021-04-06
# Hi-C data analysis with Hi-C pro
###############################################################################################
# Hi-C in HEK293
# Zuin J, Dixon JR, van der Reijden MI, Ye Z et al. Cohesin and CTCF differentially affect chromatin architecture and gene expression in human cells. Proc Natl Acad Sci U S A 2014 Jan 21;111(3):996-1001. PMID: 24335803
# https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE44267

GSM1081526	Hi-C, RAD21cv HEK293, TEV treated, replicate one
GSM1081527	Hi-C, RAD21cv HEK293, TEV treated, replicate two
GSM1081528	Hi-C, RAD21cv HEK293, HRV treated, replicate one
GSM1081529	Hi-C, RAD21cv HEK293, HRV treated, replicate two
GSM1081530	Hi-C, HEK293 siRNA Control, replicate one
GSM1081531	Hi-C, HEK293 siRNA Control, replicate two
GSM1081532	Hi-C, HEK293 siRNA CTCF, replicate one
GSM1081533	Hi-C, HEK293 siRNA CTCF, replicate two
GSM1081534	mRNA-seq, RAD21cv HEK293, TEV treated, replicate one
GSM1081535	mRNA-seq, RAD21cv HEK293, TEV treated, replicate two
GSM1081536	mRNA-seq, RAD21cv HEK293, HRV treated, replicate one
GSM1081537	mRNA-seq, RAD21cv HEK293, HRV treated, replicate two
GSM1081538	mRNA-seq, HEK293 siRNA Control, replicate one
GSM1081539	mRNA-seq, HEK293 siRNA Control, replicate two
GSM1081540	mRNA-seq, HEK293 siRNA CTCF, replicate one
GSM1081541	mRNA-seq, HEK293 siRNA CTCF, replicate two
GSM1081542	Smc3 ChIP-seq, RAD21cv HEK293, TEV Treated
GSM1081543	Input ChIP-seq, RAD21cv HEK293, TEV Treated


# fix filename 
SRR710071_GSM1081530_Hi-C_HEK293_siRNA_Control_replicate_one_Homo_sapiens_OTHER_1.fastq.gz
SRR710071_GSM1081530_Hi-C_HEK293_siRNA_Control_replicate_one_Homo_sapiens_OTHER_2.fastq.gz
SRR710072_GSM1081530_Hi-C_HEK293_siRNA_Control_replicate_one_Homo_sapiens_OTHER_1.fastq.gz
SRR710072_GSM1081530_Hi-C_HEK293_siRNA_Control_replicate_one_Homo_sapiens_OTHER_2.fastq.gz
SRR710073_GSM1081530_Hi-C_HEK293_siRNA_Control_replicate_one_Homo_sapiens_OTHER_1.fastq.gz
SRR710073_GSM1081530_Hi-C_HEK293_siRNA_Control_replicate_one_Homo_sapiens_OTHER_2.fastq.gz
SRR710074_GSM1081530_Hi-C_HEK293_siRNA_Control_replicate_one_Homo_sapiens_OTHER_1.fastq.gz
SRR710074_GSM1081530_Hi-C_HEK293_siRNA_Control_replicate_one_Homo_sapiens_OTHER_2.fastq.gz
SRR710075_GSM1081531_Hi-C_HEK293_siRNA_Control_replicate_two_Homo_sapiens_OTHER_1.fastq.gz
SRR710075_GSM1081531_Hi-C_HEK293_siRNA_Control_replicate_two_Homo_sapiens_OTHER_2.fastq.gz
SRR710076_GSM1081531_Hi-C_HEK293_siRNA_Control_replicate_two_Homo_sapiens_OTHER_1.fastq.gz
SRR710076_GSM1081531_Hi-C_HEK293_siRNA_Control_replicate_two_Homo_sapiens_OTHER_2.fastq.gz
SRR710077_GSM1081531_Hi-C_HEK293_siRNA_Control_replicate_two_Homo_sapiens_OTHER_1.fastq.gz
SRR710077_GSM1081531_Hi-C_HEK293_siRNA_Control_replicate_two_Homo_sapiens_OTHER_2.fastq.gz


cat SRR710071_GSM1081530_Hi-C_HEK293_siRNA_Control_replicate_one_Homo_sapiens_OTHER_1.fastq.gz \
SRR710072_GSM1081530_Hi-C_HEK293_siRNA_Control_replicate_one_Homo_sapiens_OTHER_1.fastq.gz \
SRR710073_GSM1081530_Hi-C_HEK293_siRNA_Control_replicate_one_Homo_sapiens_OTHER_1.fastq.gz \
SRR710074_GSM1081530_Hi-C_HEK293_siRNA_Control_replicate_one_Homo_sapiens_OTHER_1.fastq.gz \
> 293T-HiC-WT_rep1_R1.fq.gz & 

cat SRR710071_GSM1081530_Hi-C_HEK293_siRNA_Control_replicate_one_Homo_sapiens_OTHER_2.fastq.gz \
SRR710072_GSM1081530_Hi-C_HEK293_siRNA_Control_replicate_one_Homo_sapiens_OTHER_2.fastq.gz \
SRR710073_GSM1081530_Hi-C_HEK293_siRNA_Control_replicate_one_Homo_sapiens_OTHER_2.fastq.gz \
SRR710074_GSM1081530_Hi-C_HEK293_siRNA_Control_replicate_one_Homo_sapiens_OTHER_2.fastq.gz \
> 293T-HiC-WT_rep1_R2.fq.gz & 

cat SRR710075_GSM1081531_Hi-C_HEK293_siRNA_Control_replicate_two_Homo_sapiens_OTHER_1.fastq.gz \
SRR710076_GSM1081531_Hi-C_HEK293_siRNA_Control_replicate_two_Homo_sapiens_OTHER_1.fastq.gz \
SRR710077_GSM1081531_Hi-C_HEK293_siRNA_Control_replicate_two_Homo_sapiens_OTHER_1.fastq.gz \
> 293T-HiC-WT_rep2_R1.fq.gz & 

cat SRR710075_GSM1081531_Hi-C_HEK293_siRNA_Control_replicate_two_Homo_sapiens_OTHER_2.fastq.gz \
SRR710076_GSM1081531_Hi-C_HEK293_siRNA_Control_replicate_two_Homo_sapiens_OTHER_2.fastq.gz \
SRR710077_GSM1081531_Hi-C_HEK293_siRNA_Control_replicate_two_Homo_sapiens_OTHER_2.fastq.gz \
> 293T-HiC-WT_rep2_R2.fq.gz & 


mv 293T-HiC-WT_rep1_R1.fq.gz  293T-HiC-WT_rep1_R1.fastq.gz
mv 293T-HiC-WT_rep1_R2.fq.gz  293T-HiC-WT_rep1_R2.fastq.gz
mv 293T-HiC-WT_rep2_R1.fq.gz  293T-HiC-WT_rep2_R1.fastq.gz
mv 293T-HiC-WT_rep2_R2.fq.gz  293T-HiC-WT_rep2_R2.fastq.gz

mv 293T-HiC-WT_rep1_R1.fastq.gz 293T-HiC-WT-rep1_R1.fastq.gz
mv 293T-HiC-WT_rep1_R2.fastq.gz 293T-HiC-WT-rep1_R2.fastq.gz
mv 293T-HiC-WT_rep2_R1.fastq.gz 293T-HiC-WT-rep2_R1.fastq.gz
mv 293T-HiC-WT_rep2_R2.fastq.gz 293T-HiC-WT-rep2_R2.fastq.gz



# HiC pro prepare
./digest_genome.py  -r A^AGCTT  -o /home/menghaowei/menghw_HD/reference/HiCPro_ref/hg38_hindiii.bed ~/menghw_HD/reference/bowtie2_hg38/hg38_only_chromosome.fa &

# make bash
cd /home/menghaowei/menghw_HD/DdCBE_project/10.hic_data

HiC-Pro -i ./rawdata -o 01.hicpro_WT/ -c config-hicpro.txt

# init
SLURM_SUBMIT_DIR=/home/menghaowei/menghw_HD/DdCBE_project/10.hic_data/01.hicpro_WT

# run 
cat 293T-HiC-WT-rep1_R1.fastq.gz 293T-HiC-WT-rep2_R1.fastq.gz > 293T-HiC-WT-merge_R1.fastq.gz &
cat 293T-HiC-WT-rep1_R2.fastq.gz 293T-HiC-WT-rep2_R2.fastq.gz > 293T-HiC-WT-merge_R2.fastq.gz &


# HiC pro data to .hic format 
cd /home/menghaowei/menghw_HD/DdCBE_project/10.hic_data/01.hicpro_WT/hic_results/data/293T_WT

hicpro2juicebox.sh \
-i 293T_WT.allValidPairs \
-g /home/menghaowei/menghw_HD/reference/HiCPro_ref/hg38.only_chrom.sizes \
-j /home/menghaowei/menghw_HD/software_package/juicer/juicer_tools_1.22.01.jar \
-r /home/menghaowei/menghw_HD/reference/HiCPro_ref/hg38_hindiii.bed > 293T_WT.allValidPairs.hicproTohic.log  2>&1 &


hicpro2juicebox.sh \
-i test.allValidPairs \
-g /home/menghaowei/menghw_HD/reference/HiCPro_ref/hg38.only_chrom.sizes \
-j /home/menghaowei/menghw_HD/software_package/juicer/juicer_tools_1.22.01.jar \
-r /home/menghaowei/menghw_HD/reference/HiCPro_ref/hg38_hindiii.bed > test.allValidPairs.hicproTohic.log  2>&1  &


java -Xmx64g -jar /home/menghaowei/menghw_HD/software_package/juicer/juicer_tools_1.22.01.jar pre \
--threads 20 -j 20 \
-f ./tmp/2404_resfrag.juicebox  \
-r 2500000,1000000,500000,250000,100000,50000,25000,10000,5000 \
./tmp/2404_allValidPairs.pre_juicebox_sorted \
293T_WT.allValidPairs.NoFrag.hic \
/home/menghaowei/menghw_HD/reference/HiCPro_ref/hg38.only_chrom.sizes > 293T_WT.allValidPairs.NoFrag.hicproTohic.log  2>&1 &


# call loops 
cd /home/menghaowei/menghw_HD/DdCBE_project/10.hic_data/01.hicpro_WT/hic_results/data/293T_WT

# java -Xmx32g -jar /home/menghaowei/menghw_HD/software_package/juicer/juicer_tools_1.22.01.jar hiccups \
# --threads 10 \
# -r 25000,10000,5000 \
# -k KR \
# -f 0.1,0.1,0.1 \
# -p 1,2,4 \
# -d 20000,20000,20000 \
# test.allValidPairs.hic \


java -Xmx64g -jar /home/menghaowei/menghw_HD/software_package/juicer/juicer_tools_1.22.01.jar hiccups \
--threads 20 \
-r 5000,10000,25000 \
-c chr1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X \
--ignore-sparsity \
293T_WT.allValidPairs.NoFrag.hic all_hiccups_loops.NoFrag.hic


java -Xmx64g -jar /home/menghaowei/menghw_HD/software_package/juicer/juicer_tools_1.22.01.jar hiccups \
--threads 20 \
-r 5000,10000,25000 \
-c chr1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X \
--ignore-sparsity \
293T_WT.allValidPairs.hic all_hiccups_loops 

# extract data 
java -Xmx32g -jar /home/menghaowei/menghw_HD/software_package/juicer/juicer_tools_1.22.01.jar dump \
observed KR 293T_WT.allValidPairs.NoFrag.hic chr1 chr1 BP 100000 test.txt 
	

# java -Xmx32g -jar /home/menghaowei/menghw_HD/software_package/juicer/juicer_tools_1.22.01.jar hiccups \
# -r 5000 -c chr1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X --ignore-sparsity \
# 293T_WT.allValidPairs.NoFrag.hic all_hiccups_loops

# # on Lab iMac
# java -Xmx32g -jar /Users/meng/juicer_tools_1.22.01.jar hiccups \
# -r 5000,10000,25000 -c chr1,2,3,4,5,6,7,8,9,10 293T_WT.allValidPairs.hic all_hiccups_loops


# cuda install 
https://developer.nvidia.com/cuda-10.2-download-archive?target_os=MacOSX&target_arch=x86_64&target_version=1013&target_type=dmglocal

####################################################################################################
# HiCExplore run 
####################################################################################################
cd /home/menghaowei/menghw_HD/DdCBE_project/10.hic_data/01.hicpro_WT

# hicExplore make restriction sites
hicFindRestSite \
--fasta ~/menghw_HD/reference/bowtie2_hg38/hg38_only_chromosome.fa \
--searchPattern AAGCTT \
-o hicexp_result/hicexp_hindiii_hg38.bed &

# build up hic matrix 
hicBuildMatrix \
--samFiles \
./bowtie_results/bwt2/293T_WT/293T-HiC-WT-merge_R1_hg38_only_chromosome.bwt2merged.bam \
./bowtie_results/bwt2/293T_WT/293T-HiC-WT-merge_R2_hg38_only_chromosome.bwt2merged.bam \
--outFileName ./hicexp_result/293T_WT.RawMatrix.h5 \
--binSize 5000 10000 25000 40000 50000 100000 250000 500000 1000000 \
--threads 16 \
--QCfolder ./hicexp_result/QC_res \
--restrictionCutFile ./hicexp_result/hicexp_hindiii_hg38.bed \
--restrictionSequence AAGCTT \
--danglingSequence AGCT \
--minMappingQuality 20 \
--chromosomeSizes /home/menghaowei/menghw_HD/reference/hg38.only_chrom.sizes  > ./hicexp_result/293T_WT.hicBuildMatrix.RawMatrix.log 2>&1 & 

# test
hicBuildMatrix \
--samFiles \
./bowtie_results/bwt2/293T_WT/test_R1.sam \
./bowtie_results/bwt2/293T_WT/test_R2.sam \
--outFileName ./hicexp_result/test.RawMatrix.cool \
--binSize 50000 100000 250000 500000 1000000 \
--threads 4 \
--QCfolder ./hicexp_result/QC_res.test \
--restrictionCutFile ./hicexp_result/hicexp_hindiii_hg38.bed \
--restrictionSequence AAGCTT \
--danglingSequence AGCT \
--minMappingQuality 20 \
--chromosomeSizes /home/menghaowei/menghw_HD/reference/hg38.only_chrom.sizes  > ./hicexp_result/test.hicBuildMatrix.RawMatrix.log 2>&1 & 

#### correct matrix 
# hicCorrectMatrix diagnostic_plot -m test.RawMatrix.cool -o test.hic_corrected.png

# hicCorrectMatrix correct -m test.RawMatrix.cool --filterThreshold -1.5 5 -o test.KRCorrectMatrix.h5 & 

hicCorrectMatrix correct -m 293T_WT.RawMatrix.h5 --filterThreshold -1.5 5 -o 293T_WT.KRCorrectMatrix.h5 > 293T_WT.hicCorrectMatrix.log 2>&1 & 

#### call TAD
# hicFindTADs -m test.KRCorrectMatrix.h5 --outPrefix test.KRCorrectMatrix --numberOfProcessors 16 &  

hicFindTADs -m 293T_WT.KRCorrectMatrix.h5 --outPrefix 293T_WT.KR --numberOfProcessors 16 --correctForMultipleTesting fdr > 293T_WT.hicFindTADs.log 2>&1 &  

#### merge Hi-C matrix
# merge hic bins with 25Kbp binsize 
hicMergeMatrixBins --matrix 293T_WT.RawMatrix.h5 -o 293T_WT.RawMatrix.25Kbp.h5 --numBins 5 & 

# merge hic bins with 50Kbp binsize 
hicMergeMatrixBins --matrix 293T_WT.RawMatrix.h5 -o 293T_WT.RawMatrix.50Kbp.h5 --numBins 10 & 

# correct 25Kbp
hicCorrectMatrix correct -m 293T_WT.RawMatrix.25Kbp.h5 --correctionMethod KR \
--filterThreshold -1.5 5 -o 293T_WT.KR.CorrectMatrix.25Kbp.h5 > 293T_WT.hicCorrectMatrix.25Kbp.log 2>&1 & 

# correct 50Kbp
hicCorrectMatrix correct -m 293T_WT.RawMatrix.50Kbp.h5 --correctionMethod KR \
--filterThreshold -1.5 5 -o 293T_WT.KR.CorrectMatrix.50Kbp.h5 > 293T_WT.KR.hicCorrectMatrix.50Kbp.log 2>&1 & 

# correct 50Kbp ICE
hicCorrectMatrix correct -m 293T_WT.RawMatrix.50Kbp.h5 --correctionMethod ICE \
-o 293T_WT.ICE.CorrectMatrix.50Kbp.h5   \
--filterThreshold -1.5 5 --iterNum 500  \
> 293T_WT.ICE.hicCorrectMatrix.50Kbp.log 2>&1 & 

# find TAD 
srun -T 24 hicFindTADs -m 293T_WT.KR.CorrectMatrix.25Kbp.h5 --outPrefix 293T_WT.KR.25Kbp --numberOfProcessors 24 --correctForMultipleTesting fdr > 293T_WT.hicFindTADs.25Kbp.log 2>&1 &  

srun -T 24 hicFindTADs -m 293T_WT.KR.CorrectMatrix.50Kbp.h5 --outPrefix 293T_WT.KR.50Kbp --numberOfProcessors 24 --correctForMultipleTesting fdr > 293T_WT.KR.hicFindTADs.50Kbp.log 2>&1 &  

srun -T 24 hicFindTADs -m 293T_WT.ICE.CorrectMatrix.50Kbp.h5 --outPrefix 293T_WT.ICE.50Kbp --numberOfProcessors 24 --correctForMultipleTesting fdr > 293T_WT.hicFindTADs.50Kbp.log 2>&1 &  


# plot TAD
hicPlotTADs --tracks plot_TAD.ini -o out_image/293T_WT.KR.BinSize5Kbp.chr12_96M_126M.pdf --region chr12:96000000-126000000

hicPlotTADs --tracks plot_TAD.DdCBEOnly.ini -o out_image/293T_WT.KR.BinSize5Kbp.chr12_96M_126M.DdCBEOnly.pdf --region chr12:96000000-126000000 &

hicPlotTADs --tracks plot_TAD.DdCBEOnly.25Kbp.ini -o out_image/293T_WT.KR.BinSize25Kbp.chr12_96M_126M.DdCBEOnly.pdf --region chr12:96000000-126000000 &

hicPlotTADs --tracks plot_TAD.DdCBEOnly.25Kbp.ini -o out_image/293T_WT.KR.BinSize25Kbp.chr12_96M_116M.DdCBEOnly.pdf --region chr12:96000000-116000000 &

hicPlotTADs --tracks plot_TAD.Reds.DdCBEOnly.25Kbp.ini -o out_image/293T_WT.KR.BinSize25Kbp.chr12_96M_126M.DdCBEOnly.Reds.pdf --region chr12:96000000-126000000 &

hicPlotTADs --tracks plot_TAD.DdCBEOnly.25Kbp.ini -o out_image/293T_WT.KR.BinSize25Kbp.chr12_10M_130M.DdCBEOnly.pdf --region chr12:10000000-130000000 &


# plot TAD score 
sort -k1,1 -k2,2n 293T_WT.KR_score.bedgraph > 293T_WT.KR_score.sort.bedgraph &

sort -k1,1 -k2,2n 293T_WT.KR.25Kbp_score.bedgraph > 293T_WT.KR.25Kbp_score.sort.bedgraph &

# bedgraph to bigwig
bedGraphToBigWig 293T_WT.KR_score.sort.bedgraph ~/menghw_HD/reference/hg38.only_chrom.sizes 293T_WT.KR_score.bigwig &

bedGraphToBigWig 293T_WT.KR.25Kbp_score.sort.bedgraph ~/menghw_HD/reference/hg38.only_chrom.sizes 293T_WT.KR.25Kbp_score.bigwig &

# region in /home/menghaowei/menghw_HD/DdCBE_project/08.DdCBE_merge_all/region_cor_analysis/peak_region

# plot TAD
cd /home/menghaowei/menghw_HD/DdCBE_project/10.hic_data/01.hicpro_WT/hicexp_result

computeMatrix reference-point -S \
~/menghw_HD/DdCBE_project/10.hic_data/01.hicpro_WT/hicexp_result/01.TAD_info/293T_WT.KR.25Kbp_score.bigwig \
~/menghw_HD/DdCBE_project/10.hic_data/01.hicpro_WT/hicexp_result/01.TAD_info/293T_WT.KR_score.bigwig \
-R \
~/menghw_HD/DdCBE_project/08.DdCBE_merge_all/region_cor_analysis/peak_region/20210312-Motif-ND6-DepSite.merge.bed \
~/menghw_HD/DdCBE_project/08.DdCBE_merge_all/region_cor_analysis/peak_region/20210312-Motif-ND6-Indep.merge.bed \
~/menghw_HD/DdCBE_project/08.DdCBE_merge_all/region_cor_analysis/peak_region/20210319-hg38_random.50bp.sort.bed \
--referencePoint center \
--beforeRegionStartLength 2000000 \
--afterRegionStartLength 2000000 \
--skipZeros \
--binSize 25000 \
-o deeptools_result/20210421-TAD_score.profile.mat.gz \
--samplesLabel  TAD.25Kbp  TAD.5Kbp \
--numberOfProcessors 10 & 

# plot
plotHeatmap -m deeptools_result/20210421-TAD_score.profile.mat.gz \
--colorMap Purples \
-out deeptools_result/20210421-TAD_score.profile.heatmap.pdf &

# count site in TAD boundary
Dep 3 / 72
Indep 51 / 417 
Random 16 / 500

# count site in TAD boundary +- 1 pixel
Dep 8 / 72
Indep 134 / 417 
Random 58 / 500


####################################################################################################
# 2021-10-10
# Hi-C test data for qulity control 
####################################################################################################

# HiC pro prepare DpnII
digest_genome.py  -r ^GATC  -o /home/menghaowei/menghw_HD/reference/HiCPro_ref/hg38_dpnii.bed ~/menghw_HD/reference/bowtie2_hg38/hg38_only_chromosome.fa &

ligation_site: GATCGATC

# rename hic data
mv Hic-GFP-HiC-rep1_FKDL210259733-1a-1_1.fq.gz  293T_GFP_rep1/293T-HiC-batch_test-GFP_rep1_R1.fastq.gz
mv Hic-GFP-HiC-rep1_FKDL210259733-1a-1_2.fq.gz  293T_GFP_rep1/293T-HiC-batch_test-GFP_rep1_R2.fastq.gz

mv Hic-GFP-HiC-rep2_FKDL210259733-1a-2_1.fq.gz  293T_GFP_rep2/293T-HiC-batch_test-GFP_rep2_R1.fastq.gz
mv Hic-GFP-HiC-rep2_FKDL210259733-1a-2_2.fq.gz  293T_GFP_rep2/293T-HiC-batch_test-GFP_rep2_R2.fastq.gz

mv Hic-ND6-WT-HiC-rep1_FKDL210259733-1a-3_1.fq.gz  293T_DdCBE_ND6_rep1/293T-HiC-batch_test-DdCBE_ND6_rep1_R1.fastq.gz
mv Hic-ND6-WT-HiC-rep1_FKDL210259733-1a-3_2.fq.gz  293T_DdCBE_ND6_rep1/293T-HiC-batch_test-DdCBE_ND6_rep1_R2.fastq.gz

mv Hic-ND6-WT-HiC-rep2_FKDL210259733-1a-4_1.fq.gz  293T_DdCBE_ND6_rep2/293T-HiC-batch_test-DdCBE_ND6_rep2_R1.fastq.gz
mv Hic-ND6-WT-HiC-rep2_FKDL210259733-1a-4_2.fq.gz  293T_DdCBE_ND6_rep2/293T-HiC-batch_test-DdCBE_ND6_rep2_R2.fastq.gz

# fix config txt file 
config-hicpro_hic_test_DpnII.txt

# make bash
cd /home/menghaowei/menghw_HD/DdCBE_project/10.hic_data

HiC-Pro -i /home/menghaowei/menghw_HD/Data_Backup/DdCBE_project/08.hic_test_data.20211010/rename_fq \
-o 02.hic_test_DdCBE \
-c config-hicpro_hic_test_DpnII.txt &

####################################################################################################
# 2021-10-21
# Hi-C MGI add data
####################################################################################################
dir = /home/menghaowei/menghw_HD/Data_Backup/DdCBE_project/09.hic_MGI_data.20211020

61G ./V350027555/L03/V350027555_L03_read_1.fq.gz
60G ./V350027555/L04/V350027555_L04_read_1.fq.gz
60G ./V350027555/L01/V350027555_L01_read_1.fq.gz
62G ./V350027555/L02/V350027555_L02_read_1.fq.gz
63G ./V350027988/L03/V350027988_L03_read_1.fq.gz
63G ./V350027988/L04/V350027988_L04_read_1.fq.gz
62G ./V350027988/L01/V350027988_L01_read_1.fq.gz
62G ./V350027988/L02/V350027988_L02_read_1.fq.gz

# test 
srun -T 24 pigz -p 24 --test ./V350027555/L03/V350027555_L03_read_1.fq.gz &
srun -T 24 pigz -p 24 --test ./V350027555/L01/V350027555_L01_read_1.fq.gz & 
srun -T 24 pigz -p 24 --test ./V350027988/L03/V350027988_L03_read_1.fq.gz &
srun -T 24 pigz -p 24 --test ./V350027988/L01/V350027988_L01_read_1.fq.gz &
srun -T 24 pigz -p 24 --test ./V350027555/L01/V350027555_L01_read_2.fq.gz &
srun -T 24 pigz -p 24 --test ./V350027988/L01/V350027988_L01_read_2.fq.gz &


# run Hi-C pro G1
HiC-Pro -i /home/menghaowei/menghw_HD/Data_Backup/DdCBE_project/09.hic_MGI_data.20211020/rename_fq \
-o 03_1.hic_DdCBE_G1 \
-c config-hicpro_hic_MGI_G1.txt &


# run Hi-C pro G2
HiC-Pro -i /home/menghaowei/menghw_HD/Data_Backup/DdCBE_project/09.hic_MGI_data.20211020/rename_fq \
-o 03_1.hic_DdCBE_G2 \
-c config-hicpro_hic_MGI_G2.txt &


# run Hi-C pro W1
HiC-Pro -i /home/menghaowei/menghw_HD/Data_Backup/DdCBE_project/09.hic_MGI_data.20211020/rename_fq \
-o 03_1.hic_DdCBE_W1 \
-c config-hicpro_hic_MGI_W1.txt &


# run Hi-C pro W2
HiC-Pro -i /home/menghaowei/menghw_HD/Data_Backup/DdCBE_project/09.hic_MGI_data.20211020/rename_fq \
-o 03_1.hic_DdCBE_W2 \
-c config-hicpro_hic_MGI_W2.txt &

#  
03_1.hic_DdCBE_G1/bowtie_results/bwt2
293T-MGI-HiC-GFP_rep1_R1_hg38_only_chromosome.bwt2merged.bam  293T-MGI-HiC-GFP_rep1_R2_hg38_only_chromosome.bwt2merged.bam
samtools view -h test.bam | sed 's?/[1-2]??'

# make test data for step run [HiC-Pro]
samtools view  -h 293T-MGI-HiC-GFP_rep1_R1_hg38_only_chromosome.bwt2merged.bam | head -n 1000000 | samtools view -hb -o /home/menghaowei/menghw_HD/DdCBE_project/10.hic_data/test/bowtie_results/bwt2/G1_fq/293T-MGI-HiC-GFP_rep1_R1_hg38_only_chromosome.bwt2merged.bam
samtools view  -h 293T-MGI-HiC-GFP_rep1_R2_hg38_only_chromosome.bwt2merged.bam | head -n 1000000 | samtools view -hb -o /home/menghaowei/menghw_HD/DdCBE_project/10.hic_data/test/bowtie_results/bwt2/G1_fq/293T-MGI-HiC-GFP_rep1_R2_hg38_only_chromosome.bwt2merged.bam


MY_INSTALL_PATH/bin/HiC-Pro -i FULL_PATH_TO_DATA_FOLDER -o FULL_PATH_TO_OUTPUTS -c MY_LOCAL_CONFIG_FILE
MY_INSTALL_PATH/bin/HiC-Pro -i FULL_PATH_TO_DATA_FOLDER -o FULL_PATH_TO_OUTPUTS -c MY_LOCAL_CONFIG_FILE -p

/home/menghaowei/menghw_HD/DdCBE_project/10.hic_data/test/bowtie_results/bwt2/G1_fq


HiC-Pro -i /home/menghaowei/menghw_HD/DdCBE_project/10.hic_data/test/bowtie_results/bwt2/G1_fq \
-o test \
-c test_G1.txt -s proc_hic -s quality_checks -s build_contact_maps -s ice_norm &


HiC-Pro -i /home/menghaowei/menghw_HD/DdCBE_project/10.hic_data/test/bowtie_results/bwt2/G1_fq/ \
-o test \
-c test_G1.txt -s proc_hic &


/gpfs/user/menghaowei/anaconda2/envs/hicpro/bin/python \
/home/menghaowei/menghw_HD/software_package/HiC-Pro_3.0.0/scripts/mergeSAM.py \
-q 10 -t -v \
-f 293T-MGI-HiC-GFP_rep1_R1_hg38_only_chromosome.bwt2merged.bam \
-r 293T-MGI-HiC-GFP_rep1_R2_hg38_only_chromosome.bwt2merged.bam \
-o test.bwt2pairs.bam


/gpfs/user/menghaowei/anaconda2/envs/hicpro/bin/python /home/menghaowei/menghw_HD/software_package/HiC-Pro_3.0.0/scripts/mergeSAM.py \
-q 10 -t -v \
-f 293T-HiC-batch_test-DdCBE_ND6_rep1_R1_hg38_only_chromosome.bwt2merged.bam \
-r 293T-HiC-batch_test-DdCBE_ND6_rep1_R2_hg38_only_chromosome.bwt2merged.bam \
-o test.bam

# header 
less -S L01/V350027555_L01_read_1.fq.gz | sed '/^@V35/p' -n  | head -n 10
@V350027555L1C001R0030000000/1
@V350027555L1C001R0030000001/1
@V350027555L1C001R0030000002/1
@V350027555L1C001R0030000003/1
@V350027555L1C001R0030000004/1
@V350027555L1C001R0030000005/1
@V350027555L1C001R0030000008/1
@V350027555L1C001R0030000009/1
@V350027555L1C001R0030000010/1
@V350027555L1C001R0030000011/1

less -S L01/V350027555_L01_read_2.fq.gz | sed '/^@V35/p' -n  | head -n 10
@V350027555L1C001R0030000000/2
@V350027555L1C001R0030000001/2
@V350027555L1C001R0030000002/2
@V350027555L1C001R0030000003/2
@V350027555L1C001R0030000004/2
@V350027555L1C001R0030000005/2
@V350027555L1C001R0030000008/2
@V350027555L1C001R0030000009/2
@V350027555L1C001R0030000010/2
@V350027555L1C001R0030000011/2

less -S L02/V350027555_L02_read_1.fq.gz | sed '/^@V35/p' -n  | head -n 10
@V350027555L2C001R0020000000/1
@V350027555L2C001R0020000001/1
@V350027555L2C001R0020000002/1
@V350027555L2C001R0020000003/1
@V350027555L2C001R0020000004/1
@V350027555L2C001R0020000005/1
@V350027555L2C001R0020000006/1
@V350027555L2C001R0020000007/1
@V350027555L2C001R0020000008/1
@V350027555L2C001R0020000009/1


less -S L02/V350027555_L02_read_2.fq.gz | sed '/^@V35/p' -n  | head -n 10
@V350027555L2C001R0020000000/2
@V350027555L2C001R0020000001/2
@V350027555L2C001R0020000002/2
@V350027555L2C001R0020000003/2
@V350027555L2C001R0020000004/2
@V350027555L2C001R0020000005/2
@V350027555L2C001R0020000006/2
@V350027555L2C001R0020000007/2
@V350027555L2C001R0020000008/2
@V350027555L2C001R0020000009/2


less -S V350027555_L03_read_1.fq.gz | sed '/^@V35/p' -n  | head -n 10
@V350027555L3C001R0030000001/1
@V350027555L3C001R0030000002/1
@V350027555L3C001R0030000003/1
@V350027555L3C001R0030000004/1
@V350027555L3C001R0030000005/1
@V350027555L3C001R0030000006/1
@V350027555L3C001R0030000007/1
@V350027555L3C001R0030000008/1
@V350027555L3C001R0030000009/1
@V350027555L3C001R0030000010/1

less -S V350027555_L03_read_2.fq.gz | sed '/^@V35/p' -n  | head -n 10
@V350027555L3C001R0030000001/2
@V350027555L3C001R0030000002/2
@V350027555L3C001R0030000003/2
@V350027555L3C001R0030000004/2
@V350027555L3C001R0030000005/2
@V350027555L3C001R0030000006/2
@V350027555L3C001R0030000007/2
@V350027555L3C001R0030000008/2
@V350027555L3C001R0030000009/2
@V350027555L3C001R0030000010/2


less -S L04/V350027555_L04_read_1.fq.gz | sed '/^@V35/p' -n  | head -n 10
@V350027555L4C001R0010000000/1
@V350027555L4C001R0010000003/1
@V350027555L4C001R0010000004/1
@V350027555L4C001R0010000005/1
@V350027555L4C001R0010000007/1
@V350027555L4C001R0010000008/1
@V350027555L4C001R0010000009/1
@V350027555L4C001R0010000010/1
@V350027555L4C001R0010000011/1
@V350027555L4C001R0010000012/1


less -S L04/V350027555_L04_read_2.fq.gz | sed '/^@V35/p' -n  | head -n 10
@V350027555L4C001R0010000000/2
@V350027555L4C001R0010000003/2
@V350027555L4C001R0010000004/2
@V350027555L4C001R0010000005/2
@V350027555L4C001R0010000007/2
@V350027555L4C001R0010000008/2
@V350027555L4C001R0010000009/2
@V350027555L4C001R0010000010/2
@V350027555L4C001R0010000011/2
@V350027555L4C001R0010000012/2


less -S 293T-MGI-HiC-GFP_rep1_R1.fastq.gz | sed '/^@V35/p' -n  | head -n 10
@V350027555L1C001R0030000000/1
@V350027555L1C001R0030000001/1
@V350027555L1C001R0030000002/1
@V350027555L1C001R0030000003/1
@V350027555L1C001R0030000004/1
@V350027555L1C001R0030000005/1
@V350027555L1C001R0030000008/1
@V350027555L1C001R0030000009/1
@V350027555L1C001R0030000010/1
@V350027555L1C001R0030000011/1

less -S 293T-MGI-HiC-GFP_rep1_R2.fastq.gz | sed '/^@V35/p' -n  | head -n 10
@V350027555L1C001R0030000000/2
@V350027555L1C001R0030000001/2
@V350027555L1C001R0030000002/2
@V350027555L1C001R0030000003/2
@V350027555L1C001R0030000004/2
@V350027555L1C001R0030000005/2
@V350027555L1C001R0030000008/2
@V350027555L1C001R0030000009/2
@V350027555L1C001R0030000010/2
@V350027555L1C001R0030000011/2


samtools view -h 293T-MGI-HiC-GFP_rep1_R1_hg38_only_chromosome.bwt2merged.bam | sed '/^V35/p' -n  | head -n 10 | cut -f 1
V350027555L1C001R0010000000
V350027555L1C001R0010000001
V350027555L1C001R0010000002
V350027555L1C001R0010000003
V350027555L1C001R0010000004
V350027555L1C001R0010000007
V350027555L1C001R0010000008
V350027555L1C001R0010000009
V350027555L1C001R0010000010
V350027555L1C001R0010000011

samtools view -h 293T-MGI-HiC-GFP_rep1_R2_hg38_only_chromosome.bwt2merged.bam | sed '/^V35/p' -n  | head -n 10 | cut -f 1
V350027555L1C001R0030000000
V350027555L1C001R0030000001
V350027555L1C001R0030000002
V350027555L1C001R0030000003
V350027555L1C001R0030000004
V350027555L1C001R0030000005
V350027555L1C001R0030000008
V350027555L1C001R0030000009
V350027555L1C001R0030000010
V350027555L1C001R0030000011


samtools view -h 293T-MGI-HiC-GFP_rep1_R1_hg38_only_chromosome.bwt2merged.RawName.bam | sed '/^V35/p' -n  | head -n 10 | cut -f 1
V350027555L1C001R0010000000/1
V350027555L1C001R0010000001/1
V350027555L1C001R0010000002/1
V350027555L1C001R0010000003/1
V350027555L1C001R0010000004/1
V350027555L1C001R0010000007/1
V350027555L1C001R0010000008/1
V350027555L1C001R0010000009/1
V350027555L1C001R0010000010/1
V350027555L1C001R0010000011/1

samtools view -h 293T-MGI-HiC-GFP_rep1_R2_hg38_only_chromosome.bwt2merged.RawName.bam | sed '/^V35/p' -n  | head -n 10 | cut -f 1
V350027555L1C001R0030000000/2
V350027555L1C001R0030000001/2
V350027555L1C001R0030000002/2
V350027555L1C001R0030000003/2
V350027555L1C001R0030000004/2
V350027555L1C001R0030000005/2
V350027555L1C001R0030000008/2
V350027555L1C001R0030000009/2
V350027555L1C001R0030000010/2
V350027555L1C001R0030000011/2

9400043	V350027555L1C001R0010000000/2	0	chr2	203423038	42	84M	*	0	0	AGTGGCTTGCTAGTTTAGGGTACTTAATTGTTAGGTGTGGTTGCTAATACCCTACAATATTTGGTGGTCAGTCTTTTTAGTTCA	FFFFFFEFFFFFFBFFEFFFFFFGFFFFFFEFFFFFFFFD>FFDFEFEFFEFFFFFFEGDFFFDFFFCFFFAAFEBFFBFEFFC	AS:i:0	XN:i:0	XM:i:0	XO:i:0	XG:i:0	NM:i:0	MD:Z:84	YT:Z:UU	RG:Z:BML

# test 
293T-MGI-HiC-GFP_rep1_R2_hg38_only_chromosome.bwt2merged.RawName.bam
samtools view -h 293T-MGI-HiC-GFP_rep1_R2_hg38_only_chromosome.bwt2merged.RawName.bam | head -n 15000000 | samtools view -hb > test_R2.RawName.bam &
samtools view -h 293T-MGI-HiC-GFP_rep1_R2_hg38_only_chromosome.bwt2merged.bam | head -n 15000000 | samtools view -hb > test_R2.FixName.bam &

# sort by name 
samtools sort -@ 4 -T test_R2.RawName -m 10G -n -O BAM -o test_R2.RawName.SortName.bam test_R2.RawName.bam & 
samtools sort -@ 4 -T test_R2.FixName -m 10G -n -O BAM -o test_R2.FixName.SortName.bam test_R2.FixName.bam & 

# cmp reads name 
samtools view -h test_R2.RawName.bam | sed '/^V35/p' -n  | head -n 10 | cut -f 1
V350027555L1C001R0030000000/2
V350027555L1C001R0030000001/2
V350027555L1C001R0030000002/2
V350027555L1C001R0030000003/2
V350027555L1C001R0030000004/2
V350027555L1C001R0030000005/2
V350027555L1C001R0030000008/2
V350027555L1C001R0030000009/2
V350027555L1C001R0030000010/2
V350027555L1C001R0030000011/2

samtools view -h test_R2.RawName.SortName.bam | sed '/^V35/p' -n  | head -n 10 | cut -f 1
V350027555L1C001R0010000000/2
V350027555L1C001R0010000001/2
V350027555L1C001R0010000002/2
V350027555L1C001R0010000003/2
V350027555L1C001R0010000004/2
V350027555L1C001R0010000007/2
V350027555L1C001R0010000008/2
V350027555L1C001R0010000009/2
V350027555L1C001R0010000010/2
V350027555L1C001R0010000011/2


V350027555L1C001R0010000000
V350027555L1C001R0010000001
V350027555L1C001R0010000002
V350027555L1C001R0010000003
V350027555L1C001R0010000004
V350027555L1C001R0010000007
V350027555L1C001R0010000008
V350027555L1C001R0010000009
V350027555L1C001R0010000010
V350027555L1C001R0010000011
(py3env) menghaowei@cn07:G1_fq$samtools view 293T-MGI-HiC-GFP_rep1_R1_hg38_only_chromosome.bwt2merged.bam | sed '/^V35/p' -n  | head -n 100 | cut -f 1
V350027555L1C001R0010000000  V350027555L1C001R0010000000
V350027555L1C001R0010000001  V350027555L1C001R0010000001
V350027555L1C001R0010000002  V350027555L1C001R0010000002
V350027555L1C001R0010000003  V350027555L1C001R0010000003
V350027555L1C001R0010000004  V350027555L1C001R0010000004

# test merge 
/gpfs/user/menghaowei/anaconda2/envs/hicpro/bin/python /home/menghaowei/menghw_HD/software_package/HiC-Pro_3.0.0/scripts/mergeSAM.py -q 10 -t -v -f bowtie_results/bwt2/W1_fq/293T-MGI-HiC-ND6_rep1_R1_hg38_only_chromosome.bwt2merged.bam -r bowtie_results/bwt2/W1_fq/293T-MGI-HiC-ND6_rep1_R2_hg38_only_chromosome.bwt2merged.bam -o bowtie_results/bwt2/W1_fq/293T-MGI-HiC-_hg38_only_chromosome.bwt2pairs.bam


# test step 
HiC-Pro -i 03_1.hic_DdCBE_G1/bowtie_results/bwt2/G1_fq/293T-MGI-HiC-GFP_rep1_hg38_only_chromosome.bwt2pairs.bam \
-o 03_1.hic_DdCBE_G1 \
-c config-hicpro_hic_MGI_G1.txt -s proc_hic -s quality_checks -s build_contact_maps -s ice_norm &

####################################################################################################
# 2021-10-26
# Hi-C MGI 
####################################################################################################
/home/menghaowei/menghw_HD/DdCBE_project/10.hic_data/03.hic_DdCBE_MGI

.
├── [4.0K]  bam.hicpro
├── [4.0K]  bam.hicpro.G1
│   └── [4.0K]  GFP_rep1
│       ├── [105G]  293T-MGI-HiC-GFP_rep1_R1_hg38_only_chromosome.bwt2merged.SortName.bam
│       └── [103G]  293T-MGI-HiC-GFP_rep1_R2_hg38_only_chromosome.bwt2merged.SortName.bam
├── [4.0K]  bam.hicpro.G2
│   └── [4.0K]  GFP_rep2
│       ├── [102G]  293T-MGI-HiC-GFP_rep2_R1_hg38_only_chromosome.bwt2merged.SortName.bam
│       └── [100G]  293T-MGI-HiC-GFP_rep2_R2_hg38_only_chromosome.bwt2merged.SortName.bam
├── [4.0K]  bam.hicpro.W1
│   └── [4.0K]  ND6_rep1
│       ├── [106G]  293T-MGI-HiC-ND6_rep1_R1_hg38_only_chromosome.bwt2merged.SortName.bam
│       └── [108G]  293T-MGI-HiC-ND6_rep1_R2_hg38_only_chromosome.bwt2merged.SortName.bam
└── [4.0K]  bam.hicpro.W2
    └── [4.0K]  ND6_rep2
        ├── [109G]  293T-MGI-HiC-ND6_rep2_R1_hg38_only_chromosome.bwt2merged.SortName.bam
        └── [108G]  293T-MGI-HiC-ND6_rep2_R2_hg38_only_chromosome.bwt2merged.SortName.bam

# run hicpro
HiC-Pro -i /home/menghaowei/menghw_HD/DdCBE_project/10.hic_data/03.hic_DdCBE_MGI/bam.hicpro.G1 \
-o 03_1.hic_DdCBE_G1 \
-c hicpro_config/config-hicpro_hic_MGI_BAM_start.20211026.V2.txt \
-s proc_hic -s quality_checks -s build_contact_maps -s ice_norm &

HiC-Pro -i /home/menghaowei/menghw_HD/DdCBE_project/10.hic_data/03.hic_DdCBE_MGI/bam.hicpro.G2 \
-o 03_1.hic_DdCBE_G2 \
-c hicpro_config/config-hicpro_hic_MGI_BAM_start.20211026.V2.txt \
-s proc_hic -s quality_checks -s build_contact_maps -s ice_norm &

HiC-Pro -i /home/menghaowei/menghw_HD/DdCBE_project/10.hic_data/03.hic_DdCBE_MGI/bam.hicpro.W1 \
-o 03_1.hic_DdCBE_W1 \
-c hicpro_config/config-hicpro_hic_MGI_BAM_start.20211026.V2.txt \
-s proc_hic -s quality_checks -s build_contact_maps -s ice_norm &

HiC-Pro -i /home/menghaowei/menghw_HD/DdCBE_project/10.hic_data/03.hic_DdCBE_MGI/bam.hicpro.W2 \
-o 03_1.hic_DdCBE_W2 \
-c hicpro_config/config-hicpro_hic_MGI_BAM_start.20211026.V2.txt \
-s proc_hic -s quality_checks -s build_contact_maps -s ice_norm &


####################################################################################################
# 2021-10-26
# Hi-C MGI data with HiC-Pro
# 整理hicpro结果 并运行juicer
####################################################################################################
cd /home/menghaowei/menghw_HD/DdCBE_project/10.hic_data/03.hic_DdCBE_MGI/hicpro_result/valid_pairs

##### mkdir 
mkdir GFP_rep1
mkdir GFP_rep2
mkdir ND6_rep1
mkdir ND6_rep2

##### R1 and R2 mapping BAM
# GFP_rep1
mv /home/menghaowei/menghw_HD/DdCBE_project/10.hic_data/03_1.hic_DdCBE_G1/hic_results/data/G1_fq/* ./

# GFP_rep2
mv /home/menghaowei/menghw_HD/DdCBE_project/10.hic_data/03_1.hic_DdCBE_G2/hic_results/data/G2_fq/*  ./

# WT_rep1
mv /home/menghaowei/menghw_HD/DdCBE_project/10.hic_data/03_1.hic_DdCBE_W1/hic_results/data/W1_fq/*  ./

# WT_rep2
mv /home/menghaowei/menghw_HD/DdCBE_project/10.hic_data/03_1.hic_DdCBE_W2/hic_results/data/W2_fq/*  ./


##### logs 
mv /home/menghaowei/menghw_HD/DdCBE_project/10.hic_data/03_1.hic_DdCBE_G1/logs/G1_fq/*  ./GFP_rep1/
mv /home/menghaowei/menghw_HD/DdCBE_project/10.hic_data/03_1.hic_DdCBE_G2/logs/G2_fq/*  ./GFP_rep2/
mv /home/menghaowei/menghw_HD/DdCBE_project/10.hic_data/03_1.hic_DdCBE_W1/logs/W1_fq/*  ./ND6_rep1/
mv /home/menghaowei/menghw_HD/DdCBE_project/10.hic_data/03_1.hic_DdCBE_W2/logs/W2_fq/*  ./ND6_rep2/

##### QC pic
mv /home/menghaowei/menghw_HD/DdCBE_project/10.hic_data/03_1.hic_DdCBE_G1/hic_results/pic/G1_fq/*  ./GFP_rep1/
mv /home/menghaowei/menghw_HD/DdCBE_project/10.hic_data/03_1.hic_DdCBE_G2/hic_results/pic/G2_fq/*  ./GFP_rep2/
mv /home/menghaowei/menghw_HD/DdCBE_project/10.hic_data/03_1.hic_DdCBE_W1/hic_results/pic/W1_fq/*  ./ND6_rep1/
mv /home/menghaowei/menghw_HD/DdCBE_project/10.hic_data/03_1.hic_DdCBE_W2/hic_results/pic/W2_fq/*  ./ND6_rep2/


##### merged BAM file 
mv /home/menghaowei/menghw_HD/DdCBE_project/10.hic_data/03_1.hic_DdCBE_G1/bowtie_results/bwt2/G1_fq/*  ./GFP_rep1/
mv /home/menghaowei/menghw_HD/DdCBE_project/10.hic_data/03_1.hic_DdCBE_G2/bowtie_results/bwt2/G2_fq/*  ./GFP_rep2/
mv /home/menghaowei/menghw_HD/DdCBE_project/10.hic_data/03_1.hic_DdCBE_W1/bowtie_results/bwt2/W1_fq/*  ./ND6_rep1/
mv /home/menghaowei/menghw_HD/DdCBE_project/10.hic_data/03_1.hic_DdCBE_W2/bowtie_results/bwt2/W2_fq/*  ./ND6_rep2/


################################################################################
# convert HiC-Pro format to juicer format
################################################################################
##### 原始脚本
# 输入文件为 test.validPairs  则输出文件为test.validPairs.hic
hicpro2juicebox.sh \
-i test.validPairs \
-g /home/menghaowei/menghw_HD/reference/HiCPro_ref/hg38.only_chrom.sizes \
-j /home/menghaowei/menghw_HD/software_package/juicer/juicer_tools_1.22.01.jar \
-r /home/menghaowei/menghw_HD/reference/HiCPro_ref/hg38_dpnii.bed > test.ValidPairs.hicproTojuicer.log  2>&1  &

##### 问题
1. 内含sort命令，可能需要大量内存；
2. 最后一步调用juicer pre命令，内含1kb resolution的构成，可能会造成巨大的内存开销

##### step1 from HiC-Pro validPairs to jucier pairs
# fix hicpro2juicebox.sh script
# 通过修改，可以分步骤运行


##### step2 from juicer pairs to .hic file
java -Xmx64g -jar /home/menghaowei/menghw_HD/software_package/juicer/juicer_tools_1.22.01.jar pre \
--threads 20 -j 20 \
-f ./tmp/2404_resfrag.juicebox  \
-r 2500000,1000000,500000,250000,100000,50000,25000,10000,5000 \
./tmp/2404_allValidPairs.pre_juicebox_sorted \
293T_WT.allValidPairs.NoFrag.hic \
/home/menghaowei/menghw_HD/reference/HiCPro_ref/hg38.only_chrom.sizes > 293T_WT.allValidPairs.NoFrag.hicproTohic.log  2>&1 &


####################################################################################################
# 2021-10-29
# Hi-C MGI data with HicExplorer
####################################################################################################
hicFindRestSite

# hicExplore make restriction sites
hicFindRestSite \
--fasta ~/menghw_HD/reference/bowtie2_hg38/hg38_only_chromosome.fa \
--searchPattern GATC \
-o /home/menghaowei/menghw_HD/reference/HicExplorer_ref/hicexp_dpnii_hg38.bed &


# make matrix test code 
cd /home/menghaowei/menghw_HD/DdCBE_project/10.hic_data/03.hic_DdCBE_MGI/bam.hicpro

# need about 150GB memory 
hicBuildMatrix \
--samFiles \
293T-MGI-HiC-GFP_rep1_R1_hg38_only_chromosome.bwt2merged.SortName.bam \
293T-MGI-HiC-GFP_rep1_R2_hg38_only_chromosome.bwt2merged.SortName.bam \
--outFileName test.RawMatrix.h5 \
--binSize 20000 40000 100000 200000 500000 1000000 \
--threads 10 \
--QCfolder QC_res \
--restrictionCutFile ~/menghw_HD/reference/HicExplorer_ref/hicexp_dpnii_hg38.bed \
--restrictionSequence GATC \
--danglingSequence GATC \
--minMappingQuality 20 \
--chromosomeSizes /home/menghaowei/menghw_HD/reference/hg38.only_chrom.sizes

# matrix correction with KR methods
# correct 20Kbp
hicCorrectMatrix correct \
-m 293T-MGI-HiC-ND6_rep2_hg38.RawMatrix.h5 \
--correctionMethod KR \
-o 293T-MGI-HiC-ND6_rep2_hg38.KR.20Kbp.h5 > 293T-MGI-HiC-ND6_rep2_hg38.KR.20Kbp.log 2>&1 & 

# call TAD
srun -T 24 hicFindTADs \
-m 293T_WT.KR.CorrectMatrix.25Kbp.h5 \
--outPrefix 293T_WT.KR.25Kbp \
--numberOfProcessors 24 \
--correctForMultipleTesting fdr > 293T_WT.hicFindTADs.25Kbp.log 2>&1 &  

# plot 
hicPlotTADs --tracks 20211101-DdCBE_plot_TAD.ini -o out_image/test.pdf --region chrX:60000000-80000000 &
hicPlotTADs --tracks 20211101-DdCBE_plot_TAD.ini -o out_image/test_2.pdf --region chrX:60000000-80000000 &
hicPlotTADs --tracks 20211101-DdCBE_plot_TAD.ini -o out_image/test_3.pdf --region chrX:60000000-80000000 &

```