## Extract region of -2:+3 for experimental and reference

BBCA+1BW (where B is C/G/T, and W is A/T)  
http://genesdev.cshlp.org/content/31/13/1289.full

In [1]:
## Parameters specific to where your folders are and your data
parameter_file = '../params/params.yaml'
import yaml
import sys

with open(parameter_file,'r') as f:
    doc = yaml.load(f)

#p = dic2obj(**doc)

data_folder = doc['data_folder']
tissues = doc['tissues'].split(',')
sys.path.append(doc['pipeline_path'])
ref_fa = doc['ref_fa']
anno_gff=doc['annotation']
mRNA_peak_file = doc["mRNA_peak_file"]

tss_annotation = doc['tss_annotation']


import os

import subprocess
import sys
import pandas as pd
import matplotlib
import seaborn as sns
import pickle
import numpy as np
import matplotlib as mpl
from matplotlib import pyplot as plt
from itertools import product
import glob
import re
from matplotlib_venn import venn2
from matplotlib import rcParams
import inspect
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
sys.setrecursionlimit(3000)
%load_ext autoreload
%autoreload 2
rcParams['figure.figsize'] = 8, 6
import tqdm

from os.path import basename
##mpl.use('Agg')
#mpl.style.use('ggplot')
#mpl.style.use('fivethirtyeight')
from Homer import *
import helper
import create_output
print('Number of tissues: ',len(tissues))

  import sys


Number of tissues:  13


In [2]:
save_dir = "Results/homer_motifs/"
if not os.path.exists(save_dir):
    os.mkdir(save_dir)

## Read experimental bed file and create its companion refseq region

In [None]:
exp_bed_f = "../Results/output/TSS1.exp.bed"
#mrna_bed_f = "../Results/Figures/Figure3/A.TSS1_mrna"


## Create RefSeq peak file for this
meta_f = "../Results/output/TSS1.exp.meta"
mrna_bed_f = os.path.join(save_dir, "TSS1.exp_refseq_centered.bed")

#mrna_filt = "Results/Figures/Figure3/A.mrna"
create_output.exp_bed_to_refseq(exp_bed_f,meta_f,refseq_f=mRNA_peak_file,save_f=mrna_bed_f,is_unique=True)

### Create narrow bed file for hg38

In [None]:
hg38 = pd.read_csv("/data/isshamie/genome/hg38/mRNA_final.peak", sep="\t")
hg38["Stat"] = 0
hg38 = hg38[["Chr", "Start", "End","Name", "Stat", "Strand"]]
hg38 = hg38.drop_duplicates(subset=("Chr","Start", "End", "Strand"))
hg38 = hg38.rename({"Name":"ID"}, axis=1)

hg38_bed_f = os.path.join(save_dir, "hg38.bed")
write_bed_file(hg38, hg38_bed_f, use_index=False)


## For both bed files, only take -2:+3

In [None]:
def bed_subset_region(bed_f, f_save=None, region=(-2,+3)):
    bed = read_bed_file(bed_f)
    center = np.ceil((bed["Start"] + bed["End"])/2)
    bed["Start"] = (center+region[0]).astype(int)
    bed["End"] = (center+region[1]).astype(int)
    print(bed.columns)
    print(bed.index.name)
    if f_save is not None:
        write_bed_file(bed,f_save)
        
    return bed

In [None]:
exp_f_save = os.path.join(save_dir, "TSS1.exp.narrow.bed")
bed_subset_region(bed_f = exp_bed_f, f_save=exp_f_save, region=(-2,+3))

In [None]:
mrna_f_save = os.path.join(save_dir, "TSS1.exp_refseq_centered.narrow.bed")
bed_subset_region(bed_f = mrna_bed_f, f_save=mrna_f_save, region=(-2,+3))

In [None]:
hg38_f_save = os.path.join(save_dir, "hg38.narrow.bed")
bed_subset_region(bed_f = hg38_bed_f, f_save=hg38_f_save, region=(-2,+3))

## Run findMotifs in Homer

In [9]:
def run_findMotifs(bed_f, out_dir, ref_fa, args=None,  bg=None):
    if not os.path.exists(curr_dir):
        os.mkdir(curr_dir)
    cmd = f"nohup findMotifsGenome.pl {bed_f} {ref_fa} {out_dir} -size given -len 3,4,5 "
    if bg is not None:
        cmd = f"{cmd} -bg {bg} " 
    if args is not None:
        cmd = cmd + " ".join(args) + " "
    cmd = f"{cmd} > {out_dir}.log"
    print(cmd)
    os.system(cmd)    
    return

In [10]:
curr_dir = os.path.join(save_dir, "eTSS_bg_rTSS_nocpg")
run_findMotifs(exp_f_save, out_dir=curr_dir,ref_fa=ref_fa, bg=mrna_f_save)


exp_dir = os.path.join(save_dir, "eTSS_motifs_nocpg")
run_findMotifs(exp_f_save, out_dir=exp_dir,ref_fa=ref_fa)

ref_dir = os.path.join(save_dir, "rTSS_motifs_nocpg")
run_findMotifs(mrna_f_save, out_dir=ref_dir,ref_fa=ref_fa)

hg38_out = os.path.join(save_dir, "hg38_nocpg")
hg38_genome = "/data/isshamie/genome/hg38/GCF_000001405.38_GRCh38.p12_genomic.fna"
run_findMotifs(hg38_f_save, out_dir=hg38_out,ref_fa=hg38_genome)



nohup findMotifsGenome.pl Results/homer_motifs/TSS1.exp.narrow.bed /data/isshamie/genome/ncbi_anno_103/GCF_003668045.1_CriGri-PICR_genomic.fna Results/homer_motifs/eTSS_bg_rTSS_nocpg -size given -len 3,4,5  -bg Results/homer_motifs/TSS1.exp_refseq_centered.narrow.bed  > Results/homer_motifs/eTSS_bg_rTSS_nocpg.log
nohup findMotifsGenome.pl Results/homer_motifs/TSS1.exp.narrow.bed /data/isshamie/genome/ncbi_anno_103/GCF_003668045.1_CriGri-PICR_genomic.fna Results/homer_motifs/eTSS_motifs_nocpg -size given -len 3,4,5  > Results/homer_motifs/eTSS_motifs_nocpg.log
nohup findMotifsGenome.pl Results/homer_motifs/TSS1.exp_refseq_centered.narrow.bed /data/isshamie/genome/ncbi_anno_103/GCF_003668045.1_CriGri-PICR_genomic.fna Results/homer_motifs/rTSS_motifs_nocpg -size given -len 3,4,5  > Results/homer_motifs/rTSS_motifs_nocpg.log
nohup findMotifsGenome.pl Results/homer_motifs/hg38.narrow.bed /data/isshamie/genome/hg38/GCF_000001405.38_GRCh38.p12_genomic.fna Results/homer_motifs/hg38_nocpg -size

In [11]:
curr_dir = os.path.join(save_dir, "eTSS_bg_rTSS")
run_findMotifs(exp_f_save, out_dir=curr_dir,ref_fa=ref_fa, bg=mrna_f_save, args=["-cpg"])

exp_dir = os.path.join(save_dir, "eTSS_motifs")
run_findMotifs(exp_f_save, out_dir=exp_dir,ref_fa=ref_fa, args=["-cpg"])

ref_dir = os.path.join(save_dir, "rTSS_motifs")
run_findMotifs(mrna_f_save, out_dir=ref_dir,ref_fa=ref_fa, args=["-cpg"])

hg38_out = os.path.join(save_dir, "hg38")
hg38_genome = "/data/isshamie/genome/hg38/GCF_000001405.38_GRCh38.p12_genomic.fna"
run_findMotifs(hg38_f_save, out_dir=hg38_out,ref_fa=hg38_genome, args=["-cpg"])


nohup findMotifsGenome.pl Results/homer_motifs/TSS1.exp.narrow.bed /data/isshamie/genome/ncbi_anno_103/GCF_003668045.1_CriGri-PICR_genomic.fna Results/homer_motifs/eTSS_bg_rTSS -size given -len 3,4,5  -bg Results/homer_motifs/TSS1.exp_refseq_centered.narrow.bed -cpg  > Results/homer_motifs/eTSS_bg_rTSS.log
nohup findMotifsGenome.pl Results/homer_motifs/TSS1.exp.narrow.bed /data/isshamie/genome/ncbi_anno_103/GCF_003668045.1_CriGri-PICR_genomic.fna Results/homer_motifs/eTSS_motifs -size given -len 3,4,5 -cpg  > Results/homer_motifs/eTSS_motifs.log
nohup findMotifsGenome.pl Results/homer_motifs/TSS1.exp_refseq_centered.narrow.bed /data/isshamie/genome/ncbi_anno_103/GCF_003668045.1_CriGri-PICR_genomic.fna Results/homer_motifs/rTSS_motifs -size given -len 3,4,5 -cpg  > Results/homer_motifs/rTSS_motifs.log
nohup findMotifsGenome.pl Results/homer_motifs/hg38.narrow.bed /data/isshamie/genome/hg38/GCF_000001405.38_GRCh38.p12_genomic.fna Results/homer_motifs/hg38 -size given -len 3,4,5 -cpg  > R

In [13]:
curr_dir = os.path.join(save_dir, "eTSS_bg_rTSS_nocpg")
if not os.path.exists(curr_dir):
    os.mkdir(curr_dir)

run_findMotifs(exp_f_save, out_dir=curr_dir,ref_fa=ref_fa, bg=mrna_f_save)

cmd = f"nohup findMotifsGenome.pl {exp_f_save} {ref_fa} {curr_dir} -size given -bg {mrna_f_save} -len 3,4,5 > {curr_dir}.log"
print(cmd)
os.system(cmd)

nohup findMotifsGenome.pl Results/homer_motifs/TSS1.exp.narrow.bed /data/isshamie/genome/ncbi_anno_103/GCF_003668045.1_CriGri-PICR_genomic.fna Results/homer_motifs/eTSS_bg_rTSS_nocpg -size given -bg Results/homer_motifs/TSS1.exp_refseq_centered.narrow.bed -len 3,4,5 > Results/homer_motifs/eTSS_bg_rTSS_nocpg.log


0

In [10]:
exp_dir = os.path.join(save_dir, "eTSS_motifs_nocpg")
if not os.path.exists(exp_dir):
    os.mkdir(exp_dir)
cmd = f"nohup findMotifsGenome.pl {exp_f_save} {ref_fa} {exp_dir} -size given -len 3,4,5 > {exp_dir}.log"
print(cmd)
os.system(cmd)

findMotifsGenome.pl Results/homer_motifs/TSS1.exp.narrow.bed /data/isshamie/genome/ncbi_anno_103/GCF_003668045.1_CriGri-PICR_genomic.fna Results/homer_motifs/eTSS_motifs_nocpg -size given -len 3,4,5 


0

In [11]:
ref_dir = os.path.join(save_dir, "rTSS_motifs_nocpg")
if not os.path.exists(ref_dir):
    os.mkdir(ref_dir)
cmd = f"nohup findMotifsGenome.pl {exp_f_save} {ref_fa} {ref_dir} -size given -len 3,4,5 > {ref_dir}.log "
print(cmd)
os.system(cmd)

findMotifsGenome.pl Results/homer_motifs/TSS1.exp.narrow.bed /data/isshamie/genome/ncbi_anno_103/GCF_003668045.1_CriGri-PICR_genomic.fna Results/homer_motifs/rTSS_motifs_nocpg -size given -len 3,4,5 


0

In [25]:
hg38_out = os.path.join(save_dir, "hg38_nocpg")
if not os.path.exists(hg38_out):
    os.mkdir(hg38_out)

hg38_genome = "/data/isshamie/genome/hg38/GCF_000001405.38_GRCh38.p12_genomic.fna"
cmd = f"nohup findMotifsGenome.pl {hg38_f_save} {hg38_genome} {hg38_out} -size given -len 3,4,5 {hg38}.log"
print(cmd)
os.system(cmd)

findMotifsGenome.pl Results/homer_motifs/hg38.narrow.bed /data/isshamie/genome/hg38/GCF_000001405.38_GRCh38.p12_genomic.fna Results/homer_motifs/hg38 -size given -len 3,4,5,6 


2

In [13]:
ref_dir = os.path.join(save_dir, "rTSS_motifs")
if not os.path.exists(ref_dir):
    os.mkdir(ref_dir)
cmd = f"findMotifsGenome.pl {exp_f_save} {ref_fa} {ref_dir} -size given -len 3,4,5 -cpg"
print(cmd)
os.system(cmd)

findMotifsGenome.pl Results/homer_motifs/TSS1.exp.narrow.bed /data/isshamie/genome/ncbi_anno_103/GCF_003668045.1_CriGri-PICR_genomic.fna Results/homer_motifs/rTSS_motifs -size given -len 3,4,5  -cpg


0

In [16]:
exp_bg_ref_dir = os.path.join(save_dir, "exp_bg_ref_dir_motifs")
if not os.path.exists(exp_bg_ref_dir):
    os.mkdir(exp_bg_ref_dir)
cmd = f"findMotifsGenome.pl {exp_f_save} {ref_fa} {exp_bg_ref_dir} -size given -bg {mrna_f_save} -len 3,4,5  -cpg"
print(cmd)
os.system(cmd)

findMotifsGenome.pl Results/homer_motifs/TSS1.exp.narrow.bed /data/isshamie/genome/ncbi_anno_103/GCF_003668045.1_CriGri-PICR_genomic.fna Results/homer_motifs/exp_bg_ref_dir_motifs -size given -bg Results/homer_motifs/TSS1.exp_refseq_centered.narrow.bed -len 3,4,5  -cpg


0

In [15]:
exp_dir = os.path.join(save_dir, "eTSS_motifs")
if not os.path.exists(exp_dir):
    os.mkdir(exp_dir)
cmd = f"findMotifsGenome.pl {exp_f_save} {ref_fa} {exp_dir} -size given -len 3,4,5  -cpg"
print(cmd)
os.system(cmd)

findMotifsGenome.pl Results/homer_motifs/TSS1.exp.narrow.bed /data/isshamie/genome/ncbi_anno_103/GCF_003668045.1_CriGri-PICR_genomic.fna Results/homer_motifs/eTSS_motifs -size given -len 3,4,5  -cpg


0

## Plot PWM and Results