# **base GRN - *from atlas of ATAC regulatory regions***

In [None]:
import pandas as pd

In [None]:
cd /home/jovyan/jm_jlab/data_indNeuro/consensus_atlas_ATACregions_hg38/GREAT_results/

In [None]:
great_output  = pd.read_csv("./consensus_peaks_hg38_GREAT.txt", sep='\t')

In [None]:
len(great_output)

In [None]:
great_output.head()

In [None]:
great_output = great_output.iloc[:,0:2]
great_output.rename(columns={'# GREAT version 4.0.4':"peak_name", 'Species assembly: hg38':'gene_short_name'}, inplace=True)

In [None]:
len(great_output)

In [None]:
#Cleaning output
great_output.head()

In [None]:
#Cleaning output
great_output.tail()

In [None]:
great_output['gene_short_name'] = great_output['gene_short_name'].str.replace(r"\(.*?\)", "", regex=True).str.strip().str.split(" , ")

In [None]:
great_output = great_output.explode('gene_short_name')

In [None]:
great_output = great_output[great_output["gene_short_name"].str.contains("NONE")==False]

In [None]:
print(great_output.head())
print(len(great_output))

In [None]:
all_sorted = pd.read_csv("/home/jovyan/jm_jlab/data_indNeuro/consensus_atlas_ATACregions_hg38/consensus_signals.bed", sep='\t', header=None)

In [None]:
all_sorted.head(2)

In [None]:
all_sorted.rename(columns={0:'chr',1:'start',2:'end',3:'peak_name'}, inplace=True)

In [None]:
great_output = great_output.merge(all_sorted, on = ['peak_name'])

In [None]:
great_output.head(2)

In [None]:
data = {'peak_id': great_output.iloc[:,2].astype(str)+"_"+great_output.iloc[:,3].astype(str)+"_"+great_output.iloc[:,4].astype(str), 
 'gene_short_name': great_output.iloc[:,1]}

In [None]:
annotated = pd.DataFrame(data)

In [None]:
annotated.head(2)

In [None]:
print(len(annotated))

# **Scan for TF binding motifs** - CELLORACLE

**Scan for TF binding motifs** - CELLORACLE

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


import seaborn as sns

import os, sys, shutil, importlib, glob
from tqdm.notebook import tqdm

import celloracle as co
from celloracle import motif_analysis as ma
from celloracle.utility import save_as_pickled_object
co.__version__

In [None]:
%config InlineBackend.figure_format = 'retina'
%matplotlib inline

plt.rcParams['figure.figsize'] = (15,7)
plt.rcParams["savefig.dpi"] = 600

In [None]:
if not genome_installation:
    import genomepy
    genomepy.install_genome(name=ref_genome, provider="UCSC")
else:
    print(ref_genome, "is installed.")

In [None]:
def decompose_chrstr(peak_str):
    """
    Args:
        peak_str (str): peak_str. e.g. 'chr1_3094484_3095479'

    Returns:
        tuple: chromosome name, start position, end position
    """

    *chr_, start, end = peak_str.split("_")
    chr_ = "_".join(chr_)
    return chr_, start, end

from genomepy import Genome

def check_peak_format(peaks_df, ref_genome):
    """
    Check peak format.
     (1) Check chromosome name.
     (2) Check peak size (length) and remove sort DNA sequences (<5bp)

    """

    df = peaks_df.copy()

    n_peaks_before = df.shape[0]

    # Decompose peaks and make df
    decomposed = [decompose_chrstr(peak_str) for peak_str in df["peak_id"]]
    df_decomposed = pd.DataFrame(np.array(decomposed), index=peaks_df.index)
    df_decomposed.columns = ["chr", "start", "end"]
    df_decomposed["start"] = df_decomposed["start"].astype(int)
    df_decomposed["end"] = df_decomposed["end"].astype(int)

    # Load genome data
    genome_data = Genome(ref_genome)
    all_chr_list = list(genome_data.keys())


    # DNA length check
    lengths = np.abs(df_decomposed["end"] - df_decomposed["start"])


    # Filter peaks with invalid chromosome name
    n_threshold = 5
    df = df[(lengths >= n_threshold) & df_decomposed.chr.isin(all_chr_list)]

    # DNA length check
    lengths = np.abs(df_decomposed["end"] - df_decomposed["start"])

    # Data counting
    n_invalid_length = len(lengths[lengths < n_threshold])
    n_peaks_invalid_chr = n_peaks_before - df_decomposed.chr.isin(all_chr_list).sum()
    n_peaks_after = df.shape[0]


    #
    print("Peaks before filtering: ", n_peaks_before)
    print("Peaks with invalid chr_name: ", n_peaks_invalid_chr)
    print("Peaks with invalid length: ", n_invalid_length)
    print("Peaks after filtering: ", n_peaks_after)

    return df

In [None]:
peaks = check_peak_format(annotated, ref_genome)

In [None]:
peaks.head()

In [None]:
# Instantiate TFinfo object
tfi = ma.TFinfo(peak_data_frame=peaks,
                ref_genome=ref_genome)

In [None]:
import os, glob
from gimmemotifs.motif import MotifConfig
config = MotifConfig()
motif_dir = config.get_motif_dir()

# Get motif data names
motifs_data_name = [i for i in os.listdir(motif_dir) if i.endswith(".pfm")]
motifs_data_name.sort()
motifs_data_name

**Homococo**

In [None]:
# You can load motif files with "read_motifs"
from gimmemotifs.motif import read_motifs

path = os.path.join(motif_dir, "HOCOMOCOv11_HUMAN.pfm")
motifs = read_motifs(path)

# Check first 10 motifs
motifs[:10]

In [None]:
tfi.scan(motifs=motifs, verbose=True) #error with HOMOCOCO v11

In [None]:
# Check motif scan results
tfi.scanned_df.head()

In [None]:
# Reset filtering
tfi.reset_filtering()

# Do filtering
tfi.filter_motifs_by_score(threshold=10)

# Format post-filtering results.
tfi.make_TFinfo_dataframe_and_dictionary(verbose=True)

In [None]:
df = tfi.to_dataframe()
df.head()

In [None]:
pwd

In [None]:
# Save result as a dataframe
df.to_parquet("/home/jovyan/jm_jlab/data_indNeuro/consensus_atlas_ATACregions_hg38/base_GRN_dataframe_HOCOMOCOv11.parquet")