# GRN - based on scFates milestones (pcw16)

In [None]:
pwd

In [None]:
!cd /home/jovyan/jm_jlab/

import os
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns

import celloracle as co
co.__version__

# visualization settings
%config InlineBackend.figure_format = 'retina'
%matplotlib inline


save_folder = "/home/jovyan/jm_jlab/data_indNeuro/1.GRN/"
os.makedirs(save_folder, exist_ok=True)

from pathlib import Path

sc._settings.ScanpyConfig.cachedir = Path('/home/jovyan/jm_jlab/celloracle_data/cache')

In [None]:
save_filtered_links = "/home/jovyan/jm_jlab/data_indNeuro/consensus_atlas_ATACregions_hg38/GRN_trevino21/milestones_pcw16/"

## **scRNA-seq processing**

In [None]:
adata = sc.read_h5ad("/home/jovyan/jm_jlab/data_indNeuro/intermediate_files/rna_counts.h5ad")

milestones = pd.read_csv("/home/jovyan/jm_jlab/data_indNeuro/scFates_output/scFates_clustering.tsv", sep='\t', index_col=0)

adata.obs = pd.concat([adata.obs, milestones], axis=1)

cellembeddings = pd.read_csv("/home/jovyan/jm_jlab/data_indNeuro/intermediate_files/cellembeddings.tsv", sep='\t')
cellembeddings.iloc[:,1] = cellembeddings.iloc[:,1]*-1 #for better geometry
adata.obsm['X_pca'] = cellembeddings.to_numpy() #sharing embeddings

adata = adata[adata.obs['Age'] == 'pcw16',:].copy()

sc.pp.filter_genes(adata, min_cells=50)
sc.pp.normalize_per_cell(adata, key_n_counts='n_counts_all')
sc.pp.highly_variable_genes(adata, n_top_genes=4000, flavor='seurat_v3') #let's retain more genes before further filtering

In [None]:
sc.pl.highly_variable_genes(adata)

In [None]:
sc.pl.highly_variable_genes(adata[:,adata.var['means'] < 50])

In [None]:
adata=adata[:,adata.var.highly_variable]

In [None]:
#Renormalized after filtering
sc.pp.normalize_per_cell(adata, key_n_counts='n_counts_all')

In [None]:
# keep raw cont data before log transformation
adata.raw = adata.copy()
adata.layers["raw_count"] = adata.raw.X.copy()

# Log transformation
sc.pp.log1p(adata)

In [None]:
df = pd.read_parquet("/home/jovyan/jm_jlab/data_indNeuro/consensus_atlas_ATACregions_hg38/base_GRN_dataframe_HOCOMOCOv11.parquet")

df.head(2)

In [None]:
adata.var['TF_HOCOMOCO'] = adata.var_names.isin(df.columns[2:])

In [None]:
adata = adata[:,(adata.var['highly_variable'] == True) | (adata.var['TF_HOCOMOCO'] == True)].copy()

In [None]:
sc.set_figure_params(figsize=(4,4))

sc.pl.pca(adata, color=['milestones'])

In [None]:
sc.pl.pca(adata, color=['milestones', 'EGR1', 'HOPX', 'EOMES', 'GLI3', 'PPP1R17', 'NHLH1', 'HES1', 'KLF6'])

In [None]:
sc.pl.pca(adata, color=['milestones', 'EGR1', 'HOPX', 'EOMES', 'GLI3', 'PPP1R17', 'NHLH1', 'HES1', 'KLF6'], use_raw=False)

**pseudotime** (from scFates)

In [None]:
sc.pl.pca(adata, color=['t', 'milestones'])

**Final object**

In [None]:
adata

## **PEAKS**

In [None]:
import os, sys, shutil, importlib, glob
from tqdm.notebook import tqdm

In [None]:
from celloracle import motif_analysis as ma
from celloracle.utility import save_as_pickled_object

In [None]:
%config InlineBackend.figure_format = 'retina'
%matplotlib inline

plt.rcParams['figure.figsize'] = (15,7)
plt.rcParams["savefig.dpi"] = 600

In [None]:
df.head(2)

## 

## CellOracle

In [None]:
# Instantiate Oracle object
oracle = co.Oracle()

In [None]:
oracle

In [None]:
# Show data name in anndata
print("metadata columns :", list(adata.obs.columns))
print("dimensional reduction: ", list(adata.obsm.keys()))

In [None]:
#adata = adata.copy()

# In this notebook, we use the unscaled mRNA count for the input of Oracle object.
adata.X = adata.layers["raw_count"].copy()

# Instantiate Oracle object.
oracle.import_anndata_as_raw_count(adata=adata,
                                   cluster_column_name="milestones",
                                   embedding_name="X_pca") #It performs log-transform

**HOMOCOCO CELLORACLE OBJECT:**

In [None]:
# You can load TF info dataframe with the following code.
oracle.import_TF_data(TF_info_matrix=df)

In [None]:
oracle

In [None]:
# Perform PCA
oracle.perform_PCA()

# Select important PCs
plt.plot(np.cumsum(oracle.pca.explained_variance_ratio_)[:100])
n_comps = np.where(np.diff(np.diff(np.cumsum(oracle.pca.explained_variance_ratio_))>0.002))[0][0]
plt.axvline(n_comps, c="k")
print(n_comps)
n_comps = min(n_comps, 50)

In [None]:
n_cell = oracle.adata.shape[0]
print(f"cell number is :{n_cell}")

In [None]:
k = int(0.025*n_cell)
print(f"Auto-selected k is :{k}")

In [None]:
oracle.knn_imputation(n_pca_dims=n_comps, k=k, balanced=True, b_sight=k*8,
                      b_maxl=k*4, n_jobs=14)

In [None]:
%%time
# Calculate GRN for each population clustering unit.

links = oracle.get_links(cluster_name_for_GRN_unit='milestones', 
                         alpha=25, 
                         bagging_number=20, 
                         verbose_level=10, 
                         test_mode=False, 
                         model_method="bagging_ridge", 
                         ignore_warning=False, n_jobs=-1)

In [None]:
%%time
# Calculate GRN for each population clustering unit.
# This step may take some time.(~30 minutes)

links_bayesian_ridge = oracle.get_links(cluster_name_for_GRN_unit='milestones', 
                         alpha=25, 
                         bagging_number=20, 
                         verbose_level=10, 
                         test_mode=False, 
                         model_method="bayesian_ridge", 
                         ignore_warning=False, n_jobs=-1)

**P VALUE - bagging_ridge**

In [None]:
links.filter_links(p=0.001, weight="coef_abs", threshold_number=2000)

**P VALUE - bayesian_ridge**

In [None]:
links_bayesian_ridge.filter_links(p=0.001, weight="coef_abs", threshold_number=2000)

In [None]:
for i in links.filtered_links.keys():

    links.filtered_links[i].to_csv(save_filtered_links+"GRN_for_"+i+"_baggingridge_pcw16.csv", index=False)

**bayesian_ridge**

In [None]:
for i in links.filtered_links.keys():

    links_bayesian_ridge.filtered_links[i].to_csv(save_filtered_links+"alt_GRN_for_"+i+"_bayesianridge_pcw16.csv", index=False)

In [None]:
plt.rcParams["figure.figsize"] = [6, 4.5]

In [None]:
links.plot_degree_distributions(plot_model=True)#save=f"{save_folder}/degree_distribution/",)

**Higher r2 scores for bagging ridge algorithm**

In [None]:
links_bayesian_ridge.plot_degree_distributions(plot_model=True)#save=f"{save_folder}/degree_distribution/",)

In [None]:
# Calculate network scores.
links.get_network_score()

In [None]:
links.merged_score.head()

**links_bayesian_ridge**

In [None]:
links_bayesian_ridge.get_network_score()

In [None]:
links_bayesian_ridge.merged_score.head()

In [None]:
# Save Links object.
links.to_hdf5(file_path="/home/jovyan/jm_jlab/data_indNeuro/1.GRN/milestones_pcw16_baggindridge.celloracle.links")

In [None]:
# links_bayesian_ridge
# Save Links object.
links_bayesian_ridge.to_hdf5(file_path="/home/jovyan/jm_jlab/data_indNeuro/1.GRN/alt_milestones_pcw16_bayesianridge.celloracle.links")

## Network analysis

In [None]:
sc.pl.pca(adata, color='milestones')

In [None]:
for i in links.filtered_links.keys():

    print(i)
    
    links.plot_scores_as_rank(cluster=i, n_gene=10)

In [None]:
for i in links.filtered_links.keys():

    print(i)
    
    links.plot_scores_as_rank(cluster=i, n_gene=10, save=save_filtered_links+"GRN_for_"+i+"_pcw16")

# Save Oracle object pcw16

In [None]:
oracle.to_hdf5("/home/jovyan/jm_jlab/data_indNeuro/1.GRN/tr21_pcw16.celloracle.oracle")