# Step 1

In [None]:
import anndata as ad
import networkx as nx
import scanpy as sc
import scglue
from matplotlib import rcParams
out_dir = '../../output'
scglue.plot.set_publication_params()
rcParams["figure.figsize"] = (4, 4)

rna = ad.read_h5ad(f"{out_dir}/scRNA/adata_rna.h5ad")
atac = ad.read_h5ad(f"{out_dir}/scATAC/adata_atac.h5ad")


In [None]:
rna.layers["counts"] = rna.X.copy()
sc.pp.highly_variable_genes(rna, n_top_genes=2000, flavor="seurat_v3")
sc.pp.normalize_total(rna)
sc.pp.log1p(rna)
sc.pp.scale(rna)
sc.tl.pca(rna, n_comps=100, svd_solver="auto")
sc.pp.neighbors(rna, metric="cosine")
sc.tl.umap(rna)

In [None]:
scglue.data.lsi(atac, n_components=100, n_iter=15)
sc.pp.neighbors(atac, use_rep="X_lsi", metric="cosine")
sc.tl.umap(atac)


In [None]:
scglue.data.get_gene_annotation(
    rna, gtf=f"{out_dir}/infer/scglue/gencode.v45.annotation.gtf.gz",
    gtf_by="gene_name"
)
rna.var.loc[:, ["chrom", "chromStart", "chromEnd"]].head()

In [None]:
rna = rna[:, ~rna.var.chromEnd.isna()]

In [None]:
split = atac.var_names.str.split(r"[:-]")
atac.var["chrom"] = split.map(lambda x: x[0])
atac.var["chromStart"] = split.map(lambda x: x[1]).astype(int)
atac.var["chromEnd"] = split.map(lambda x: x[2]).astype(int)
atac.var.head()

In [None]:
guidance = scglue.genomics.rna_anchored_guidance_graph(rna, atac)
guidance



In [None]:
scglue.graph.check_graph(guidance, [rna, atac])


In [None]:
atac.var.head()


In [None]:
rna.write(f"{out_dir}/infer/scglue/rna-pp.h5ad", compression="gzip")
atac.write(f"{out_dir}/infer/scglue/atac-pp.h5ad", compression="gzip")
nx.write_graphml(guidance, f"{out_dir}/infer/scglue/guidance.graphml.gz")

# Step 2

In [None]:
from itertools import chain

import anndata as ad
import itertools
import networkx as nx
import pandas as pd
import scanpy as sc
import scglue
import seaborn as sns
from matplotlib import rcParams

In [None]:
scglue.plot.set_publication_params()
rcParams["figure.figsize"] = (4, 4)

In [None]:
# rna = ad.read_h5ad(f"{out_dir}/rna-pp.h5ad")
# atac = ad.read_h5ad(f"{out_dir}/atac-pp.h5ad")
# guidance = nx.read_graphml(f"{out_dir}/guidance.graphml.gz")

In [None]:
scglue.models.configure_dataset(
    rna, "NB", use_highly_variable=True,
    use_layer="counts", use_rep="X_pca"
)

In [None]:
# scglue.models.configure_dataset(
#     atac, "NB", use_highly_variable=True,
#     use_rep="X_lsi"
# )

In [None]:
scglue.models.configure_dataset(
    atac, "NB", use_highly_variable=True
)

In [None]:
guidance_hvf = guidance.subgraph(chain(
    rna.var.query("highly_variable").index,
    atac.var.query("highly_variable").index
)).copy()

In [None]:
glue = scglue.models.fit_SCGLUE(
    {"rna": rna, "atac": atac}, guidance_hvf,
    fit_kws={"directory": "glue"}
)

In [None]:
glue.save(f"{out_dir}/infer/scglue/glue.dill")

In [None]:
dx = scglue.models.integration_consistency(
    glue, {"rna": rna, "atac": atac}, guidance_hvf
)

In [None]:
_ = sns.lineplot(x="n_meta", y="consistency", data=dx).axhline(y=0.05, c="darkred", ls="--")


In [None]:
rna.obsm["X_glue"] = glue.encode_data("rna", rna)
atac.obsm["X_glue"] = glue.encode_data("atac", atac)

In [None]:
combined = ad.concat([rna, atac])

In [None]:
sc.pp.neighbors(combined, use_rep="X_glue", metric="cosine")
sc.tl.umap(combined)

In [None]:
feature_embeddings = glue.encode_graph(guidance_hvf)
feature_embeddings = pd.DataFrame(feature_embeddings, index=glue.vertices)
feature_embeddings.iloc[:5, :5]

In [None]:
rna.varm["X_glue"] = feature_embeddings.reindex(rna.var_names).to_numpy()
atac.varm["X_glue"] = feature_embeddings.reindex(atac.var_names).to_numpy()

In [None]:
# rna.write(f"{out_dir}/rna-emb.h5ad", compression="gzip")
# atac.write(f"{out_dir}/atac-emb.h5ad", compression="gzip")
# nx.write_graphml(guidance_hvf, f"{out_dir}/guidance-hvf.graphml.gz")

# Step 3

In [None]:
import anndata as ad
import networkx as nx
import numpy as np
import pandas as pd
import scglue
import seaborn as sns
from IPython import display
from matplotlib import rcParams
from networkx.algorithms.bipartite import biadjacency_matrix
from networkx.drawing.nx_agraph import graphviz_layout
scglue.plot.set_publication_params()
rcParams['figure.figsize'] = (4, 4)
rna.var["name"] = rna.var_names
atac.var["name"] = atac.var_names

genes = rna.var.query("highly_variable").index
peaks = atac.var.query("highly_variable").index

features = pd.Index(np.concatenate([rna.var_names, atac.var_names]))
feature_embeddings = np.concatenate([rna.varm["X_glue"], atac.varm["X_glue"]])

skeleton = guidance_hvf.edge_subgraph(
    e for e, attr in dict(guidance_hvf.edges).items()
    if attr["type"] == "fwd"
).copy()

reginf = scglue.genomics.regulatory_inference(
    features, feature_embeddings,
    skeleton=skeleton, random_state=0
)

gene2peak = reginf.edge_subgraph(
    e for e, attr in dict(reginf.edges).items()
    if attr["qval"] < 0.05
)

scglue.genomics.Bed(atac.var).write_bed(f"{out_dir}/infer/scglue/peaks.bed", ncols=3)
scglue.genomics.write_links(
    gene2peak,
    scglue.genomics.Bed(rna.var).strand_specific_start_site(),
    scglue.genomics.Bed(atac.var),
    f"{out_dir}/infer/scglue/gene2peak.links", keep_attrs=["score"]
)

In [None]:
# motif_bed = scglue.genomics.read_bed(f"{out_dir}/JASPAR2022-hg38.bed.gz")
motif_bed = scglue.genomics.read_bed(f"{out_dir}/ENCODE-TF-ChIP-hg38.bed.gz")
tfs = pd.Index(motif_bed["name"]).intersection(rna.var_names)
tfs.size
rna[:, np.union1d(genes, tfs)].write_loom(f"{out_dir}/infer/scglue/rna.loom")
np.savetxt(f"{out_dir}/infer/scglue/tfs.txt", tfs, fmt="%s")

In [None]:
!pyscenic grn rna.loom tfs.txt \
    -o draft_grn.csv --seed 0 --num_workers 20 \
    --cell_id_attribute cells --gene_attribute genes

In [None]:
peak_bed = scglue.genomics.Bed(atac.var.loc[peaks])
peak2tf = scglue.genomics.window_graph(peak_bed, motif_bed, 0, right_sorted=True)
peak2tf = peak2tf.edge_subgraph(e for e in peak2tf.edges if e[1] in tfs)

gene2tf_rank_glue = scglue.genomics.cis_regulatory_ranking(
    gene2peak, peak2tf, genes, peaks, tfs,
    region_lens=atac.var.loc[peaks, "chromEnd"] - atac.var.loc[peaks, "chromStart"],
    random_state=0
)
gene2tf_rank_glue.iloc[:5, :5]

flank_bed = scglue.genomics.Bed(rna.var.loc[genes]).strand_specific_start_site().expand(500, 500)
flank2tf = scglue.genomics.window_graph(flank_bed, motif_bed, 0, right_sorted=True)

gene2flank = nx.Graph([(g, g) for g in genes])
gene2tf_rank_supp = scglue.genomics.cis_regulatory_ranking(
    gene2flank, flank2tf, genes, genes, tfs,
    n_samples=0
)
gene2tf_rank_supp.iloc[:5, :5]

gene2tf_rank_glue.columns = gene2tf_rank_glue.columns + "_glue"
gene2tf_rank_supp.columns = gene2tf_rank_supp.columns + "_supp"

scglue.genomics.write_scenic_feather(gene2tf_rank_glue, f"{out_dir}/infer/scglue/glue.genes_vs_tracks.rankings.feather")
scglue.genomics.write_scenic_feather(gene2tf_rank_supp, f"{out_dir}/infer/scglue/supp.genes_vs_tracks.rankings.feather")

pd.concat([
    pd.DataFrame({
        "#motif_id": tfs + "_glue",
        "gene_name": tfs
    }),
    pd.DataFrame({
        "#motif_id": tfs + "_supp",
        "gene_name": tfs
    })
]).assign(
    motif_similarity_qvalue=0.0,
    orthologous_identity=1.0,
    description="placeholder"
).to_csv(f"{out_dir}/infer/scglue/ctx_annotation.tsv", sep="\t", index=False)



In [None]:
!pyscenic ctx draft_grn.csv \
    glue.genes_vs_tracks.rankings.feather \
    supp.genes_vs_tracks.rankings.feather \
    --annotations_fname ctx_annotation.tsv \
    --expression_mtx_fname rna.loom \
    --output pruned_grn.csv \
    --rank_threshold 500 --min_genes 1 \
    --num_workers 20 \
    --cell_id_attribute cells --gene_attribute genes 2> /dev/null

In [None]:
grn = scglue.genomics.read_ctx_grn("pruned_grn.csv")