## Tutorial to build single cell foundation model gene embedding network (scGENet)

### Step 1. Fine tuning single cell foundation models

In [None]:
#for scGPT
#Refer to https://scgpt.readthedocs.io/en/latest/introduction.html

In [None]:
#for geneformer
#Refer to https://docs.nvidia.com/bionemo-framework/latest/main/examples/bionemo-geneformer/geneformer-gene-embedding-GRN/

### Step 2. build gene network using the fine tuned model gene embedding

The result file scgpt_br_ft_sel_embeddings_network.txt contains the coordinates from TSNE for the gene embeddings, and cluster assignment based on louvain clustering

In [None]:
!python ./Scripts/build_scGENet.py --input scgpt_br_ft_sel_embeddings.csv --output scgpt_br_ft_sel_embeddings_network.txt --method tsne --cluster louvain

scgpt_br_ft_sel_embeddings.csv has 12837 genes, 512 dimensions.


## Step 3. Perform enrichment tests for the gene modules using GO Biological Process annotation

In [None]:
#convert to mat for annotation
!python Scripts/convert_cluster_to_matrix.py -i scgpt_br_ft_sel_embeddings_network.txt -o scgpt_br_ft_sel_embeddings_network_mat.txt


Binary matrix written to scgpt_br_ft_sel_embeddings_network_mat.txt


In [12]:
#gene set analysis
!perl Scripts/gs-fisher_caller.pl -i scgpt_br_ft_sel_embeddings_network_mat.txt -t matrix -d Db/c5.go.bp.v2024.1.Hs.symbols.txt -o scgpt_br_ft_bp

'perl' is not recognized as an internal or external command,
operable program or batch file.


## Step 4. build dot plot

In [None]:
#Read Figure2 Rscript file to build dot plot and bar plot

## Step 5. extract gene interaction network

In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

In [None]:

def compute_gene_cosine_similarities(embedding_scgpt_gene_embeddings_sel_cs_sel):
    """
    Compute pairwise cosine similarities between gene embeddings.
    
    Parameters:
    embedding_scgpt_gene_embeddings_sel_cs_sel (pd.DataFrame): Gene x Embedding matrix (index = gene names)

    Returns:
    pd.DataFrame: DataFrame with columns ['gene1', 'gene2', 'similarity']
    """
    # Compute cosine similarity matrix
    sim_matrix = cosine_similarity(embedding_scgpt_gene_embeddings_sel_cs_sel.values)
    
    # Create DataFrame
    gene_names = embedding_scgpt_gene_embeddings_sel_cs_sel.index.tolist()
    sim_scgpt_gene_embeddings_sel_cs_sel = pd.DataFrame(sim_matrix, index=gene_names, columns=gene_names)
    
    # Melt to long format and filter out self-similarity
    sim_long = sim_scgpt_gene_embeddings_sel_cs_sel.reset_index().melt(id_vars='index', var_name='gene2', value_name='similarity')
    sim_long.rename(columns={'index': 'gene1'}, inplace=True)
    
    # Optionally remove duplicate pairs (e.g., keep only gene1 < gene2)
    sim_long = sim_long[sim_long['gene1'] < sim_long['gene2']]

    return sim_long


In [None]:
scgpt_gene_embeddings = pd.read_csv("E:\\OneDrive\\Work\\BrainstormTherapeutics\\Research\\BioxML\\Network/scgpt_br_ft_sel_embeddings.csv", index_col=0)
scgpt_gene_embeddings

In [None]:
#selected by DE and neurogenesis
all_de_v5_sel.loc[all_de_v5_sel_neuro.index].to_csv("all_de_rev5_mat_neuro.csv")

In [None]:
scgpt_gene_embeddings_sel=scgpt_gene_embeddings.loc[all_de_v5_sel_neuro.index]
scgpt_gene_embeddings_sel

In [None]:
scgpt_gene_embeddings_sel_cs=compute_gene_cosine_similarities(scgpt_gene_embeddings_sel)
scgpt_gene_embeddings_sel_cs

produce network file for Cytoscape to build the gene interaction network

In [None]:
(scgpt_gene_embeddings_sel_cs['similarity'] > 0.25).sum()

scgpt_gene_embeddings_sel_cs_sel025=scgpt_gene_embeddings_sel_cs.loc[scgpt_gene_embeddings_sel_cs['similarity'] > 0.25]

scgpt_gene_embeddings_sel_cs_sel025.to_csv("scgpt_gene_embeddings_sel_cs_sel_025.csv")

scgpt_gene_embeddings_sel_cs_sel025

## Step 6. perform enrichment analysis using scGENet for bulk RNA-Seq & snRNASeq

In [None]:
!perl Scripts/gs-fisher_caller.pl -i de_combo.txt -t matrix -d scgpt_br_ft_sel_embeddings_network_cluster.txt -o scgpt_br_ft_cluster