# Find number of unique genes in each parent node 

In [None]:
from model_nodes_edges import load_nodes_edges
import os
from ontology_modify import find_children, parent_unique_genes, count_children
from model_nodes_edges import get_genes

os.environ['MODEL_ANNOTATION_ROOT'] = 'Projects/cellmaps_annotate_hierarchy/cellmaps_annotate_hierarchy/'

model_name = "MuSIC2_Maps"
version = "May2023"
file_name = "muse_imgdim_1024_ppidim_1024_latentd_128_layered.chi_10.maxres_80.alg_leiden.pruned"

node_table, edges_df = load_nodes_edges(model_name, version, file_name)

# print(node_table.head(), edges_df.head())
print(node_table.shape, edges_df.shape)

# count unique genes per parent
unique_genes_per_parent = {}

for parent in edges_df['parent'].unique():
    children = find_children(parent, edges_df)
    unique_genes_count = len(parent_unique_genes(parent, children, node_table))
    unique_genes_per_parent[parent] = unique_genes_count
    # print(f"Parent: {parent}, Unique genes count: {unique_genes_count}")

# Count children for each parent node
children_counts = {}
for parent in edges_df['parent'].unique():
    children_counts[parent] = count_children(parent, edges_df)

# Find leaf nodes
leaf_nodes = set(edges_df['child']) - set(edges_df['parent'])

# Find leaf nodes with more than 50 unique genes
big_leaf_nodes = {}

for leaf in leaf_nodes:
    unique_genes_count = len(get_genes(leaf, node_table)) # Leaf nodes have no children
    if unique_genes_count > 50:
        big_leaf_nodes[leaf] = unique_genes_count
        print(f"Leaf node: {leaf}, Unique genes count: {unique_genes_count}")


## Ideas to split the large parent node

Divide the large parent into three groups (skip root node)

**Group 1:** Unique genes < = 10 (>0)

* Keep as is, when creating prompt use the unique gene list (GO annotations, disease etc) + summaries from child nodes 

**Group 2:** Unique genes > 10 and <50

* Create one stepchild node to the ontology (update both edges and nodes table)

* Query summary for stepchild and child nodes

* Create a prompt with summaries from all child nodes and stepchild

**Group 3:** Unique genes >= 50

* Run k means clustering and keep each cluster size < 50

* Add stepchild node for each cluster 

* Query for all stepchild and child nodes 

* Prompt for all summaries 

**Big leaf:** Group of leaves that have genes >50

* Run k means clustering and keep each cluster size < 50

* Add stepchild node for each leaf

* Prompt for all step children summaries 


NOTE: after creating stepchild nodes → check again for the number of children → if the number of children > 20? (Skip for now)


In [None]:
## create groups of parents
group_1_parents = [parent for parent, count in unique_genes_per_parent.items() if count <= 10]
group_2_parents = [parent for parent, count in unique_genes_per_parent.items() if 10 < count < 50]
group_3_parents = [parent for parent, count in unique_genes_per_parent.items() if count >= 50]

big_leaf = [leaf for leaf, _ in big_leaf_nodes.items()]
print(len(group_1_parents), len(group_2_parents), len(group_3_parents), len(big_leaf))
parents_by_group = {
    'group_1': group_1_parents,
    'group_2': group_2_parents,
    'group_3': group_3_parents,
    'big_leaf': big_leaf
}

import json
import os 
from file_io import get_model_directory_path

model_name = "MuSIC2_Maps"
version = "May2023"

# Save the parents_by_group dictionary as a JSON file
with open(os.path.join(get_model_directory_path(model_name, version),'parent_nodes_by_group.json'), 'w') as f:
    json.dump(parents_by_group, f)


30 49 27 4


In [None]:
import pandas as pd 
from ontology_modify import find_children, parent_unique_genes, generate_step_child
# Create a function to return the unique genes list for a parent node
def get_unique_genes_list(node, edges, nodes):
    if node not in edges['parent'].unique():
        return ''

    children = find_children(node, edges)
    unique_genes = parent_unique_genes(node, children, nodes)
    return ' '.join(unique_genes)

# Add a new column to the nodes_copy DataFrame with the unique genes list
nodes_copy = node_table.copy()
nodes_copy['unique_genes'] = nodes_copy['term'].apply(lambda x: get_unique_genes_list(x, edges_df, node_table))


nodes_copy.head()
# copy the edges dataframe
edges_copy = edges_df.copy()
# generate step child node from unique genes 

for parent in group_2_parents:
    step_child = generate_step_child(parent, 1)
    unique_genes_list = nodes_copy.loc[nodes_copy['term'] == parent, 'unique_genes'].values[0]
    # print(unique_genes_list)
    # Create a new row for the step child in the nodes DataFrame
    step_child_row = pd.DataFrame({'term': step_child, 'genes': [unique_genes_list]})
    nodes_copy = nodes_copy.append(step_child_row, ignore_index=True)

    # # Update the edges DataFrame to include the new stepchild node
    new_edge_row = pd.DataFrame({'parent': parent, 'child': step_child, 'type': ['default']})
    edges_copy = edges_copy.append(new_edge_row, ignore_index=True)

print(nodes_copy.head(), edges_copy.head())

         term    size                                              genes  \
0  Cluster0-0  5254.0  SLFN11 VPS11 CCNB1 FTSJ1 MEX3C TBCC SYTL5 PIDD...   
1  Cluster1-0  2652.0  DOCK10 ANKRD54 MAGI1 HOMER3 TJP1 CSPP1 PLEC RT...   
2  Cluster1-1  2150.0  DOCK10 RTCB PRKD2 PPP2R5D CTNND2 CREB1 ECD CDK...   
3  Cluster2-0  1890.0  FAAP24 ARL4C ANXA9 BPNT1 EEF1B2 LRRC42 HERC4 T...   
4  Cluster3-0  1340.0  FAAP24 ANXA9 BPNT1 MRFAP1 FBXO45 SYCE1 DAPK3 Z...   

   stability                                       unique_genes  
0       88.0  TBCC FTSJ1 FLCN NQO2 PLEKHO2 CDPF1 SELENOM SLF...  
1       14.0  C1QTNF1 CYTH3 NFX1 GIGYF1 LRP1B CREB1 ECD ITPR...  
2       15.0  PPOX BIRC6 PRX RRAGC MYO19 RABGGTB CDK5 SYTL4 ...  
3       26.0  SHTN1 ZNF576 LRRC42 DYNLL1 ZC4H2 COMMD1 CCDC12...  
4       36.0  SLF1 RPP14 GTF2IRD1 GAB1 DUSP22 JMJD6 FBXO33 G...          parent       child     type
0  Cluster0-0  Cluster1-0  default
1  Cluster0-0  Cluster1-1  default
2  Cluster0-0  Cluster1-2  default
3  Clus

In [None]:
from ontology_modify import perform_kmeans_clustering, generate_step_child
import pandas as pd 
# the embedding file from muse output (use for clustering)
emb = '/cellar/users/lvschaffer/Data/MuSIC/U2OS/coembedding/muse_pytorch/new_embeddings/U2OS_music_pipeline/outputs_run_muse_integration/muse_imgdim_1024_ppidim_1024_latentd_128_latent.txt'
parent_clusters = {}
for parent in group_3_parents:
    print(f'Run k-means clustering on {parent}')
    unique_genes_list = nodes_copy.loc[nodes_copy['term'] == parent, 'unique_genes'].values[0].split()

    cluster_labels, num_clusters = perform_kmeans_clustering(unique_genes_list, emb)
    # print(f"{parent}: ", cluster_labels, num_clusters)
    step_child_names = generate_step_child(parent, num_clusters)
    parent_clusters[parent] = num_clusters
    print(f"{parent} with unique genes of {len(unique_genes_list)} splits into {num_clusters} clusters")
    for idx, step_child in enumerate(step_child_names):
        genes_in_cluster = ' '.join([gene for i, gene in enumerate(unique_genes_list) if cluster_labels[i] == idx])

        # Create a new row for the step child in the nodes DataFrame
 
        step_child_row = pd.DataFrame({'term': step_child, 'genes': [genes_in_cluster]})
        nodes_copy = nodes_copy.append(step_child_row, ignore_index=True)

        # Update the edges DataFrame to include the new step child node
        new_edge_row = pd.DataFrame({'parent': parent, 'child': step_child, 'type': ['default']})
        edges_copy = edges_copy.append(new_edge_row, ignore_index=True)

print(nodes_copy.tail(), edges_copy.tail())

Run k-means clustering on Cluster1-0
There is a big cluster with genes more than 50, add one more cluster
Cluster1-0 with unique genes of 59 splits into 2 clusters
Run k-means clustering on Cluster1-4
There is a big cluster with genes more than 50, add one more cluster
Cluster1-4 with unique genes of 84 splits into 3 clusters
Run k-means clustering on Cluster2-0
Cluster2-0 with unique genes of 61 splits into 2 clusters
Run k-means clustering on Cluster2-7
There is a big cluster with genes more than 50, add one more cluster
Cluster2-7 with unique genes of 150 splits into 6 clusters
Run k-means clustering on Cluster2-9
Cluster2-9 with unique genes of 97 splits into 3 clusters
Run k-means clustering on Cluster2-10
There is a big cluster with genes more than 50, add one more cluster
Cluster2-10 with unique genes of 52 splits into 2 clusters
Run k-means clustering on Cluster2-1
Cluster2-1 with unique genes of 70 splits into 2 clusters
Run k-means clustering on Cluster2-2
Cluster2-2 with uni

In [None]:
from model_nodes_edges import get_genes
from ontology_modify import perform_kmeans_clustering, generate_step_child
# the embedding file from muse output (use for clustering)
emb = '/cellar/users/lvschaffer/Data/MuSIC/U2OS/coembedding/muse_pytorch/new_embeddings/U2OS_music_pipeline/outputs_run_muse_integration/muse_imgdim_1024_ppidim_1024_latentd_128_latent.txt'
big_leaf_cluster = {}
# Perform k-means clustering on big leaf nodes
for leaf in big_leaf_nodes:
    print(f'Run k-means clustering on {leaf}')
    genes_list = get_genes(leaf, node_table)

    cluster_labels, num_clusters = perform_kmeans_clustering(genes_list, emb)
    # print(f"{leaf}: ", cluster_labels, num_clusters)
    step_child_names = generate_step_child(leaf, num_clusters)
    big_leaf_cluster[leaf] = num_clusters
    print(f"{leaf} with unique genes of {len(genes_list)} splits into {num_clusters} clusters")
    for idx, step_child in enumerate(step_child_names):
        genes_in_cluster = ' '.join([gene for i, gene in enumerate(genes_list) if cluster_labels[i] == idx])

        # Create a new row for the step child in the nodes DataFrame
        step_child_row = pd.DataFrame({'term': step_child, 'genes': [genes_in_cluster]})
        nodes_copy = nodes_copy.append(step_child_row, ignore_index=True)

        # Update the edges DataFrame to include the new step child node
        new_edge_row = pd.DataFrame({'parent': leaf, 'child': step_child, 'type': ['default']})
        edges_copy = edges_copy.append(new_edge_row, ignore_index=True)

print(nodes_copy.tail(), edges_copy.tail())

Run k-means clustering on Cluster2-11
There is a big cluster with genes more than 50, add one more cluster
Cluster2-11 with unique genes of 86 splits into 3 clusters
Run k-means clustering on Cluster3-12
Cluster3-12 with unique genes of 64 splits into 2 clusters
Run k-means clustering on Cluster1-12
There is a big cluster with genes more than 50, add one more cluster
Cluster1-12 with unique genes of 53 splits into 2 clusters
Run k-means clustering on Cluster3-9
Cluster3-9 with unique genes of 102 splits into 3 clusters
              term  size                                              genes  \
471  Cluster1-12_1   NaN  ACADM ACO2 ACOT2 ALDH6A1 CIAPIN1 CPOX DTYMK EC...   
472  Cluster1-12_2   NaN  ACADVL AKR7A2 AKR7A3 ARG2 BID CPNE3 DCPS DDX28...   
473   Cluster3-9_1   NaN  EIF4A3 ACAT2 ARL2 C11orf68 CYRIB DDX19B DIS3 D...   
474   Cluster3-9_2   NaN  MDP1 YBEY ADPRS ALOX5 ARIH2 CELF1 CMPK1 CPNE1 ...   
475   Cluster3-9_3   NaN  AAMDC ADISSP AIDA ANXA11 ASNS BLVRB CNOT10 EIF...   



In [None]:
import os
from file_io import get_model_directory_path

model_name = "MuSIC2_Maps"
version = "May2023"
file_name = "muse_imgdim_1024_ppidim_1024_latentd_128_layered.chi_10.maxres_80.alg_leiden.pruned"

nodes_copy.to_csv(os.path.join(get_model_directory_path(model_name, version), f'{file_name}.expand.nodes'), sep = '\t', index=False, header=False)
edges_copy.to_csv(os.path.join(get_model_directory_path(model_name, version), f'{file_name}.expand.edges'), sep = '\t', index=False, header=False)