# SNNomics demo 

Problem: There are millions of publicly available transcriptomics samples that can be reused to make novel discoveries about the molecular underpinnings of biological contexts such as tissues, diseases, phenotypes, and molecular pathways. However, only a small fraction of the millions of available samples are annotated to standardized terms denoting biological contexts. Thus, it's difficult to curate large collections of data for researchers to reuse to make discoveries.

Solution: To autonomously label transcriptomics samples for which the biological context is unknown, we can use a siamese neural network (SNN) to learn an embedding space that will pull samples derived from the same biological context together and push apart samples from different contexts. To identify samples 

![Overview](imgs/overview.png)

## Import modules

In [ ]:
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
from pathlib import Path
from SNNomics.network import SNN

## Load data

In [ ]:
query_sample = np.load("data/query_sample.npy")
database = np.load("data/test_database.npz")
database_expression = database["expression"]
database_ids = database['gsms']
labels = pd.read_csv('data/tissue_labels.csv', index_col=0)
onto_map = pd.read_csv('data/onto_map.csv')

## Define similarity function

In [ ]:
def cosine_similarity(x, y):
    dot_product = torch.dot(x, y)
    norm_x = torch.norm(x)
    norm_y = torch.norm(y)
    similarity = dot_product / (norm_x * norm_y)
    return similarity

## Compute similarities between the query and all other samples

In [ ]:
num_genes = len(query_sample)   # Number of genes for input dimension
num_samples = len(database_ids) # Number of samples to iterate through
model = SNN(num_genes)  # Initialize model

query = torch.from_numpy(query_sample)

results = {'id': [], 'similarity': []}
for sample in range(num_samples):
    z = torch.from_numpy(database_expression[sample, :])
    query_emb, z_emb = model(query, z)
    similarity = cosine_similarity(query_emb, z_emb)
    
    results['id'].append(database_ids[sample])
    results['similarity'].append(similarity.item())
    
results_df = pd.DataFrame.from_dict(results)

## Map tissue labels to samples

In [ ]:
def assign_labels(gsm: str, labels: pd.DataFrame, onto_map: pd.DataFrame) -> str:
    terms = labels.columns[labels.loc[gsm] == 1].tolist()
    term_names = []
    for term in terms:
        term_names.append(onto_map.loc[onto_map['id'] == term, 'name'].item())
    
    return ','.join(term_names)

results_df['tissues'] = results_df['id'].apply(
    assign_labels, 
    labels=labels, 
    onto_map=onto_map
)

print(f"Query tissue: heart")
print(results_df.head(20))