## How to use orthoglogy recap

In [17]:
import os
cwd = os.getcwd()
db_path = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(cwd)))), 'stagedb')
os.environ['DATA_URL'] = 'https://data.monarchinitiative.org/monarch-kg/2024-02-13/monarch-kg.tar.gz'
os.environ['HUMAN_GENE_LIST'] = f'{cwd}/wrappers/knowledgegraph/human_hgnc.txt'
os.environ['MOUSE_GENE_LIST'] = f'{cwd}/wrappers/knowledgegraph/mouse_mgi.txt'

os.environ['DB_PATH'] = db_path

os.environ['QUICK_ORTH'] = f'{cwd}/wrappers/knowledgegraph/quick_orth_gene_pairs.txt'
os.environ['QUICK_NON_ORTH'] = f'{cwd}/wrappers/knowledgegraph/quick_non_orth_gene_pairs.txt'
os.environ['QUICK_ORTH_OUT'] = f'{cwd}/wrappers/knowledgegraph/quick_orth_cosim'
os.environ['QUICK_NON_ORTH_OUT'] = f'{cwd}/wrappers/knowledgegraph/quick_non_orth_cosim'


os.environ['ORTH_OUT'] = f'{cwd}/wrappers/knowledgegraph/orth_cosim'
os.environ['NON_ORTH_OUT'] = f'{cwd}/wrappers/knowledgegraph/non_orth_cosim'

#### Check ChromaDB content. 
You need "ont_hp" and "ont_mp" collection!!!!

In [None]:
# index ont_hp
!curategpt ontology index --index-fields label, definition, relationships -p stagedb -c ont_hp -m openai: sqlite:obo:hp

In [None]:
# index ont_mp
!curategpt ontology index --index-fields label, definition, relationships -p stagedb -c ont_mp -m openai: sqlite:obo:mp

In [None]:
# Check Collections
from curate_gpt import ChromaDBAdapter

db = ChromaDBAdapter(db_path)
db.list_collection_names()

### Workflow

For a List of genes to upsert:

In [None]:
!curategpt ontology make_gene_embeddings -u $DATA_URL --path $DB_PATH --collection ont_hp -g HGNC: -p HP: -f $HUMAN_GENE_LIST -s gene_by_gene -e human_gene_list_collection

For all genes from an entity to be upserted:

In [None]:
# MOUSE
!curategpt ontology make_gene_embeddings -u $DATA_URL --path $DB_PATH --collection ont_mp -g MGI: -p MP: -s all -e global_mgi


In [None]:
# HUMAN
!curategpt ontology make_gene_embeddings -u $DATA_URL --path $DB_PATH --collection ont_mp -g HGNC: -p HP: -s all -e global_hgnc


### Use of the full 1000_orth to 1000_random_non_orth command

In [None]:
!curategpt ontology gene_orthology -u $DATA_URL --path $DB_PATH --collection_one global_human --collection_two global_mgi --output_file_one $ORTH_OUT --output_file_two $NON_ORTH_OUT

## Quick experiments with smaller size orth and non orths from Human to Mouse

In [None]:
!curategpt ontology gene_orthology -u $DATA_URL --path $DB_PATH --collection_one human_genes --collection_two mouse_genes --gene_pairs_file $QUICK_ORTH --output_file_one $QUICK_ORTH_OUT

In [None]:
!curategpt ontology gene_orthology -u $DATA_URL --path $DB_PATH --collection_one human_genes --collection_two mouse_genes --gene_pairs_file $QUICK_NON_ORTH --output_file_one $QUICK_NON_ORTH_OUT

## Comparison 1000 orthologous to 1000 non_orthologous pairs

In [None]:
import pandas as pd
import plotly.graph_objects as go

df1 = pd.read_csv(f'{cwd}/wrappers/knowledgegraph/orth_cosim.tsv', sep='\t')
df2 = pd.read_csv(f'{cwd}/wrappers/knowledgegraph/non_orth_cosim.tsv', sep='\t')

avg_original = df1['CosineSimilarity'].mean()
avg_altered = df2['CosineSimilarity'].mean()
num_pairs_original = len(df1)
num_pairs_altered = len(df2)

trace1 = go.Bar(x=['Average'], y=[avg_original], name='Orthologous',
                text=[f'Number of Pairs: {num_pairs_original}'],
                hovertemplate='Average Cosine Similarity: %{y:.3f}<br>Number of Pairs: %{text}',
                marker=dict(color='blue'))

trace2 = go.Bar(x=['Average'], y=[avg_altered], name='Non-Orthologous',
                text=[f'Number of Pairs: {num_pairs_altered}'],
                hovertemplate='Average Cosine Similarity: %{y:.3f}<br>Number of Pairs: %{text}',
                marker=dict(color='red'))

layout = go.Layout(
    title='Average CoSim Orth vs non-Orth',
    xaxis=dict(title=''),
    yaxis=dict(title='Cosine Similarity'),
    barmode='group'
)

fig = go.Figure(data=[trace1, trace2], layout=layout)

fig.update_layout(
    template='plotly_white',
    font=dict(size=12),
    legend=dict(x=0.8, y=0.95, borderwidth=1),
    margin=dict(l=50, r=50, t=80, b=50),
    width=400,
    height=400
)

fig.show()

### Comparison of smaller subset

In [None]:
f'{cwd}/wrappers/knowledgegraph/quick_orth_cosim.tsv'
f'{cwd}/wrappers/knowledgegraph/quick_orth_cosim.tsv'

import pandas as pd
import plotly.graph_objects as go

df1 = pd.read_csv(f'{cwd}/wrappers/knowledgegraph/quick_orth_cosim.tsv', sep='\t')
df2 = pd.read_csv(f'{cwd}/wrappers/knowledgegraph/quick_non_orth_cosim.tsv', sep='\t')

avg_original = df1['CosineSimilarity'].mean()
avg_altered = df2['CosineSimilarity'].mean()
num_pairs_original = len(df1)
num_pairs_altered = len(df2)

trace1 = go.Bar(x=['Average'], y=[avg_original], name='Orthologous',
                text=[f'Number of Pairs: {num_pairs_original}'],
                hovertemplate='Average Cosine Similarity: %{y:.3f}<br>Number of Pairs: %{text}',
                marker=dict(color='blue'))

trace2 = go.Bar(x=['Average'], y=[avg_altered], name='Non-Orthologous',
                text=[f'Number of Pairs: {num_pairs_altered}'],
                hovertemplate='Average Cosine Similarity: %{y:.3f}<br>Number of Pairs: %{text}',
                marker=dict(color='red'))

layout = go.Layout(
    title='Average CoSim Orth vs non-Orth',
    xaxis=dict(title=''),
    yaxis=dict(title='Cosine Similarity'),
    barmode='group'
)

fig = go.Figure(data=[trace1, trace2], layout=layout)

fig.update_layout(
    template='plotly_white',
    font=dict(size=12),
    legend=dict(x=0.8, y=0.95, borderwidth=1),
    margin=dict(l=50, r=50, t=80, b=50),
    width=400,
    height=400
)

fig.show()