In [24]:
!uv pip install notebook duckdb pandas fastparquet semsimian

[2mAudited [1m5 packages[0m [2min 26ms[0m[0m


Download and unzip the Monarch Graph and Phenio semantic SQL datbabase

In [None]:
!wget https://data.monarchinitiative.org/monarch-kg/latest/monarch-kg.duckdb.gz
!gunzip monarch-kg.duckdb.gz
!wget https://data.monarchinitiative.org/monarch-kg/latest/phenio.db.gz # consider hpo.db.gz for speed
!gunzip phenio.db.gz

--2024-11-11 12:18:54--  https://data.monarchinitiative.org/monarch-kg/latest/monarch-kg.duckdb.gz
Resolving data.monarchinitiative.org (data.monarchinitiative.org)... 35.208.191.193
Connecting to data.monarchinitiative.org (data.monarchinitiative.org)|35.208.191.193|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1628735176 (1.5G) [application/octet-stream]
Saving to: ‘monarch-kg.duckdb.gz’


2024-11-11 12:19:44 (31.5 MB/s) - ‘monarch-kg.duckdb.gz’ saved [1628735176/1628735176]

monarch-kg.duckdb already exists -- do you wish to overwrite (y or n)? 

In [1]:
import duckdb

db = duckdb.connect('monarch-kg.duckdb', read_only=True)
db.sql("""
copy (
  select 
       subject as disease, 
       list(object) as phenotype, 
       primary_knowledge_source as source
    from edges
  where predicate = 'biolink:has_phenotype' 
       and negated <> True
       and primary_knowledge_source in ('infores:orphanet','infores:omim')
       and subject in (
          select distinct subject
            from denormalized_edges
            where subject_namespace = 'MONDO'
              and predicate = 'biolink:has_phenotype'
              and primary_knowledge_source in ('infores:orphanet','infores:omim')
            group by all
            having count(distinct primary_knowledge_source) > 1
       )     
    group by all
) to 'phenotype_profiles_by_source.parquet';       
""") 

In [2]:
# load the phenotype profiles into a pandas dataframe 
import pandas as pd
phenotype_profiles = pd.read_parquet('phenotype_profiles_by_source.parquet')
phenotype_profiles.head()



Unnamed: 0,disease,phenotype,source
0,MONDO:0007167,"[HP:0000774, HP:0001156, HP:0001762, HP:000208...",infores:orphanet
1,MONDO:0008967,"[HP:0003236, HP:0005978, HP:0011892, HP:003051...",infores:orphanet
2,MONDO:0010547,"[HP:0002073, HP:0000639, HP:0001152, HP:000127...",infores:orphanet
3,MONDO:0008829,"[HP:0000501, HP:0001004, HP:0001482, HP:000154...",infores:orphanet
4,MONDO:0008620,"[HP:0002986, HP:0003022, HP:0009465]",infores:orphanet


Initialize semsimian from phenio.db

In [26]:
from semsimian import Semsimian

predicates= [
    "rdfs:subClassOf",
    "BFO:0000050",
    "UPHENO:0000001",
]

semsimian = Semsimian(
    spo=None,
    predicates=predicates,
    pairwise_similarity_attributes=None,
    resource_path="./phenio.db",
)


TypeError: argument 'subject_terms': 'list' object cannot be converted to 'PySet'

Using Semsimian, generate termset_pairwise_similarity scores for OMIM vs Orphanet, OMIM vs OMIM and Orphanet vs Orphanet using each metric. 

In [39]:
results = []
for disease in phenotype_profiles['disease'].unique():
    result = {
        'disease': disease
    }
    # get the omim profile
    try:
        omim_profile = list(phenotype_profiles[(phenotype_profiles['disease'] == disease) & (phenotype_profiles['source'] == 'infores:omim')]['phenotype'])[0]
    except IndexError:
        print(f'No OMIM profile for {disease}, skipping')
        continue
    # get the orphanet profile
    try:
        orphanet_profile = list(phenotype_profiles[(phenotype_profiles['disease'] == disease) & (phenotype_profiles['source'] == 'infores:orphanet')]['phenotype'])[0]
    except IndexError:
        print(f'No Orphanet profile for {disease}, skipping')
        continue

    for metric in ['ancestor_information_content', 'jaccard_similarity', 'phenodigm_score']:                
        result[metric] = pairwise_similarity_result = semsimian.termset_pairwise_similarity(
            set(omim_profile), set(orphanet_profile), metric
        )
        result[f"omim_self_{metric}"] = semsimian.termset_pairwise_similarity(
            set(omim_profile), set(omim_profile), metric
        )
        result[f"orphanet_self_{metric}"] = semsimian.termset_pairwise_similarity(
            set(orphanet_profile), set(orphanet_profile), metric
        )

    results.append(result)
    


No OMIM profile for MONDO:0017324, skipping
No OMIM profile for MONDO:0016241, skipping
No OMIM profile for MONDO:0018852, skipping
No OMIM profile for MONDO:0033925, skipping
No OMIM profile for MONDO:0018705, skipping
No OMIM profile for MONDO:0015826, skipping
No Orphanet profile for MONDO:0008856, skipping
No Orphanet profile for MONDO:0010024, skipping
No Orphanet profile for MONDO:0007597, skipping
No Orphanet profile for MONDO:0007091, skipping
No Orphanet profile for MONDO:0007447, skipping
No Orphanet profile for MONDO:0007186, skipping
No Orphanet profile for MONDO:0060650, skipping
No Orphanet profile for MONDO:0007810, skipping
No Orphanet profile for MONDO:0014912, skipping
No Orphanet profile for MONDO:0014784, skipping
No Orphanet profile for MONDO:0013754, skipping
No Orphanet profile for MONDO:0013519, skipping
No Orphanet profile for MONDO:0008787, skipping
No Orphanet profile for MONDO:0010520, skipping
No OMIM profile for MONDO:0002413, skipping
No OMIM profile for 

In [42]:
score_df = pd.DataFrame(
    [{
        'disease': r['disease'],
        'ancestor_information_content': r['ancestor_information_content']['average_score'],
        'ancestor_information_content_omim_self': r['omim_self_ancestor_information_content']['average_score'],
        'ancestor_information_content_orphanet_self': r['orphanet_self_ancestor_information_content']['average_score'],   
        'jaccard_similarity': r['jaccard_similarity']['average_score'],
        'jaccard_similarity_omim_self': r['omim_self_jaccard_similarity']['average_score'],
        'jaccard_similarity_orphanet_self': r['orphanet_self_jaccard_similarity']['average_score'],
        'phenodigm_score': r['phenodigm_score']['average_score'],
        'phenodigm_score_omim_self': r['omim_self_phenodigm_score']['average_score'],
        'phenodigm_score_orphanet_self': r['orphanet_self_phenodigm_score']['average_score'],
    } for r in results])

In [43]:
score_df

Unnamed: 0,disease,ancestor_information_content,ancestor_information_content_omim_self,ancestor_information_content_orphanet_self,jaccard_similarity,jaccard_similarity_omim_self,jaccard_similarity_orphanet_self,phenodigm_score,phenodigm_score_omim_self,phenodigm_score_orphanet_self
0,MONDO:0007167,13.817717,17.012315,16.634543,0.772808,1.0,1.0,3.195493,4.119417,4.070784
1,MONDO:0008967,10.742168,17.482061,15.999629,0.571184,1.0,1.0,2.379026,4.179427,3.989719
2,MONDO:0010547,13.075760,15.739035,17.361246,0.806076,1.0,1.0,3.127593,3.961502,4.163276
3,MONDO:0008829,12.094596,16.958185,14.324329,0.744447,1.0,1.0,2.995891,4.114497,3.762925
4,MONDO:0008620,15.917020,17.564011,16.833542,0.802005,1.0,1.0,3.523730,4.189954,4.100690
...,...,...,...,...,...,...,...,...,...,...
2092,MONDO:0013737,14.584981,16.366311,15.814934,0.869527,1.0,1.0,3.507886,4.040150,3.967945
2093,MONDO:0013400,13.380031,16.584899,16.951649,0.723024,1.0,1.0,3.057376,4.064994,4.110144
2094,MONDO:0008087,12.853699,16.438208,15.836758,0.773554,1.0,1.0,3.086015,4.044986,3.972138
2095,MONDO:0011035,12.761761,17.142761,15.570460,0.756597,1.0,1.0,3.059558,4.135477,3.922544


Export the full results as JSON, export the score dataframe as tsv

In [44]:
# write results as json
with open('phenotype_profile_comparison_results.json', 'w') as f:
    json.dump(results, f)

# write results as tsv
score_df.to_csv('phenotype_profile_comparison_scores.tsv', sep='\t', index=False)