In [131]:
import pandas as pd
import requests
import ast
from ncr_utils import get_hpo

In [132]:
GSC_PATH = "data/GSC+.csv"
NCR_GSC_PATH = "data/NCR_GSC.csv"
CLINPHEN_GSC_PATH = "data/clinphen_gsc.csv"
IHP_GSC_PATH = "data/IHP_GSC.csv"
BIOLINK_GSC_PATH = "data/biolink_GSC.csv"

In [133]:
GSC_dataset = pd.read_csv(GSC_PATH)
GSC_dataset.head(2)

Unnamed: 0,file,text,HPO_symptoms,HPO_codes,starts,ends
0,1003450,A syndrome of brachydactyly (absence of some m...,"['brachydactyly', 'absence of some middle or d...","['0001156', '0009881', '0001798', '0001792', '...","[14, 29, 74, 86, 105, 120, 166, 210, 232, 346,...","[27, 71, 103, 103, 118, 163, 208, 226, 253, 37..."
1,10051003,Townes-Brocks syndrome (TBS) is an autosomal d...,"['autosomal dominant disorder', 'autosomal dom...","['0000006', '0000006', '0000006', '0003828', '...","[35, 35, 45, 95, 95, 139, 148, 163, 177, 186, ...","[62, 53, 53, 114, 103, 161, 161, 175, 197, 197..."


## Neural Concept Recognizer

In [134]:
NCR_det = pd.read_csv(NCR_GSC_PATH)
NCR_det.head(2)

Unnamed: 0.1,Unnamed: 0,file,text,HPO_symptoms,HPO_codes,starts,ends
0,0,1003450,A syndrome of brachydactyly (absence of some m...,"['Type A brachydactyly', 'Aplasia/Hypoplasia o...","['0009370', '0008386', '0100266', '0001363', '...","[0, 74, 166, 210, 232]","[27, 103, 208, 226, 246]"
1,1,10051003,Townes-Brocks syndrome (TBS) is an autosomal d...,"['Abnormality of the outer ear', 'Hearing impa...","['0000356', '0000365', '0100258', '0001199', '...","[139, 163, 177, 202, 224, 246, 314, 344]","[161, 175, 197, 222, 240, 265, 333, 362]"


## Clinphen

In [135]:
clinphen_det = pd.read_csv(CLINPHEN_GSC_PATH)
clinphen_det.head()

Unnamed: 0.1,Unnamed: 0,file,text,HPO_symptoms,HPO_codes
0,0,1003450,A syndrome of brachydactyly (absence of some m...,[],[]
1,1,10051003,Townes-Brocks syndrome (TBS) is an autosomal d...,"['Abnormality of the outer ear', 'Abnormality ...","['0000356', '0000598', '0000365', '0010442', '..."
2,2,10066029,Nevoid basal cell carcinoma syndrome (NBCCS) i...,"['Palmar pits', 'Plantar pits', 'Basal cell ca...","['0010610', '0010612', '0002671', '0030731', '..."
3,3,10196695,Angelman syndrome (AS) is a neurodevelopmental...,[],[]
4,4,10417280,Prader-Willi syndrome (PWS) and Angelman syndr...,[],[]


## IHP

In [136]:
ihp_det = pd.read_csv(IHP_GSC_PATH)
ihp_det.head(2)

Unnamed: 0.1,Unnamed: 0,file,text,HPO_symptoms,HPO_codes,starts,ends
0,0,1003450,A syndrome of brachydactyly (absence of some m...,"['symphalangism', 'ankylois of proximal interp...","['0010179', None, '0001831', None, '0001363', ...","['105', '120', '14', '166', '210', '232', '29'...","['118', '163', '27', '208', '226', '253', '71'..."
1,1,10051003,Townes-Brocks syndrome (TBS) is an autosomal d...,"['ear anomalies', 'hearing loss', 'polydactyly...","['0000356', '0000365', '0010442', '0001199', '...","['148', '163', '186', '202', '224', '246', '35...","['161', '175', '197', '222', '240', '265', '36..."


## Biolink

In [137]:
biolink_det = pd.read_csv(BIOLINK_GSC_PATH)
biolink_det.head(2)

Unnamed: 0.1,Unnamed: 0,file,text,HPO_symptoms,HPO_codes,starts,ends
0,0,1003450,A syndrome of brachydactyly (absence of some m...,"['Brachydactyly', 'Distal', 'Small nail', 'Pro...","['0001156', '0012839', '0001792', '0012840', '...","[14, 55, 86, 132, 132, 210, 346]","[27, 61, 104, 140, 176, 227, 364]"
1,1,10051003,Townes-Brocks syndrome (TBS) is an autosomal d...,"['Autosomal dominant inheritance', 'Abnormalit...","['0000006', '0000356', '0000598', '0000365', '...","[35, 148, 148, 163, 186, 177, 202, 224, 246, 344]","[53, 162, 162, 176, 197, 197, 223, 241, 266, 362]"


## Evaluation

In [150]:
def jaccard_score(a, b):
    return len(set(a).intersection(set(b))) / len(set(a).union(set(b)))

def jaccard_evaluation(gsc_data, ncr_df, comparison):
    jaccard_scores = []
    for i, row in ncr_df.iterrows():
        gsc_row = gsc_data.iloc[i]
        gsc_codes = ast.literal_eval(gsc_row[comparison])
        ncr_codes = ast.literal_eval(row[comparison])
        score = jaccard_score(gsc_codes, ncr_codes)
        jaccard_scores.append(score)
    return jaccard_scores

def precision_score(a, b):
    try:
        return len(set(a).intersection(set(b))) / len(b)
    except: 
        return 0

def get_precision(gsc_data, ncr_df, comparison):
    prec_scores = []
    for i, row in ncr_df.iterrows():
        gsc_row = gsc_data.iloc[i]
        gsc_codes = ast.literal_eval(gsc_row[comparison])
        ncr_codes = ast.literal_eval(row[comparison])
        score = precision_score(gsc_codes, ncr_codes)
        prec_scores.append(score)
    return prec_scores

def recall_score(a, b):
    return len(set(a).intersection(set(b))) / len(a)

def get_recall(gsc_data, ncr_df, comparison):
    recall_scores = []
    for i, row in ncr_df.iterrows():
        gsc_row = gsc_data.iloc[i]
        gsc_codes = ast.literal_eval(gsc_row[comparison])
        ncr_codes = ast.literal_eval(row[comparison])
        score = recall_score(gsc_codes, ncr_codes)
        recall_scores.append(score)
    return recall_scores

In [154]:
ncr_jaccard_scores = jaccard_evaluation(GSC_dataset, NCR_det, "HPO_codes")
print(f"NCR Jaccard Score is {sum(ncr_jaccard_scores)/len(ncr_jaccard_scores)}")

clinphen_jaccard_scores = jaccard_evaluation(GSC_dataset, clinphen_det, "HPO_codes")
print(f"ClinPhen Jaccard Score is {sum(clinphen_jaccard_scores)/len(clinphen_jaccard_scores)}")


ihp_jaccard_scores = jaccard_evaluation(GSC_dataset, ihp_det, "HPO_symptoms")
print(f"IHP Jaccard Score is {sum(ihp_jaccard_scores)/len(ihp_jaccard_scores)}")

biolink_jaccard_scores = jaccard_evaluation(GSC_dataset, biolink_det, "HPO_codes")
print(f"Biolink Jaccard Score is {sum(biolink_jaccard_scores)/len(biolink_jaccard_scores)}")



print("-----------------------------------------------")


ncr_prec = get_precision(GSC_dataset, NCR_det, "HPO_codes")
print(f"NCR Precision is {sum(ncr_prec)/len(ncr_prec)}")

clin_prec = get_precision(GSC_dataset, clinphen_det, "HPO_codes")
print(f"ClinPhen Precision is {sum(clin_prec)/len(clin_prec)}")

biolink_prec = get_precision(GSC_dataset, biolink_det, "HPO_codes")
print(f"Biolink Precision is {sum(biolink_prec)/len(biolink_prec)}")

ihp_prec = get_precision(GSC_dataset, ihp_det, "HPO_symptoms")
print(f"IHP Precision is {sum(ihp_prec)/len(ihp_prec)}")

print("-----------------------------------------------")

ncr_recall = get_recall(GSC_dataset, NCR_det, "HPO_codes")
print(f"NCR Recall is {sum(ncr_recall)/len(ncr_recall)}")

clin_recall = get_recall(GSC_dataset, clinphen_det, "HPO_codes")
print(f"ClinPhen Recall is {sum(clin_recall)/len(clin_recall)}")

biolink_recall = get_recall(GSC_dataset, biolink_det, "HPO_codes")
print(f"Biolink Recall is {sum(biolink_recall)/len(biolink_recall)}")

ihp_recall = get_recall(GSC_dataset, ihp_det, "HPO_symptoms")
print(f"IHP Recall is {sum(ihp_recall)/len(ihp_recall)}")

NCR Jaccard Score is 0.36920936284125916
ClinPhen Jaccard Score is 0.24376771201209219
IHP Jaccard Score is 0.6785378552203412
Biolink Jaccard Score is 0.3725942054802809
-----------------------------------------------
NCR Precision is 0.49545837487623845
ClinPhen Precision is 0.4946183373851704
Biolink Precision is 0.414125976404112
IHP Precision is 0.848500866831831
-----------------------------------------------
NCR Recall is 0.2959833401711098
ClinPhen Recall is 0.1918913892039217
Biolink Recall is 0.3561891016277511
IHP Recall is 0.5802484600000741


In [140]:
comparison_endpoint = "https://api.monarchinitiative.org/api/sim/compare"

def compare(hpo1, hpo2):
    resp = requests.get(comparison_endpoint, params={"ref_id": hpo1, "query_id": hpo2})
    if resp.status_code != 200:
        print(resp.text)
        return 0
    try:
        score = resp['matches'][0]['score']
        print(score)
    except:
        score = 0
    return score

def semantic_similarity(gsc_data, df2, comparison):
    similarity_scores = []
    for i, row in df2.iterrows():
        gsc_row = gsc_data.iloc[i]
        gsc_codes = ast.literal_eval(gsc_row[comparison])
        ncr_codes = ast.literal_eval(row[comparison])
        gsc_codes = ["HP:" + code for code in gsc_codes]
        ncr_codes = ["HP:" + code for code in ncr_codes]
        score = compare(gsc_codes, ncr_codes)
        similarity_scores.append(score)
    return similarity_scores

In [141]:
#clin_semantic_scores = semantic_similarity(GSC_dataset, clinphen_det, "HPO_codes")
#print(f"ClinPhen Semantic Score is {sum(clin_semantic_scores)/len(cline_semantic_scores)}")