In [96]:
from os.path import join, exists
from os import listdir
from SPARQLWrapper import SPARQLWrapper, JSON
from unidecode import unidecode
from tqdm import tqdm
from functools import cache
import itertools

import hashlib
import random
import string
import json
import sys
import re

sys.path.append('..')

from lib.dataset_gen import setup

In [2]:
guidelines, dataset, search = setup()

 [+] Using cached query: d24ede18
 [+] Using cached query: b2ea7c42
 [+] Using cached query: b07e95d6
Unmatched proportion: 4.49% (59 elements)


In [3]:
guidelines[0]['label']

'Iron Deficiency Anemia in Young Children'

In [4]:
dataset['Q3299714']

{'id': 'Q3299714',
 'name': 'normocytic anemia',
 'alt': ['ANEMIA NORMOCYTIC', 'Normocytic anemia'],
 'subclass_of': ['anemia'],
 'study_by': [],
 'health_speciality': [],
 'symptoms_and_signs': []}

In [7]:
search('health problem')

['Q748309', 'Q105434', 'Q12135']

In [73]:
STOP_TOKEN = { 'disease' }

@cache
def metric_search(q_init, n_max = 8):
    history = []

    visited = set()
    class_of = [q_init]
    index = 0

    while index < n_max and len(class_of) > 0:
        current = class_of.pop(0)
        visited.add(current)
        current = dataset[current]
        if current['name'] in STOP_TOKEN:
            continue

        class_of += [j[0] for j in [search(x) for x in current['subclass_of']] if len(j) > 0 and j[0] not in visited]
        history.append(
            (current['study_by'] + current['health_speciality'], current['symptoms_and_signs'])
        )
        index += 1

    return history

In [140]:
def powerset(iterable):
    """
    powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)
    """
    xs = list(iterable)
    # note we return an iterator rather than a list
    return itertools.chain.from_iterable(itertools.combinations(xs,n) for n in range(len(xs)+1))

def build_multiindex(dataset):
    index = {}
    for elem in dataset:
        spec = dataset[elem]['health_speciality'] + dataset[elem]['study_by']
        for comb in powerset(spec):
            if len(list(comb)) > 0:
                comb = tuple(sorted(comb))
                index[comb] = index.get(comb, []) + [elem]
    return index

multiindex = build_multiindex(dataset)

In [141]:
multiindex

{('oncology',): ['Q18554604',
  'Q18556157',
  'Q786852',
  'Q18556253',
  'Q4763273',
  'Q18555112',
  'Q7168693',
  'Q18573407',
  'Q1827370',
  'Q18556262',
  'Q18554264',
  'Q18556809',
  'Q2030989',
  'Q17152512',
  'Q18556799',
  'Q5264831',
  'Q18556448',
  'Q18554044',
  'Q18558060',
  'Q5464665',
  'Q486090',
  'Q18557994',
  'Q1164529',
  'Q18556792',
  'Q18554477',
  'Q5370233',
  'Q209369',
  'Q18558073',
  'Q18557544',
  'Q720020',
  'Q1785791',
  'Q7005037',
  'Q25422732',
  'Q18558218',
  'Q18557977',
  'Q7046751',
  'Q18554321',
  'Q18555918',
  'Q18557394',
  'Q18554882',
  'Q18556461',
  'Q825923',
  'Q18554263',
  'Q2165399',
  'Q2095252',
  'Q4677933',
  'Q18557968',
  'Q18557976',
  'Q18556084',
  'Q16877679',
  'Q128581',
  'Q18556340',
  'Q827497',
  'Q18556089',
  'Q4667534',
  'Q1989240',
  'Q1892153',
  'Q1088072',
  'Q2072821',
  'Q18557843',
  'Q570875',
  'Q18556314',
  'Q5130798',
  'Q18555317',
  'Q28439793',
  'Q18555197',
  'Q18554887',
  'Q18555285',
 

In [156]:
def extract_field(domains):
    first_fields = set()
    i = -len(domains)
    for d in domains:
        first_fields.update(d)
        if len(d) >= 1:
            i = 0
        if i == 2:
            break
        i += 1
    return list(first_fields)

def proximity_heuristic(d1, dbase):
    score = 0
    for d in d1:
        if d in dbase:
            score += 1

    return (score) / len(dbase)

def find_matching_not_matching(true_positive_q, ref_fields, n_max = 3):
    ref_fields = extract_field(ref_fields)
    scores = 0
    elems = None

    for _ in range(n_max):
        q_init = random.choice(list(dataset.keys()))
        fields,_ = list(zip(*metric_search(q_init)))
        fields = extract_field(fields)
        heuristic = proximity_heuristic(fields, ref_fields)

        if (heuristic < scores or elems is None) and q_init != true_positive_q:
            elems = q_init
            scores = heuristic

    # Generate the powerset
    field_powerset = list(powerset(ref_fields))

    while True:
        elem = random.choice(field_powerset)
        if len(elem) == 0:
            continue

        n_q = random.choice(multiindex[tuple(sorted(elem))])
        return elems, n_q

In [157]:
def standardize(x):
    return x.lower()

In [158]:
q_init = random.choice(list(dataset.keys())); q_init

'Q55786940'

In [159]:
A = metric_search(q_init)
domains, _ = list(zip(*A))
domains

([],
 ['medical genetics'],
 ['pulmonology'],
 ['pulmonology'],
 [],
 [],
 ['pulmonology'],
 ['medical genetics'])

In [160]:
extract_field(domains)

['medical genetics', 'pulmonology']

In [164]:
min_q, max_q = find_matching_not_matching(q_init, domains)

In [165]:
min_q, max_q

('Q18558032', 'Q18928105')

In [166]:
dataset[min_q]['name'], dataset[q_init]['name'], dataset[max_q]['name']

('skin melanoma',
 'congenital pulmonary airway malformation type 4',
 'Kartagener syndrome')

In [167]:
dataset[max_q]

{'id': 'Q18928105',
 'name': 'Kartagener syndrome',
 'alt': ['PCD',
  "Kartagener's syndrome",
  'Immotile ciliary syndrome',
  'Primary ciliary dyskinesia, Kartagener type'],
 'subclass_of': ['disease', 'situs inversus', 'primary ciliary dyskinesia'],
 'study_by': [],
 'health_speciality': ['cardiology',
  'otolaryngology',
  'pulmonology',
  'medical genetics'],
 'symptoms_and_signs': []}