In [3]:



def find_outgoing_nodes_df(entity, limit):
    query = """
        PREFIX wd: <https://www.wikidata.org/wiki/>
        PREFIX wdt: <https://www.wikidata.org/wiki/Property:>
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
        PREFIX schema: <http://www.schema.org/>


        select distinct ?a ?b ?c where {{{{
        ?a ?b ?c .

        filter(?a = wd:{}) .
        filter(?b not in (skos:prefLabel, skos:altLabel, schema:description)) .
        }}}}
        limit {}""".format(entity, limit)
    sparql.setQuery(query)
    sparql.method = 'GET'
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    ndf = pd.DataFrame(results['results']['bindings'])
    ndf = ndf.applymap(lambda x: x['value'] if x["type"] == "uri" else np.nan)
    try:
        ndf = ndf[["a", "b", "c"]]
        ndf.columns = ["1", "2", "3"]

        ndf = ndf.dropna()

        ndf = ndf.drop_duplicates()

        return ndf
    except Exception as e:
        return pd.DataFrame()


def find_incoming_nodes_df(entity, limit):
    query = """
        PREFIX wd: <https://www.wikidata.org/wiki/>
        PREFIX wdt: <https://www.wikidata.org/wiki/Property:>
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
        PREFIX schema: <http://www.schema.org/>


        select distinct ?c ?b ?a where {{{{
        ?c ?b ?a .

        filter(?a = wd:{}) .
        filter(?b not in (skos:prefLabel, skos:altLabel, schema:description)) .
        }}}}
        limit {}""".format(entity, limit)
    sparql.setQuery(query)
    sparql.method = 'GET'
    sparql.setReturnFormat(JSON)
    try:
        results = sparql.query().convert()
        ndf = pd.DataFrame(results['results']['bindings'])
        ndf = ndf.applymap(lambda x: x['value'] if x["type"] == "uri" else np.nan)

        ndf = ndf[["c", "b", "a"]]
        ndf.columns = ["1", "2", "3"]

        ndf = ndf.dropna()

        ndf = ndf.drop_duplicates()

        return ndf
    except Exception as e:
        return pd.DataFrame()

def find_neighbours(entity, cutoff_hub_spoke = 200):
    ndf = pd.concat([find_incoming_nodes_df(entity, 1000000), find_outgoing_nodes_df(entity, 1000000)])
        
    if len(ndf) < cutoff_hub_spoke:
        return find_second_neighbours(entity, ndf, cutoff_hub_spoke), True
    else:
        return ndf, False
        
def find_second_neighbours(entity, ndf, cutoff_hub_spoke):
    first_neighbours = list(set(ndf["1"].values).union(set(ndf["3"].values)) - set(["https://www.wikidata.org/wiki/" + entity]))
    no_dfs = []
    for neighbour in first_neighbours:
        neighbour_entity = neighbour.split("/")[-1]
        
        ntdf1 = find_incoming_nodes_df(neighbour_entity, cutoff_hub_spoke + 1)
        ntdf2 = find_outgoing_nodes_df(neighbour_entity, cutoff_hub_spoke + 1)
        
#         print(ntdf)
        if len(ntdf1) < cutoff_hub_spoke and len(ntdf1):
            no_dfs.append(ntdf1)
            
        if len(ntdf2) < cutoff_hub_spoke and len(ntdf2):
            no_dfs.append(ntdf2) 
            
    n2df = pd.concat(no_dfs + [ndf])
    n2df = n2df.drop_duplicates()
    
    return n2df


def is_taxon(entity):
    
    query = """
    PREFIX wd: <https://www.wikidata.org/wiki/>
    PREFIX wdt: <https://www.wikidata.org/wiki/Property:>
    ASK where {{ 
        wd:{} (wdt:P31|wdt:P279)* wd:Q21871294  .
    }}
    """.format(entity)
    
    sparql.setQuery(query)
    sparql.method = 'GET'
    sparql.setReturnFormat(JSON)
    result = sparql.query().convert()["boolean"]
    return result

def find_superclasses(entity):
	query = """
	PREFIX wd: <https://www.wikidata.org/wiki/>
	PREFIX wdt: <https://www.wikidata.org/wiki/Property:>
	select ?c where {{ 
	wd:{} (wdt:P31|wdt:P279)* ?c  .
	}}
	""".format(entity)
	
	sparql.setQuery(query)
	sparql.method = 'GET'
	sparql.setReturnFormat(JSON)
	results = sparql.query().convert()

	ndf = pd.DataFrame(results['results']['bindings'])

	ndf = ndf.applymap(lambda x: x['value'] if x["type"] == "uri" else np.nan)
	return ndf.values



    
def find_subgraph(entity_matches):
    dfs = {}
    
    groups = {x[1]: [] for x in entity_matches}   
    spoke = {}
    
    for ent in entity_matches:
        idd = ent[0]
        groups[ent[1]] += [idd]
        
        dfs[idd], spoke[idd] = find_neighbours(idd)
    	
    taxon_present = False
    for x in entity_matches:
        if is_taxon(x[0]):
            taxon_present = True
    
    if not taxon_present:
        groups["no_taxon_so_assumed_human"] = ["Q15978631"]
        
    return dfs, groups, spoke


def total_path_length(combination, G):
    total_spl_length = 0
    for c1 in combination:
        for c2 in combination:
            spl = nx.shortest_path_length(G, source = 'https://www.wikidata.org/wiki/' + c1, target = 'https://www.wikidata.org/wiki/' + c2)
            total_spl_length += spl

    return total_spl_length

def min_total_path_length(combinations, G):
    min_total_path_length = 1000
    
    filtered = []
    for comb in combinations:
    	try:
    		tpl = total_path_length(comb, G)
    	except:
    		tpl = len(comb) * 2 + 1
    	if tpl < min_total_path_length:
    		filtered = [comb]
    		min_total_path_length = tpl
    	elif tpl == min_total_path_length:
    		filtered += [comb]
    		min_total_path_length = tpl

    return filtered

def final_entities(combinations, G, groups, tdfs, spokes):
    # print(combinations)    
    filtered_entities = min_total_path_length(combinations, G)

    final_entities = []
    final_filtered_entities = []
    # print(tdfs)
    if len(filtered_entities) >= 2:
    	max_score = 0
    	for ent_set in filtered_entities:
    		score = sum([len(tdfs[x]) for x in ent_set if x in tdfs.keys()])
    		if score > max_score:
    			final_filtered_entities = ent_set
    			max_score = score
    else:
    	final_filtered_entities = filtered_entities[0]

    # print(final_filtered_entities)
    # for filtered_entities_set in filtered_entities:
    new_tdfs = {}
    new_spokes = {}
    for k, l in zip(groups.keys(), final_filtered_entities):
        if k != "no_taxon_so_assumed_human":
            if [l, k] not in final_entities:
                final_entities.append([l, k])
                new_tdfs[l] = tdfs[l]
                new_spokes[l] = spokes[l]
    return final_entities, new_tdfs, new_spokes

def find_final_entities(entity_matches, verbose = True):
    if verbose:
        print("Finding SubGraph Edges")
    tdfs, groups, spokes = find_subgraph(entity_matches)
    tdf = pd.concat(list(tdfs.values()))
    if verbose:
        print("Making SubGraph")
    G = nx.from_pandas_edgelist(tdf, "1", "3", edge_attr='2', create_using=nx.MultiGraph())
    combinations = [p for p in itertools.product(*groups.values())]
    if verbose:
        print("Filtering Entities")
        
    return final_entities(combinations, G, groups, tdfs, spokes)


In [24]:
Ans=find_properties('Q18037663')

In [27]:
for j in range(len(Ans)):
    print(Ans['prop'][j]['value'],":",Ans['c'][j]['value'])

http://www.w3.org/2004/02/skos/core#prefLabel : PANX1
http://www.schema.org/description : protein-coding gene in the species Homo sapiens
https://www.wikidata.org/wiki/Property:P279 : https://www.wikidata.org/wiki/Q20747295
https://www.wikidata.org/wiki/Property:P31 : https://www.wikidata.org/wiki/Q7187
http://www.w3.org/2004/02/skos/core#altLabel : MRS1
http://www.w3.org/2004/02/skos/core#altLabel : PX1
http://www.w3.org/2004/02/skos/core#altLabel : UNQ2529
http://www.w3.org/2004/02/skos/core#altLabel : pannexin 1
http://www.w3.org/2004/02/skos/core#altLabel : OOMD7
http://www.w3.org/2004/02/skos/core#altLabel : Pannexin1
https://www.wikidata.org/wiki/Property:P703 : https://www.wikidata.org/wiki/Q15978631
https://www.wikidata.org/wiki/Property:P593 : 49416
https://www.wikidata.org/wiki/Property:P684 : https://www.wikidata.org/wiki/Q29744389
https://www.wikidata.org/wiki/Property:P684 : https://www.wikidata.org/wiki/Q24406886
https://www.wikidata.org/wiki/Property:P684 : https://www.w