In [1]:
import pandas as pd
import networkx as nx
from SPARQLWrapper import SPARQLWrapper
import warnings

warnings.filterwarnings('ignore')

### Constants

In [17]:
NODE = "wd:Q171076"
NODE_URL = "http://www.wikidata.org/entity/Q171076"

# Building the graph

In [27]:
def build_graph(node: str):
    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
    sparql.setQuery(f"""
                    CONSTRUCT {{ 
                    {node} ?p ?object.
                    ?subject3 ?p4 ?object.
                    ?object ?p1 ?subject1.
                    ?subject ?p2 {node}.
                    ?subject ?p5 ?subject4.
                    ?subject2 ?p3 ?subject. 
                    {node} wdtn:P227 ?object2.
                    ?subject5 wdtn:P227 ?object2. 
                    }} 
                    WHERE {{
                        {{ 
                        {node} ?p ?object.
                        ?subject3 ?p4 ?object.
                        ?object ?p1 ?subject1.
                        OPTIONAL {{
                            {node} wdtn:P227 ?object2.
                            ?subject5 wdtn:P227 ?object2.}}
                        FILTER(isURI(?subject1))
                        FILTER(isURI(?object2))
                        }}
                    UNION
                        {{ 
                        ?subject ?p2 {node}.
                        ?subject2 ?p3 ?subject.
                        ?subject ?p5 ?subject4.
                        FILTER(isURI(?subject4))
                        }}
                    }}
                """)
    query_result = sparql.queryAndConvert() # Produces a RDFlib Graph object

    # Produces a dataframe based on the RDFlib graph
    query_df = pd.DataFrame(query_result, columns=["Subject", "Property", "Value"])
    # Cleaning unwanted wrappings around our objects
    query_df["Subject"] = query_df["Subject"].str.replace("rdflib.term.URIRef", "")
    query_df["Value"] = query_df["Value"].str.replace("rdflib.term.URIRef", "")
    query_df["Property"] = query_df["Property"].str.replace("rdflib.term.URIRef", "")
    discard_properties = ["P921", "P2860"]  # Properties to discard
    discard_sub_obj = ["/statement/"]  #  Subjects and Objects to discard
    query_df = query_df[~query_df.Property.str.contains('|'.join(discard_properties))]
    query_df = query_df[~query_df.Subject.str.contains('|'.join(discard_sub_obj))]
    query_df = query_df[~query_df.Value.str.contains('|'.join(discard_sub_obj))]

    # Convert dataframe to a networkx graph
    G = nx.from_pandas_edgelist(query_df, "Subject", "Value", edge_attr="Property")
    return G

In [28]:
q171076_graph = build_graph(NODE)
q171076_graph.nodes()

NodeView(('http://www.wikidata.org/entity/Q36091973', 'http://www.wikidata.org/entity/Q13442814', 'http://www.wikidata.org/entity/Q38919265', 'https://dblp.org/rec/journals/see/VochozkaMS18.rdf', 'http://www.wikidata.org/entity/Q36072972', 'http://dx.doi.org/10.1099/IJSEM.0.001210', 'http://www.wikidata.org/entity/Q42379261', 'http://www.wikidata.org/entity/Q15754394', 'http://www.wikidata.org/entity/Q39020160', 'http://www.wikidata.org/entity/Q7318358', 'http://www.wikidata.org/entity/Q35987442', 'http://www.wikidata.org/entity/Q564954', 'http://www.wikidata.org/entity/Q39097675', 'http://www.wikidata.org/entity/Q56912439', 'http://www.wikidata.org/entity/Q42816956', 'http://www.wikidata.org/entity/Q1860', 'http://www.wikidata.org/entity/Q67224781', 'http://www.wikidata.org/entity/Q41974008', 'http://www.wikidata.org/entity/Q39651404', 'http://www.wikidata.org/entity/Q38394488', 'http://www.wikidata.org/entity/Q2120036', 'http://www.wikidata.org/entity/Q36264971', 'http://www.wikidata

# Identifier heuristic

In [20]:
def identifier(graph: object, node: str):
    common_id = {}

    for item in graph.neighbors(node):
        property = graph[node][item]["Property"]
        if property == "http://www.wikidata.org/prop/direct-normalized/P227": 
            for item_neighbor in graph.neighbors(item):
                if item_neighbor == node:
                    continue
                else:
                    property2 = graph[item][item_neighbor]["Property"]
                    if property2 == property:
                        common_id[property] = item_neighbor
    return common_id

In [21]:
identifier(q171076_graph, NODE_URL)

NetworkXError: The node http://www.wikidata.org/entity/Q171076 is not in the graph.

# Neighborhood heuristic

In [None]:
def common_neighbors(graph: object, node: str):
    node_dict = {}

    for item in graph.neighbors(node):
        for neighbor in graph.neighbors(item):
            if neighbor == node or neighbor == 'http://wikiba.se/ontology#BestRank' or neighbor == 'http://wikiba.se/ontology#NormalRank':
                continue
            common_neighbors = sum(n in graph.neighbors(neighbor) for n in graph.neighbors(node))
            node_dict[neighbor] = common_neighbors
    return sorted(node_dict.items(), key=lambda item: item[1], reverse=True)        

In [None]:
common_neighbors(q171076_graph, NODE_URL)    