<a href="https://colab.research.google.com/github/koad7/uol_final/blob/main/f_work_Graph_normalisation_optimisation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
def create_graph_from_dataframe_normalisation(df):
    KG = nx.DiGraph()
    node_to_osm_id = {}  # For deduplication based on osm_id

    for index, row in df.iterrows():
        # Standardize Node Labels
        name = row['name'].lower().strip() if not pd.isnull(row['name']) else None
        city = row['city'].lower().strip() if not pd.isnull(row['city']) else None
        country = row['country'].lower().strip() if not pd.isnull(row['country']) else None
        sub_region = row['sub-region'].lower().strip() if not pd.isnull(row['sub-region']) else None
        region = row['region'].lower().strip() if not pd.isnull(row['region']) else None

        # Node Deduplication based on osm_id
        if name and row['osm_id'] in node_to_osm_id.values():
            continue

        # Add nodes only if they are not None
        if name:
            KG.add_node(name, alternative_names=row['alternative_names'], region=row['region'], osm_id=row['osm_id'], labels=name)
            node_to_osm_id[name] = row['osm_id']
        if city:
            KG.add_node(city, region=row['region'], state=row['state'], labels=city)
        if country:
            KG.add_node(country, capital=row['capital'], region=row['region'], related_places=get_related_places(country), labels=country)
        if sub_region:
            KG.add_node(sub_region, region=row['region'], labels=sub_region)
        if region:
            KG.add_node(region, labels=region)

        # Add edges only between existing nodes
        if name and city and KG.has_node(name) and KG.has_node(city):
            KG.add_edge(name, city, relation="is_in", labels=f'{name} IS_IN {city}')
        if city and country and KG.has_node(city) and KG.has_node(country) and city != row['capital']:
            KG.add_edge(city, country, relation="is_in", labels=f'{city} IS_IN {country}')
        if country and sub_region and KG.has_node(country) and KG.has_node(sub_region):
            KG.add_edge(country, sub_region, relation="is_in", labels=f'{country} IS_IN {sub_region}')
        if sub_region and region and KG.has_node(sub_region) and KG.has_node(region):
            KG.add_edge(sub_region, region, relation="is_in", labels=f'{sub_region} IS_IN {region}')

    return KG


In [None]:
def get_embedding(node, tokenizer, model):
    inputs = tokenizer(node, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    node_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    return node, node_embedding


def distillmBERT_KGE(KG):
    # Initialize DistilBERT tokenizer and model
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')
    model = DistilBertModel.from_pretrained('distilbert-base-multilingual-cased')

    # Initialize an empty dictionary to store embeddings
    distilmbert_embeddings = {}

    # Use ThreadPoolExecutor for parallel processing
    with ThreadPoolExecutor() as executor:
        future_to_node = {executor.submit(get_embedding, node, tokenizer, model): node for node in KG.nodes}
        for future in as_completed(future_to_node):
            node = future_to_node[future]
            try:
                node, node_embedding = future.result()
                distilmbert_embeddings[node] = np.array(node_embedding)  # Ensure it's a NumPy array
            except Exception as exc:
                print(f"{node} generated an exception: {exc}")

    # Embedding Similarity for Node Merging
    nodes = list(KG.nodes)
    for i, node1 in enumerate(nodes):
        for j, node2 in enumerate(nodes[i+1:]):
            similarity = cosine_similarity([distilmbert_embeddings[node1]], [distilmbert_embeddings[node2]])
            if similarity > 0.9:  # Threshold for merging
                # Merge nodes
                merge_nodes(KG, node1, node2, distilmbert_embeddings)

    return distilmbert_embeddings

In [None]:
def merge_nodes(graph, primary, secondary, embeddings):
    # Transfer edges from secondary to primary
    for neighbor in list(graph.neighbors(secondary)):
        if not graph.has_edge(primary, neighbor):
            graph.add_edge(primary, neighbor, **graph[secondary][neighbor])

    # Update attributes of the primary node
    for key, value in graph.nodes[secondary].items():
        if key in graph.nodes[primary]:
            if isinstance(graph.nodes[primary][key], list):
                graph.nodes[primary][key].extend(value)
            elif isinstance(graph.nodes[primary][key], set):
                graph.nodes[primary][key].update(value)
            # Add more types as needed
        else:
            graph.nodes[primary][key] = value

    # Update the embeddings for the primary node (average the embeddings)
    embeddings[primary] = (embeddings[primary] + embeddings[secondary]) / 2

    # Remove the secondary node
    graph.remove_node(secondary)




def distillmBERT_KGE(KG):
    # Initialize DistilBERT tokenizer and model
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')
    model = DistilBertModel.from_pretrained('distilbert-base-multilingual-cased')

    # Initialize an empty dictionary to store embeddings
    distilmbert_embeddings = {}

    # Use ThreadPoolExecutor for parallel processing
    with ThreadPoolExecutor() as executor:
        future_to_node = {executor.submit(get_embedding, node, tokenizer, model): node for node in KG.nodes}
        for future in as_completed(future_to_node):
            node = future_to_node[future]
            try:
                node, node_embedding = future.result()
                distilmbert_embeddings[node] = np.array(node_embedding)  # Ensure it's a NumPy array
            except Exception as exc:
                print(f"{node} generated an exception: {exc}")

    return distilmbert_embeddings