### Maximal Marginal Relevance (MMR)
Select k samples that are as different from each other as possible (based on cosine distance of embeddings).

### Version with a threshold - only add values that are dissimilar enough

In [5]:
from rdflib import Graph, Literal
from collections import defaultdict
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans

# Load RDF graph
graph = Graph()
graph.parse("src_data_graph_new.nt", format="nt")

# Extract literal values per datatype property
property_literals = defaultdict(set)
for s, p, o in graph:
    if isinstance(o, Literal):
        property_literals[str(p)].add(str(o))

# MMR (Maximal Marginal Relevance) with threshold
def mmr_thresholded(embeddings, texts, k=5, diversity=0.7, similarity_threshold=0.85):
    """
    Perform Maximal Marginal Relevance (MMR) sampling with a similarity threshold.

    This function selects a subset of `k` text values from a list using the MMR algorithm.
    MMR balances two criteria:
      - relevance (how representative a value is),
      - and diversity (how different a value is from those already selected).

    A candidate will only be added to the result if it is sufficiently dissimilar to the
    already selected items — determined by a maximum pairwise cosine similarity threshold.

    Parameters:
    -----------
    embeddings : np.ndarray
        A 2D numpy array of shape (n_samples, n_dimensions) containing the vector
        representations (e.g., from Sentence-BERT) of the input texts.

    texts : List[str]
        The original list of text strings corresponding to the embeddings. Must be the same
        length as `embeddings`.

    k : int, optional (default=5)
        The maximum number of values to select.

    diversity : float, optional (default=0.7)
        A weight between 0 and 1. Higher values increase the importance of dissimilarity;
        lower values favor relevance (centrality). Typically between 0.5 and 0.9.

    similarity_threshold : float, optional (default=0.85)
        The maximum allowed cosine similarity (range: 0 to 1) between a candidate and any
        already selected item. If the most similar match exceeds this threshold, the candidate
        is rejected. Lower values lead to higher diversity in the final selection.

    Returns:
    --------
    List[str]
        A list of selected texts, up to `k` in length, satisfying both MMR and the similarity
        constraint. May return fewer than `k` items if diversity constraints prevent more being added.
    """
    if len(texts) <= k:
        return texts

    selected = [0]
    candidates = list(range(1, len(embeddings)))

    while len(selected) < k and candidates:
        mmr_scores = []
        for i in candidates:
            sim_to_selected = max(cosine_similarity([embeddings[i]], [embeddings[j] for j in selected])[0])
            relevance = np.mean(embeddings[i])
            mmr_score = diversity * relevance - (1 - diversity) * sim_to_selected
            mmr_scores.append((i, mmr_score, sim_to_selected))

        mmr_scores.sort(key=lambda x: x[1], reverse=True)

        added = False
        for idx, score, max_sim in mmr_scores:
            if max_sim < similarity_threshold:
                selected.append(idx)
                candidates.remove(idx)
                added = True
                break

        if not added:
            break

    return [texts[i] for i in selected]

# Sentence-BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Sampled results
final_samples = {}
target_k = 5

# Loop over each property
for prop, values in property_literals.items():
    values = list(values)
    if len(values) == 0:
        continue

    prepped_values = [f"val: {v}" for v in values]
    embeddings = np.array(model.encode(prepped_values))

    # K-means clustering (optional, for initial variety)
    n_clusters = min(target_k * 2, len(values))  # more clusters to increase diversity
    kmeans = KMeans(n_clusters=n_clusters, random_state=42).fit(embeddings)
    cluster_labels = kmeans.labels_

    # One value per cluster
    seen_clusters = set()
    clustered_vals = []
    clustered_embeds = []

    for i, label in enumerate(cluster_labels):
        if label not in seen_clusters:
            seen_clusters.add(label)
            clustered_vals.append(values[i])  # un-prefixed value
            clustered_embeds.append(embeddings[i])

    # Apply MMR with threshold
    mmr_selected = mmr_thresholded(
        embeddings=np.array(clustered_embeds),
        texts=clustered_vals,
        k=target_k,
        diversity=0.7,
        similarity_threshold=0.85
    )

    final_samples[prop] = mmr_selected

# Show output
for prop, sample in final_samples.items():
    print(f"{prop}: {sample}")

http://example.org/src#samplingtime: ['12:49:00']
http://example.org/src#patient_cpr: ['te64687489', 'c0cef4fadfd', 'dc44b505e4e', 'afedd9d7f0', 'cdse4751d0']
http://example.org/src#analysiscode: ['DNK35312', 'NPU02070']
http://example.org/src#laboratorium_idcode: ['UKN', 'OUI', 'ESB', 'HDI', 'KPL']
http://example.org/src#referenceinterval_lowerlimit: ['50.0', '137.0', '27.0']
http://example.org/src#referenceinterval_upperlimit: ['30.0', '7.5', '105.0']
http://example.org/src#unit: ['U/L', 'mL/min', '10^6/l', 'mg/g', 'mol/l']
http://example.org/src#rekvirent_idtype: ['sorkode', 'sygehusafdelingsnummer', 'yaugethusgbdnummer', 'ydernummer']
http://example.org/src#samplingdate: ['2010-12-07', '2017-04-16', '2023-10-27']
http://example.org/src#resulttype: ['alfanumerisk', 'numerisk']
http://example.org/src#value: ['00', 'A RhD pos', '>175', 'NEG', '137']
http://example.org/src#operator: ['stoerre_end', 'mindre_end']
http://example.org/src#resultvalidation: ['for_hoej']
http://example.org/s