In [34]:
import xml.etree.ElementTree as ET

# Function to print full path with indented visual tree format
def print_schema_tree(node, path="", indent=""):
    current_path = f"{path}/{node.tag}" if path else node.tag
    print(f"{indent}{current_path} [-]")
    for child in node:
        print_schema_tree(child, current_path, indent + "  ")

# Load and parse the XML
tree = ET.parse(r"C:\Users\HP\Desktop\sDI project\movie.xml")
root = tree.getroot()

# Print Schema Tree with full paths
print("Schema Tree 1 (XDS)")
print_schema_tree(root)

Schema Tree 1 (XDS)
movie [-]
  movie/title [-]
  movie/director [-]
    movie/director/firstName [-]
    movie/director/lastName [-]
  movie/genre [-]
  movie/releaseYear [-]
  movie/actors [-]
    movie/actors/actor [-]
      movie/actors/actor/actorName [-]
      movie/actors/actor/characterName [-]
  movie/ratings [-]
    movie/ratings/rating [-]
      movie/ratings/rating/critic [-]
      movie/ratings/rating/score [-]
  movie/book [-]
    movie/book/title [-]
    movie/book/DOI [-]
  movie/ShootLocation [-]
    movie/ShootLocation/address [-]
    movie/ShootLocation/city [-]
    movie/ShootLocation/country [-]
    movie/ShootLocation/latitude [-]
    movie/ShootLocation/longitude [-]


In [35]:
import xml.etree.ElementTree as ET

# Function to print full path with indented visual tree format
def print_schema_tree(node, path="", indent=""):
    current_path = f"{path}/{node.tag}" if path else node.tag
    print(f"{indent}{current_path} [-]")
    for child in node:
        print_schema_tree(child, current_path, indent + "  ")

# Load and parse the XML (Mediated Schema)
tree = ET.parse(r"C:\Users\HP\Desktop\sDI project\mediated.xml")
root = tree.getroot()

# Print Schema Tree with full paths
print("Schema Tree 2 (XMS)")
print_schema_tree(root)

Schema Tree 2 (XMS)
mediatedSchema [-]
  mediatedSchema/movie [-]
    mediatedSchema/movie/title [-]
    mediatedSchema/movie/book [-]
      mediatedSchema/movie/book/title [-]
      mediatedSchema/movie/book/DOI [-]
      mediatedSchema/movie/book/publicationHouse [-]
        mediatedSchema/movie/book/publicationHouse/location [-]
          mediatedSchema/movie/book/publicationHouse/location/address [-]
          mediatedSchema/movie/book/publicationHouse/location/city [-]
          mediatedSchema/movie/book/publicationHouse/location/country [-]


### Levenshtein Edit Distance

In [36]:
!pip install python-Levenshtein



In [37]:
import pandas as pd
import numpy as np
import Levenshtein  # for edit distance
from IPython.display import display

paths = [
    "mediatedSchema",
    "mediatedSchema/movie",
    "mediatedSchema/movie/title",
    "mediatedSchema/movie/book",
    "mediatedSchema/movie/book/title",
    "mediatedSchema/movie/book/DOI",
    "mediatedSchema/movie/book/publicationHouse",
    "mediatedSchema/movie/book/publicationHouse/location",
    "mediatedSchema/movie/book/publicationHouse/location/address",
    "mediatedSchema/movie/book/publicationHouse/location/city",
    "mediatedSchema/movie/book/publicationHouse/location/country"
]


source_paths = [
    "movie",
    "movie/title",
    "movie/director",
    "movie/director/firstName",
    "movie/director/lastName",
    "movie/genre",
    "movie/releaseYear",
    "movie/actors",
    "movie/actors/actor",
    "movie/actors/actor/actorName",
    "movie/actors/actor/characterName",
    "movie/ratings",
    "movie/ratings/rating",
    "movie/ratings/rating/critic",
    "movie/ratings/rating/score",
    "movie/book",
    "movie/book/title",
    "movie/book/DOI",
    "movie/ShootLocation",
    "movie/ShootLocation/address",
    "movie/ShootLocation/city",
    "movie/ShootLocation/country",
    "movie/ShootLocation/latitude",
    "movie/ShootLocation/longitude"
]

nodes = [
    "movie", "title", "director", "firstName", "lastName", "genre", "releaseYear",
    "actors", "actor", "actorName", "characterName", "ratings", "rating",
    "critic", "score", "book", "DOI", "ShootLocation", "address", "city",
    "country", "latitude", "longitude"
]
mediated_nodes = [
    "mediatedSchema", "movie", "title", "book", "DOI", "publicationHouse", "location",
    "address", "city", "country"
]


In [38]:
def levenshtein_similarity_matrix(rows, cols):
    sim_matrix = pd.DataFrame(index=rows, columns=cols, dtype=float)

    for r in rows:
        for c in cols:
            dist = Levenshtein.distance(r, c)
            max_len = max(len(r), len(c))
            similarity = 1 - dist / max_len if max_len != 0 else 1
            sim_matrix.loc[r, c] = round(similarity, 6)
    return sim_matrix


In [39]:
# Compute similarity matrices
path_sim_matrix = levenshtein_similarity_matrix(paths, source_paths)
node_sim_matrix = levenshtein_similarity_matrix(mediated_nodes, nodes)

# Display as in your friend's output
print("\nPATH :")
print("Edit Distance Similarity Matrix")
display(path_sim_matrix)

print("\nNODE :")
print("Edit Distance Similarity Matrix")
display(node_sim_matrix)



PATH :
Edit Distance Similarity Matrix


Unnamed: 0,movie,movie/title,movie/director,movie/director/firstName,movie/director/lastName,movie/genre,movie/releaseYear,movie/actors,movie/actors/actor,movie/actors/actor/actorName,...,movie/ratings/rating/score,movie/book,movie/book/title,movie/book/DOI,movie/ShootLocation,movie/ShootLocation/address,movie/ShootLocation/city,movie/ShootLocation/country,movie/ShootLocation/latitude,movie/ShootLocation/longitude
mediatedSchema,0.214286,0.214286,0.142857,0.25,0.26087,0.214286,0.294118,0.285714,0.222222,0.214286,...,0.192308,0.214286,0.125,0.142857,0.157895,0.185185,0.166667,0.185185,0.142857,0.137931
mediatedSchema/movie,0.25,0.25,0.25,0.208333,0.26087,0.25,0.25,0.25,0.25,0.285714,...,0.230769,0.25,0.25,0.2,0.2,0.185185,0.208333,0.185185,0.214286,0.241379
mediatedSchema/movie/title,0.192308,0.423077,0.230769,0.269231,0.230769,0.269231,0.230769,0.230769,0.269231,0.214286,...,0.192308,0.230769,0.423077,0.230769,0.307692,0.185185,0.269231,0.185185,0.285714,0.241379
mediatedSchema/movie/book,0.2,0.2,0.24,0.16,0.16,0.2,0.24,0.2,0.24,0.178571,...,0.192308,0.4,0.2,0.24,0.28,0.148148,0.2,0.185185,0.142857,0.172414
mediatedSchema/movie/book/title,0.16129,0.354839,0.225806,0.258065,0.258065,0.225806,0.193548,0.225806,0.225806,0.225806,...,0.193548,0.322581,0.516129,0.354839,0.290323,0.225806,0.322581,0.225806,0.290323,0.258065
mediatedSchema/movie/book/DOI,0.172414,0.206897,0.206897,0.172414,0.206897,0.206897,0.206897,0.206897,0.206897,0.206897,...,0.172414,0.344828,0.310345,0.482759,0.241379,0.172414,0.241379,0.206897,0.172414,0.172414
mediatedSchema/movie/book/publicationHouse,0.119048,0.214286,0.214286,0.238095,0.238095,0.190476,0.190476,0.238095,0.214286,0.261905,...,0.214286,0.238095,0.333333,0.261905,0.309524,0.261905,0.309524,0.285714,0.333333,0.285714
mediatedSchema/movie/book/publicationHouse/location,0.098039,0.176471,0.215686,0.196078,0.215686,0.156863,0.215686,0.196078,0.27451,0.215686,...,0.235294,0.196078,0.27451,0.215686,0.294118,0.27451,0.313725,0.313725,0.313725,0.313725
mediatedSchema/movie/book/publicationHouse/location/address,0.084746,0.169492,0.20339,0.220339,0.237288,0.169492,0.20339,0.20339,0.254237,0.254237,...,0.305085,0.169492,0.254237,0.186441,0.254237,0.389831,0.271186,0.288136,0.338983,0.322034
mediatedSchema/movie/book/publicationHouse/location/city,0.089286,0.160714,0.196429,0.196429,0.214286,0.142857,0.196429,0.178571,0.25,0.232143,...,0.285714,0.178571,0.25,0.196429,0.267857,0.267857,0.357143,0.303571,0.321429,0.303571



NODE :
Edit Distance Similarity Matrix


Unnamed: 0,movie,title,director,firstName,lastName,genre,releaseYear,actors,actor,actorName,...,critic,score,book,DOI,ShootLocation,address,city,country,latitude,longitude
mediatedSchema,0.214286,0.214286,0.285714,0.142857,0.142857,0.142857,0.285714,0.142857,0.142857,0.142857,...,0.214286,0.071429,0.0,0.0,0.071429,0.214286,0.142857,0.071429,0.285714,0.214286
movie,1.0,0.2,0.0,0.111111,0.125,0.2,0.090909,0.0,0.0,0.222222,...,0.166667,0.2,0.2,0.0,0.153846,0.142857,0.0,0.142857,0.25,0.333333
title,0.2,1.0,0.25,0.333333,0.25,0.2,0.090909,0.166667,0.2,0.222222,...,0.333333,0.2,0.0,0.0,0.153846,0.142857,0.4,0.142857,0.5,0.333333
book,0.2,0.0,0.125,0.0,0.0,0.0,0.0,0.166667,0.2,0.111111,...,0.0,0.2,1.0,0.0,0.153846,0.0,0.0,0.142857,0.0,0.111111
DOI,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
publicationHouse,0.125,0.1875,0.1875,0.1875,0.1875,0.125,0.125,0.25,0.1875,0.25,...,0.1875,0.1875,0.1875,0.0,0.25,0.125,0.125,0.125,0.375,0.25
location,0.25,0.125,0.125,0.111111,0.125,0.0,0.090909,0.25,0.375,0.0,...,0.25,0.125,0.25,0.0,0.538462,0.0,0.25,0.25,0.25,0.333333
address,0.142857,0.142857,0.25,0.0,0.125,0.285714,0.181818,0.285714,0.142857,0.222222,...,0.0,0.285714,0.0,0.0,0.0,1.0,0.0,0.0,0.125,0.0
city,0.0,0.4,0.25,0.222222,0.125,0.0,0.0,0.166667,0.2,0.111111,...,0.5,0.2,0.0,0.0,0.153846,0.0,1.0,0.428571,0.25,0.222222
country,0.142857,0.142857,0.125,0.111111,0.0,0.285714,0.0,0.142857,0.142857,0.222222,...,0.285714,0.285714,0.142857,0.0,0.153846,0.0,0.428571,1.0,0.125,0.222222


### Jaccard Similiarity

In [40]:
def jaccard_similarity_matrix(rows, cols):
    sim_matrix = pd.DataFrame(index=rows, columns=cols, dtype=float)

    for r in rows:
        set_r = set(r.lower().split("/"))  # lowercase & tokenize
        for c in cols:
            set_c = set(c.lower().split("/"))
            intersection = set_r & set_c
            union = set_r | set_c
            similarity = len(intersection) / len(union) if union else 0
            sim_matrix.loc[r, c] = round(similarity, 6)
    return sim_matrix


In [41]:
# Compute Jaccard Similarity Matrices
jaccard_path_sim_matrix = jaccard_similarity_matrix(paths, source_paths)
jaccard_node_sim_matrix = jaccard_similarity_matrix(mediated_nodes, nodes)

# Display
print("\nPATH :")
print("Jaccard Similarity Matrix")
display(jaccard_path_sim_matrix)

print("\nNODE :")
print("Jaccard Similarity Matrix")
display(jaccard_node_sim_matrix)



PATH :
Jaccard Similarity Matrix


Unnamed: 0,movie,movie/title,movie/director,movie/director/firstName,movie/director/lastName,movie/genre,movie/releaseYear,movie/actors,movie/actors/actor,movie/actors/actor/actorName,...,movie/ratings/rating/score,movie/book,movie/book/title,movie/book/DOI,movie/ShootLocation,movie/ShootLocation/address,movie/ShootLocation/city,movie/ShootLocation/country,movie/ShootLocation/latitude,movie/ShootLocation/longitude
mediatedSchema,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mediatedSchema/movie,0.5,0.333333,0.333333,0.25,0.25,0.333333,0.333333,0.333333,0.25,0.2,...,0.2,0.333333,0.25,0.25,0.333333,0.25,0.25,0.25,0.25,0.25
mediatedSchema/movie/title,0.333333,0.666667,0.25,0.2,0.2,0.25,0.25,0.25,0.2,0.166667,...,0.166667,0.25,0.5,0.2,0.25,0.2,0.2,0.2,0.2,0.2
mediatedSchema/movie/book,0.333333,0.25,0.25,0.2,0.2,0.25,0.25,0.25,0.2,0.166667,...,0.166667,0.666667,0.5,0.5,0.25,0.2,0.2,0.2,0.2,0.2
mediatedSchema/movie/book/title,0.25,0.5,0.2,0.166667,0.166667,0.2,0.2,0.2,0.166667,0.142857,...,0.142857,0.5,0.75,0.4,0.2,0.166667,0.166667,0.166667,0.166667,0.166667
mediatedSchema/movie/book/DOI,0.25,0.2,0.2,0.166667,0.166667,0.2,0.2,0.2,0.166667,0.142857,...,0.142857,0.5,0.4,0.75,0.2,0.166667,0.166667,0.166667,0.166667,0.166667
mediatedSchema/movie/book/publicationHouse,0.25,0.2,0.2,0.166667,0.166667,0.2,0.2,0.2,0.166667,0.142857,...,0.142857,0.5,0.4,0.4,0.2,0.166667,0.166667,0.166667,0.166667,0.166667
mediatedSchema/movie/book/publicationHouse/location,0.2,0.166667,0.166667,0.142857,0.142857,0.166667,0.166667,0.166667,0.142857,0.125,...,0.125,0.4,0.333333,0.333333,0.166667,0.142857,0.142857,0.142857,0.142857,0.142857
mediatedSchema/movie/book/publicationHouse/location/address,0.166667,0.142857,0.142857,0.125,0.125,0.142857,0.142857,0.142857,0.125,0.111111,...,0.111111,0.333333,0.285714,0.285714,0.142857,0.285714,0.125,0.125,0.125,0.125
mediatedSchema/movie/book/publicationHouse/location/city,0.166667,0.142857,0.142857,0.125,0.125,0.142857,0.142857,0.142857,0.125,0.111111,...,0.111111,0.333333,0.285714,0.285714,0.142857,0.125,0.285714,0.125,0.125,0.125



NODE :
Jaccard Similarity Matrix


Unnamed: 0,movie,title,director,firstName,lastName,genre,releaseYear,actors,actor,actorName,...,critic,score,book,DOI,ShootLocation,address,city,country,latitude,longitude
mediatedSchema,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
movie,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
title,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
book,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
DOI,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
publicationHouse,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
location,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
address,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
city,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
country,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [42]:
#TF-IDF Similarity

In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [44]:
#Node Similarity
def tfidf_node_similarity(nodes1, nodes2):
    """
    Compute node similarity using TF-IDF vectorization and cosine similarity.
    Returns a transposed matrix: mediated nodes as rows, source nodes as columns.
    """
    all_labels = list(set(nodes1 + nodes2))
    vectorizer = TfidfVectorizer().fit(all_labels)

    vec1 = vectorizer.transform(nodes1)
    vec2 = vectorizer.transform(nodes2)

    sim_matrix = pd.DataFrame(
        cosine_similarity(vec1, vec2),
        index=nodes1,
        columns=nodes2
    )

    return sim_matrix.T.round(4)



In [45]:
tfidf_sim_matrix = tfidf_node_similarity(nodes, mediated_nodes)

print("TF-IDF Node Similarity Matrix:")
display(tfidf_node_similarity(nodes, mediated_nodes))


TF-IDF Node Similarity Matrix:


Unnamed: 0,movie,title,director,firstName,lastName,genre,releaseYear,actors,actor,actorName,...,critic,score,book,DOI,ShootLocation,address,city,country,latitude,longitude
mediatedSchema,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
movie,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
title,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
book,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
DOI,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
publicationHouse,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
location,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
address,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
city,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
country,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [46]:
#Path Similarity
def tfidf_path_similarity(paths1, paths2):
    """
    Compute TF-IDF-based similarity for full XML paths using cosine similarity.
    Returns a transposed matrix: mediated paths as rows, source paths as columns.
    """
    vectorizer = TfidfVectorizer(tokenizer=lambda x: x.split("/"))
    vectorizer.fit(paths1 + paths2)

    vec1 = vectorizer.transform(paths1)
    vec2 = vectorizer.transform(paths2)

    sim_matrix = pd.DataFrame(
        cosine_similarity(vec1, vec2),
        index=paths1,
        columns=paths2
    )

    return sim_matrix.T.round(4)



In [47]:
tfidf_path_sim_matrix = tfidf_path_similarity(source_paths, paths)

print("TF-IDF Path Similarity Matrix:")
display(tfidf_path_similarity(source_paths, paths))


TF-IDF Path Similarity Matrix:




Unnamed: 0,movie,movie/title,movie/director,movie/director/firstName,movie/director/lastName,movie/genre,movie/releaseYear,movie/actors,movie/actors/actor,movie/actors/actor/actorName,...,movie/ratings/rating/score,movie/book,movie/book/title,movie/book/DOI,movie/ShootLocation,movie/ShootLocation/address,movie/ShootLocation/city,movie/ShootLocation/country,movie/ShootLocation/latitude,movie/ShootLocation/longitude
mediatedSchema,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mediatedSchema/movie,0.44,0.1438,0.1347,0.088,0.088,0.1124,0.1124,0.1438,0.1008,0.0762,...,0.0762,0.1936,0.1196,0.1078,0.1598,0.1008,0.1008,0.1008,0.094,0.094
mediatedSchema/movie/title,0.2718,0.832,0.0832,0.0544,0.0544,0.0695,0.0695,0.0888,0.0623,0.0471,...,0.0471,0.1196,0.6922,0.0666,0.0987,0.0622,0.0622,0.0622,0.0581,0.0581
mediatedSchema/movie/book,0.3273,0.107,0.1002,0.0655,0.0655,0.0836,0.0836,0.107,0.075,0.0567,...,0.0567,0.744,0.4597,0.4144,0.1189,0.075,0.075,0.075,0.07,0.07
mediatedSchema/movie/book/title,0.2377,0.7275,0.0728,0.0476,0.0476,0.0607,0.0607,0.0777,0.0545,0.0412,...,0.0412,0.5403,0.8744,0.3009,0.0863,0.0544,0.0544,0.0544,0.0508,0.0508
mediatedSchema/movie/book/DOI,0.2192,0.0716,0.0671,0.0438,0.0438,0.056,0.056,0.0716,0.0502,0.0379,...,0.0379,0.4981,0.3078,0.8944,0.0796,0.0502,0.0502,0.0502,0.0468,0.0468
mediatedSchema/movie/book/publicationHouse,0.2447,0.0799,0.0749,0.0489,0.0489,0.0625,0.0625,0.0799,0.0561,0.0424,...,0.0424,0.5561,0.3436,0.3097,0.0889,0.056,0.056,0.056,0.0523,0.0523
mediatedSchema/movie/book/publicationHouse/location,0.1997,0.0653,0.0611,0.04,0.04,0.051,0.051,0.0653,0.0458,0.0346,...,0.0346,0.4539,0.2805,0.2528,0.0725,0.0457,0.0457,0.0457,0.0427,0.0427
mediatedSchema/movie/book/publicationHouse/location/address,0.1654,0.054,0.0506,0.0331,0.0331,0.0423,0.0423,0.054,0.0379,0.0286,...,0.0286,0.3759,0.2323,0.2094,0.0601,0.473,0.0379,0.0379,0.0353,0.0353
mediatedSchema/movie/book/publicationHouse/location/city,0.1654,0.054,0.0506,0.0331,0.0331,0.0423,0.0423,0.054,0.0379,0.0286,...,0.0286,0.3759,0.2323,0.2094,0.0601,0.0379,0.473,0.0379,0.0353,0.0353


In [48]:
#FuzzyWuzzy Similarity

In [49]:
!pip install rapidfuzz



In [50]:
#Fuzzy Wuzzy Node Similarity
from rapidfuzz.fuzz import token_sort_ratio

def fuzzy_node_similarity(source_nodes, mediated_nodes):
    """
    Compute fuzzy similarity between node labels using token sort ratio.
    Returns a matrix with mediated nodes as rows, source nodes as columns.
    """
    sim_matrix = pd.DataFrame(index=mediated_nodes, columns=source_nodes, dtype=float)

    for m in mediated_nodes:
        for s in source_nodes:
            score = token_sort_ratio(m, s) / 100  # normalize to [0,1]
            sim_matrix.loc[m, s] = round(score, 4)

    return sim_matrix


In [51]:
print("Fuzzy-Wuzzy Node Similarity Matrix")
display(fuzzy_node_similarity(nodes, mediated_nodes))


Fuzzy-Wuzzy Node Similarity Matrix


Unnamed: 0,movie,title,director,firstName,lastName,genre,releaseYear,actors,actor,actorName,...,critic,score,book,DOI,ShootLocation,address,city,country,latitude,longitude
mediatedSchema,0.3158,0.3158,0.3636,0.2609,0.2727,0.2105,0.4,0.2,0.2105,0.2609,...,0.3,0.2105,0.0,0.0,0.2222,0.2857,0.2222,0.0952,0.3636,0.3478
movie,1.0,0.4,0.3077,0.2857,0.3077,0.2,0.125,0.1818,0.2,0.2857,...,0.1818,0.4,0.2222,0.0,0.2222,0.1667,0.2222,0.1667,0.3077,0.4286
title,0.4,1.0,0.3077,0.4286,0.3077,0.2,0.25,0.1818,0.2,0.2857,...,0.3636,0.2,0.0,0.0,0.2222,0.1667,0.4444,0.1667,0.6154,0.4286
book,0.2222,0.0,0.1667,0.0,0.0,0.0,0.0,0.2,0.2222,0.1538,...,0.0,0.2222,1.0,0.0,0.2353,0.0,0.0,0.1818,0.0,0.1538
DOI,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
publicationHouse,0.1905,0.2857,0.3333,0.24,0.3333,0.1905,0.2963,0.3636,0.2857,0.32,...,0.2727,0.2857,0.3,0.0,0.4138,0.1739,0.2,0.2609,0.5,0.4
location,0.3077,0.3077,0.375,0.1176,0.375,0.1538,0.2105,0.4286,0.4615,0.3529,...,0.4286,0.3077,0.3333,0.0,0.6667,0.1333,0.3333,0.4,0.5,0.3529
address,0.1667,0.1667,0.4,0.25,0.2667,0.3333,0.3333,0.4615,0.3333,0.375,...,0.1538,0.3333,0.0,0.0,0.1,1.0,0.0,0.1429,0.4,0.25
city,0.2222,0.4444,0.3333,0.3077,0.1667,0.0,0.0,0.4,0.4444,0.3077,...,0.6,0.2222,0.0,0.0,0.2353,0.0,1.0,0.5455,0.3333,0.3077
country,0.1667,0.1667,0.4,0.125,0.1333,0.3333,0.1111,0.4615,0.5,0.375,...,0.3077,0.5,0.1818,0.0,0.3,0.1429,0.5455,1.0,0.1333,0.375


In [52]:
#Fuzzy Wuzzy Path Similarity
def fuzzy_path_similarity(source_paths, mediated_paths):
    """
    Compute fuzzy similarity between full XML paths using token sort ratio.
    Returns a matrix with mediated paths as rows, source paths as columns.
    """
    sim_matrix = pd.DataFrame(index=mediated_paths, columns=source_paths, dtype=float)

    for m in mediated_paths:
        for s in source_paths:
            score = token_sort_ratio(m, s) / 100
            sim_matrix.loc[m, s] = round(score, 4)

    return sim_matrix


In [53]:
print("Fuzzy-Wuzzy Node Similarity Matrix")
display(fuzzy_path_similarity(source_paths, paths))

Fuzzy-Wuzzy Node Similarity Matrix


Unnamed: 0,movie,movie/title,movie/director,movie/director/firstName,movie/director/lastName,movie/genre,movie/releaseYear,movie/actors,movie/actors/actor,movie/actors/actor/actorName,...,movie/ratings/rating/score,movie/book,movie/book/title,movie/book/DOI,movie/ShootLocation,movie/ShootLocation/address,movie/ShootLocation/city,movie/ShootLocation/country,movie/ShootLocation/latitude,movie/ShootLocation/longitude
mediatedSchema,0.3158,0.4,0.4286,0.3684,0.3784,0.32,0.3871,0.3077,0.3125,0.2857,...,0.35,0.25,0.3333,0.2143,0.3636,0.2927,0.3158,0.2927,0.3333,0.2791
mediatedSchema/movie,0.4,0.3871,0.4118,0.4091,0.4186,0.3226,0.3243,0.3125,0.3158,0.3333,...,0.3478,0.3333,0.3889,0.2941,0.359,0.3404,0.3636,0.3404,0.375,0.4082
mediatedSchema/movie/title,0.3226,0.5946,0.4,0.44,0.4082,0.3784,0.3721,0.3684,0.3636,0.3333,...,0.3846,0.3333,0.5238,0.3,0.3556,0.3396,0.4,0.3396,0.4444,0.4
mediatedSchema/movie/book,0.3333,0.3333,0.359,0.3673,0.375,0.3333,0.2857,0.3784,0.3721,0.3019,...,0.3137,0.5714,0.4878,0.5128,0.3636,0.3077,0.3265,0.3462,0.3396,0.3704
mediatedSchema/movie/book/title,0.2778,0.5238,0.3556,0.4,0.3704,0.3333,0.3333,0.3256,0.3673,0.3729,...,0.3509,0.4878,0.6809,0.4889,0.4,0.3793,0.4,0.3793,0.4407,0.4
mediatedSchema/movie/book/DOI,0.2941,0.3,0.3256,0.3396,0.3462,0.3,0.2609,0.3415,0.3404,0.3158,...,0.2909,0.5128,0.4889,0.6512,0.3333,0.3214,0.3396,0.3214,0.3158,0.3448
mediatedSchema/movie/book/publicationHouse,0.2128,0.3396,0.3571,0.3636,0.3692,0.3019,0.339,0.3704,0.3667,0.3714,...,0.3824,0.3846,0.4828,0.3929,0.459,0.4348,0.4242,0.4638,0.4571,0.4789
mediatedSchema/movie/book/publicationHouse/location,0.1786,0.2903,0.3385,0.32,0.3784,0.2581,0.3235,0.3175,0.4058,0.3797,...,0.4156,0.3279,0.4179,0.3385,0.4571,0.4103,0.4533,0.4615,0.481,0.45
mediatedSchema/movie/book/publicationHouse/location/address,0.1562,0.2857,0.3288,0.3373,0.3902,0.2857,0.3158,0.338,0.3896,0.4138,...,0.4471,0.2899,0.4,0.3014,0.4103,0.5581,0.4096,0.4419,0.4828,0.4545
mediatedSchema/movie/book/publicationHouse/location/city,0.1639,0.2687,0.3143,0.35,0.3544,0.2388,0.3014,0.2941,0.3784,0.4048,...,0.439,0.303,0.3889,0.3143,0.4267,0.4096,0.525,0.4819,0.4762,0.4706


In [54]:
# Similarity matrices
similarity_matrices = {
    "Levenshtein Node": node_sim_matrix,
    "Levenshtein Path": path_sim_matrix,
    "Jaccard Node": jaccard_node_sim_matrix,
    "Jaccard Path": jaccard_path_sim_matrix,
    "TF-IDF Node": tfidf_node_similarity(nodes, mediated_nodes),
    "TF-IDF Path": tfidf_path_similarity(source_paths, paths),
    "Fuzzy Node": fuzzy_node_similarity(nodes, mediated_nodes),
    "Fuzzy Path": fuzzy_path_similarity(source_paths, paths),
}




In [55]:
display(similarity_matrices)

{'Levenshtein Node':                      movie     title  director  firstName  lastName     genre  \
 mediatedSchema    0.214286  0.214286  0.285714   0.142857  0.142857  0.142857   
 movie             1.000000  0.200000  0.000000   0.111111  0.125000  0.200000   
 title             0.200000  1.000000  0.250000   0.333333  0.250000  0.200000   
 book              0.200000  0.000000  0.125000   0.000000  0.000000  0.000000   
 DOI               0.000000  0.000000  0.000000   0.000000  0.000000  0.000000   
 publicationHouse  0.125000  0.187500  0.187500   0.187500  0.187500  0.125000   
 location          0.250000  0.125000  0.125000   0.111111  0.125000  0.000000   
 address           0.142857  0.142857  0.250000   0.000000  0.125000  0.285714   
 city              0.000000  0.400000  0.250000   0.222222  0.125000  0.000000   
 country           0.142857  0.142857  0.125000   0.111111  0.000000  0.285714   
 
                   releaseYear    actors     actor  actorName  ...    critic

In [56]:
#Set Thresholds
NODE_THRESHOLD = 0.6
PATH_THRESHOLD = 0.75

In [57]:
def extract_alignments(sim_matrix, threshold=0.8):
    """
    Extract (source, mediated, similarity) triples from a similarity matrix
    where similarity >= threshold.
    """
    matches = []
    for row in sim_matrix.index:        # mediated item
        for col in sim_matrix.columns:  # source item
            score = sim_matrix.loc[row, col]
            if score >= threshold:
                matches.append((col, row, round(score, 4)))  # (source, mediated, score)
    return matches


In [58]:
# Step 4: Extract and display as schema alignment tables
all_alignments = {}

for name, sim_matrix in similarity_matrices.items():
    threshold = NODE_THRESHOLD if "Node" in name else PATH_THRESHOLD
    alignments = extract_alignments(sim_matrix, threshold=threshold)
    all_alignments[name] = alignments

    print(f"\n📌 {name} Alignments (Threshold = {threshold}):")
    
    df = pd.DataFrame(alignments, columns=["Source Attribute", "Mediated Attribute", "Similarity Score"])
    display(df)



📌 Levenshtein Node Alignments (Threshold = 0.6):


Unnamed: 0,Source Attribute,Mediated Attribute,Similarity Score
0,movie,movie,1.0
1,title,title,1.0
2,book,book,1.0
3,DOI,DOI,1.0
4,address,address,1.0
5,city,city,1.0
6,country,country,1.0



📌 Levenshtein Path Alignments (Threshold = 0.75):


Unnamed: 0,Source Attribute,Mediated Attribute,Similarity Score



📌 Jaccard Node Alignments (Threshold = 0.6):


Unnamed: 0,Source Attribute,Mediated Attribute,Similarity Score
0,movie,movie,1.0
1,title,title,1.0
2,book,book,1.0
3,DOI,DOI,1.0
4,address,address,1.0
5,city,city,1.0
6,country,country,1.0



📌 Jaccard Path Alignments (Threshold = 0.75):


Unnamed: 0,Source Attribute,Mediated Attribute,Similarity Score
0,movie/book/title,mediatedSchema/movie/book/title,0.75
1,movie/book/DOI,mediatedSchema/movie/book/DOI,0.75



📌 TF-IDF Node Alignments (Threshold = 0.6):


Unnamed: 0,Source Attribute,Mediated Attribute,Similarity Score
0,movie,movie,1.0
1,title,title,1.0
2,book,book,1.0
3,DOI,DOI,1.0
4,address,address,1.0
5,city,city,1.0
6,country,country,1.0



📌 TF-IDF Path Alignments (Threshold = 0.75):


Unnamed: 0,Source Attribute,Mediated Attribute,Similarity Score
0,movie/title,mediatedSchema/movie/title,0.832
1,movie/book/title,mediatedSchema/movie/book/title,0.8744
2,movie/book/DOI,mediatedSchema/movie/book/DOI,0.8944



📌 Fuzzy Node Alignments (Threshold = 0.6):


Unnamed: 0,Source Attribute,Mediated Attribute,Similarity Score
0,movie,movie,1.0
1,title,title,1.0
2,latitude,title,0.6154
3,book,book,1.0
4,DOI,DOI,1.0
5,ShootLocation,location,0.6667
6,address,address,1.0
7,critic,city,0.6
8,city,city,1.0
9,country,country,1.0



📌 Fuzzy Path Alignments (Threshold = 0.75):


Unnamed: 0,Source Attribute,Mediated Attribute,Similarity Score


In [59]:
#Levenst Distance 
from sklearn.metrics import precision_score, recall_score, f1_score
import pandas as pd

# -------- LEVENSHTEIN NODE SIMILARITY EVALUATION --------
lev_node_threshold = 0.6
y_true_node = (node_sim_matrix.values.flatten() > 0).astype(int)
y_pred_node = (node_sim_matrix.values.flatten() >= lev_node_threshold).astype(int)

# Compute scores
node_precision = precision_score(y_true_node, y_pred_node)
node_recall = recall_score(y_true_node, y_pred_node)
node_f1 = f1_score(y_true_node, y_pred_node)

# Thresholded binary DataFrame (node similarity)
thresholded_node_df = (node_sim_matrix >= lev_node_threshold).astype(int)

print(f"🔎 Levenshtein Node Similarity Evaluation (threshold = {lev_node_threshold})")
print(f"Precision: {node_precision:.3f}")
print(f"Recall:    {node_recall:.3f}")
print(f"F1 Score:  {node_f1:.3f}")
print("\nThresholded Levenshtein Node Similarity Matrix:")
display(thresholded_node_df)


# -------- LEVENSHTEIN PATH SIMILARITY EVALUATION --------
lev_path_threshold = 0.75
y_true_path = (path_sim_matrix.values.flatten() > 0).astype(int)
y_pred_path = (path_sim_matrix.values.flatten() >= lev_path_threshold).astype(int)

# Compute scores
path_precision = precision_score(y_true_path, y_pred_path)
path_recall = recall_score(y_true_path, y_pred_path)
path_f1 = f1_score(y_true_path, y_pred_path)

# Thresholded binary DataFrame (path similarity)
thresholded_path_df = (path_sim_matrix >= lev_path_threshold).astype(int)

print(f"\n🔎 Levenshtein Path Similarity Evaluation (threshold = {lev_path_threshold})")
print(f"Precision: {path_precision:.3f}")
print(f"Recall:    {path_recall:.3f}")
print(f"F1 Score:  {path_f1:.3f}")
print("\nThresholded Levenshtein Path Similarity Matrix:")
display(thresholded_path_df)


🔎 Levenshtein Node Similarity Evaluation (threshold = 0.6)
Precision: 1.000
Recall:    0.044
F1 Score:  0.084

Thresholded Levenshtein Node Similarity Matrix:


Unnamed: 0,movie,title,director,firstName,lastName,genre,releaseYear,actors,actor,actorName,...,critic,score,book,DOI,ShootLocation,address,city,country,latitude,longitude
mediatedSchema,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
movie,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
title,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
book,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
DOI,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
publicationHouse,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
location,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
address,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
city,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
country,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0



🔎 Levenshtein Path Similarity Evaluation (threshold = 0.75)
Precision: 0.000
Recall:    0.000
F1 Score:  0.000

Thresholded Levenshtein Path Similarity Matrix:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Unnamed: 0,movie,movie/title,movie/director,movie/director/firstName,movie/director/lastName,movie/genre,movie/releaseYear,movie/actors,movie/actors/actor,movie/actors/actor/actorName,...,movie/ratings/rating/score,movie/book,movie/book/title,movie/book/DOI,movie/ShootLocation,movie/ShootLocation/address,movie/ShootLocation/city,movie/ShootLocation/country,movie/ShootLocation/latitude,movie/ShootLocation/longitude
mediatedSchema,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
mediatedSchema/movie,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
mediatedSchema/movie/title,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
mediatedSchema/movie/book,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
mediatedSchema/movie/book/title,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
mediatedSchema/movie/book/DOI,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
mediatedSchema/movie/book/publicationHouse,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
mediatedSchema/movie/book/publicationHouse/location,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
mediatedSchema/movie/book/publicationHouse/location/address,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
mediatedSchema/movie/book/publicationHouse/location/city,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [60]:
#Jaccard Similarity

from sklearn.metrics import precision_score, recall_score, f1_score

# --- Jaccard NODE ---
node_threshold = 0.6  # or use your value

y_true_node = (jaccard_node_sim_matrix.values.flatten() > 0).astype(int)
y_pred_node = (jaccard_node_sim_matrix.values.flatten() >= node_threshold).astype(int)

precision_node = precision_score(y_true_node, y_pred_node)
recall_node = recall_score(y_true_node, y_pred_node)
f1_node = f1_score(y_true_node, y_pred_node)

thresholded_jaccard_node_df = (jaccard_node_sim_matrix >= node_threshold).astype(int)

print(f"🔎 Jaccard Node Similarity Evaluation (threshold = {node_threshold})")
print(f"Precision: {precision_node:.3f}")
print(f"Recall:    {recall_node:.3f}")
print(f"F1 Score:  {f1_node:.3f}")
print("\nThresholded Jaccard Node Similarity Matrix:")
display(thresholded_jaccard_node_df)



# --- Jaccard PATH ---
path_threshold = 0.75  # or use your value

y_true_path = (jaccard_path_sim_matrix.values.flatten() > 0).astype(int)
y_pred_path = (jaccard_path_sim_matrix.values.flatten() >= path_threshold).astype(int)

precision_path = precision_score(y_true_path, y_pred_path)
recall_path = recall_score(y_true_path, y_pred_path)
f1_path = f1_score(y_true_path, y_pred_path)

thresholded_jaccard_path_df = (jaccard_path_sim_matrix >= path_threshold).astype(int)

print(f"\n🔎 Jaccard Path Similarity Evaluation (threshold = {path_threshold})")
print(f"Precision: {precision_path:.3f}")
print(f"Recall:    {recall_path:.3f}")
print(f"F1 Score:  {f1_path:.3f}")
print("\nThresholded Jaccard Path Similarity Matrix:")
display(thresholded_jaccard_path_df)


🔎 Jaccard Node Similarity Evaluation (threshold = 0.6)
Precision: 1.000
Recall:    1.000
F1 Score:  1.000

Thresholded Jaccard Node Similarity Matrix:


Unnamed: 0,movie,title,director,firstName,lastName,genre,releaseYear,actors,actor,actorName,...,critic,score,book,DOI,ShootLocation,address,city,country,latitude,longitude
mediatedSchema,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
movie,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
title,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
book,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
DOI,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
publicationHouse,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
location,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
address,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
city,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
country,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0



🔎 Jaccard Path Similarity Evaluation (threshold = 0.75)
Precision: 1.000
Recall:    0.008
F1 Score:  0.017

Thresholded Jaccard Path Similarity Matrix:


Unnamed: 0,movie,movie/title,movie/director,movie/director/firstName,movie/director/lastName,movie/genre,movie/releaseYear,movie/actors,movie/actors/actor,movie/actors/actor/actorName,...,movie/ratings/rating/score,movie/book,movie/book/title,movie/book/DOI,movie/ShootLocation,movie/ShootLocation/address,movie/ShootLocation/city,movie/ShootLocation/country,movie/ShootLocation/latitude,movie/ShootLocation/longitude
mediatedSchema,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
mediatedSchema/movie,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
mediatedSchema/movie/title,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
mediatedSchema/movie/book,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
mediatedSchema/movie/book/title,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
mediatedSchema/movie/book/DOI,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
mediatedSchema/movie/book/publicationHouse,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
mediatedSchema/movie/book/publicationHouse/location,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
mediatedSchema/movie/book/publicationHouse/location/address,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
mediatedSchema/movie/book/publicationHouse/location/city,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [61]:
#Tf-idf

from sklearn.metrics import precision_score, recall_score, f1_score
import pandas as pd

# -------- TF-IDF NODE SIMILARITY EVALUATION --------
node_threshold = 0.6
y_true = (tfidf_sim_matrix.values.flatten() > 0).astype(int)
y_pred = (tfidf_sim_matrix.values.flatten() >= node_threshold).astype(int)

# Compute scores
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

# Thresholded binary DataFrame (node similarity)
thresholded_node_df = (tfidf_sim_matrix >= node_threshold).astype(int)

print(f"🔎 TF-IDF Node Similarity Evaluation (threshold = {node_threshold})")
print(f"Precision: {precision:.3f}")
print(f"Recall:    {recall:.3f}")
print(f"F1 Score:  {f1:.3f}")
print("\nThresholded TF-IDF Node Similarity Matrix:")
display(thresholded_node_df)


# -------- TF-IDF PATH SIMILARITY EVALUATION --------
path_threshold = 0.75
y_true = (tfidf_path_sim_matrix.values.flatten() > 0).astype(int)
y_pred = (tfidf_path_sim_matrix.values.flatten() >= path_threshold).astype(int)

# Compute scores
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

# Thresholded binary DataFrame (path similarity)
thresholded_path_df = (tfidf_path_sim_matrix >= path_threshold).astype(int)

print(f"\n🔎 TF-IDF Path Similarity Evaluation (threshold = {path_threshold})")
print(f"Precision: {precision:.3f}")
print(f"Recall:    {recall:.3f}")
print(f"F1 Score:  {f1:.3f}")
print("\nThresholded TF-IDF Path Similarity Matrix:")
display(thresholded_path_df)


🔎 TF-IDF Node Similarity Evaluation (threshold = 0.6)
Precision: 1.000
Recall:    1.000
F1 Score:  1.000

Thresholded TF-IDF Node Similarity Matrix:


Unnamed: 0,movie,title,director,firstName,lastName,genre,releaseYear,actors,actor,actorName,...,critic,score,book,DOI,ShootLocation,address,city,country,latitude,longitude
mediatedSchema,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
movie,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
title,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
book,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
DOI,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
publicationHouse,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
location,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
address,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
city,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
country,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0



🔎 TF-IDF Path Similarity Evaluation (threshold = 0.75)
Precision: 1.000
Recall:    0.013
F1 Score:  0.025

Thresholded TF-IDF Path Similarity Matrix:


Unnamed: 0,movie,movie/title,movie/director,movie/director/firstName,movie/director/lastName,movie/genre,movie/releaseYear,movie/actors,movie/actors/actor,movie/actors/actor/actorName,...,movie/ratings/rating/score,movie/book,movie/book/title,movie/book/DOI,movie/ShootLocation,movie/ShootLocation/address,movie/ShootLocation/city,movie/ShootLocation/country,movie/ShootLocation/latitude,movie/ShootLocation/longitude
mediatedSchema,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
mediatedSchema/movie,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
mediatedSchema/movie/title,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
mediatedSchema/movie/book,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
mediatedSchema/movie/book/title,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
mediatedSchema/movie/book/DOI,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
mediatedSchema/movie/book/publicationHouse,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
mediatedSchema/movie/book/publicationHouse/location,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
mediatedSchema/movie/book/publicationHouse/location/address,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
mediatedSchema/movie/book/publicationHouse/location/city,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [62]:
#Fuzzy-

from sklearn.metrics import precision_score, recall_score, f1_score
import pandas as pd

# -------- FUZZY NODE SIMILARITY EVALUATION --------
node_threshold = 0.6
y_true = (node_sim_matrix.values.flatten() > 0).astype(int)
y_pred = (node_sim_matrix.values.flatten() >= node_threshold).astype(int)

# Compute scores
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

# Thresholded binary DataFrame (node similarity)
thresholded_node_df = (node_sim_matrix >= node_threshold).astype(int)

print(f"🔎 Fuzzy Node Similarity Evaluation (threshold = {node_threshold})")
print(f"Precision: {precision:.3f}")
print(f"Recall:    {recall:.3f}")
print(f"F1 Score:  {f1:.3f}")
print("\nThresholded Fuzzy Node Similarity Matrix:")
display(thresholded_node_df)

# -------- FUZZY PATH SIMILARITY EVALUATION --------
path_threshold = 0.75
y_true = (path_sim_matrix.values.flatten() > 0).astype(int)
y_pred = (path_sim_matrix.values.flatten() >= path_threshold).astype(int)

# Compute scores
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

# Thresholded binary DataFrame (path similarity)
thresholded_path_df = (path_sim_matrix >= path_threshold).astype(int)

print(f"\n🔎 Fuzzy Path Similarity Evaluation (threshold = {path_threshold})")
print(f"Precision: {precision:.3f}")
print(f"Recall:    {recall:.3f}")
print(f"F1 Score:  {f1:.3f}")
print("\nThresholded Fuzzy Path Similarity Matrix:")
display(thresholded_path_df)


🔎 Fuzzy Node Similarity Evaluation (threshold = 0.6)
Precision: 1.000
Recall:    0.044
F1 Score:  0.084

Thresholded Fuzzy Node Similarity Matrix:


Unnamed: 0,movie,title,director,firstName,lastName,genre,releaseYear,actors,actor,actorName,...,critic,score,book,DOI,ShootLocation,address,city,country,latitude,longitude
mediatedSchema,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
movie,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
title,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
book,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
DOI,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
publicationHouse,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
location,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
address,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
city,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
country,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0



🔎 Fuzzy Path Similarity Evaluation (threshold = 0.75)
Precision: 0.000
Recall:    0.000
F1 Score:  0.000

Thresholded Fuzzy Path Similarity Matrix:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Unnamed: 0,movie,movie/title,movie/director,movie/director/firstName,movie/director/lastName,movie/genre,movie/releaseYear,movie/actors,movie/actors/actor,movie/actors/actor/actorName,...,movie/ratings/rating/score,movie/book,movie/book/title,movie/book/DOI,movie/ShootLocation,movie/ShootLocation/address,movie/ShootLocation/city,movie/ShootLocation/country,movie/ShootLocation/latitude,movie/ShootLocation/longitude
mediatedSchema,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
mediatedSchema/movie,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
mediatedSchema/movie/title,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
mediatedSchema/movie/book,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
mediatedSchema/movie/book/title,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
mediatedSchema/movie/book/DOI,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
mediatedSchema/movie/book/publicationHouse,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
mediatedSchema/movie/book/publicationHouse/location,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
mediatedSchema/movie/book/publicationHouse/location/address,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
mediatedSchema/movie/book/publicationHouse/location/city,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
