# Calcularemos un dataset con las distancias entre las cadenas dentro de cada grupo isomorfo

In [1]:
!pip install levenshtein

Collecting levenshtein
  Downloading levenshtein-0.27.1-cp311-cp311-macosx_11_0_arm64.whl.metadata (3.6 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from levenshtein)
  Downloading rapidfuzz-3.14.1-cp311-cp311-macosx_11_0_arm64.whl.metadata (12 kB)
Downloading levenshtein-0.27.1-cp311-cp311-macosx_11_0_arm64.whl (156 kB)
Downloading rapidfuzz-3.14.1-cp311-cp311-macosx_11_0_arm64.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, levenshtein
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [levenshtein]
[1A[2KSuccessfully installed levenshtein-0.27.1 rapidfuzz-3.14.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [1]:
from Levenshtein import distance
import pandas as pd

df_groups = pd.read_csv("../BBDD_isomorphic_labeled_distance_filtered_>5.csv", sep=";")
df_groups.head()


Unnamed: 0,wl_signature,sentences,Compleja,Compuesta,Compuesta-Compleja,Indeterminada (múltiples verbos sin subordinación/coordinación clara),Simple,Unimembre
0,"0_():1|10_(11,):2|11_(9, 10, 10):1|1_(2, 4):1|...","[9953, 5536, 14132, 13015, 14125, 11144, 6406,...",0,0,0,0,13,0
1,"0_():1|10_(4, 9, 9, 9, 12):1|11_(12,):1|12_(10...","[2507, 7972, 13922, 3686, 772, 7284]",6,0,0,0,0,0
2,"0_():1|1_(1, 2, 4):2|2_(1,):2|3_(4,):4|4_(1, 3...","[4715, 1016, 842, 8950, 9433, 13509, 12767, 49...",0,0,0,0,23,0
3,"0_():1|1_(2, 2, 2):1|2_(1,):3|3_(3,):2|4_(5,):...","[6072, 12536, 10037, 11987, 3688, 8764, 13216,...",0,0,0,0,10,0
4,"0_():1|1_(2, 2, 4):2|2_(1,):4|3_(3,):2|4_(1, 1):1","[14141, 11914, 13420, 6376, 12262, 5328, 8104,...",0,0,0,0,10,0


In [2]:
import networkx as nx
import itertools

def get_phrase(file):
    G = nx.read_graphml(file)
    return G.graph.get('phrase', 'No phrase found')

def get_phrases(id_list):
    phrases = {}
    for id in id_list:
        file = f"../UD_Spanish-GSD/es_gsd-ud-train_{id}.graphml"
        phrase = get_phrase(file)
        phrases[id] = phrase
    return phrases

def compute_distances(phrases):
    max_len = max(len(p) for p in phrases.values())
    distances = {}
    for (id_1, phrase_1), (id_2, phrase_2) in itertools.combinations(phrases.items(), 2):
        dist = distance(phrase_1, phrase_2) / max_len
        distances[(id_1, id_2)] = dist
    return distances

In [3]:
from ast import literal_eval

distances_by_group = {}

for index, row in df_groups.iterrows():
    id_list = literal_eval(row['sentences'])
    phrases = get_phrases(id_list)
    distances = compute_distances(phrases)
    distances_by_group[row["wl_signature"]] = distances

print(distances_by_group)

{'0_():1|10_(11,):2|11_(9, 10, 10):1|1_(2, 4):1|2_(1,):1|3_(4,):2|4_(1, 3, 3, 6, 7):1|5_(6,):2|6_(4, 5, 5):1|7_(4, 9):1|8_(9,):2|9_(7, 8, 8, 11):1': {(9953, 5536): 0.10989010989010989, (9953, 14132): 0.10989010989010989, (9953, 13015): 0.10989010989010989, (9953, 14125): 0.08791208791208792, (9953, 11144): 0.17582417582417584, (9953, 6406): 0.10989010989010989, (9953, 2541): 0.0989010989010989, (9953, 12970): 0.0989010989010989, (9953, 12597): 0.08791208791208792, (9953, 13273): 0.08791208791208792, (9953, 5411): 0.0989010989010989, (9953, 4592): 0.0989010989010989, (5536, 14132): 0.06593406593406594, (5536, 13015): 0.07692307692307693, (5536, 14125): 0.0989010989010989, (5536, 11144): 0.16483516483516483, (5536, 6406): 0.0989010989010989, (5536, 2541): 0.08791208791208792, (5536, 12970): 0.08791208791208792, (5536, 12597): 0.10989010989010989, (5536, 13273): 0.0989010989010989, (5536, 5411): 0.06593406593406594, (5536, 4592): 0.08791208791208792, (14132, 13015): 0.10989010989010989, (

# Guardamos las distancias

In [7]:
def ids_without_close_neighbors(dist, threshold, assume_missing=None, ensure_nonempty=True):
    """
    Return the set of IDs that have no distance < threshold to any other ID.

    Parameters
    ----------
    dist : mapping from (id1, id2) -> distance
        Keys may be ordered or unordered; the function checks both (i, j) and (j, i).
    threshold : float
        Cutoff; IDs must be at least this far from everyone else.
    assume_missing : float or None
        If None, raise KeyError when a pair distance is missing.
        If a float (e.g., float('inf')), treat missing distances as that value.

    Example
    -------
    >>> d = {('A','B'):0.3, ('A','C'):0.6, ('B','C'):0.8}
    >>> ids_without_close_neighbors(d, 0.5)
    {'C'}
    """
    # collect all unique IDs present in keys
    ids = {x for pair in dist for x in pair}

    def get_distance(a, b):
        if a == b:
            return 0.0
        if (a, b) in dist:
            return dist[(a, b)]
        if (b, a) in dist:
            return dist[(b, a)]
        if assume_missing is None:
            raise KeyError(f"Missing distance for pair ({a}, {b}) and ({b}, {a}).")
        return assume_missing

    # keep those i whose distances to all others are >= threshold
    result = {i for i in ids if all(get_distance(i, j) >= threshold for j in ids if j != i)}
    if result or not ensure_nonempty:
        return result
    
    else:
        try:
            return [list(ids)[0]]
        except IndexError:
            return []

final_phrases = {}
for sign, dist in distances_by_group.items():
    ids = ids_without_close_neighbors(dist, 0.5)
    final_phrases[sign] = list(ids)
print(final_phrases)


{'0_():1|10_(11,):2|11_(9, 10, 10):1|1_(2, 4):1|2_(1,):1|3_(4,):2|4_(1, 3, 3, 6, 7):1|5_(6,):2|6_(4, 5, 5):1|7_(4, 9):1|8_(9,):2|9_(7, 8, 8, 11):1': [5536], '0_():1|10_(4, 9, 9, 9, 12):1|11_(12,):1|12_(10, 11):1|13_(13,):2|14_(15,):1|15_(14, 16):1|16_(1, 1, 1, 15):1|1_(16,):3|2_(3,):2|3_(2, 2, 4):1|4_(3, 6, 10):1|5_(6,):2|6_(4, 5, 5, 8):1|7_(8,):1|8_(6, 7):1|9_(10,):3': [13922], '0_():1|1_(1, 2, 4):2|2_(1,):2|3_(4,):4|4_(1, 3, 3):2|5_(6,):2|6_(5, 5):1': [4745], '0_():1|1_(2, 2, 2):1|2_(1,):3|3_(3,):2|4_(5,):2|5_(4, 4):1': [12536], '0_():1|1_(2, 2, 4):2|2_(1,):4|3_(3,):2|4_(1, 1):1': [12262], '0_():1|1_(2, 5):1|2_(1,):1|3_(3,):2|4_(5,):4|5_(1, 4, 4, 4, 4):1': [3543], '0_():1|1_(2,):2|2_(1, 1, 5):1|3_(3,):2|4_(5,):3|5_(2, 4, 4, 4):1': [9576, 5579, 2095, 903], '0_():1|1_(2,):6|2_(1, 1, 1, 2):2': [14113, 12699, 13941], '0_():1|1_(3, 5, 5, 8):1|2_(3,):1|3_(1, 2):1|4_(5,):4|5_(1, 4, 4):2|6_(7,):2|7_(6, 6):1|8_(1,):1': [3844], '0_():1|1_(5, 6):1|2_(3,):2|3_(2, 2, 5):1|4_(5,):2|5_(1, 3, 4, 4):

In [8]:
import json, datetime


with open("final_groups_distances.json", "w", encoding="utf-8") as f:
    json.dump(final_phrases, f, ensure_ascii=False, indent=2)