In [2]:
%pip install sentence-transformers

from sentence_transformers import SentenceTransformer, util
import torch
import re
import unicodedata

# ------------------------
# Load the embedding model
# ------------------------
model = SentenceTransformer("paraphrase-MiniLM-L6-v2")

# ------------------------
# Address Normalization (Mexico-specific)
# ------------------------
def normalize_address(addr: str) -> str:
    # Lowercase
    text = addr.lower()
    
    # Remove accents (México → Mexico)
    text = ''.join(
        c for c in unicodedata.normalize('NFD', text)
        if unicodedata.category(c) != 'Mn'
    )
    
    # Remove punctuation
    text = re.sub(r"[^a-z0-9\s]", " ", text)
    
    # Collapse whitespace
    text = re.sub(r"\s+", " ", text).strip()
    
    # Expand common Mexican abbreviations
    replacements = {
        r"\bav\b": " avenida",
        r"\bavda\b": " avenida",
        r"\bblvd\b": " bulevar",
        r"\bcol\b": " colonia",
        r"\bfracc\b": " fraccionamiento",
        r"\bno\b": " numero",
        r"\bcd\b": " ciudad",
        r"\bcdmx\b": " ciudad de mexico",
        r"\bedo mex\b": " estado de mexico"
    }
    for k, v in replacements.items():
        text = re.sub(k, v, text)
    
    return text

# ------------------------
# Duplicate Detection
# ------------------------
def find_duplicates(addresses, threshold=0.8):
    # Normalize for embeddings
    texts = [normalize_address(a["address"]) for a in addresses]
    
    # Encode in batch
    embeddings = model.encode(texts, convert_to_tensor=True)
    
    results = []
    for i, addr in enumerate(addresses):
        duplicate_of = None
        for j in range(i):
            score = util.cos_sim(embeddings[i], embeddings[j]).item()
            if score >= threshold:
                duplicate_of = addresses[j]["id"]
                break
        results.append({
            "id": addr["id"],
            "address": addr["address"],
            "duplicate_of": duplicate_of
        })
    return results



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3.13 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# ------------------------
# Example Run
# ------------------------
if __name__ == "__main__":
    input_addresses = [
        {"id": "a1", "address": "Av. Insurgentes Sur 123, CDMX"},
        {"id": "a2", "address": "Avenida Insurgentes Sur 123, Ciudad de México"},
        {"id": "a3", "address": "Blvd. Adolfo López Mateos 456, Edo. Mex"},
        {"id": "a4", "address": "Bulevar Adolfo Lopez Mateos 456, Estado de Mexico"}
    ]
    
    output = find_duplicates(input_addresses, threshold=0.8)
    
    for row in output:
        print(row)


{'id': 'a1', 'address': 'Av. Insurgentes Sur 123, CDMX', 'duplicate_of': None}
{'id': 'a2', 'address': 'Avenida Insurgentes Sur 123, Ciudad de México', 'duplicate_of': 'a1'}
{'id': 'a3', 'address': 'Blvd. Adolfo López Mateos 456, Edo. Mex', 'duplicate_of': None}
{'id': 'a4', 'address': 'Bulevar Adolfo Lopez Mateos 456, Estado de Mexico', 'duplicate_of': 'a3'}
