In [3]:
from transformers import AutoModel, AutoTokenizer
import torch
from scipy.spatial.distance import cosine
import numpy as np

In [4]:
# Cargar el modelo BERT preentrenado (similar a Word2Vec pero más robusto)
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Palabras de ejemplo
words = ["king", "queen", "man", "woman", "cat", "dog"]

# Tokenizar y obtener embeddings
word_vectors = {}
for word in words:
    inputs = tokenizer(word, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
        word_vectors[word] = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Calcular similitud coseno
similarity_matrix = np.array([[1 - cosine(word_vectors[w1], word_vectors[w2]) for w2 in words] for w1 in words])

In [5]:
# Mostrar similitudes
print("Similitud coseno entre palabras:")
for w1, row in zip(words, similarity_matrix):
    print(f"{w1}: {row}")

Similitud coseno entre palabras:
king: [0.99999998 0.93882426 0.86181645 0.87864462 0.87267842 0.85303606]
queen: [0.93882426 1.         0.87879222 0.89415062 0.8840131  0.83682315]
man: [0.86181645 0.87879222 0.99999999 0.9260298  0.86900343 0.83045134]
woman: [0.87864462 0.89415062 0.9260298  1.         0.87642809 0.85893852]
cat: [0.87267842 0.8840131  0.86900343 0.87642809 0.99999998 0.90085159]
dog: [0.85303606 0.83682315 0.83045134 0.85893852 0.90085159 0.99999998]
