# Comparar frases con un modelo de embeddings

## Instalación de librería de embeddings

In [None]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0 (from sentence-transformers)
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from sentence-transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.4.0 (from sentence-transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB

Carga de modelo y procesamiento de embeddings

In [None]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('all-MiniLM-L6-v2')

#Lista de frases
sentences = ['el gato juega afuera',
             'me gusta tocar guitarra',
             'me encanta la pasta',
             'esta pelicula estuvo asombrosa',
             'el perro esta afuera',
             'te gusta la musica?',
             'estuvo muy buena la pelicula',
             'te gustaria una pizza?']

#Cómputo de embeddings
embeddings = model.encode(sentences)

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [None]:
embeddings

array([[-0.03232959,  0.11775171, -0.04929117, ..., -0.00081667,
        -0.03978717, -0.02792935],
       [-0.03385794,  0.00732809, -0.04184823, ...,  0.04272254,
        -0.0227952 , -0.06084861],
       [-0.05483966, -0.03731453, -0.01410467, ...,  0.1172901 ,
         0.07456188, -0.0974494 ],
       ...,
       [ 0.02240627,  0.00977139,  0.01616971, ...,  0.02968126,
         0.01667443, -0.05944228],
       [ 0.0229008 ,  0.00147286,  0.0013893 , ...,  0.08682209,
         0.08835244, -0.13047363],
       [-0.06596519,  0.03551989, -0.04087367, ...,  0.09201567,
         0.00076539, -0.08679891]], dtype=float32)

## Comparar frases con modelo de embeddings

In [None]:
#Cálculo de similitudes coseno para cada frase con todas las demás frases
cosine_scores = util.cos_sim(embeddings,embeddings)

#Encuentra las parejas con los puntajes de similitud del coseno más altos
pairs = []
for i in range(len(cosine_scores)-1):
    for j in range(i+1, len(cosine_scores)):
        pairs.append({'index': [i, j], 'score': cosine_scores[i][j]})


#Ordena los puntajes en orden descendente.
pairs = sorted(pairs, key=lambda x : x['score'],reverse=True)

for pair in pairs[0:5]:
    i, j = pair['index']
    print("{} ----> {}".format(sentences[i], sentences[j]))


esta pelicula estuvo asombrosa ----> estuvo muy buena la pelicula
te gusta la musica? ----> te gustaria una pizza?
el gato juega afuera ----> el perro esta afuera
me gusta tocar guitarra ----> te gusta la musica?
me encanta la pasta ----> te gustaria una pizza?
