# CH07c Semantic similarity experiment with FLAIR

In [None]:
!pip install flair

In [None]:
import pandas as pd

In [None]:
similar = [
    (
        "A black dog walking beside a pool.",
        "A black dog is walking along the side of a pool.",
    ),
    (
        "A blonde woman looks for medical supplies for work in a suitcase.	",
        " The blond woman is searching for medical supplies in a suitcase.",
    ),
    (
        "A doubly decker red bus driving down the road.",
        "A red double decker bus driving down a street.",
    ),
    (
        "There is a black dog jumping into a swimming pool.",
        "A black dog is leaping into a swimming pool.",
    ),
    (
        "The man used a sword to slice a plastic bottle.	",
        "A man sliced a plastic bottle with a sword.",
    ),
]
pd.DataFrame(similar, columns=["sen1", "sen2"])

In [None]:
import pandas as pd

dissimilar = [
    (
        "A little girl and boy are reading books. ",
        "An older child is playing with a doll while gazing out the window.",
    ),
    (
        "Two horses standing in a field with trees in the background.",
        "A black and white bird on a body of water with grass in the background.",
    ),
    (
        "Two people are walking by the ocean.",
        "Two men in fleeces and hats looking at the camera.",
    ),
    ("A cat is pouncing on a trampoline.", "A man is slicing a tomato."),
    ("A woman is riding on a horse.", "A man is turning over tables in anger."),
]
pd.DataFrame(dissimilar, columns=["sen1", "sen2"])

In [None]:
import torch, numpy as np


def sim(s1, s2):
    # cosine similarity function outputs in the range 0-1
    s1 = s1.embedding.unsqueeze(0)
    s2 = s2.embedding.unsqueeze(0)
    sim = torch.cosine_similarity(s1, s2).item()
    return np.round(sim, 2)


def evaluate(embeddings, myPairList):
    # it evaluates embeddings for a given list of sentence pair
    scores = []
    for s1, s2 in myPairList:
        s1, s2 = Sentence(s1), Sentence(s2)
        embeddings.embed(s1)
        embeddings.embed(s2)
        score = sim(s1, s2)
        scores.append(score)
    return scores, np.round(np.mean(scores), 2)

## Document Pool Embedding

The Document Pool embeddings apply mean pooling operation over all word where the average of all word embeddings in a sentence is computed to obtain sentence embedding.

In [None]:
from flair.data import Sentence
from flair.embeddings import WordEmbeddings, DocumentPoolEmbeddings

glove_embedding = WordEmbeddings("glove")
glove_pool_embeddings = DocumentPoolEmbeddings([glove_embedding])

In [None]:
evaluate(glove_pool_embeddings, similar)

In [None]:
evaluate(glove_pool_embeddings, dissimilar)

## RNN-based Document Embeddings

In [None]:
from flair.embeddings import WordEmbeddings, DocumentRNNEmbeddings

gru_embeddings = DocumentRNNEmbeddings([glove_embedding])

In [None]:
evaluate(gru_embeddings, similar)

In [None]:
evaluate(gru_embeddings, dissimilar)

## Transformer-based BERT Embeddings

In [None]:
from flair.embeddings import TransformerDocumentEmbeddings
from flair.data import Sentence

bert_embeddings = TransformerDocumentEmbeddings("bert-base-uncased")

In [None]:
evaluate(bert_embeddings, similar)

In [None]:
evaluate(bert_embeddings, dissimilar)

## SentenceBERT

In [None]:
!pip install sentence-transformers

In [None]:
from flair.data import Sentence
from flair.embeddings import SentenceTransformerDocumentEmbeddings

# init embedding
sbert_embeddings = SentenceTransformerDocumentEmbeddings("bert-base-nli-mean-tokens")

In [None]:
evaluate(sbert_embeddings, similar)

In [None]:
evaluate(sbert_embeddings, dissimilar)

In [None]:
# Tricky pairs

In [None]:
tricky_pairs = [
    ("An elephant is bigger than a lion", "A lion is bigger than an elephant"),
    ("the cat sat on the mat", "the mat sat on the cat"),
]

In [None]:
evaluate(glove_pool_embeddings, tricky_pairs)

In [None]:
evaluate(gru_embeddings, tricky_pairs)

In [None]:
evaluate(bert_embeddings, tricky_pairs)

In [None]:
evaluate(sbert_embeddings, tricky_pairs)