## Semantic Search Tutorial

### BM25

In [None]:
%%bash
pip install -q faiss-gpu datasets \
               evaluate transformers[sentencepiece] \
               rank_bm25 pip install langchain-community

In [None]:
import torch
from torch import nn
from torch.functional import tensordot
from torch.nn import functional as F
from torch.utils.data import DataLoader
from torch.nn import CosineEmbeddingLoss
from torch import Tensor

from transformers import AutoModel
from transformers import AutoTokenizer
from transformers import DPRContextEncoder
from typing import List, Dict

from datasets import Dataset
import numpy as np

import pandas as pd

In [None]:
# load the data
df = pd.read_csv("imdb_top_10k.csv")
# df = df[['Movie Name', 'Genre', 'Plot', 'Directors']]
# df = df.dropna()
df.head()

In [None]:
print(df.shape)

In [None]:
def get_contexts(df: pd.DataFrame) -> List[Dict]:
    contexts = []
    for i, row in df.iterrows():
        contexts.append(
            {
                "title": row["Movie Name"],
                "text": row["Plot"],
                "meta": {
                    "genre": row["Genre"],
                    "director": row["Directors"],
                    "votes": row["Votes"],
                    "rating": row["Rating"],
                    "metascore": row["Metascore"],
                },
            }
        )
    return contexts


contexts = get_contexts(df)

### Reference Implementation of BM25

In [None]:
from rank_bm25 import BM25Okapi


class BM25Search:
    def __init__(self, documents: List[str]):
        self.documents = documents
        self.tokenized_documents = [document.split() for document in documents]
        self.bm25 = BM25Okapi(self.tokenized_documents)

    def search(self, query: str, top_k: int = 5) -> List[int]:
        tokenized_query = query.split()
        doc_scores = self.bm25.get_scores(tokenized_query)
        # print(doc_scores)
        sorted_indices = np.argsort(doc_scores)[::-1]
        return sorted_indices[:top_k]


bm25_search = BM25Search([context["text"] for context in contexts])
query = "Batman"
retrieved_indices = bm25_search.search(query)
print(retrieved_indices)
for i in retrieved_indices:
    print(contexts[i]["title"], contexts[i]["text"])
    print()

You can use the retriever in your favorite package for RAG as well, i.e. LangChain, etc.

In [None]:
from langchain_community.retrievers import BM25Retriever

retriever = BM25Retriever.from_texts([context["text"] for context in contexts])
retrieved_indices = retriever.invoke(query)
retrieved_indices

### BM25 from scratch

In [None]:
def get_term_freq(term: str, document: str) -> int:
    return document.count(term)


def get_doc_length(document: str):
    return len(document.split())


def get_avg_doc_length(documents: str):
    res = [get_doc_length(d) for d in documents]
    return sum(res) / len(res)


def get_num_containing_docs(term: str, documents: List[str]):
    return sum(list(map(lambda x: term in x, documents)))


def bm25(
    term: str, document: str, documents: List[str], k1: float = 1.5, b: float = 0.75
) -> float:

    term_freq = get_term_freq(term, document)
    N = len(documents)
    D = get_doc_length(document)
    avgdl = get_avg_doc_length(documents)

    tf = term_freq * (k1 + 1) / (term_freq + k1 * (1 - b + b * D / avgdl))
    nq = get_num_containing_docs(term, documents)
    idf = np.log((N - nq + 0.5) / (nq + 0.5) + 1)

    return tf * idf


def bm25_similarity(query: str, document: str, documents: List[str]) -> float:
    query_terms = query.split()
    return sum(bm25(term, document, documents) for term in query_terms)


def get_bm25_topk(query: str, documents: List[str], k: int = 5) -> List[int]:
    scores = [bm25_similarity(query, document, documents) for document in documents]
    return sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:k]


# test
retrieved_indices = get_bm25_topk(query, [context["text"] for context in contexts])
for i in retrieved_indices:
    print(contexts[i]["title"], contexts[i]["text"])
    print()

### Neural Retrieval

Lets make it a huggingface dataset out of pure convenience

In [None]:
movie_dataset = Dataset.from_pandas(df)
movie_dataset

In [None]:
def concatenate_text(data):

    return {
        "text": data["Movie Name"]
        + " \n"
        + data["Genre"]
        + " \n"
        + data["Plot"]
        + " \n"
        + data["Directors"]
        + " \n"
        + str(data["Votes"])
        + " \n"
        + str(data["Rating"])
        + " \n"
        + str(data["Metascore"])
    }


movie_dataset = movie_dataset.map(concatenate_text)

In [None]:
movie_dataset[1]

In [None]:
class Transformer_embedder(nn.Module):
    def __init__(self, feat_extractor_name: str = ""):
        """Transformer Embedding model

        Args:
            feat_extractor_name (str, optional): Name of the feature extracator from HF hub or torch Hub.
        """
        super(Transformer_embedder, self).__init__()

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.feat_extractor_name = feat_extractor_name

        if "dpr" in feat_extractor_name.lower():
            feat_extractor = DPRContextEncoder.from_pretrained(feat_extractor_name)
        else:
            feat_extractor = AutoModel.from_pretrained(feat_extractor_name)

        self.tokenizer = AutoTokenizer.from_pretrained(feat_extractor_name)

        self.normalize = True
        self.feat_extractor = feat_extractor
        self.embeding_shape = self.get_extractor_output_shape()

    def get_extractor_output_shape(self):
        last_layer = list(self.feat_extractor.named_children())[-1]

        if hasattr(list(last_layer[1].modules())[1], "out_features"):
            shape = list(last_layer[1].modules())[1].out_features
        else:
            shape = self.feat_extractor.config.hidden_size

        return shape

    def mean_pooling(self, model_output: Tensor, attention_mask: Tensor):
        token_embeddings = model_output[
            0
        ]  # First element of model_output contains all token embeddings
        input_mask_expanded = (
            attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        )
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
            input_mask_expanded.sum(1), min=1e-9
        )

    def pool(self, embedding: Tensor, attention_mask: Tensor, pool_type: str = "mean"):

        if "mean" in pool_type:
            pooled = self.mean_pooling(embedding, attention_mask)
        else:
            pooled = embedding.last_hidden_state[:, 0, :]

        return pooled

    def __call__(
        self, input_ids: Tensor, attention_mask: Tensor, labels: Tensor = None, **kwargs
    ):

        embedding = self.feat_extractor(input_ids, attention_mask)

        if "dpr" in self.feat_extractor_name.lower():
            pooled = embedding.pooler_output
        else:
            pooled = self.pool(embedding, attention_mask, pool_type="mean")

        if self.normalize:
            pooled = F.normalize(pooled, p=2, dim=1)

        return pooled

In [None]:
model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"

# The following is a bigger model and might require slight modification in the code
# follow this link for more details: https://huggingface.co/intfloat/e5-mistral-7b-instruct
# model_ckpt = "intfloat/e5-mistral-7b-instruct"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
embedder = Transformer_embedder(model_ckpt)
embedder = embedder.to(device)


def get_embeddings(text_list):
    encoded_input = embedder.tokenizer(
        text_list, padding=True, truncation=True, return_tensors="pt"
    )
    embedder.eval()
    with torch.inference_mode():
        encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
        model_output = embedder(**encoded_input)
    return model_output


# the reason to save as numpy is for further FAISS indexing
embeddings_dataset = movie_dataset.map(
    lambda x: {"embeddings": get_embeddings(x["text"]).cpu().detach().numpy()[0]}
)

In [None]:
# make embeddings numpy array
embeddings_dataset.set_format(
    type="numpy",
    columns=[
        "embeddings",
        "text",
        "Movie Name",
        "Genre",
        "Plot",
        "Directors",
        "Votes",
        "Rating",
        "Metascore",
    ],
)

In [None]:
embeddings_dataset[1]["embeddings"].shape

In [None]:
embeddings_dataset.add_faiss_index(column="embeddings")

In [None]:
question = "Armenia"
question_embedding = get_embeddings([question]).cpu().detach().numpy()[0]
question_embedding.shape

In [None]:
scores, samples = embeddings_dataset.get_nearest_examples(
    "embeddings", question_embedding, k=5
)
samples = {k: v for k, v in samples.items() if k != "embeddings"}
samples_df = pd.DataFrame.from_dict(samples)
samples_df["scores"] = scores
samples_df.sort_values("scores", ascending=False, inplace=True)

In [None]:
for _, row in samples_df.iterrows():
    print(f"Series Title: {row['Movie Name']}")
    print(f"Overview: {row['Plot']}")
    print(f"Genre: {row['Genre']}")
    print(f"Scores: {row['scores']}")
    print(f"Votes: {row['Votes']}")
    print(f"Rating: {row['Rating']}")
    print(f"Metascore: {row['Metascore']}")
    print(f"Directors: {row['Directors']}")
    print("=" * 50)
    print()

In [None]:
# save only the embeddings
embeddings = embeddings_dataset["embeddings"]
np.save("imdb_top_10k_embeddings.npy", embeddings)

# save the dataset
embeddings_dataset.drop_index("embeddings")
embeddings_dataset.save_to_disk("imdb_top_10k_embeddings_dataset")