# Semantic search with FAISS

In [17]:
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModel
import torch
import pandas as pd
import faiss

## Loading and preparing the dataset

In [2]:
issues_dataset = load_dataset("lewtun/github-issues", split="train")
issues_dataset

Repo card metadata block was not found. Setting CardData to empty.


Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'pull_request', 'body', 'timeline_url', 'performed_via_github_app', 'is_pull_request'],
    num_rows: 3019
})

In [3]:
# filter out the pull requests, as these tend to be rarely used for answering user queries and will introduce noise in our search engine
issues_dataset = issues_dataset.filter(
    lambda x: (x["is_pull_request"] == False and len(x["comments"]) > 0)
)
issues_dataset

Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'pull_request', 'body', 'timeline_url', 'performed_via_github_app', 'is_pull_request'],
    num_rows: 808
})

In [4]:
# remove the columns that we don't need
columns = issues_dataset.column_names
columns_to_keep = ["title", "body", "html_url", "comments"]
columns_to_remove = set(columns_to_keep).symmetric_difference(columns)
issues_dataset = issues_dataset.remove_columns(columns_to_remove)
issues_dataset

Dataset({
    features: ['html_url', 'title', 'comments', 'body'],
    num_rows: 808
})

In [5]:
issues_dataset.set_format("pandas")
df = issues_dataset[:]

df["comments"][0].tolist()

['Cool, I think we can do both :)',
 '@lhoestq now the 2 are implemented.\r\n\r\nPlease note that for the the second protection, finally I have chosen to protect the master branch only from **merge commits** (see update comment above), so no need to disable/re-enable the protection on each release (direct commits, different from merge commits, can be pushed to the remote master branch; and eventually reverted without messing up the repo history).']

In [6]:
comments_df = df.explode("comments", ignore_index=True)
comments_df.head(4)

Unnamed: 0,html_url,title,comments,body
0,https://github.com/huggingface/datasets/issues...,Protect master branch,"Cool, I think we can do both :)",After accidental merge commit (91c55355b634d0d...
1,https://github.com/huggingface/datasets/issues...,Protect master branch,@lhoestq now the 2 are implemented.\r\n\r\nPle...,After accidental merge commit (91c55355b634d0d...
2,https://github.com/huggingface/datasets/issues...,Backwards compatibility broken for cached data...,Hi ! I guess the caching mechanism should have...,## Describe the bug\r\nAfter upgrading to data...
3,https://github.com/huggingface/datasets/issues...,Backwards compatibility broken for cached data...,"If it's easy enough to implement, then yes ple...",## Describe the bug\r\nAfter upgrading to data...


In [7]:
comments_dataset = Dataset.from_pandas(comments_df)
comments_dataset

Dataset({
    features: ['html_url', 'title', 'comments', 'body'],
    num_rows: 2964
})

In [8]:
# add a column for the length of the comments
comments_dataset = comments_dataset.map(
    lambda x: {"comment_length": len(x["comments"].split())}
)

Map:   0%|          | 0/2964 [00:00<?, ? examples/s]

In [9]:
# filter out comments that are too short like "Thanks!" or "cc @lewtun"
comments_dataset = comments_dataset.filter(lambda x: x["comment_length"] > 15)
comments_dataset

Filter:   0%|          | 0/2964 [00:00<?, ? examples/s]

Dataset({
    features: ['html_url', 'title', 'comments', 'body', 'comment_length'],
    num_rows: 2175
})

In [10]:
# concatenate the title, body and comments into a single text field
def concatenate_text(examples):
    return {
        "text": examples["title"]
        + " \n "
        + examples["body"]
        + " \n "
        + examples["comments"]
    }


comments_dataset = comments_dataset.map(concatenate_text)

Map:   0%|          | 0/2175 [00:00<?, ? examples/s]

## Creating text embeddings

In [11]:
model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

In [12]:
device = torch.device("cuda")
model.to(device)

MPNetModel(
  (embeddings): MPNetEmbeddings(
    (word_embeddings): Embedding(30527, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): MPNetEncoder(
    (layer): ModuleList(
      (0-11): 12 x MPNetLayer(
        (attention): MPNetAttention(
          (attn): MPNetSelfAttention(
            (q): Linear(in_features=768, out_features=768, bias=True)
            (k): Linear(in_features=768, out_features=768, bias=True)
            (v): Linear(in_features=768, out_features=768, bias=True)
            (o): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (intermediate): MPNetIntermediate(
          (dense): Linear(in_

In [13]:
def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]

In [14]:
def get_embeddings(text_list):
    encoded_input = tokenizer(
        text_list, padding=True, truncation=True, return_tensors="pt"
    )
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    model_output = model(**encoded_input)
    return cls_pooling(model_output)

In [15]:
embedding = get_embeddings(comments_dataset["text"][0])
embedding.shape

torch.Size([1, 768])

In [16]:
# add the embeddings to the dataset
embeddings_dataset = comments_dataset.map(
    lambda x: {"embeddings": get_embeddings(x["text"]).detach().cpu().numpy()[0]}
)

Map:   0%|          | 0/2175 [00:00<?, ? examples/s]

## Using FAISS for effiecient similarity search

In [None]:
embeddings_dataset.add_faiss_index(column="embeddings")

  0%|          | 0/3 [00:00<?, ?it/s]

Dataset({
    features: ['html_url', 'title', 'comments', 'body', 'comment_length', 'text', 'embeddings'],
    num_rows: 2175
})

In [19]:
question = "How can I load a dataset offline?"
question_embedding = get_embeddings([question]).cpu().detach().numpy()
question_embedding.shape

(1, 768)

In [20]:
scores, samples = embeddings_dataset.get_nearest_examples(
    "embeddings", question_embedding, k=5
)

In [21]:
samples_df = pd.DataFrame.from_dict(samples)
samples_df["scores"] = scores
samples_df.sort_values("scores", ascending=False, inplace=True)

In [23]:
display(samples_df[["text", "scores"]])

Unnamed: 0,text,scores
4,Discussion using datasets in offline mode \n `...,25.50502
3,Discussion using datasets in offline mode \n `...,24.555546
2,Discussion using datasets in offline mode \n `...,24.148987
1,Discussion using datasets in offline mode \n `...,22.894001
0,Discussion using datasets in offline mode \n `...,22.406652


In [22]:
for _, row in samples_df.iterrows():
    print(f"COMMENT: {row.comments}")
    print(f"SCORE: {row.scores}")
    print(f"TITLE: {row.title}")
    print(f"URL: {row.html_url}")
    print("=" * 50)
    print()

COMMENT: Requiring online connection is a deal breaker in some cases unfortunately so it'd be great if offline mode is added similar to how `transformers` loads models offline fine.

@mandubian's second bullet point suggests that there's a workaround allowing you to use your offline (custom?) dataset with `datasets`. Could you please elaborate on how that should look like?
SCORE: 25.505020141601562
TITLE: Discussion using datasets in offline mode
URL: https://github.com/huggingface/datasets/issues/824

COMMENT: The local dataset builders (csv, text , json and pandas) are now part of the `datasets` package since #1726 :)
You can now use them offline
```python
datasets = load_dataset('text', data_files=data_files)
```

We'll do a new release soon
SCORE: 24.555545806884766
TITLE: Discussion using datasets in offline mode
URL: https://github.com/huggingface/datasets/issues/824

COMMENT: I opened a PR that allows to reload modules that have already been loaded once even if there's no intern

## Examples

In [27]:
query_questions = [
    "How can I load a dataset offline?",
    "How do I install the library?",
    "What is the best way to preprocess the data?",
    "How do I train the model?",
    "How do I evaluate the model?"
]

for i, query in enumerate(query_questions):
    print(f"Query {i}: {query}")
    query_embeddings = get_embeddings(query).cpu().detach().numpy()
    
    scores, samples = embeddings_dataset.get_nearest_examples(
        "embeddings", query_embeddings, k=5
    )
    
    samples_df = pd.DataFrame.from_dict(samples)
    samples_df["scores"] = scores
    samples_df.sort_values("scores", ascending=False, inplace=False)
    
    print("Top 5 comments:")
    for j, row in samples_df.iterrows():
        print(f" {j + 1}: {row.comments} (score: {row.scores:.4f})")
    print("=" * 100, "\n")
    

Query 0: How can I load a dataset offline?
Top 5 comments:
 1: here is my way to load a dataset offline, but it **requires** an online machine
1. (online machine)
```
import datasets
data = datasets.load_dataset(...)
data.save_to_disk(/YOUR/DATASET/DIR)
```
2. copy the dir from online to the offline machine
3. (offline machine)
```
import datasets
data = datasets.load_from_disk(/SAVED/DATA/DIR)
```

HTH. (score: 22.4067)
 2: > here is my way to load a dataset offline, but it **requires** an online machine
> 
> 1. (online machine)
> 
> ```
> 
> import datasets
> 
> data = datasets.load_dataset(...)
> 
> data.save_to_disk(/YOUR/DATASET/DIR)
> 
> ```
> 
> 2. copy the dir from online to the offline machine
> 
> 3. (offline machine)
> 
> ```
> 
> import datasets
> 
> data = datasets.load_from_disk(/SAVED/DATA/DIR)
> 
> ```
> 
> 
> 
> HTH.

 (score: 22.8940)
 3: I opened a PR that allows to reload modules that have already been loaded once even if there's no internet.

Let me know if you kno

### **Podsumowanie uzyskanych rezultatów**

#### Zapytanie 0: "How can I load a dataset offline?"
- Wyniki były związane z tematem ładowania danych.
- Pokazywały konkretne metody, np. użycie funkcji *load_from_disk* lub zapisanie danyc lokalnie *save_to_disk*
- Rezultaty miały różne poziomy szczegółowości, ale były trafne i odpowiadały na zadane pytanie.

#### Zapytanie 1: "How do I install the library?"
- Wyniki w większości były związanie z instalacją bibliotek, odpowiedzi były trafne i pokazywały rozwiązanie na zadane pytanie.
- Sugerowały instalacje poprzez *pip install git+https://github.com/huggingface/nlp.git*

Błędy:
- odpowiedź z najwyższym scorem nie odpowiadała na zadane pytanie, tylko informowała, że coś zostało naprawione i wkrótce zostanie zaktualizowane.
- odpowiedzi często dotyczyły instalacji specyficznych bibliotek

#### Zapytanie 2: "What is the best way to preprocess the data?"
- wyniki były mieszane. Niektóre odpowiedzi omawiały użycie funkcji map do augmentacji danych, niekoniecznie przedstawiając najlepsze sposoby preprocessingu.

Błędy:
- odpowiedzi skupiały się na używaniu map do augmenatcji, zamiast przedstawiać sposoby preproccesingu danych.

#### Zapytanie 3: "How do I train the model?"
- Wyniki były mało związane z tematem, skupiały się bardziej na zbiorach danych test i train.
  
Błędy:
- odpowiedzi skupiały się na tym jak załadować zbiór danych, zamiast opisywać procedury trenowania modelu lub pokazywać kod do tego
- Wyniki nie były pomocne dla tego tematu (np. przeformułowanie pytania lub pytania dotyczące połączenia sieciowego)

#### Zapytanie 4: "How do I evaluate the model?"
- mało konkretne odpowiedzi do zadanego tematu
- pojawiły się wyniki dotyczące metryk (accuracy i dice score)

Błędu:
- brak konkretnych odpowiedzi o możliwościach ewaluacji modelu
- wyniki mało związane z tematm, np. odnosiły się do dokumentacji 'puzzles' albo do konkretnych problemów danego użytkownika


# Zadanie 4

## Dodanie modułu *re-rankingu*

In [28]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [30]:
reranker_model = AutoModelForSequenceClassification.from_pretrained("cross-encoder/ms-marco-MiniLM-L-6-v2")
reranker_tokenizer = AutoTokenizer.from_pretrained("cross-encoder/ms-marco-MiniLM-L-6-v2")

reranker_model.eval()

config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 384, padding_idx=0)
      (position_embeddings): Embedding(512, 384)
      (token_type_embeddings): Embedding(2, 384)
      (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-5): 6 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=384, out_features=384, bias=True)
              (key): Linear(in_features=384, out_features=384, bias=True)
              (value): Linear(in_features=384, out_features=384, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=384, out_features=384, bias=True)
              (LayerNorm): LayerNorm((384,), eps=1e-1

In [29]:
def rerank(query, model, tokenizer, results):
    scores = []
    for candidate in results:
        inputs = tokenizer(query, candidate, return_tensors="pt", truncation=True, padding=True)
    
        with torch.no_grad():
            logits = model(**inputs).logits
            score = logits.item()
            scores.append(score)
    
    ranked_results = sorted(zip(results, scores), key=lambda x: x[1], reverse=True)
    return [result[0] for result in ranked_results]

In [33]:
query_questions = [
    "How can I load a dataset offline?",
    "How do I install the library?",
    "What is the best way to preprocess the data?",
    "How do I train the model?",
    "How do I evaluate the model?"
]

for i, query in enumerate(query_questions):
    print(f"Query {i}: {query}")
    query_embeddings = get_embeddings(query).cpu().detach().numpy()
    
    scores, samples = embeddings_dataset.get_nearest_examples(
        "embeddings", query_embeddings, k=5
    )
    
    samples_df = pd.DataFrame.from_dict(samples)
    samples_df["scores"] = scores
    samples_df.sort_values("scores", ascending=False, inplace=False)
    
    results = samples_df["comments"].tolist()
    
    results_reranked = rerank(query, reranker_model, reranker_tokenizer, results)
    
    for res in results_reranked:
        print(f"COMMENT: {res}")
    print("=" * 50)
    

Query 0: How can I load a dataset offline?
COMMENT: > here is my way to load a dataset offline, but it **requires** an online machine
> 
> 1. (online machine)
> 
> ```
> 
> import datasets
> 
> data = datasets.load_dataset(...)
> 
> data.save_to_disk(/YOUR/DATASET/DIR)
> 
> ```
> 
> 2. copy the dir from online to the offline machine
> 
> 3. (offline machine)
> 
> ```
> 
> import datasets
> 
> data = datasets.load_from_disk(/SAVED/DATA/DIR)
> 
> ```
> 
> 
> 
> HTH.


COMMENT: here is my way to load a dataset offline, but it **requires** an online machine
1. (online machine)
```
import datasets
data = datasets.load_dataset(...)
data.save_to_disk(/YOUR/DATASET/DIR)
```
2. copy the dir from online to the offline machine
3. (offline machine)
```
import datasets
data = datasets.load_from_disk(/SAVED/DATA/DIR)
```

HTH.
COMMENT: I opened a PR that allows to reload modules that have already been loaded once even if there's no internet.

Let me know if you know other ways that can make the offl