In [1]:
from datasets import load_dataset

issues_dataset = load_dataset("lewtun/github-issues", split="train")

Downloading readme:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.
Downloading data: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12.2M/12.2M [00:03<00:00, 3.51MB/s]


Generating train split: 0 examples [00:00, ? examples/s]

In [2]:
issues_dataset = issues_dataset.filter(
    lambda x: (x["is_pull_request"] == False and len(x["comments"]) > 0)
)

Filter:   0%|          | 0/3019 [00:00<?, ? examples/s]

In [4]:
columns = issues_dataset.column_names
columns_to_keep = ["title", "body", "html_url", "comments"]
columns_to_remove = set(columns_to_keep).symmetric_difference(columns)
issues_dataset = issues_dataset.remove_columns(columns_to_remove)

In [5]:
issues_dataset.set_format("pandas")
df = issues_dataset[:]

In [6]:
df["comments"][0].tolist()

['Cool, I think we can do both :)',
 '@lhoestq now the 2 are implemented.\r\n\r\nPlease note that for the the second protection, finally I have chosen to protect the master branch only from **merge commits** (see update comment above), so no need to disable/re-enable the protection on each release (direct commits, different from merge commits, can be pushed to the remote master branch; and eventually reverted without messing up the repo history).']

In [16]:
df.loc[0, 'comments']

array(['Cool, I think we can do both :)',
       '@lhoestq now the 2 are implemented.\r\n\r\nPlease note that for the the second protection, finally I have chosen to protect the master branch only from **merge commits** (see update comment above), so no need to disable/re-enable the protection on each release (direct commits, different from merge commits, can be pushed to the remote master branch; and eventually reverted without messing up the repo history).'],
      dtype=object)

In [17]:
comments_df = df.explode("comments", ignore_index=True)
comments_df.head(4)

Unnamed: 0,html_url,title,comments,body
0,https://github.com/huggingface/datasets/issues...,Protect master branch,"Cool, I think we can do both :)",After accidental merge commit (91c55355b634d0d...
1,https://github.com/huggingface/datasets/issues...,Protect master branch,@lhoestq now the 2 are implemented.\r\n\r\nPle...,After accidental merge commit (91c55355b634d0d...
2,https://github.com/huggingface/datasets/issues...,Backwards compatibility broken for cached data...,Hi ! I guess the caching mechanism should have...,## Describe the bug\r\nAfter upgrading to data...
3,https://github.com/huggingface/datasets/issues...,Backwards compatibility broken for cached data...,"If it's easy enough to implement, then yes ple...",## Describe the bug\r\nAfter upgrading to data...


In [18]:
from datasets import Dataset

comments_dataset = Dataset.from_pandas(comments_df)
comments_dataset

Dataset({
    features: ['html_url', 'title', 'comments', 'body'],
    num_rows: 2964
})

In [23]:
issues_dataset = issues_dataset.map(lambda batch: batch.explode('comments', ignore_index=True), batched=True)

Map:   0%|          | 0/808 [00:00<?, ? examples/s]

In [24]:
comments_dataset = comments_dataset.map(
    lambda x: {"comment_length": len(x["comments"].split())}
)

Map:   0%|          | 0/2964 [00:00<?, ? examples/s]

In [25]:
comments_dataset = comments_dataset.filter(lambda x: x["comment_length"] > 15)

Filter:   0%|          | 0/2964 [00:00<?, ? examples/s]

In [26]:
def concatenate_text(examples):
    return {
        "text": examples["title"]
        + " \n "
        + examples["body"]
        + " \n "
        + examples["comments"]
    }


In [27]:
comments_dataset = comments_dataset.map(concatenate_text)

Map:   0%|          | 0/2175 [00:00<?, ? examples/s]

In [1]:
from transformers import AutoTokenizer, AutoModel

model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

In [2]:
def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]

In [3]:
import torch

In [4]:
def get_embeddings(text_list):
    encoded_input = tokenizer(
        text_list, padding=True, truncation=True, return_tensors="pt"
    )
    model_output = model(**encoded_input)
    return cls_pooling(model_output)

In [34]:
embeddings_dataset = comments_dataset.map(
    lambda x: {"embeddings": get_embeddings(x["text"]).detach().numpy()[0]}
)

Map:   0%|          | 0/2175 [00:00<?, ? examples/s]

In [44]:
embeddings_dataset.save_to_disk('embeddings')

Saving the dataset (0/1 shards):   0%|          | 0/2175 [00:00<?, ? examples/s]

In [5]:
from datasets import load_from_disk

In [6]:
embeddings_dataset = load_from_disk('embeddings')

In [8]:
embeddings_dataset.add_faiss_index(column="embeddings")

  0%|          | 0/3 [00:00<?, ?it/s]

Dataset({
    features: ['html_url', 'title', 'comments', 'body', 'comment_length', 'text', 'embeddings'],
    num_rows: 2175
})

In [10]:
question = "How can I load a dataset offline?"
question_embedding = get_embeddings([question]).detach().numpy()

In [11]:
question_embedding.shape

(1, 768)

In [None]:
scores, samples = embeddings_dataset.get_nearest_examples(
    "embeddings", question_embedding, k=5
)

In [9]:
import psutil
import time

while True:
    memory_usage = psutil.virtual_memory()
    print(f"Memory Usage: {memory_usage.percent}%")
    time.sleep(1)

Memory Usage: 64.0%
Memory Usage: 64.0%
Memory Usage: 64.0%
Memory Usage: 64.0%
Memory Usage: 63.9%
Memory Usage: 63.8%
Memory Usage: 63.6%
Memory Usage: 63.6%
Memory Usage: 63.5%
Memory Usage: 63.7%
Memory Usage: 63.7%
Memory Usage: 63.8%
Memory Usage: 63.8%


KeyboardInterrupt: 