In [1]:
import torch
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModel
import pandas as pd

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
# device = "mps"
device

'mps'

In [3]:
issues_dataset = load_dataset("lewtun/github-issues", split="train")
issues_dataset

Repo card metadata block was not found. Setting CardData to empty.


Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'pull_request', 'body', 'timeline_url', 'performed_via_github_app', 'is_pull_request'],
    num_rows: 3019
})

In [4]:
issues_dataset = issues_dataset.filter(
    lambda x: (x["is_pull_request"] == False and len(x["comments"]) > 0)
)
issues_dataset

Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'pull_request', 'body', 'timeline_url', 'performed_via_github_app', 'is_pull_request'],
    num_rows: 808
})

In [5]:
columns = issues_dataset.column_names
columns_to_keep = ["title", "body", "html_url", "comments"]
columns_to_remove = set(columns_to_keep).symmetric_difference(columns)
issues_dataset = issues_dataset.remove_columns(columns_to_remove)
issues_dataset

Dataset({
    features: ['html_url', 'title', 'comments', 'body'],
    num_rows: 808
})

In [6]:
issues_dataset.set_format("pandas")
df = issues_dataset[:]

df["comments"][0].tolist()

['Cool, I think we can do both :)',
 '@lhoestq now the 2 are implemented.\r\n\r\nPlease note that for the the second protection, finally I have chosen to protect the master branch only from **merge commits** (see update comment above), so no need to disable/re-enable the protection on each release (direct commits, different from merge commits, can be pushed to the remote master branch; and eventually reverted without messing up the repo history).']

In [7]:
comments_df = df.explode("comments", ignore_index=True)
comments_df.head(2)

Unnamed: 0,html_url,title,comments,body
0,https://github.com/huggingface/datasets/issues...,Protect master branch,"Cool, I think we can do both :)",After accidental merge commit (91c55355b634d0d...
1,https://github.com/huggingface/datasets/issues...,Protect master branch,@lhoestq now the 2 are implemented.\r\n\r\nPle...,After accidental merge commit (91c55355b634d0d...


In [8]:
comments_dataset = Dataset.from_pandas(comments_df)
comments_dataset

Dataset({
    features: ['html_url', 'title', 'comments', 'body'],
    num_rows: 2964
})

In [9]:
comments_dataset = comments_dataset.map(
    lambda x: {"comment_length": len(x["comments"].split())}
)

comments_dataset = comments_dataset.filter(lambda x: x["comment_length"] > 15)
comments_dataset

Map:   0%|          | 0/2964 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2964 [00:00<?, ? examples/s]

Dataset({
    features: ['html_url', 'title', 'comments', 'body', 'comment_length'],
    num_rows: 2175
})

In [10]:
def concatenate_text(examples):
    return {
        "text": examples["title"]
        + " \n "
        + examples["body"]
        + " \n "
        + examples["comments"]
    }


comments_dataset = comments_dataset.map(concatenate_text)
comments_dataset

Map:   0%|          | 0/2175 [00:00<?, ? examples/s]

Dataset({
    features: ['html_url', 'title', 'comments', 'body', 'comment_length', 'text'],
    num_rows: 2175
})

In [11]:
model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)
model.to(device)

MPNetModel(
  (embeddings): MPNetEmbeddings(
    (word_embeddings): Embedding(30527, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): MPNetEncoder(
    (layer): ModuleList(
      (0-11): 12 x MPNetLayer(
        (attention): MPNetAttention(
          (attn): MPNetSelfAttention(
            (q): Linear(in_features=768, out_features=768, bias=True)
            (k): Linear(in_features=768, out_features=768, bias=True)
            (v): Linear(in_features=768, out_features=768, bias=True)
            (o): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (intermediate): MPNetIntermediate(
          (dense): Linear(in_

In [12]:
def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]

In [13]:
def get_embeddings(text_list):
    encoded_input = tokenizer(
        text_list, padding=True, truncation=True, return_tensors="pt"
    ).to(device)
    
    model_output = model(**encoded_input)
    return cls_pooling(model_output)

In [14]:
embedding = get_embeddings(comments_dataset["text"][0])
embedding.shape

torch.Size([1, 768])

In [15]:
embeddings_dataset = comments_dataset.map(
    lambda x: {"embeddings": get_embeddings(x["text"]).detach().cpu().numpy()[0]}
)

Map:   0%|          | 0/2175 [00:00<?, ? examples/s]

In [16]:
type(embeddings_dataset["embeddings"][0]), len(embeddings_dataset["embeddings"][0])

(list, 768)

In [17]:
embeddings_dataset.add_faiss_index(column="embeddings")

  0%|          | 0/3 [00:00<?, ?it/s]

Dataset({
    features: ['html_url', 'title', 'comments', 'body', 'comment_length', 'text', 'embeddings'],
    num_rows: 2175
})

In [18]:
def print_samples(samples_df):
    for _, row in samples_df.iterrows():
        print(f"SCORE: {row.scores}")
        print(f"TITLE: {row.title}")
        print(f"COMMENT: {row.comments}")
        print("=" * 50)
        print()

In [19]:
def analyze_question(question):
    question_embedding = get_embeddings([question]).detach().cpu().numpy()
    
    scores, samples = embeddings_dataset.get_nearest_examples(
        "embeddings", question_embedding, k=5
    )

    samples_df = pd.DataFrame.from_dict(samples)
    samples_df["scores"] = scores
    samples_df.sort_values("scores", ascending=False, inplace=True)

    print_samples(samples_df)

In [20]:
analyze_question("How can I load a dataset offline?")

SCORE: 25.505016326904297
TITLE: Discussion using datasets in offline mode
COMMENT: Requiring online connection is a deal breaker in some cases unfortunately so it'd be great if offline mode is added similar to how `transformers` loads models offline fine.

@mandubian's second bullet point suggests that there's a workaround allowing you to use your offline (custom?) dataset with `datasets`. Could you please elaborate on how that should look like?

SCORE: 24.55552101135254
TITLE: Discussion using datasets in offline mode
COMMENT: The local dataset builders (csv, text , json and pandas) are now part of the `datasets` package since #1726 :)
You can now use them offline
```python
datasets = load_dataset('text', data_files=data_files)
```

We'll do a new release soon

SCORE: 24.14898109436035
TITLE: Discussion using datasets in offline mode
COMMENT: I opened a PR that allows to reload modules that have already been loaded once even if there's no internet.

Let me know if you know other ways

In [21]:
analyze_question("How do merge two datasets with the same columns into one?")

SCORE: 31.586475372314453
TITLE: add a new column 
COMMENT: Hi ! Currently you have to use `map` . You can see an example of how to do it in this comment: https://github.com/huggingface/datasets/issues/853#issuecomment-727872188

In the future we'll add support for a more native way of adding a new column ;)

SCORE: 30.84308624267578
TITLE: how can I combine 2 dataset with different/same features?
COMMENT: Hi ! Currently we don't have a way to `zip` datasets but we plan to add this soon :)
For now you'll need to use `map` to add the fields from one dataset to the other. See the comment here for more info : https://github.com/huggingface/datasets/issues/853#issuecomment-727872188

SCORE: 30.727935791015625
TITLE: how can I combine 2 dataset with different/same features?
COMMENT: Good to hear.
Currently I did not use map , just fetch src and tgt from the 2 dataset and merge them.
It will be a release if you can deal with it at the backend.
Thanks.

SCORE: 30.592056274414062
TITLE: [Quest

In [22]:
analyze_question("How do concatenate two datasets with the same columns into one?")

SCORE: 33.38499450683594
TITLE: [Question] Combine 2 datasets which have the same columns
COMMENT: We are thinking about ways to combine datasets for T5 in #217, feel free to share your thoughts about this.

SCORE: 32.46791458129883
TITLE: save_to_disk doesn't work when we use concatenate_datasets function before creating the final dataset_object.
COMMENT: Hi ! We refactored save_to_disk in #2025 so this doesn't happen.
Feel free to try it on master for now
We'll do a new release soon

SCORE: 30.74243927001953
TITLE: Concatenate several datasets with removed columns is not working.
COMMENT: Hi,

did you fill out the env info section manually or by copy-pasting the output of the `datasets-cli env` command?

This code should work without issues on 1.6.2 version (I'm working on master (1.6.2.dev0 version) and can't reproduce this error).

SCORE: 30.71234703063965
TITLE: concatenate_datasets support axis=0 or 1 ？
COMMENT: Actually it's doable but requires to update the `Dataset._data_files