DATASETS Library

In [1]:
from datasets import load_dataset
data_files = {'train': 'data/drugsComTrain_raw.tsv', 'test': 'data/drugsComTest_raw.tsv'}
drug_dataset = load_dataset('csv', data_files=data_files, delimiter='\t')

In [2]:
drug_dataset.shape

{'train': (161297, 7), 'test': (53766, 7)}

In [3]:
drug_dataset['train'].features

{'Unnamed: 0': Value(dtype='int64', id=None),
 'drugName': Value(dtype='string', id=None),
 'condition': Value(dtype='string', id=None),
 'review': Value(dtype='string', id=None),
 'rating': Value(dtype='float64', id=None),
 'date': Value(dtype='string', id=None),
 'usefulCount': Value(dtype='int64', id=None)}

In [4]:
drug_dataset['train'][:3]

{'Unnamed: 0': [206461, 95260, 92703],
 'drugName': ['Valsartan', 'Guanfacine', 'Lybrel'],
 'condition': ['Left Ventricular Dysfunction', 'ADHD', 'Birth Control'],
 'review': ['"It has no side effect, I take it in combination of Bystolic 5 Mg and Fish Oil"',
  '"My son is halfway through his fourth week of Intuniv. We became concerned when he began this last week, when he started taking the highest dose he will be on. For two days, he could hardly get out of bed, was very cranky, and slept for nearly 8 hours on a drive home from school vacation (very unusual for him.) I called his doctor on Monday morning and she said to stick it out a few days. See how he did at school, and with getting up in the morning. The last two days have been problem free. He is MUCH more agreeable than ever. He is less emotional (a good thing), less cranky. He is remembering all the things he should. Overall his behavior is better. \r\nWe have tried many different medications and so far this is the most effect

In [5]:
drug_sample = drug_dataset['train'].shuffle(seed=42).select(range(1000))
drug_sample[:3]

{'Unnamed: 0': [87571, 178045, 80482],
 'drugName': ['Naproxen', 'Duloxetine', 'Mobic'],
 'condition': ['Gout, Acute', 'ibromyalgia', 'Inflammatory Conditions'],
 'review': ['"like the previous person mention, I&#039;m a strong believer of aleve, it works faster for my gout than the prescription meds I take. No more going to the doctor for refills.....Aleve works!"',
  '"I have taken Cymbalta for about a year and a half for fibromyalgia pain. It is great\r\nas a pain reducer and an anti-depressant, however, the side effects outweighed \r\nany benefit I got from it. I had trouble with restlessness, being tired constantly,\r\ndizziness, dry mouth, numbness and tingling in my feet, and horrible sweating. I am\r\nbeing weaned off of it now. Went from 60 mg to 30mg and now to 15 mg. I will be\r\noff completely in about a week. The fibro pain is coming back, but I would rather deal with it than the side effects."',
  '"I have been taking Mobic for over a year with no side effects other than 

In [6]:
drug_dataset

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})

In [7]:
for split in drug_dataset.keys():
    assert len(drug_dataset[split]) == len(drug_dataset[split].unique('Unnamed: 0'))

In [8]:
drug_dataset = drug_dataset.rename_column(original_column_name='Unnamed: 0', new_column_name='patient_id')
drug_dataset

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})

In [9]:
def lowercase_condition(example):
    return {'condition': example['condition'].lower()}

In [10]:
drug_dataset['train'][:3]

{'patient_id': [206461, 95260, 92703],
 'drugName': ['Valsartan', 'Guanfacine', 'Lybrel'],
 'condition': ['Left Ventricular Dysfunction', 'ADHD', 'Birth Control'],
 'review': ['"It has no side effect, I take it in combination of Bystolic 5 Mg and Fish Oil"',
  '"My son is halfway through his fourth week of Intuniv. We became concerned when he began this last week, when he started taking the highest dose he will be on. For two days, he could hardly get out of bed, was very cranky, and slept for nearly 8 hours on a drive home from school vacation (very unusual for him.) I called his doctor on Monday morning and she said to stick it out a few days. See how he did at school, and with getting up in the morning. The last two days have been problem free. He is MUCH more agreeable than ever. He is less emotional (a good thing), less cranky. He is remembering all the things he should. Overall his behavior is better. \r\nWe have tried many different medications and so far this is the most effect

In [11]:
def filter_nones(x):
    return x['condition'] is not None

In [12]:
drug_dataset = drug_dataset.filter(lambda x: x['condition'] is not None)
drug_dataset

Filter:   0%|          | 0/161297 [00:00<?, ? examples/s]

Filter: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 161297/161297 [00:01<00:00, 149286.04 examples/s]
Filter: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 53766/53766 [00:00<00:00, 165832.36 examples/s]


DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 160398
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53471
    })
})

In [13]:
drug_dataset = drug_dataset.map(lowercase_condition)
drug_dataset['train'][:3]

Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 160398/160398 [00:18<00:00, 8488.32 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 53471/53471 [00:05<00:00, 9056.15 examples/s]


{'patient_id': [206461, 95260, 92703],
 'drugName': ['Valsartan', 'Guanfacine', 'Lybrel'],
 'condition': ['left ventricular dysfunction', 'adhd', 'birth control'],
 'review': ['"It has no side effect, I take it in combination of Bystolic 5 Mg and Fish Oil"',
  '"My son is halfway through his fourth week of Intuniv. We became concerned when he began this last week, when he started taking the highest dose he will be on. For two days, he could hardly get out of bed, was very cranky, and slept for nearly 8 hours on a drive home from school vacation (very unusual for him.) I called his doctor on Monday morning and she said to stick it out a few days. See how he did at school, and with getting up in the morning. The last two days have been problem free. He is MUCH more agreeable than ever. He is less emotional (a good thing), less cranky. He is remembering all the things he should. Overall his behavior is better. \r\nWe have tried many different medications and so far this is the most effect

In [14]:
def compute_review_length(example):
    return {'review_length': len(example['review'].split())}

In [15]:
drug_dataset = drug_dataset.map(compute_review_length)
drug_dataset['train'][0]

Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 160398/160398 [00:13<00:00, 11849.69 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 53471/53471 [00:04<00:00, 11929.43 examples/s]


{'patient_id': 206461,
 'drugName': 'Valsartan',
 'condition': 'left ventricular dysfunction',
 'review': '"It has no side effect, I take it in combination of Bystolic 5 Mg and Fish Oil"',
 'rating': 9.0,
 'date': 'May 20, 2012',
 'usefulCount': 27,
 'review_length': 17}

In [16]:
drug_dataset['train'].sort('review_length')[:3]

{'patient_id': [111469, 13653, 53602],
 'drugName': ['Ledipasvir / sofosbuvir',
  'Amphetamine / dextroamphetamine',
  'Alesse'],
 'condition': ['hepatitis c', 'adhd', 'birth control'],
 'review': ['"Headache"', '"Great"', '"Awesome"'],
 'rating': [10.0, 10.0, 10.0],
 'date': ['February 3, 2015', 'October 20, 2009', 'November 23, 2015'],
 'usefulCount': [41, 3, 0],
 'review_length': [1, 1, 1]}

In [17]:
drug_dataset

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 160398
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 53471
    })
})

In [18]:
drug_dataset = drug_dataset.filter(lambda x: x['review_length'] > 30)
print(drug_dataset.num_rows)

Filter: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 160398/160398 [00:01<00:00, 142847.01 examples/s]
Filter: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 53471/53471 [00:00<00:00, 149399.72 examples/s]

{'train': 138514, 'test': 46108}





In [19]:
import html
drug_dataset = drug_dataset.map(lambda x: {'review': [html.unescape(o) for o in x['review']]}, batched=True)

Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 138514/138514 [00:07<00:00, 19577.39 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 46108/46108 [00:02<00:00, 20529.43 examples/s]


In [20]:
drug_dataset.num_rows

{'train': 138514, 'test': 46108}

In [21]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

def tokenize_function(examples):
    return tokenizer(examples['review'], truncation=True)



In [22]:
%time tokenized_dataset = drug_dataset.map(tokenize_function, batched=True)

Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 138514/138514 [00:22<00:00, 6194.49 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 46108/46108 [00:07<00:00, 6506.98 examples/s]

CPU times: user 1min 2s, sys: 1.61 s, total: 1min 4s
Wall time: 29.5 s





In [23]:
%time tokenized_dataset = drug_dataset.map(tokenize_function, batched=False)

Map:   0%|          | 0/138514 [00:00<?, ? examples/s]

Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 138514/138514 [01:15<00:00, 1844.71 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 46108/46108 [00:24<00:00, 1846.34 examples/s]

CPU times: user 1min 39s, sys: 672 ms, total: 1min 39s
Wall time: 1min 40s





In [24]:
slow_tokenizer = AutoTokenizer.from_pretrained('bert-base-cased', use_fast=False)
def slow_tokenize_function(examples):
    return slow_tokenizer(examples['review'], truncation=True)

%time tokenized_dataset = drug_dataset.map(slow_tokenize_function, batched=True)

Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 138514/138514 [02:02<00:00, 1126.22 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 46108/46108 [00:40<00:00, 1129.36 examples/s]

CPU times: user 2min 43s, sys: 545 ms, total: 2min 44s
Wall time: 2min 44s





In [25]:
%time tokenized_dataset = drug_dataset.map(slow_tokenize_function, batched=False)

Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 138514/138514 [02:30<00:00, 922.05 examples/s] 
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 46108/46108 [00:49<00:00, 924.77 examples/s] 

CPU times: user 3min 18s, sys: 3.39 s, total: 3min 21s
Wall time: 3min 20s





In [26]:
def tokenize_and_split(examples):
    return tokenizer(
        examples["review"],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True,
    )

In [27]:
drug_dataset['train'][0]

{'patient_id': 95260,
 'drugName': 'Guanfacine',
 'condition': 'adhd',
 'review': '"My son is halfway through his fourth week of Intuniv. We became concerned when he began this last week, when he started taking the highest dose he will be on. For two days, he could hardly get out of bed, was very cranky, and slept for nearly 8 hours on a drive home from school vacation (very unusual for him.) I called his doctor on Monday morning and she said to stick it out a few days. See how he did at school, and with getting up in the morning. The last two days have been problem free. He is MUCH more agreeable than ever. He is less emotional (a good thing), less cranky. He is remembering all the things he should. Overall his behavior is better. \r\nWe have tried many different medications and so far this is the most effective."',
 'rating': 8.0,
 'date': 'April 27, 2010',
 'usefulCount': 192,
 'review_length': 141}

In [28]:
drug_dataset['train'][0]['review']

'"My son is halfway through his fourth week of Intuniv. We became concerned when he began this last week, when he started taking the highest dose he will be on. For two days, he could hardly get out of bed, was very cranky, and slept for nearly 8 hours on a drive home from school vacation (very unusual for him.) I called his doctor on Monday morning and she said to stick it out a few days. See how he did at school, and with getting up in the morning. The last two days have been problem free. He is MUCH more agreeable than ever. He is less emotional (a good thing), less cranky. He is remembering all the things he should. Overall his behavior is better. \r\nWe have tried many different medications and so far this is the most effective."'

In [29]:
result = tokenize_and_split(drug_dataset["train"][0])
[len(inp) for inp in result["input_ids"]]

[128, 49]

In [30]:
drug_dataset['train'][:3]

{'patient_id': [95260, 92703, 138000],
 'drugName': ['Guanfacine', 'Lybrel', 'Ortho Evra'],
 'condition': ['adhd', 'birth control', 'birth control'],
 'review': ['"My son is halfway through his fourth week of Intuniv. We became concerned when he began this last week, when he started taking the highest dose he will be on. For two days, he could hardly get out of bed, was very cranky, and slept for nearly 8 hours on a drive home from school vacation (very unusual for him.) I called his doctor on Monday morning and she said to stick it out a few days. See how he did at school, and with getting up in the morning. The last two days have been problem free. He is MUCH more agreeable than ever. He is less emotional (a good thing), less cranky. He is remembering all the things he should. Overall his behavior is better. \r\nWe have tried many different medications and so far this is the most effective."',
  '"I used to take another oral contraceptive, which had 21 pill cycle, and was very happy-

In [31]:
drug_dataset.set_format('pandas')
drug_dataset['train'][:3]

Unnamed: 0,patient_id,drugName,condition,review,rating,date,usefulCount,review_length
0,95260,Guanfacine,adhd,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192,141
1,92703,Lybrel,birth control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17,134
2,138000,Ortho Evra,birth control,"""This is my first time using any form of birth...",8.0,"November 3, 2015",10,89


In [32]:
drug_dataset

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 138514
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 46108
    })
})

In [33]:
drug_dataset_clean = drug_dataset["train"].train_test_split(train_size=0.8, seed=42)


In [34]:
drug_dataset_clean

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 110811
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 27703
    })
})

In [35]:
# Rename the default "test" split to "validation"
drug_dataset_clean["validation"] = drug_dataset_clean.pop("test")
# Add the "test" set to our `DatasetDict`
drug_dataset_clean["test"] = drug_dataset["test"]
drug_dataset_clean

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 110811
    })
    validation: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 27703
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 46108
    })
})

In [36]:
drug_dataset_clean.items()

dict_items([('train', Dataset({
    features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
    num_rows: 110811
})), ('validation', Dataset({
    features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
    num_rows: 27703
})), ('test', Dataset({
    features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
    num_rows: 46108
}))])

Semantic search

In [37]:
from datasets import load_dataset
issue_dataset = load_dataset('lewtun/github-issues', split='train')
issue_dataset

Repo card metadata block was not found. Setting CardData to empty.


Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'pull_request', 'body', 'timeline_url', 'performed_via_github_app', 'is_pull_request'],
    num_rows: 3019
})

In [38]:
import pandas as pd
pd.DataFrame(issue_dataset)[:3].T

Unnamed: 0,0,1,2
url,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...
repository_url,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datasets,https://api.github.com/repos/huggingface/datasets
labels_url,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...
comments_url,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...
events_url,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...,https://api.github.com/repos/huggingface/datas...
html_url,https://github.com/huggingface/datasets/pull/2955,https://github.com/huggingface/datasets/pull/2954,https://github.com/huggingface/datasets/pull/2952
id,1003999469,1003904803,1002704096
node_id,PR_kwDODunzps4sHuRu,PR_kwDODunzps4sHa8O,PR_kwDODunzps4sDU8S
number,2955,2954,2952
title,Update legacy Python image for CI tests in Linux,Run tests in parallel,Fix missing conda deps


In [39]:
issue_dataset = issue_dataset.filter(lambda x: x['is_pull_request'] == False and len(x['comments']) > 0)
issue_dataset

Filter: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3019/3019 [00:00<00:00, 12771.38 examples/s]


Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'pull_request', 'body', 'timeline_url', 'performed_via_github_app', 'is_pull_request'],
    num_rows: 808
})

In [40]:
columns = issue_dataset.column_names
columns_to_keep = ['title', 'body', 'html_url', 'comments']
columns_to_remove = set(columns_to_keep).symmetric_difference(columns)
issue_dataset = issue_dataset.remove_columns(columns_to_remove)
issue_dataset

Dataset({
    features: ['html_url', 'title', 'comments', 'body'],
    num_rows: 808
})

In [41]:
pd.DataFrame(issue_dataset)[:3]

Unnamed: 0,html_url,title,comments,body
0,https://github.com/huggingface/datasets/issues...,Protect master branch,"[Cool, I think we can do both :), @lhoestq now...",After accidental merge commit (91c55355b634d0d...
1,https://github.com/huggingface/datasets/issues...,Backwards compatibility broken for cached data...,[Hi ! I guess the caching mechanism should hav...,## Describe the bug\r\nAfter upgrading to data...
2,https://github.com/huggingface/datasets/issues...,OSCAR unshuffled_original_ko: NonMatchingSplit...,[I tried `unshuffled_original_da` and it is al...,## Describe the bug\r\n\r\nCannot download OSC...


In [42]:
issue_dataset.set_format('pandas')
df = issue_dataset[:]
df.head()

Unnamed: 0,html_url,title,comments,body
0,https://github.com/huggingface/datasets/issues...,Protect master branch,"[Cool, I think we can do both :), @lhoestq now...",After accidental merge commit (91c55355b634d0d...
1,https://github.com/huggingface/datasets/issues...,Backwards compatibility broken for cached data...,[Hi ! I guess the caching mechanism should hav...,## Describe the bug\r\nAfter upgrading to data...
2,https://github.com/huggingface/datasets/issues...,OSCAR unshuffled_original_ko: NonMatchingSplit...,[I tried `unshuffled_original_da` and it is al...,## Describe the bug\r\n\r\nCannot download OSC...
3,https://github.com/huggingface/datasets/issues...,load_dataset using default cache on Windows ca...,"[Hi @daqieq, thanks for reporting.\r\n\r\nUnfo...",## Describe the bug\r\nStandard process to dow...
4,https://github.com/huggingface/datasets/issues...,to_tf_dataset keeps a reference to the open da...,"[I did some investigation and, as it seems, th...",To reproduce:\r\n```python\r\nimport datasets ...


In [43]:
df['comments'][0].tolist()

['Cool, I think we can do both :)',
 '@lhoestq now the 2 are implemented.\r\n\r\nPlease note that for the the second protection, finally I have chosen to protect the master branch only from **merge commits** (see update comment above), so no need to disable/re-enable the protection on each release (direct commits, different from merge commits, can be pushed to the remote master branch; and eventually reverted without messing up the repo history).']

In [44]:
comments_df = df.explode('comments', ignore_index=True)
comments_df.head(3).T

Unnamed: 0,0,1,2
html_url,https://github.com/huggingface/datasets/issues...,https://github.com/huggingface/datasets/issues...,https://github.com/huggingface/datasets/issues...
title,Protect master branch,Protect master branch,Backwards compatibility broken for cached data...
comments,"Cool, I think we can do both :)",@lhoestq now the 2 are implemented.\r\n\r\nPle...,Hi ! I guess the caching mechanism should have...
body,After accidental merge commit (91c55355b634d0d...,After accidental merge commit (91c55355b634d0d...,## Describe the bug\r\nAfter upgrading to data...


In [45]:
from datasets import Dataset
comments_dataset = Dataset.from_pandas(comments_df)
comments_dataset

Dataset({
    features: ['html_url', 'title', 'comments', 'body'],
    num_rows: 2964
})

In [46]:
comments_dataset = comments_dataset.map(lambda x: {'comment_length': len(x['comments'].split())})
comments_dataset

Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 2964/2964 [00:00<00:00, 11278.76 examples/s]


Dataset({
    features: ['html_url', 'title', 'comments', 'body', 'comment_length'],
    num_rows: 2964
})

In [47]:
comments_dataset = comments_dataset.filter(lambda x: x['comment_length'] > 15)
comments_dataset

Filter: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 2964/2964 [00:00<00:00, 120654.88 examples/s]


Dataset({
    features: ['html_url', 'title', 'comments', 'body', 'comment_length'],
    num_rows: 2175
})

In [48]:
comments_dataset[:5]

{'html_url': ['https://github.com/huggingface/datasets/issues/2945',
  'https://github.com/huggingface/datasets/issues/2943',
  'https://github.com/huggingface/datasets/issues/2943',
  'https://github.com/huggingface/datasets/issues/2943',
  'https://github.com/huggingface/datasets/issues/2943'],
 'title': ['Protect master branch',
  'Backwards compatibility broken for cached datasets that use `.filter()`',
  'Backwards compatibility broken for cached datasets that use `.filter()`',
  'Backwards compatibility broken for cached datasets that use `.filter()`',
  'Backwards compatibility broken for cached datasets that use `.filter()`'],
 'comments': ['@lhoestq now the 2 are implemented.\r\n\r\nPlease note that for the the second protection, finally I have chosen to protect the master branch only from **merge commits** (see update comment above), so no need to disable/re-enable the protection on each release (direct commits, different from merge commits, can be pushed to the remote master

In [49]:
def concatenate_text(example):
    return {'text':example['title'] + ' \n' + example['body'] + ' \n' + example['comments']}

In [50]:
comments_dataset = comments_dataset.map(concatenate_text)

Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 2175/2175 [00:00<00:00, 8426.96 examples/s]


In [51]:
comments_dataset[0]

{'html_url': 'https://github.com/huggingface/datasets/issues/2945',
 'title': 'Protect master branch',
 'comments': '@lhoestq now the 2 are implemented.\r\n\r\nPlease note that for the the second protection, finally I have chosen to protect the master branch only from **merge commits** (see update comment above), so no need to disable/re-enable the protection on each release (direct commits, different from merge commits, can be pushed to the remote master branch; and eventually reverted without messing up the repo history).',
 'body': 'After accidental merge commit (91c55355b634d0dc73350a7ddee1a6776dbbdd69) into `datasets` master branch, all commits present in the feature branch were permanently added to `datasets` master branch history, as e.g.:\r\n- 00cc036fea7c7745cfe722360036ed306796a3f2\r\n- 13ae8c98602bbad8197de3b9b425f4c78f582af1\r\n- ...\r\n\r\nI propose to protect our master branch, so that we avoid we can accidentally make this kind of mistakes in the future:\r\n- [x] For Pul

In [52]:
from transformers import AutoTokenizer, AutoModel
model_ckpt = 'sentence-transformers/bert-base-nli-mean-tokens'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)



In [53]:
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [54]:
def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]

In [55]:
def get_embeddings(text_list):
    encoded_input = tokenizer(text_list, padding=True, truncation=True,max_length=512, return_tensors='pt')
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    # print(f'encoded_input size : {encoded_input['input_ids'].size()}')
    model_output = model(**encoded_input)
    # print(f'model_output last hidden state size : {model_output.last_hidden_state.size()}')
    return cls_pooling(model_output)

In [56]:
comments_dataset

Dataset({
    features: ['html_url', 'title', 'comments', 'body', 'comment_length', 'text'],
    num_rows: 2175
})

In [57]:
embedding = get_embeddings(comments_dataset['text'][0])
embedding.shape

torch.Size([1, 768])

In [58]:
embedding.detach().cpu().numpy()[0].shape

(768,)

In [72]:
embeddings_dataset = comments_dataset.map(lambda x: {'embeddings': get_embeddings(x['text']).detach().numpy()[0]})

Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 2175/2175 [18:14<00:00,  1.99 examples/s]


In [73]:
embeddings_dataset.add_faiss_index(column='embeddings')

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3/3 [00:00<00:00, 268.45it/s]


Dataset({
    features: ['html_url', 'title', 'comments', 'body', 'comment_length', 'text', 'embeddings'],
    num_rows: 2175
})

In [74]:
question = "How can I load a dataset offline?"
question_embedding = get_embeddings(question).detach().cpu().numpy()
question_embedding.shape

(1, 768)

In [75]:
scores, samples = embeddings_dataset.get_nearest_examples('embeddings', question_embedding, k=5)

In [76]:
type(scores),type(samples)

(numpy.ndarray, dict)

In [77]:
samples_df = pd.DataFrame.from_dict(samples)
samples_df['scores'] = scores
samples_df.sort_values('scores', ascending=False, inplace=True)

In [78]:
for _, row in samples_df.iterrows():
    print(_)
    print(f"COMMENT: {row.comments}")
    print(f"SCORE: {row.scores}")
    print(f"TITLE: {row.title}")
    print(f"URL: {row.html_url}")
    print("=" * 50)
    print()

4
COMMENT: There are already a few transformations that you can apply on a dataset using methods like `dataset.map()`.
You can find examples in the documentation here:
https://huggingface.co/docs/datasets/processing.html

You can merge two datasets with `concatenate_datasets()` or do label extraction with `dataset.map()` for example
SCORE: 203.49624633789062
TITLE: Transformer Class on dataset
URL: https://github.com/huggingface/datasets/issues/2596

3
COMMENT: I'm not sure I understand your issue, can you elaborate ?

`cache_file_name` is indeed an argument you can set to specify the cache file that will be used for the processed dataset. By default the file is named with something like `cache-<fingerprint>.arrow` where the fingerprint is a hash.
SCORE: 203.44866943359375
TITLE: is there a way to override a dataset object saved with save_to_disk?
URL: https://github.com/huggingface/datasets/issues/2055

2
COMMENT: @lhoestq How can we make sure that the data we upload on HuggingFace hu