# 5.1 Datasets
Key Functions:
* `load_dataset(file_type, data_files, field, delimiter)` -> `DatasetDict` object
* `DatasetDict` methods: `rename_column`, `filter`, `map`
* Turn on `batched` flag to enable processing of many rows at once

In [71]:
import html
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer

### Load Local Dataset

In [11]:
squad_path = "/n/fs/nlp-jy1682/hf_datasets/misc/SQuAD-it/"
squad_it_dataset = load_dataset("json", data_files=squad_path+"SQuAD_it-train.json", field="data")

Using custom data configuration default-53afaf4c2d40b352
Found cached dataset json (/n/fs/nlp-jy1682/hf_datasets/json/default-53afaf4c2d40b352/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


  0%|          | 0/1 [00:00<?, ?it/s]

In [12]:
squad_it_dataset

DatasetDict({
    train: Dataset({
        features: ['title', 'paragraphs'],
        num_rows: 442
    })
})

In [13]:
squad_it_dataset["train"][0]['paragraphs'][0]

{'context': "Il terremoto del Sichuan del 2008 o il terremoto del Gran Sichuan, misurato a 8.0 Ms e 7.9 Mw, e si è verificato alle 02:28:01 PM China Standard Time all' epicentro (06:28:01 UTC) il 12 maggio nella provincia del Sichuan, ha ucciso 69.197 persone e lasciato 18.222 dispersi.",
 'qas': [{'answers': [{'answer_start': 29, 'text': '2008'}],
   'id': '56cdca7862d2951400fa6826',
   'question': 'In quale anno si è verificato il terremoto nel Sichuan?'},
  {'answers': [{'answer_start': 232, 'text': '69.197'}],
   'id': '56cdca7862d2951400fa6828',
   'question': 'Quante persone sono state uccise come risultato?'},
  {'answers': [{'answer_start': 29, 'text': '2008'}],
   'id': '56d4f9902ccc5a1400d833c0',
   'question': 'Quale anno ha avuto luogo il terremoto del Sichuan?'},
  {'answers': [{'answer_start': 78, 'text': '8.0 Ms e 7.9 Mw'}],
   'id': '56d4f9902ccc5a1400d833c1',
   'question': 'Che cosa ha fatto la misura di sisma?'},
  {'answers': [{'answer_start': 183, 'text': '12 maggi

In [14]:
# Load both train and test data files

# data_files argument can be
# * a single file path
# * list of file paths
# * dictionary mapping split name to file path
# * glob of files that match a specified pattern (i.e. data_files="*.json")
data_files = {"train": squad_path+"SQuAD_it-train.json", "test": squad_path+"SQuAD_it-test.json"}

# loading script also supports automatic decompression of input files (can specify compressed files)
squad_it_dataset = load_dataset("json", data_files=data_files, field="data")
squad_it_dataset

Using custom data configuration default-910cd934e5f15cb1


Downloading and preparing dataset json/default to /n/fs/nlp-jy1682/hf_datasets/json/default-910cd934e5f15cb1/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /n/fs/nlp-jy1682/hf_datasets/json/default-910cd934e5f15cb1/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['title', 'paragraphs'],
        num_rows: 442
    })
    test: Dataset({
        features: ['title', 'paragraphs'],
        num_rows: 48
    })
})

### Load Remote Dataset

In [15]:
# Same thing as local, except file paths -> URLs
url = "https://github.com/crux82/squad-it/raw/master/"
data_files = {
    "train": url + "SQuAD_it-train.json.gz",
    "test" : url + "SQuAD_it-test.json.gz"
}
squad_it_dataset = load_dataset("json", data_files=data_files, field="data")

Using custom data configuration default-57dcee3ea6992346


Downloading and preparing dataset json/default to /n/fs/nlp-jy1682/hf_datasets/json/default-57dcee3ea6992346/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/7.73M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /n/fs/nlp-jy1682/hf_datasets/json/default-57dcee3ea6992346/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

### Dataset Slicing

In [17]:
# Drug Review Dataset by UCI ML Repository
drugsCom_path = "/n/fs/nlp-jy1682/hf_datasets/misc/drugsCom/"
data_files = {
    "train": drugsCom_path + "drugsComTrain_raw.tsv",
    "test" : drugsCom_path + "drugsComTest_raw.tsv",
}
drug_dataset = load_dataset("csv", data_files=data_files, delimiter="\t")

Using custom data configuration default-ec8026ee3cb4f326


Downloading and preparing dataset csv/default to /n/fs/nlp-jy1682/hf_datasets/csv/default-ec8026ee3cb4f326/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /n/fs/nlp-jy1682/hf_datasets/csv/default-ec8026ee3cb4f326/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [20]:
drug_dataset

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})

In [26]:
# Quick preview of samples in dataset
drug_sample = drug_dataset["train"].shuffle(seed=42).select(range(1000))
drug_sample[:3]

Loading cached shuffled indices for dataset at /n/fs/nlp-jy1682/hf_datasets/csv/default-ec8026ee3cb4f326/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-b3b196d3d2fb285f.arrow


{'Unnamed: 0': [87571, 178045, 80482],
 'drugName': ['Naproxen', 'Duloxetine', 'Mobic'],
 'condition': ['Gout, Acute', 'ibromyalgia', 'Inflammatory Conditions'],
 'review': ['"like the previous person mention, I&#039;m a strong believer of aleve, it works faster for my gout than the prescription meds I take. No more going to the doctor for refills.....Aleve works!"',
  '"I have taken Cymbalta for about a year and a half for fibromyalgia pain. It is great\r\nas a pain reducer and an anti-depressant, however, the side effects outweighed \r\nany benefit I got from it. I had trouble with restlessness, being tired constantly,\r\ndizziness, dry mouth, numbness and tingling in my feet, and horrible sweating. I am\r\nbeing weaned off of it now. Went from 60 mg to 30mg and now to 15 mg. I will be\r\noff completely in about a week. The fibro pain is coming back, but I would rather deal with it than the side effects."',
  '"I have been taking Mobic for over a year with no side effects other than 

Let's do some data cleaning...

In [29]:
# check if 'Unnamed: 0' column corresponds to the number of rows in dataset
for split in drug_dataset.keys():
    assert len(drug_dataset[split]) == len(drug_dataset[split].unique("Unnamed: 0"))
    
# ...And it checks out. Renamed column to something more interpretable
drug_dataset = drug_dataset.rename_column(
    original_column_name="Unnamed: 0",
    new_column_name="patient_id"
)

In [36]:
# remove all rows without any condition
drug_dataset = drug_dataset.filter(lambda x: x["condition"] is not None)
# lowercase `condition` labels using `Dataset.map`
drug_dataset = drug_dataset.map(lambda ex: {"condition": ex["condition"].lower()})

  0%|          | 0/161 [00:00<?, ?ba/s]

  0%|          | 0/54 [00:00<?, ?ba/s]

  0%|          | 0/160398 [00:00<?, ?ex/s]

  0%|          | 0/53471 [00:00<?, ?ex/s]

In [41]:
# check that prior cell execute properly
drug_dataset["train"][:5]["condition"]

['left ventricular dysfunction',
 'adhd',
 'birth control',
 'birth control',
 'opiate dependence']

Now let's create a new column...

In [43]:
# Creating new column containing review length
drug_dataset = drug_dataset.map(lambda ex: {"review_length": len(ex["review"].split())})
drug_dataset["train"][0]

  0%|          | 0/160398 [00:00<?, ?ex/s]

  0%|          | 0/53471 [00:00<?, ?ex/s]

{'patient_id': 206461,
 'drugName': 'Valsartan',
 'condition': 'left ventricular dysfunction',
 'review': '"It has no side effect, I take it in combination of Bystolic 5 Mg and Fish Oil"',
 'rating': 9.0,
 'date': 'May 20, 2012',
 'usefulCount': 27,
 'review_length': 17}

In [47]:
drug_dataset["train"].sort("review_length")[:3] # Observation: some reviews are super short!

Loading cached sorted indices for dataset at /n/fs/nlp-jy1682/hf_datasets/csv/default-ec8026ee3cb4f326/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-90f5a81ece31af8b.arrow


{'patient_id': [103488, 23627, 20558],
 'drugName': ['Loestrin 21 1 / 20', 'Chlorzoxazone', 'Nucynta'],
 'condition': ['birth control', 'muscle spasm', 'pain'],
 'review': ['"Excellent."', '"useless"', '"ok"'],
 'rating': [10.0, 1.0, 6.0],
 'date': ['November 4, 2008', 'March 24, 2017', 'August 20, 2016'],
 'usefulCount': [5, 2, 10],
 'review_length': [1, 1, 1]}

In [48]:
# So filter these out, as they're probably not useful for sentiment analysis
print("Before:", drug_dataset.num_rows)
drug_dataset = drug_dataset.filter(lambda x: x["review_length"] > 30)
print("After:", drug_dataset.num_rows)

Before: {'train': 160398, 'test': 53471}


  0%|          | 0/161 [00:00<?, ?ba/s]

  0%|          | 0/54 [00:00<?, ?ba/s]

After: {'train': 138514, 'test': 46108}


And now let's deal with the `HTMl` characters in reviews

In [50]:
drug_dataset = drug_dataset.map(lambda x: {"review": html.unescape(x["review"])})

  0%|          | 0/138514 [00:00<?, ?ex/s]

  0%|          | 0/46108 [00:00<?, ?ex/s]

In [51]:
# we can make the above faster by turning on the "batched" flag
# if turned on, x is now a list of rows, not just one row
new_drug_dataset = drug_dataset.map(
    lambda rows: {"review": [html.unescape(row) for row in rows["review"]]},
    batched=True
)

  0%|          | 0/139 [00:00<?, ?ba/s]

  0%|          | 0/47 [00:00<?, ?ba/s]

In [53]:
# timing comparison of tokenizers
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
def tokenize_function(examples):
    return tokenizer(examples["review"], truncation=True)

In [55]:
%time tokenized_dataset = drug_dataset.map(tokenize_function, batched=False)

  0%|          | 0/138514 [00:00<?, ?ex/s]

  0%|          | 0/46108 [00:00<?, ?ex/s]

CPU times: user 1min 28s, sys: 740 ms, total: 1min 29s
Wall time: 1min 30s


In [54]:
%time tokenized_dataset = drug_dataset.map(tokenize_function, batched=True)

  0%|          | 0/139 [00:00<?, ?ba/s]

  0%|          | 0/47 [00:00<?, ?ba/s]

CPU times: user 6min 3s, sys: 4.11 s, total: 6min 7s
Wall time: 19.2 s


In [56]:
# timing comparison with slow tokenizer
tokenizer_slow = AutoTokenizer.from_pretrained("bert-base-uncased", use_fast=False)
def tokenize_function_slow(examples):
    return tokenizer_slow(examples["review"], truncation=True)
%time tokenized_dataset = drug_dataset.map(tokenize_function_slow, batched=True)

  0%|          | 0/139 [00:00<?, ?ba/s]

  0%|          | 0/47 [00:00<?, ?ba/s]

CPU times: user 4min 34s, sys: 554 ms, total: 4min 34s
Wall time: 4min 35s


We can also change the number of elements in a dataset with `Dataset.map`

⚠️ TODO: Didn't fully understand this section, revisit

In [60]:
def tokenize_and_split(examples):
    return tokenizer(
        examples["review"],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True
    )
result = tokenize_and_split(drug_dataset["train"][0])
[len(inp) for inp in result["input_ids"]]
# first example in train set became 2 features b/c it was toeknized to more than the
# max number of tokens specified

[128, 45]

In [61]:
# This won't work because there is a mismatch in length of one of the columns
# 1000 examples -> 1463 new features = shape error
tokenized_dataset = drug_dataset.map(tokenize_and_split, batched=True)

  0%|          | 0/139 [00:00<?, ?ba/s]

ArrowInvalid: Column 8 named input_ids expected length 1000 but got length 1447

In [63]:
# Solutions
# 1. Remove the columns from the old dataset
# 2. Make them the same size as they are in the new dataset

# Solution 1
tokenized_dataset = drug_dataset.map(
    tokenize_and_split, batched=True, remove_columns=drug_dataset["train"].column_names
)
tokenized_dataset

  0%|          | 0/139 [00:00<?, ?ba/s]

  0%|          | 0/47 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'overflow_to_sample_mapping'],
        num_rows: 204198
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'overflow_to_sample_mapping'],
        num_rows: 68023
    })
})

In [64]:
# Solution 2: make old columns same size as new ones
def tokenize_and_split(examples):
    result = tokenizer(
        examples["review"],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True,
    )
    # Extract mapping between new, old indices
    sample_map = result.pop("overflow_to_sample_mapping")
    for key, values in examples.items():
        result[key] = [values[i] for i in sample_map]
    return result

tokenized_dataset = drug_dataset.map(tokenize_and_split, batched=True)
tokenized_dataset

  0%|          | 0/139 [00:00<?, ?ba/s]

  0%|          | 0/47 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 204198
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 68023
    })
})

### Conversion `Dataset` <-> `DataFrame`

In [65]:
# Note: `type` is still Dataset, it just changes return format
drug_dataset.set_format("pandas")

In [68]:
drug_dataset["train"][:3]

Unnamed: 0,patient_id,drugName,condition,review,rating,date,usefulCount,review_length
0,95260,Guanfacine,adhd,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192,141
1,92703,Lybrel,birth control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17,134
2,138000,Ortho Evra,birth control,"""This is my first time using any form of birth...",8.0,"November 3, 2015",10,89


In [69]:
# To get a DataFrame of the values, we still need to create a deep copy
train_df = drug_dataset["train"][:]

In [70]:
# Now we can do Pandas stuff if we want to
frequencies = (
    train_df["condition"].value_counts().to_frame().reset_index()
        .rename(columns={"index": "condition", "condition": "frequency"})
)
frequencies.head()

Unnamed: 0,condition,frequency
0,birth control,27655
1,depression,8023
2,acne,5209
3,anxiety,4991
4,pain,4744


In [72]:
# We can convert this back to dataset too!
freq_dataset = Dataset.from_pandas(frequencies)
freq_dataset

Dataset({
    features: ['condition', 'frequency'],
    num_rows: 819
})

In [73]:
# Change pandas formatting back to arrow
drug_dataset.reset_format()

### Create Validation Set

In [75]:
drug_dataset_clean = drug_dataset["train"].train_test_split(train_size=0.8, seed=42)
# Rename default "test" split to "vadliation"
drug_dataset_clean["validation"] = drug_dataset_clean.pop("test")
# Add "test" set to `DatasetDict`
drug_dataset_clean["test"] = drug_dataset["test"]
drug_dataset_clean

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 110811
    })
    validation: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 27703
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 46108
    })
})

### Save Dataset

In [77]:
# save_to_disk, to_csv, to_json
drug_dataset_clean.save_to_disk("/n/fs/nlp-jy1682/hf_datasets/misc/drug-reviews")

Flattening the indices:   0%|          | 0/111 [00:00<?, ?ba/s]

Saving the dataset (0/1 shards):   0%|          | 0/110811 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/28 [00:00<?, ?ba/s]

Saving the dataset (0/1 shards):   0%|          | 0/27703 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/46108 [00:00<?, ? examples/s]

In [None]:
# load_to_disk