# Data Loader (Input Pipeline)

In [1]:
# Load dataset path
import pickle

with open('data/all_dataset_path', 'rb') as fp:
    all_dataset_path = pickle.load(fp)

In [2]:
len(all_dataset_path)

1363

In [4]:
# see merge path
all_dataset_path[0:10]

['data/id_oscar/text_543.txt',
 'data/id_oscar/text_155.txt',
 'data/id_oscar/text_528.txt',
 'data/id_oscar/text_582.txt',
 'data/id_oscar/text_983.txt',
 'data/id_oscar/text_919.txt',
 'data/id_oscar/text_729.txt',
 'data/id_oscar/text_857.txt',
 'data/id_oscar/text_246.txt',
 'data/id_oscar/text_222.txt']

In [5]:
# sample_dataset_path = all_dataset_path[0:2]
# len(sample_dataset_path)

In [6]:
%%time

from datasets import load_dataset

dataset = load_dataset('text', data_files=all_dataset_path)

Resolving data files:   0%|          | 0/1363 [00:00<?, ?it/s]

Using custom data configuration default-8d5c9df5ba894650
Reusing dataset text (/home/tel-user/.cache/huggingface/datasets/text/default-8d5c9df5ba894650/0.0.0/acc32f2f2ef863c93c2f30c52f7df6cc9053a1c2230b8d7da0d210404683ca08)


  0%|          | 0/1 [00:00<?, ?it/s]

CPU times: user 5.65 s, sys: 2.27 s, total: 7.91 s
Wall time: 13.1 s


In [7]:
dataset.cache_files

{'train': [{'filename': '/home/tel-user/.cache/huggingface/datasets/text/default-8d5c9df5ba894650/0.0.0/acc32f2f2ef863c93c2f30c52f7df6cc9053a1c2230b8d7da0d210404683ca08/text-train.arrow'}]}

In [8]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 13571936
    })
})

In [9]:
split_dataset = dataset['train'].train_test_split(test_size=0.1)
split_dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 12214742
    })
    test: Dataset({
        features: ['text'],
        num_rows: 1357194
    })
})

In [10]:
# load Tokenizer
from transformers import RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained('feel-in') 

In [11]:
# Using the map method from the Datasets library. First define a function that call the tokenizer on our texts

def tokenize_function(examples):
    return tokenizer(examples['text'])

In [12]:
%%time

# Apply it to all the splits in our datasets object, using batched=True and 4 processes to speed up the preprocessing. 
# We won't need the text column afterward, so we discard it.

# tokenizing the train dataset
train_dataset = split_dataset['train'].map(tokenize_function,
                                                batched=True, 
                                                num_proc=15, 
                                                remove_columns=['text'])

# tokenizing the testing dataset
test_dataset = split_dataset['test'].map(tokenize_function,
                                                batched=True, 
                                                num_proc=15, 
                                                remove_columns=['text'])

                  

#0:   0%|          | 0/815 [00:00<?, ?ba/s]

#1:   0%|          | 0/815 [00:00<?, ?ba/s]

#4:   0%|          | 0/815 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/815 [00:00<?, ?ba/s]

    

#2:   0%|          | 0/815 [00:00<?, ?ba/s]

 

#8:   0%|          | 0/815 [00:00<?, ?ba/s]

#6:   0%|          | 0/815 [00:00<?, ?ba/s]

#13:   0%|          | 0/815 [00:00<?, ?ba/s]

  

#10:   0%|          | 0/815 [00:00<?, ?ba/s]

  

#9:   0%|          | 0/815 [00:00<?, ?ba/s]

 

#12:   0%|          | 0/815 [00:00<?, ?ba/s]

#11:   0%|          | 0/815 [00:00<?, ?ba/s]

#5:   0%|          | 0/815 [00:00<?, ?ba/s]

#7:   0%|          | 0/815 [00:00<?, ?ba/s]

 

#14:   0%|          | 0/815 [00:00<?, ?ba/s]

                 

#1:   0%|          | 0/91 [00:00<?, ?ba/s]

#0:   0%|          | 0/91 [00:00<?, ?ba/s]

   

#3:   0%|          | 0/91 [00:00<?, ?ba/s]

  

#5:   0%|          | 0/91 [00:00<?, ?ba/s]

#4:   0%|          | 0/91 [00:00<?, ?ba/s]

 

#9:   0%|          | 0/91 [00:00<?, ?ba/s]

 

#8:   0%|          | 0/91 [00:00<?, ?ba/s]

#2:   0%|          | 0/91 [00:00<?, ?ba/s]

#6:   0%|          | 0/91 [00:00<?, ?ba/s]

   

#7:   0%|          | 0/91 [00:00<?, ?ba/s]

 

#10:   0%|          | 0/91 [00:00<?, ?ba/s]

#11:   0%|          | 0/91 [00:00<?, ?ba/s]

 

#14:   0%|          | 0/91 [00:00<?, ?ba/s]

#13:   0%|          | 0/91 [00:00<?, ?ba/s]

 

#12:   0%|          | 0/91 [00:00<?, ?ba/s]

CPU times: user 1min 32s, sys: 12.7 s, total: 1min 45s
Wall time: 1h 18min 35s


In [13]:
# Main data processing function that will concatenate all texts from our dataset and generate chunks of max_seq_length.
# https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm.py

# maximum sequence length, lowering will result to faster training (when increasing batch size).
max_length=512

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, we can customize this part to our needs.
    total_length = (total_length // max_length) * max_length
    
    # Split by chunks of max_length.
    result = {
        k: [t[i : i + max_length] for i in range(0, total_length, max_length)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [14]:
%%time

# Note that with `batched=True`, this map processes 1,000 texts together, 
# so group_texts throws away a remainder for each of those groups of 1,000 texts. 
# We can adjust that batch_size here but a higher value might be slower to preprocess. To speed up this part, we use multiprocessing.
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map

lm_train_dataset = train_dataset.map(group_texts, 
                                  batched=True, 
                                  num_proc=5,
                                  desc=f'Grouping texts in chunks of {max_length}')

lm_test_dataset = test_dataset.map(group_texts, 
                                batched=True, 
                                num_proc=5,
                                desc=f'Grouping texts in chunks of {max_length}')

# convert lists to torch tensors
lm_train_dataset.set_format('torch')
lm_test_dataset.set_format('torch')

         

Grouping texts in chunks of 512 #2:   0%|          | 0/2443 [00:00<?, ?ba/s]

Grouping texts in chunks of 512 #4:   0%|          | 0/2443 [00:00<?, ?ba/s]

Grouping texts in chunks of 512 #3:   0%|          | 0/2443 [00:00<?, ?ba/s]

 

Grouping texts in chunks of 512 #0:   0%|          | 0/2443 [00:00<?, ?ba/s]

Grouping texts in chunks of 512 #1:   0%|          | 0/2443 [00:00<?, ?ba/s]

          

Grouping texts in chunks of 512 #0:   0%|          | 0/272 [00:00<?, ?ba/s]

Grouping texts in chunks of 512 #4:   0%|          | 0/272 [00:00<?, ?ba/s]

Grouping texts in chunks of 512 #3:   0%|          | 0/272 [00:00<?, ?ba/s]

Grouping texts in chunks of 512 #1:   0%|          | 0/272 [00:00<?, ?ba/s]

Grouping texts in chunks of 512 #2:   0%|          | 0/272 [00:00<?, ?ba/s]

CPU times: user 35.9 s, sys: 14.5 s, total: 50.4 s
Wall time: 2h 38min 18s


In [15]:
len(lm_train_dataset), len(lm_test_dataset)

(5559028, 617624)

In [None]:
# save data pipeline
lm_train_dataset.save_to_disk('data/lm_train')
lm_test_dataset.save_to_disk('data/lm_test')