In [1]:
import datasets
import json
import os
import tiktoken

import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = tiktoken.encoding_for_model("gpt2")

In [3]:
def get_wikipedia():
    print("get_wikipedia")
    data_type = "train"
    dataset = datasets.load_dataset("wikimedia/wikipedia", "20231101.en", split=data_type)

    directory = f"data/wikipedia/text"
    if not os.path.exists(directory):
        os.makedirs(directory)

    directory = f"data/wikipedia/tokens"
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    for i, element in enumerate(dataset):
    
        text = element["text"]
        tokens = tokenizer.encode(text)
        
        with open(f"data/wikipedia/text/{data_type}_{i:012d}.json", "w") as f:
            json.dump({"text": text}, f)
            
        with open(f"data/wikipedia/tokens/{data_type}_{i:012d}.json", "w") as f:
            json.dump({"tokens": tokens}, f)
            
        if (i + 1) % 5000 == 0:
            print(f"{i + 1}|{len(dataset)}")
    print()

In [4]:
def get_tatsu_lab_alpaca():
    print("get_tatsu_lab_alpaca")
    data_type = "train"
    dataset = datasets.load_dataset("tatsu-lab/alpaca", split=data_type)

    directory = f"data/tatsu_lab_alpaca/text"
    if not os.path.exists(directory):
        os.makedirs(directory)

    directory = f"data/tatsu_lab_alpaca/tokens"
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    for i, element in enumerate(dataset):
        
        text = "Information: \n" + element["text"] + "\n\n"
        instruction = "Instruction: \n" + element["instruction"] + "\n\n"
        output = "Answer: \n" + element["output"] + "\n\n"
        text = text + instruction + output
        tokens = tokenizer.encode(text)
        
        with open(f"data/tatsu_lab_alpaca/text/{data_type}_{i:012d}.json", "w") as f:
            json.dump({"text": text}, f)
            
        with open(f"data/tatsu_lab_alpaca/tokens/{data_type}_{i:012d}.json", "w") as f:
            json.dump({"tokens": tokens}, f)
            
        if (i + 1) % 5000 == 0:
            print(f"{i + 1}|{len(dataset)}")
    print()

In [5]:
def get_truthful_qa_generation():
    print("get_truthful_qa_generation")
    set_ = "generation"
    data_type = "validation"
    dataset = datasets.load_dataset("truthful_qa", set_, split=data_type)

    directory = f"data/truthful_qa/text"
    if not os.path.exists(directory):
        os.makedirs(directory)

    directory = f"data/truthful_qa/tokens"
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    for i, element in enumerate(dataset):
        
        question = "Question: \n" + element["question"] + "\n\n"
        best_answer = "Best answer: \n" + element["best_answer"] + "\n\n"
        correct_answers = "Correct answers: \n" + "\n".join(element["correct_answers"]) + "\n\n"
        incorrect_answers = "Incorrect answers: \n" + "\n".join(element["incorrect_answers"]) + "\n\n"
        text = question + question + incorrect_answers + best_answer
        tokens = tokenizer.encode(text)
        
        with open(f"data/truthful_qa/text/{data_type}_{set_}_{i:012d}.json", "w") as f:
            json.dump({"text": text}, f)
            
        with open(f"data/truthful_qa/tokens/{data_type}_{set_}_{i:012d}.json", "w") as f:
            json.dump({"tokens": tokens}, f)
            
        if (i + 1) % 5000 == 0:
            print(f"{i + 1}|{len(dataset)}")
    print()

In [6]:
def get_truthful_qa_multiple_choice():
    print("get_truthful_qa_multiple_choice")
    set_ = "multiple_choice"
    data_type = "validation"
    dataset = datasets.load_dataset("truthful_qa", set_, split=data_type)

    directory = f"data/truthful_qa/text"
    if not os.path.exists(directory):
        os.makedirs(directory)

    directory = f"data/truthful_qa/tokens"
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    for i, element in enumerate(dataset):
        
        question = "Question: \n" + element["question"] + "\n\n"
        choices = "Choices: \n" + "\n".join(element["mc1_targets"]["choices"]) + "\n\n"
        idx = np.argmax(element["mc1_targets"]["labels"])
        correct_answer = "Correct answer: \n" + element["mc1_targets"]["choices"][idx]
        text = question + choices + correct_answer
        tokens = tokenizer.encode(text)
        
        with open(f"data/truthful_qa/text/{data_type}_{set_}_{i:012d}.json", "w") as f:
            json.dump({"text": text}, f)
            
        with open(f"data/truthful_qa/tokens/{data_type}_{set_}_{i:012d}.json", "w") as f:
            json.dump({"tokens": tokens}, f)
            
        if (i + 1) % 5000 == 0:
            print(f"{i + 1}|{len(dataset)}")
    print()

In [7]:
def get_open_orca():
    print("get_open_orca")
    data_type = "train"
    dataset = datasets.load_dataset("Open-Orca/OpenOrca", split=data_type)

    directory = f"data/open_orca/text"
    if not os.path.exists(directory):
        os.makedirs(directory)

    directory = f"data/open_orca/tokens"
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    for i, element in enumerate(dataset):
        
        system_prompt = "System prompt: \n" + element["system_prompt"] + "\n\n"
        question = "Question: \n" + element["question"] + "\n\n"
        response = "Response: \n" + element["response"] + "\n\n"
        text = system_prompt + question + response
        tokens = tokenizer.encode(text)
        
        with open(f"data/open_orca/text/{data_type}_{i:012d}.json", "w") as f:
            json.dump({"text": text}, f)
            
        with open(f"data/open_orca/tokens/{data_type}_{i:012d}.json", "w") as f:
            json.dump({"tokens": tokens}, f)
            
        if (i + 1) % 5000 == 0:
            print(f"{i + 1}|{len(dataset)}")
    print()

In [8]:
def get_stingning_ultrachat():
    print("get_stingning_ultrachat")
    data_type = "train"
    dataset = datasets.load_dataset("stingning/ultrachat", split=data_type)

    directory = f"data/stingning_ultrachat/text"
    if not os.path.exists(directory):
        os.makedirs(directory)

    directory = f"data/stingning_ultrachat/tokens"
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    for i, element in enumerate(dataset):
        
        question = "Question: \n" + element["data"][0] + "\n\n"
        answer = "Answer: \n" + element["data"][1] + "\n\n"
        text = question + answer
        tokens = tokenizer.encode(text)
        
        with open(f"data/stingning_ultrachat/text/{data_type}_{i:012d}.json", "w") as f:
            json.dump({"text": text}, f)
            
        with open(f"data/stingning_ultrachat/tokens/{data_type}_{i:012d}.json", "w") as f:
            json.dump({"tokens": tokens}, f)
            
        if (i + 1) % 5000 == 0:
            print(f"{i + 1}|{len(dataset)}")
    print()

In [9]:
def get_xtreme():
    print("get_xtreme")
    for data_type in ["validation", "test"]:
        print("get_xtreme", data_type)
        dataset = datasets.load_dataset("xtreme", "MLQA.en.en", split=data_type)
    
        directory = f"data/xtreme/text"
        if not os.path.exists(directory):
            os.makedirs(directory)
    
        directory = f"data/xtreme/tokens"
        if not os.path.exists(directory):
            os.makedirs(directory)
        
        for i, element in enumerate(dataset):
            
            title = "Title: \n" + element["title"] + "\n\n"
            context = "Context: \n" + element["context"] + "\n\n"
            question = "Question: \n" + element["question"] + "\n\n"
            answer = "Answer: \n" + "\n".join(element["answers"]["text"]) + "\n\n"
            text = title + context + question + answer
            tokens = tokenizer.encode(text)
            
            with open(f"data/xtreme/text/{data_type}_{i:012d}.json", "w") as f:
                json.dump({"text": text}, f)
                
            with open(f"data/xtreme/tokens/{data_type}_{i:012d}.json", "w") as f:
                json.dump({"tokens": tokens}, f)
                
            if (i + 1) % 5000 == 0:
                print(f"{i + 1}|{len(dataset)}")
    print()

In [10]:
def get_openwebtext():
    print("get_openwebtext")
    data_type = "train"
    dataset = datasets.load_dataset("Skylion007/openwebtext", split=data_type)

    directory = f"data/openwebtext/text"
    if not os.path.exists(directory):
        os.makedirs(directory)

    directory = f"data/openwebtext/tokens"
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    for i, element in enumerate(dataset):
    
        text = element["text"]
        tokens = tokenizer.encode(text)
        
        with open(f"data/openwebtext/text/{data_type}_{i:012d}.json", "w") as f:
            json.dump({"text": text}, f)
            
        with open(f"data/openwebtext/tokens/{data_type}_{i:012d}.json", "w") as f:
            json.dump({"tokens": tokens}, f)
            
        if (i + 1) % 5000 == 0:
            print(f"{i + 1}|{len(dataset)}")
    print()

In [11]:
# get_wikipedia()
# get_tatsu_lab_alpaca()
# get_truthful_qa_generation()
# get_truthful_qa_multiple_choice()
# get_open_orca()
# get_stingning_ultrachat()
# get_xtreme()
get_openwebtext()

get_openwebtext


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
Downloading builder script: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2.73k/2.73k [00:00<00:00, 35.7MB/s]
Downloading readme: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7.33k/7.33k [00:00<00:00, 52.7MB/s]
Downloading data: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 633M/633M [00:18

5000|8013769
10000|8013769
15000|8013769
20000|8013769
25000|8013769
30000|8013769
35000|8013769
40000|8013769
45000|8013769
50000|8013769
55000|8013769
60000|8013769
65000|8013769
70000|8013769
75000|8013769
80000|8013769
85000|8013769
90000|8013769
95000|8013769
100000|8013769
105000|8013769
110000|8013769
115000|8013769
120000|8013769
125000|8013769
130000|8013769
135000|8013769
140000|8013769
145000|8013769
150000|8013769
155000|8013769
160000|8013769
165000|8013769
170000|8013769
175000|8013769
180000|8013769
185000|8013769
190000|8013769
195000|8013769
200000|8013769
205000|8013769
210000|8013769
215000|8013769
220000|8013769
225000|8013769
230000|8013769
235000|8013769
240000|8013769
245000|8013769
250000|8013769
255000|8013769
260000|8013769
265000|8013769
270000|8013769
275000|8013769
280000|8013769
285000|8013769
290000|8013769
295000|8013769
300000|8013769
305000|8013769
310000|8013769
315000|8013769
320000|8013769
325000|8013769
330000|8013769
335000|8013769
340000|8013769
