In [1]:
import datasets
import json
import os
import tiktoken

import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
DATA_PATH = "/mnt/"

In [3]:
tokenizer = tiktoken.encoding_for_model("gpt2")

In [4]:
def get_wikipedia():
    print("get_wikipedia")
    data_type = "train"
    dataset = datasets.load_dataset("wikimedia/wikipedia", "20231101.en", split=data_type)

    directory = os.path.join(DATA_PATH, "data/wikipedia/text")
    if not os.path.exists(directory):
        os.makedirs(directory)

    directory = os.path.join(DATA_PATH, "data/wikipedia/tokens")
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    for i, element in enumerate(dataset):
    
        text = element["text"]
        tokens = tokenizer.encode(text)
        
        with open(os.path.join(DATA_PATH, f"data/wikipedia/text/{data_type}_{i:012d}.json"), "w") as f:
            json.dump({"text": text}, f)
            
        with open(os.path.join(DATA_PATH, f"data/wikipedia/tokens/{data_type}_{i:012d}.json"), "w") as f:
            json.dump({"tokens": tokens}, f)
            
        if (i + 1) % 5000 == 0:
            print(f"{i + 1}|{len(dataset)}")
    print()

In [5]:
def get_tatsu_lab_alpaca():
    print("get_tatsu_lab_alpaca")
    data_type = "train"
    dataset = datasets.load_dataset("tatsu-lab/alpaca", split=data_type)

    directory = os.path.join(DATA_PATH, "data/tatsu_lab_alpaca/text")
    if not os.path.exists(directory):
        os.makedirs(directory)

    directory = os.path.join(DATA_PATH, "data/tatsu_lab_alpaca/tokens")
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    for i, element in enumerate(dataset):
        
        text = "Information: \n" + element["text"] + "\n\n"
        instruction = "Instruction: \n" + element["instruction"] + "\n\n"
        output = "Answer: \n" + element["output"] + "\n\n"
        text = text + instruction + output
        tokens = tokenizer.encode(text)
        
        with open(os.path.join(DATA_PATH, f"data/tatsu_lab_alpaca/text/{data_type}_{i:012d}.json"), "w") as f:
            json.dump({"text": text}, f)
            
        with open(os.path.join(DATA_PATH, f"data/tatsu_lab_alpaca/tokens/{data_type}_{i:012d}.json"), "w") as f:
            json.dump({"tokens": tokens}, f)
            
        if (i + 1) % 5000 == 0:
            print(f"{i + 1}|{len(dataset)}")
    print()

In [6]:
def get_truthful_qa_generation():
    print("get_truthful_qa_generation")
    set_ = "generation"
    data_type = "validation"
    dataset = datasets.load_dataset("truthful_qa", set_, split=data_type)

    directory = os.path.join(DATA_PATH, "data/truthful_qa/text")
    if not os.path.exists(directory):
        os.makedirs(directory)

    directory = os.path.join(DATA_PATH, "data/truthful_qa/tokens")
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    for i, element in enumerate(dataset):
        
        question = "Question: \n" + element["question"] + "\n\n"
        best_answer = "Best answer: \n" + element["best_answer"] + "\n\n"
        correct_answers = "Correct answers: \n" + "\n".join(element["correct_answers"]) + "\n\n"
        incorrect_answers = "Incorrect answers: \n" + "\n".join(element["incorrect_answers"]) + "\n\n"
        text = question + question + incorrect_answers + best_answer
        tokens = tokenizer.encode(text)
        
        with open(os.path.join(DATA_PATH, f"data/truthful_qa/text/{data_type}_{set_}_{i:012d}.json"), "w") as f:
            json.dump({"text": text}, f)
            
        with open(os.path.join(DATA_PATH, f"data/truthful_qa/tokens/{data_type}_{set_}_{i:012d}.json"), "w") as f:
            json.dump({"tokens": tokens}, f)
            
        if (i + 1) % 5000 == 0:
            print(f"{i + 1}|{len(dataset)}")
    print()

In [7]:
def get_truthful_qa_multiple_choice():
    print("get_truthful_qa_multiple_choice")
    set_ = "multiple_choice"
    data_type = "validation"
    dataset = datasets.load_dataset("truthful_qa", set_, split=data_type)

    directory = os.path.join(DATA_PATH, "data/truthful_qa/text")
    if not os.path.exists(directory):
        os.makedirs(directory)

    directory = os.path.join(DATA_PATH, "data/truthful_qa/tokens")
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    for i, element in enumerate(dataset):
        
        question = "Question: \n" + element["question"] + "\n\n"
        choices = "Choices: \n" + "\n".join(element["mc1_targets"]["choices"]) + "\n\n"
        idx = np.argmax(element["mc1_targets"]["labels"])
        correct_answer = "Correct answer: \n" + element["mc1_targets"]["choices"][idx]
        text = question + choices + correct_answer
        tokens = tokenizer.encode(text)
        
        with open(os.path.join(DATA_PATH, f"data/truthful_qa/text/{data_type}_{set_}_{i:012d}.json"), "w") as f:
            json.dump({"text": text}, f)
            
        with open(os.path.join(DATA_PATH, f"data/truthful_qa/tokens/{data_type}_{set_}_{i:012d}.json"), "w") as f:
            json.dump({"tokens": tokens}, f)
            
        if (i + 1) % 5000 == 0:
            print(f"{i + 1}|{len(dataset)}")
    print()

In [8]:
def get_open_orca():
    print("get_open_orca")
    data_type = "train"
    dataset = datasets.load_dataset("Open-Orca/OpenOrca", split=data_type)

    directory = os.path.join(DATA_PATH, "data/open_orca/text")
    if not os.path.exists(directory):
        os.makedirs(directory)

    directory = os.path.join(DATA_PATH, "data/open_orca/tokens")
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    for i, element in enumerate(dataset):
        
        system_prompt = "System prompt: \n" + element["system_prompt"] + "\n\n"
        question = "Question: \n" + element["question"] + "\n\n"
        response = "Response: \n" + element["response"] + "\n\n"
        text = system_prompt + question + response
        tokens = tokenizer.encode(text)
        
        with open(os.path.join(DATA_PATH, f"data/open_orca/text/{data_type}_{i:012d}.json"), "w") as f:
            json.dump({"text": text}, f)
            
        with open(os.path.join(DATA_PATH, f"data/open_orca/tokens/{data_type}_{i:012d}.json"), "w") as f:
            json.dump({"tokens": tokens}, f)
            
        if (i + 1) % 5000 == 0:
            print(f"{i + 1}|{len(dataset)}")
    print()

In [9]:
def get_stingning_ultrachat():
    print("get_stingning_ultrachat")
    data_type = "train"
    dataset = datasets.load_dataset("stingning/ultrachat", split=data_type)

    directory = os.path.join(DATA_PATH, f"data/stingning_ultrachat/text")
    if not os.path.exists(directory):
        os.makedirs(directory)

    directory = os.path.join(DATA_PATH, f"data/stingning_ultrachat/tokens")
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    for i, element in enumerate(dataset):
        
        question = "Question: \n" + element["data"][0] + "\n\n"
        answer = "Answer: \n" + element["data"][1] + "\n\n"
        text = question + answer
        tokens = tokenizer.encode(text)
        
        with open(os.path.join(DATA_PATH, f"data/stingning_ultrachat/text/{data_type}_{i:012d}.json"), "w") as f:
            json.dump({"text": text}, f)
            
        with open(os.path.join(DATA_PATH, f"data/stingning_ultrachat/tokens/{data_type}_{i:012d}.json"), "w") as f:
            json.dump({"tokens": tokens}, f)
            
        if (i + 1) % 5000 == 0:
            print(f"{i + 1}|{len(dataset)}")
    print()

In [10]:
def get_xtreme():
    print("get_xtreme")
    for data_type in ["validation", "test"]:
        print("get_xtreme", data_type)
        dataset = datasets.load_dataset("xtreme", "MLQA.en.en", split=data_type)
    
        directory = os.path.join(DATA_PATH, f"data/xtreme/text")
        if not os.path.exists(directory):
            os.makedirs(directory)
    
        directory = os.path.join(DATA_PATH, f"data/xtreme/tokens")
        if not os.path.exists(directory):
            os.makedirs(directory)
        
        for i, element in enumerate(dataset):
            
            title = "Title: \n" + element["title"] + "\n\n"
            context = "Context: \n" + element["context"] + "\n\n"
            question = "Question: \n" + element["question"] + "\n\n"
            answer = "Answer: \n" + "\n".join(element["answers"]["text"]) + "\n\n"
            text = title + context + question + answer
            tokens = tokenizer.encode(text)
            
            with open(os.path.join(DATA_PATH, f"data/xtreme/text/{data_type}_{i:012d}.json"), "w") as f:
                json.dump({"text": text}, f)
                
            with open(os.path.join(DATA_PATH, f"data/xtreme/tokens/{data_type}_{i:012d}.json"), "w") as f:
                json.dump({"tokens": tokens}, f)
                
            if (i + 1) % 5000 == 0:
                print(f"{i + 1}|{len(dataset)}")
    print()

In [11]:
def get_openwebtext():
    print("get_openwebtext")
    data_type = "train"
    dataset = datasets.load_dataset("Skylion007/openwebtext", split=data_type)

    directory = os.path.join(DATA_PATH, f"data/openwebtext/text")
    if not os.path.exists(directory):
        os.makedirs(directory)

    directory = os.path.join(DATA_PATH, f"data/openwebtext/tokens")
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    for i, element in enumerate(dataset):
    
        text = element["text"]
        tokens = tokenizer.encode(text)
        
        with open(os.path.join(DATA_PATH, f"data/openwebtext/text/{data_type}_{i:012d}.json"), "w") as f:
            json.dump({"text": text}, f)
            
        with open(os.path.join(DATA_PATH, f"data/openwebtext/tokens/{data_type}_{i:012d}.json"), "w") as f:
            json.dump({"tokens": tokens}, f)
            
        if (i + 1) % 5000 == 0:
            print(f"{i + 1}|{len(dataset)}")
    print()

In [12]:
def get_oscar():
    print("get_oscar")
    data_type = "train"
    dataset = datasets.load_dataset("oscar", "unshuffled_deduplicated_en", split=data_type)

    directory = os.path.join(DATA_PATH, f"data/oscar/text")
    if not os.path.exists(directory):
        os.makedirs(directory)

    directory = os.path.join(DATA_PATH, f"data/oscar/tokens")
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    for i, element in enumerate(dataset):
    
        text = element["text"]
        tokens = tokenizer.encode(text)
        
        with open(os.path.join(DATA_PATH, f"data/oscar/text/{data_type}_{i:012d}.json"), "w") as f:
            json.dump({"text": text}, f)
            
        with open(os.path.join(DATA_PATH, f"data/oscar/tokens/{data_type}_{i:012d}.json"), "w") as f:
            json.dump({"tokens": tokens}, f)
            
        if (i + 1) % 5000 == 0:
            print(f"{i + 1}|{len(dataset)}")
    print()

In [13]:
def get_minipile():
    print("get_minipile")
    data_type = "train"
    dataset = datasets.load_dataset("JeanKaddour/minipile", split=data_type)

    directory = os.path.join(DATA_PATH, f"data/minipile/text")
    if not os.path.exists(directory):
        os.makedirs(directory)

    directory = os.path.join(DATA_PATH, f"data/minipile/tokens")
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    for i, element in enumerate(dataset):
    
        text = element["text"]
        tokens = tokenizer.encode(text)
        
        with open(os.path.join(DATA_PATH, f"data/minipile/text/{data_type}_{i:012d}.json"), "w") as f:
            json.dump({"text": text}, f)
            
        with open(os.path.join(DATA_PATH, f"data/minipile/tokens/{data_type}_{i:012d}.json"), "w") as f:
            json.dump({"tokens": tokens}, f)
            
        if (i + 1) % 5000 == 0:
            print(f"{i + 1}|{len(dataset)}")
    print()

In [14]:
def get_cc100():
    print("get_cc100")
    data_type = "train"
    dataset = datasets.load_dataset("cc100", "en", split=data_type)

    directory = os.path.join(DATA_PATH, f"data/cc100/text")
    if not os.path.exists(directory):
        os.makedirs(directory)

    directory = os.path.join(DATA_PATH, f"data/cc100/tokens")
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    for i, element in enumerate(dataset):
    
        text = element["text"]
        tokens = tokenizer.encode(text)
        
        with open(os.path.join(DATA_PATH, f"data/cc100/text/{data_type}_{i:012d}.json"), "w") as f:
            json.dump({"text": text}, f)
            
        with open(os.path.join(DATA_PATH, f"data/cc100/tokens/{data_type}_{i:012d}.json"), "w") as f:
            json.dump({"tokens": tokens}, f)
            
        if (i + 1) % 5000 == 0:
            print(f"{i + 1}|{len(dataset)}")
    print()

In [None]:
# get_wikipedia()
# get_tatsu_lab_alpaca()
# get_truthful_qa_generation()
# get_truthful_qa_multiple_choice()
# get_open_orca()
# get_stingning_ultrachat()
# get_xtreme()
# get_openwebtext()
get_oscar()
get_minipile()
get_cc100()

get_oscar


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
Downloading builder script: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 14.8k/14.8k [00:00<00:00, 32.8MB/s]
Downloading readme: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 303k/303k [00:00<00:00, 4.24MB/s]
Downloading data: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 56.2k/56.2k [00:00<00:00, 524kB/s]
Downloading data: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 732M/732M [01:05<00:00, 11.1MB/s]
Downloading data: 100%|███████████