In [2]:
import datasets
import json
import os
import time
import tiktoken
import uuid

import pyarrow.parquet as pq
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
DATA_PATH = "/mnt/"

In [4]:
tokenizer = tiktoken.encoding_for_model("gpt2")

In [5]:
def get_wikipedia():
    data_type = "train"
    directory_name = "wikipedia"
    dataset = datasets.load_dataset("wikimedia/wikipedia", "20231101.en", split=data_type, cache_dir="/mnt/data/.cache/huggingface")

    directory = os.path.join(DATA_PATH, f"data/{directory_name}")
    if not os.path.exists(directory):
        os.makedirs(directory)

    M = 128
    N = len(dataset) // M
    count = 0
    for i in range(N):
        data = {"text": [], "tokens": []}
        for j in range(M):
            text = dataset[count]["text"]
            data["text"].append(text)
            data["tokens"].append(tokenizer.encode(text))
            count += 1

            print(f"\rget_wikipedia(): {i + 1}|{N}, {j + 1}|{M}, {count=}", end="")
            
        with open(os.path.join(DATA_PATH, f"data/{directory_name}/{data_type}_{count:09d}.json"), "w") as f:
            json.dump(data, f)
    print()

In [6]:
def get_tatsu_lab_alpaca():
    data_type = "train"
    directory_name = "tatsu_lab_alpaca"
    dataset = datasets.load_dataset("tatsu-lab/alpaca", split=data_type, cache_dir="/mnt/data/.cache/huggingface")

    directory = os.path.join(DATA_PATH, f"data/{directory_name}")
    if not os.path.exists(directory):
        os.makedirs(directory)

    M = 128
    N = len(dataset) // M
    count = 0
    for i in range(N):
        data = {"text": [], "tokens": []}
        for j in range(M):
            text = dataset[count]["text"]
            data["text"].append(text)
            data["tokens"].append(tokenizer.encode(text))
            count += 1

            print(f"\rget_tatsu_lab_alpaca(): {i + 1}|{N}, {j + 1}|{M}, {count=}", end="")
            
        with open(os.path.join(DATA_PATH, f"data/{directory_name}/{data_type}_{count:09d}.json"), "w") as f:
            json.dump(data, f)
    print()

In [7]:
def get_truthful_qa_generation():
    set_ = "generation"
    data_type = "validation"
    directory_name = "truthful_qa_generation"
    dataset = datasets.load_dataset("truthful_qa", set_, split=data_type, cache_dir="/mnt/data/.cache/huggingface")

    directory = os.path.join(DATA_PATH, f"data/{directory_name}")
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    M = 128
    N = len(dataset) // M
    count = 0
    for i in range(N):
        data = {"text": [], "tokens": []}
        for j in range(M):
            element = dataset[count]
            question = "Question: \n" + element["question"] + "\n\n"
            best_answer = "Best answer: \n" + element["best_answer"] + "\n\n"
            correct_answers = "Correct answers: \n" + "\n".join(element["correct_answers"]) + "\n\n"
            incorrect_answers = "Incorrect answers: \n" + "\n".join(element["incorrect_answers"]) + "\n\n"
            text = question + question + incorrect_answers + best_answer
            data["text"].append(text)
            data["tokens"].append(tokenizer.encode(text))
            count += 1

            print(f"\rget_truthful_qa_generation(): {i + 1}|{N}, {j + 1}|{M}, {count=}", end="")
            
        with open(os.path.join(DATA_PATH, f"data/{directory_name}/{data_type}_{count:09d}.json"), "w") as f:
            json.dump(data, f)
    print()

In [8]:
def get_truthful_qa_multiple_choice():
    set_ = "multiple_choice"
    data_type = "validation"
    directory_name = "truthful_qa_multiple_choice"
    dataset = datasets.load_dataset("truthful_qa", set_, split=data_type, cache_dir="/mnt/data/.cache/huggingface")

    directory = os.path.join(DATA_PATH, f"data/{directory_name}")
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    M = 128
    N = len(dataset) // M
    count = 0
    for i in range(N):
        data = {"text": [], "tokens": []}
        for j in range(M):
            element = dataset[count]
            question = "Question: \n" + element["question"] + "\n\n"
            choices = "Choices: \n" + "\n".join(element["mc1_targets"]["choices"]) + "\n\n"
            idx = np.argmax(element["mc1_targets"]["labels"])
            correct_answer = "Correct answer: \n" + element["mc1_targets"]["choices"][idx]
            text = question + choices + correct_answer
            data["text"].append(text)
            data["tokens"].append(tokenizer.encode(text))
            count += 1

            print(f"\rget_truthful_qa_multiple_choice(): {i + 1}|{N}, {j + 1}|{M}, {count=}", end="")
            
        with open(os.path.join(DATA_PATH, f"data/{directory_name}/{data_type}_{count:09d}.json"), "w") as f:
            json.dump(data, f)
    print()

In [9]:
def get_open_orca():
    data_type = "train"
    directory_name = "open_orca"
    dataset = datasets.load_dataset("Open-Orca/OpenOrca", split=data_type, cache_dir="/mnt/data/.cache/huggingface")

    directory = os.path.join(DATA_PATH, f"data/{directory_name}")
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    M = 128
    N = len(dataset) // M
    count = 0
    for i in range(N):
        data = {"text": [], "tokens": []}
        for j in range(M):
            element = dataset[count]
            system_prompt = "System prompt: \n" + element["system_prompt"] + "\n\n"
            question = "Question: \n" + element["question"] + "\n\n"
            response = "Response: \n" + element["response"] + "\n\n"
            text = system_prompt + question + response
            data["text"].append(text)
            data["tokens"].append(tokenizer.encode(text))
            count += 1

            print(f"\rget_open_orca(): {i + 1}|{N}, {j + 1}|{M}, {count=}", end="")
            
        with open(os.path.join(DATA_PATH, f"data/{directory_name}/{data_type}_{count:09d}.json"), "w") as f:
            json.dump(data, f)
    print()

In [10]:
def get_stingning_ultrachat():
    data_type = "train"
    directory_name = "stingning_ultrachat"
    dataset = datasets.load_dataset("stingning/ultrachat", split=data_type, cache_dir="/mnt/data/.cache/huggingface")

    directory = os.path.join(DATA_PATH, f"data/{directory_name}")
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    M = 128
    N = len(dataset) // M
    count = 0
    for i in range(N):
        data = {"text": [], "tokens": []}
        for j in range(M):
            element = dataset[count]
            question = "Question: \n" + element["data"][0] + "\n\n"
            answer = "Answer: \n" + element["data"][1] + "\n\n"
            text = question + answer
            data["text"].append(text)
            data["tokens"].append(tokenizer.encode(text))
            count += 1

            print(f"\rget_stingning_ultrachat(): {i + 1}|{N}, {j + 1}|{M}, {count=}", end="")
            
        with open(os.path.join(DATA_PATH, f"data/{directory_name}/{data_type}_{count:09d}.json"), "w") as f:
            json.dump(data, f)
    print()

In [11]:
def get_xtreme():
    for data_type in ["validation", "test"]:
        directory_name = "xtreme"
        dataset = datasets.load_dataset("xtreme", "MLQA.en.en", split=data_type, cache_dir="/mnt/data/.cache/huggingface")

        directory = os.path.join(DATA_PATH, f"data/{directory_name}")
        if not os.path.exists(directory):
            os.makedirs(directory)
        
        M = 128
        N = len(dataset) // M
        count = 0
        for i in range(N):
            data = {"text": [], "tokens": []}
            for j in range(M):
                element = dataset[count]
                title = "Title: \n" + element["title"] + "\n\n"
                context = "Context: \n" + element["context"] + "\n\n"
                question = "Question: \n" + element["question"] + "\n\n"
                answer = "Answer: \n" + "\n".join(element["answers"]["text"]) + "\n\n"
                text = title + context + question + answer
                data["text"].append(text)
                data["tokens"].append(tokenizer.encode(text))
                count += 1
    
                print(f"\rget_xtreme(): {i + 1}|{N}, {j + 1}|{M}, {count=}", end="")
                
            with open(os.path.join(DATA_PATH, f"data/{directory_name}/{data_type}_{count:09d}.json"), "w") as f:
                json.dump(data, f)
    print()

In [12]:
def get_openwebtext():
    data_type = "train"
    directory_name = "openwebtext"
    dataset = datasets.load_dataset("Skylion007/openwebtext", split=data_type, cache_dir="/mnt/data/.cache/huggingface")

    directory = os.path.join(DATA_PATH, f"data/{directory_name}")
    if not os.path.exists(directory):
        os.makedirs(directory)

    M = 128
    N = len(dataset) // M
    count = 0
    for i in range(N):
        data = {"text": [], "tokens": []}
        for j in range(M):
            text = dataset[count]["text"]
            data["text"].append(text)
            data["tokens"].append(tokenizer.encode(text))
            count += 1

            print(f"\rget_openwebtext(): {i + 1}|{N}, {j + 1}|{M}, {count=}", end="")
            
        with open(os.path.join(DATA_PATH, f"data/{directory_name}/{data_type}_{count:09d}.json"), "w") as f:
            json.dump(data, f)
    print()

In [13]:
def get_oscar_direct_from_source():
    print("get_oscar_direct_from_source")

    directory = os.path.join(DATA_PATH, f"data/oscar/text")
    if not os.path.exists(directory):
        os.makedirs(directory)

    directory = os.path.join(DATA_PATH, f"data/oscar/tokens")
    if not os.path.exists(directory):
        os.makedirs(directory)

    _BASE_DATA_URL_FORMAT_STR = (
        "https://s3.amazonaws.com/datasets.huggingface.co/oscar/1.0/{shuffled}/{deduplicated}/{language}/"
    )
    _BASE_CHECKSUM_FILE_NAME = "{language}_sha256.txt"

    language = "en"
    shuffled_str = "shuffled"
    deduplicated_str = "deduplicated"
    base_data_url = _BASE_DATA_URL_FORMAT_STR.format(
        shuffled=shuffled_str, language=language, deduplicated=deduplicated_str
    )
    print(base_data_url)

    dl_manager = datasets.DownloadManager()
    checksum_url = base_data_url + _BASE_CHECKSUM_FILE_NAME.format(language=language)
    print(checksum_url)
    #checksum_file = dl_manager.download(checksum_url)
    
    # with open(checksum_file, encoding="utf-8") as f:
    #     data_filenames = [line.split("\t")[0] for line in f if line]
    #     data_urls = [base_url + data_filename for data_filename in data_filenames]
    # downloaded_files = dl_manager.download(data_urls)
    
    # for i, element in enumerate(dataset):
    
    #     text = element["text"]
    #     tokens = tokenizer.encode(text)
        
    #     with open(os.path.join(DATA_PATH, f"data/oscar/text/{data_type}_{i:012d}.json"), "w") as f:
    #         pass # json.dump({"text": text}, f)
            
    #     with open(os.path.join(DATA_PATH, f"data/oscar/tokens/{data_type}_{i:012d}.json"), "w") as f:
    #         pass # json.dump({"tokens": tokens}, f)
            
    #     if (i + 1) % 5000 == 0:
    #         print(f"{i + 1}|{len(dataset)}")
    # print()

In [14]:
def get_minipile():
    data_type = "train"
    directory_name = "minipile"
    dataset = datasets.load_dataset("JeanKaddour/minipile", split=data_type)#, cache_dir="/mnt/data/.cache/huggingface")

    directory = os.path.join(DATA_PATH, f"data/{directory_name}")
    if not os.path.exists(directory):
        os.makedirs(directory)

    M = 128
    N = len(dataset) // M
    count = 0
    for i in range(N):
        data = {"text": [], "tokens": []}
        for j in range(M):
            text = dataset[count]["text"]
            data["text"].append(text)
            data["tokens"].append(tokenizer.encode(text, allowed_special={'<|endoftext|>'}))
            count += 1

            print(f"\rget_minipile(): {i + 1}|{N}, {j + 1}|{M}, {count=}", end="")
            
        with open(os.path.join(DATA_PATH, f"data/{directory_name}/{data_type}_{count:09d}.json"), "w") as f:
            json.dump(data, f)
    print()

In [15]:
def get_cc100_en_from_parquet():
    directory_name = "cc100"

    directory = f"/mnt/data/{directory_name}"
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    for j in range(0, 1):
        id = f"{j:04d}"
        key = id + ".parquet"
        filename = os.path.join("/mnt/data/parquet", key)
        
        table = pq.read_table(filename)
        dataset = table.to_pandas()
        dataset = np.asarray(dataset)[:, -1]
    
        M = 128
        N = len(dataset) // M
        count = 0
        for i in range(N):
            data = {"text": [], "tokens": []}
            for j in range(M):
                text = dataset[count]
                data["text"].append(text)
                data["tokens"].append(tokenizer.encode(text))
                count += 1
    
                if ((count + 1) % 2500) == 0:
                    print(f"\rget_cc100_en_from_parquet(): {id} {i + 1}|{N}, {j + 1}|{M}, {count=}", end="")
                
            with open(os.path.join(DATA_PATH, f"data/{directory_name}/{id}_{count:09d}.json"), "w") as f:
                json.dump(data, f)
        print()

In [16]:
# get_wikipedia()
# get_tatsu_lab_alpaca()
# get_truthful_qa_generation()
# get_truthful_qa_multiple_choice()
# get_open_orca()
# get_stingning_ultrachat()
# get_xtreme()
# get_openwebtext()
# get_minipile()
get_cc100_en_from_parquet()

get_cc100_en_from_parquet(): 0000 24083|24093, 3|128, count=308249999
