# SlimPajama as a Reference Corpus

[SlimPajama](https://huggingface.co/datasets/cerebras/SlimPajama-627B) is the largest extensively deduplicated, multi-corpora, open-source dataset for training large language models.

|  Data source  | SlimPajama |
|:-------------:|:----------:|
| Commoncrawl   | 52.2%      |
| C4            | 26.7%      |
| GitHub        | 5.2%       |
| Books         | 4.2%       |
| ArXiv         | 4.6%       |
| Wikpedia      | 3.8%       |
| StackExchange | 3.3%       |

In [1]:
from datasets import load_dataset
import pandas as pd
from tqdm.auto import tqdm
import spacy

nlp = spacy.load("en_core_web_lg")

# Test split is 500M tokens
ds_streamed = load_dataset("cerebras/SlimPajama-627B", split="test", streaming=True)

README.md: 0.00B [00:00, ?B/s]

Resolving data files:   0%|          | 0/59166 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/31428 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/31411 [00:00<?, ?it/s]

In [None]:
data = [example for example in tqdm(ds_streamed)]
df = pd.DataFrame(data)

## Parse metadata and save to disk

In [None]:
df.meta.value_counts()

In [None]:
meta = pd.json_normalize(df.meta)
df.meta = meta["redpajama_set_name"]

In [None]:
df

In [None]:
f"Tokens: {df.text.str.split().str.len().sum():,}"

## Save version without code to disk

In [None]:
df_no_code = df[
    df.meta.isin(
        ["RedPajamaC4", "RedPajamaCommonCrawl", "RedPajamaBook", "RedPajamaWikipedia"]
    )
]
df_no_code

In [None]:
# zstd is an efficient compression algorithm for text
# it has excellent decompression speed
df_no_code.to_parquet("data/slim-pajama-test-no-code.parquet", compression="zstd")

## Read from disk

In [None]:
df_no_code = pd.read_parquet("data/slim-pajama-test-no-code.parquet")
df_no_code

In [None]:
df_len = len(df_no_code)
batch_size = df_len // 200 + 1
num_batches = (df_len // batch_size) + 1

spacy_path = Path("../data/slim_pajama_docbins")
spacy_path.mkdir(parents=False, exist_ok=True)
print(f"Data will be saved to {spacy_path}")


def proc_texts(batch):
    for doc in batch:
        doc_bin.add(doc)


for i in range(0, ds_len, batch_size):
    doc_bin = spacy.tokens.DocBin(store_user_data=True)
    end = min(ds_len, i + batch_size)
    checkpoint = spacy_path / (
        "{stem}_{i:0{width}d}_{end:0{width}d}.docbin".format(
            stem="slim_pajama", i=i, end=end, width=len(str(ds_len))
        )
    )

    batch = df_no_code["text"].iloc[i : i + batch_size]

    if not checkpoint.exists():
        for doc in tqdm(
            nlp.pipe(batch, disable=["ner"]),
            description=f"Processing {checkpoint.stem}...{i} of {num_batches}",
            total=batch_size,
        ):
            doc_bin.add(doc)