# Data for Training Deep Neural Networks
Sourcing, cleaning, tokenization, mapping, and dataset cards.

In [None]:
!pip -q install -U datasets transformers pandas


In [None]:
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer
import pandas as pd

# Example: IMDB for text classification
ds = load_dataset("imdb")
print(ds)

# Train/val/test split (IMDB already split; we create a small validation here)
small_train = ds["train"].shuffle(seed=42).select(range(5000))
small_test  = ds["test"].shuffle(seed=42).select(range(2000))
raw = DatasetDict({"train": small_train, "test": small_test})
raw


In [None]:
# Cleaning/tokenization
tok = AutoTokenizer.from_pretrained("distilbert-base-uncased")
def preprocess(ex):
    return tok(ex["text"], truncation=True, padding="max_length", max_length=256)

tok_ds = raw.map(preprocess, batched=True, remove_columns=["text"])
tok_ds = tok_ds.rename_column("label", "labels")
tok_ds.set_format(type="torch")
tok_ds


In [None]:
# Simple data card (example template)
data_card = {
    "source": "IMDB from Hugging Face Datasets",
    "license": "See dataset card",
    "splits": {k: len(v) for k, v in tok_ds.items()},
    "processing": "Lowercasing via tokenizer; truncation to 256 tokens; no deduping for demo",
    "known_issues": ["Small subset; sentiment domain only; minimal cleaning"],
}
import json, pprint
pprint.pp(data_card)


**Next**: Use these prepared datasets in SFT/PEFT notebooks.