In [1]:
import gc
import os
import json
import datasets
from dotenv import load_dotenv
import minds
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import torch
from huggingface_hub import HfApi, HfFolder, login
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import warnings

from honeybee.loaders import (
    PDFreport,
    Scan,
    Slide,
    generate_summary_from_json,
    get_chunk_text,
)
from honeybee.models import REMEDIS, UNI, HuggingFaceEmbedder, TissueDetector

os.environ["TOKENIZERS_PARALLELISM"] = "false"
warnings.filterwarnings("ignore")
load_dotenv()

True

In [2]:
def manifest_to_df(manifest_path, modality):
    with open(manifest_path, "r") as f:
        manifest = json.load(f)

    # Initialize an empty DataFrame for the modality
    modality_df = pd.DataFrame()

    # Process each patient in the manifest
    for patient in manifest:
        patient_id = patient["PatientID"]
        gdc_case_id = patient["gdc_case_id"]

        # Check if the current patient has the requested modality
        if modality in patient:
            # Convert the list of dictionaries into a DataFrame
            df = pd.DataFrame(patient[modality])
            # Add 'PatientID' and 'gdc_case_id' columns
            df["PatientID"] = patient_id
            df["gdc_case_id"] = gdc_case_id

            # Append the new data to the existing DataFrame for this modality
            modality_df = pd.concat([modality_df, df], ignore_index=True)

    # Check if the modality DataFrame is not empty before returning
    if not modality_df.empty:
        return modality_df
    else:
        return None

## Pathology Reports

In [10]:
!rm -rf "/mnt/d/TCGA/parquet/Pathology Report (gatortron-base)/"

In [4]:
PROJECTS = [
    "TCGA-ACC",
    # "TCGA-COAD",
    # "TCGA-KICH",
    # "TCGA-LIHC",
    # "TCGA-PAAD",
    # "TCGA-SKCM",
    # "TCGA-UCEC",
    # "TCGA-BLCA",
    # "TCGA-DLBC",
    # "TCGA-KIRC",
    # "TCGA-LUAD",
    # "TCGA-PCPG",
    # "TCGA-STAD",
    # "TCGA-UCS",
    # "TCGA-BRCA",
    # "TCGA-ESCA",
    # "TCGA-KIRP",
    # "TCGA-LUSC",
    # "TCGA-PRAD",
    # "TCGA-TGCT",
    # "TCGA-UVM",
    # "TCGA-CESC",
    # "TCGA-GBM",
    # "TCGA-LAML",
    # "TCGA-MESO",
    # "TCGA-READ",
    # "TCGA-THCA",
    # "TCGA-CHOL",
    # "TCGA-HNSC",
    # "TCGA-LGG",
    # "TCGA-OV",
    # "TCGA-SARC",
    # "TCGA-THYM",
]


def setup_writer(parquet_path, schema):
    """Setup or return existing ParquetWriter."""
    return pq.ParquetWriter(
        parquet_path,
        schema,
        flavor="spark",
        compression="snappy",
    )

for PROJECT in PROJECTS:
    DATA_DIR = f"/mnt/d/TCGA/raw/{PROJECT}"
    MANIFEST_PATH = DATA_DIR + "/manifest.json"
    MODALITY = "Pathology Report"
    PARQUET = f"/mnt/d/TCGA/parquet/{MODALITY}_gatortron-base.parquet"

    df = manifest_to_df(MANIFEST_PATH, MODALITY)
    embedding_model = HuggingFaceEmbedder(model_name="UFNLP/gatortron-base")
    pdf_report = PDFreport(chunk_size=512, chunk_overlap=10)

    if df is None:
        print(f"No data found for {PROJECT}")
        continue

    writer = None
    schema = None

    for index, row in tqdm(df.iterrows(), total=len(df), desc=f"Processing {PROJECT}"):
        try:
            file_path = f"{DATA_DIR}/raw/{row['PatientID']}/{MODALITY}/{row['id']}/{row['file_name']}"
            report_text = pdf_report.load(file_path)

            if report_text:
                embeddings = embedding_model.generate_embeddings(report_text)
                embedding_bytes = embeddings.tobytes()
                df.at[index, "report_text"] = report_text
                df.at[index, "embedding"] = embedding_bytes
                df.at[index, "embedding_shape"] = embeddings.shape
            else:
                raise ValueError("No report text loaded.")

        except Exception as e:
            print(f"Error processing {row['PatientID']}: {e}")

        table = pa.Table.from_pandas(df.iloc[[index]])
        if writer is None or not table.schema.equals(schema):
            if writer is not None:
                writer.close()
            schema = table.schema
            writer = setup_writer(PARQUET, schema)

        writer.write_table(table)

    if writer is not None:
        writer.close()

    gc.collect()
    torch.cuda.empty_cache()

Processing TCGA-ACC:   4%|▍         | 4/92 [00:00<00:05, 16.46it/s]

Error processing TCGA-OR-A5JL: Must have equal len keys and value when setting with an iterable
Error processing TCGA-OR-A5LL: Must have equal len keys and value when setting with an iterable
Error processing TCGA-OR-A5J8: Must have equal len keys and value when setting with an iterable
Error processing TCGA-P6-A5OG: Must have equal len keys and value when setting with an iterable


Processing TCGA-ACC:   9%|▊         | 8/92 [00:00<00:04, 17.66it/s]

Error processing TCGA-OR-A5KB: Must have equal len keys and value when setting with an iterable
Error processing TCGA-OR-A5J7: Must have equal len keys and value when setting with an iterable
Error processing TCGA-OR-A5JP: Must have equal len keys and value when setting with an iterable
Error processing TCGA-OR-A5KS: Must have equal len keys and value when setting with an iterable


Processing TCGA-ACC:  13%|█▎        | 12/92 [00:00<00:04, 18.43it/s]

Error processing TCGA-OR-A5K8: Must have equal len keys and value when setting with an iterable
Error processing TCGA-OR-A5LT: Must have equal len keys and value when setting with an iterable
Error processing TCGA-OR-A5JQ: Must have equal len keys and value when setting with an iterable
Error processing TCGA-OR-A5LO: Must have equal len keys and value when setting with an iterable
Error processing TCGA-OR-A5KV: Must have equal len keys and value when setting with an iterable


Processing TCGA-ACC:  17%|█▋        | 16/92 [00:00<00:04, 18.29it/s]

Error processing TCGA-OR-A5JD: Must have equal len keys and value when setting with an iterable
Error processing TCGA-OR-A5JY: Must have equal len keys and value when setting with an iterable
Error processing TCGA-OR-A5L5: Must have equal len keys and value when setting with an iterable
Error processing TCGA-OR-A5K6: Must have equal len keys and value when setting with an iterable
Error processing TCGA-OR-A5KX: Must have equal len keys and value when setting with an iterable


Processing TCGA-ACC:  24%|██▍       | 22/92 [00:01<00:03, 19.84it/s]

Error processing TCGA-OR-A5JV: Must have equal len keys and value when setting with an iterable
Error processing TCGA-OR-A5L1: Must have equal len keys and value when setting with an iterable
Error processing TCGA-PK-A5H8: Must have equal len keys and value when setting with an iterable
Error processing TCGA-OR-A5JX: Must have equal len keys and value when setting with an iterable


Processing TCGA-ACC:  28%|██▊       | 26/92 [00:01<00:03, 17.98it/s]

Error processing TCGA-OR-A5JA: Must have equal len keys and value when setting with an iterable
Error processing TCGA-OR-A5KU: Must have equal len keys and value when setting with an iterable
Error processing TCGA-OR-A5JH: Must have equal len keys and value when setting with an iterable
Error processing TCGA-OR-A5JO: Must have equal len keys and value when setting with an iterable


Processing TCGA-ACC:  32%|███▏      | 29/92 [00:01<00:03, 19.22it/s]

Error processing TCGA-OR-A5L8: Must have equal len keys and value when setting with an iterable
Error processing TCGA-OR-A5LN: Must have equal len keys and value when setting with an iterable
Error processing TCGA-PK-A5HA: Must have equal len keys and value when setting with an iterable
Error processing TCGA-OR-A5J4: Must have equal len keys and value when setting with an iterable
Error processing TCGA-OR-A5KY: Must have equal len keys and value when setting with an iterable


Processing TCGA-ACC:  38%|███▊      | 35/92 [00:01<00:02, 20.33it/s]

Error processing TCGA-OR-A5KW: Must have equal len keys and value when setting with an iterable
Error processing TCGA-OR-A5KP: Must have equal len keys and value when setting with an iterable
Error processing TCGA-OR-A5K2: Must have equal len keys and value when setting with an iterable
Error processing TCGA-OR-A5JW: Must have equal len keys and value when setting with an iterable


Processing TCGA-ACC:  41%|████▏     | 38/92 [00:01<00:02, 19.73it/s]

Error processing TCGA-OR-A5K4: Must have equal len keys and value when setting with an iterable
Error processing TCGA-OR-A5JG: Must have equal len keys and value when setting with an iterable
Error processing TCGA-OR-A5LR: Must have equal len keys and value when setting with an iterable
Error processing TCGA-OR-A5LF: Must have equal len keys and value when setting with an iterable
Error processing TCGA-OR-A5J5: Must have equal len keys and value when setting with an iterable


Processing TCGA-ACC:  45%|████▍     | 41/92 [00:02<00:02, 20.09it/s]

Error processing TCGA-OR-A5JS: Must have equal len keys and value when setting with an iterable
Error processing TCGA-PK-A5H9: Must have equal len keys and value when setting with an iterable
Error processing TCGA-P6-A5OF: Must have equal len keys and value when setting with an iterable
Error processing TCGA-PK-A5HC: Must have equal len keys and value when setting with an iterable


Processing TCGA-ACC:  52%|█████▏    | 48/92 [00:02<00:02, 18.27it/s]

Error processing TCGA-OR-A5J2: Must have equal len keys and value when setting with an iterable
Error processing TCGA-P6-A5OH: Must have equal len keys and value when setting with an iterable
Error processing TCGA-PA-A5YG: Must have equal len keys and value when setting with an iterable
Error processing TCGA-OR-A5L4: Must have equal len keys and value when setting with an iterable


Processing TCGA-ACC:  55%|█████▌    | 51/92 [00:02<00:02, 18.98it/s]

Error processing TCGA-OR-A5JZ: Must have equal len keys and value when setting with an iterable
Error processing TCGA-OR-A5KZ: Must have equal len keys and value when setting with an iterable
Error processing TCGA-PK-A5HB: Must have equal len keys and value when setting with an iterable
Error processing TCGA-OR-A5LK: Must have equal len keys and value when setting with an iterable
Error processing TCGA-OR-A5JJ: Must have equal len keys and value when setting with an iterable


Processing TCGA-ACC:  58%|█████▊    | 53/92 [00:02<00:02, 19.21it/s]

Error processing TCGA-OR-A5JR: Must have equal len keys and value when setting with an iterable


Processing TCGA-ACC:  63%|██████▎   | 58/92 [00:03<00:02, 15.56it/s]

Error processing TCGA-OR-A5L2: Must have equal len keys and value when setting with an iterable
Error processing TCGA-OR-A5KQ: Must have equal len keys and value when setting with an iterable
Error processing TCGA-OR-A5JK: Must have equal len keys and value when setting with an iterable
Error processing TCGA-OR-A5J9: Must have equal len keys and value when setting with an iterable
Error processing TCGA-OU-A5PI: Must have equal len keys and value when setting with an iterable


Processing TCGA-ACC:  68%|██████▊   | 63/92 [00:03<00:01, 17.39it/s]

Error processing TCGA-OR-A5L3: Must have equal len keys and value when setting with an iterable
Error processing TCGA-OR-A5J1: Must have equal len keys and value when setting with an iterable
Error processing TCGA-OR-A5KO: Must have equal len keys and value when setting with an iterable
Error processing TCGA-OR-A5JI: Must have equal len keys and value when setting with an iterable


Processing TCGA-ACC:  74%|███████▍  | 68/92 [00:03<00:01, 18.84it/s]

Error processing TCGA-OR-A5JE: Must have equal len keys and value when setting with an iterable
Error processing TCGA-OR-A5JC: Must have equal len keys and value when setting with an iterable
Error processing TCGA-OR-A5JT: Must have equal len keys and value when setting with an iterable
Error processing TCGA-OR-A5L9: Must have equal len keys and value when setting with an iterable
Error processing TCGA-OR-A5LE: Must have equal len keys and value when setting with an iterable


Processing TCGA-ACC:  78%|███████▊  | 72/92 [00:03<00:01, 18.80it/s]

Error processing TCGA-OR-A5K3: Must have equal len keys and value when setting with an iterable
Error processing TCGA-OR-A5J6: Must have equal len keys and value when setting with an iterable
Error processing TCGA-OR-A5LH: Must have equal len keys and value when setting with an iterable
Error processing TCGA-OR-A5JU: Must have equal len keys and value when setting with an iterable


Processing TCGA-ACC:  80%|████████  | 74/92 [00:04<00:00, 18.45it/s]

Error processing TCGA-OR-A5K5: Must have equal len keys and value when setting with an iterable
Error processing TCGA-OR-A5JB: Must have equal len keys and value when setting with an iterable
Error processing TCGA-OR-A5K9: Must have equal len keys and value when setting with an iterable
Error processing TCGA-OR-A5LD: Must have equal len keys and value when setting with an iterable


Processing TCGA-ACC:  86%|████████▌ | 79/92 [00:04<00:00, 19.15it/s]

Error processing TCGA-OR-A5JF: Must have equal len keys and value when setting with an iterable
Error processing TCGA-OR-A5KT: Must have equal len keys and value when setting with an iterable
Error processing TCGA-OR-A5J3: Must have equal len keys and value when setting with an iterable
Error processing TCGA-OR-A5LP: Must have equal len keys and value when setting with an iterable
Error processing TCGA-OR-A5LM: Must have equal len keys and value when setting with an iterable


Processing TCGA-ACC:  92%|█████████▏| 85/92 [00:04<00:00, 20.79it/s]

Error processing TCGA-OR-A5LA: Must have equal len keys and value when setting with an iterable
Error processing TCGA-OR-A5K0: Must have equal len keys and value when setting with an iterable
Error processing TCGA-OR-A5LI: Must have equal len keys and value when setting with an iterable
Error processing TCGA-OR-A5LS: Must have equal len keys and value when setting with an iterable
Error processing TCGA-OR-A5LC: Must have equal len keys and value when setting with an iterable


Processing TCGA-ACC:  99%|█████████▉| 91/92 [00:04<00:00, 20.80it/s]

Error processing TCGA-OR-A5K1: Must have equal len keys and value when setting with an iterable
Error processing TCGA-OR-A5LB: Must have equal len keys and value when setting with an iterable
Error processing TCGA-OR-A5LG: Must have equal len keys and value when setting with an iterable
Error processing TCGA-OR-A5JM: Must have equal len keys and value when setting with an iterable
Error processing TCGA-OR-A5LJ: Must have equal len keys and value when setting with an iterable


Processing TCGA-ACC: 100%|██████████| 92/92 [00:04<00:00, 18.73it/s]


Error processing TCGA-OR-A5L6: Must have equal len keys and value when setting with an iterable


## Slide Image

In [7]:
DATA_DIR = "/mnt/d/TCGA/raw/TCGA-LUAD/"
MANIFEST_PATH = "/mnt/d/TCGA/raw/TCGA-LUAD/manifest.json"
MODALITY = "Slide Image"
PARQUET = f"/mnt/d/TCGA/parquet/{MODALITY}.parquet"
HE_DETECTOR_PATH = "/mnt/f/Projects/Multimodal-Transformer/models/deep-tissue-detector_densenet_state-dict.pt"
EMBEDDING_MODEL_PATH = (
    "/mnt/d/ckpts/vit_large_patch16_224.dinov2.uni_mass100k/pytorch_model.bin"
)

df = manifest_to_df(MANIFEST_PATH, MODALITY)
tissue_detector = TissueDetector(model_path=HE_DETECTOR_PATH)
embedding_model_path = EMBEDDING_MODEL_PATH
uni = UNI()

df["embedding"] = None
df["embedding_shape"] = None
writer = None
for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing"):
    try:
        slide_image_path = f"{DATA_DIR}/raw/{row['PatientID']}/{MODALITY}/{row['id']}/{row['file_name']}"
        slide = Slide(
            slide_image_path,
            tileSize=512,
            max_patches=100,
            visualize=False,
            tissue_detector=tissue_detector,
        )
        patches = slide.load_patches_concurrently(target_patch_size=224)

        if patches.shape[0] == 0:
            slide = Slide(
                slide_image_path,
                tileSize=512,
                max_patches=1000,
                visualize=True,
                tissue_detector=tissue_detector,
            )
            patches = slide.load_patches_concurrently(target_patch_size=224)

            if patches.shape[0] == 0:
                with open("errors.txt", "a") as f:
                    f.write(f"{slide_image_path} | No patches extracted.\n")
                raise ValueError("No patches extracted.")

        embedding = uni.load_model_and_predict(embedding_model_path, patches)
        df.at[index, "embedding_shape"] = embedding.shape
        embedding = embedding.reshape(-1)
        embedding = np.array(embedding, dtype=np.float32)
        embedding = embedding.tobytes()
        df.at[index, "embedding"] = embedding
    except Exception as e:
        with open("errors.txt", "a") as f:
            f.write(f"{slide_image_path} | {e}\n")
        df.at[index, "embedding"] = None
        continue

    if writer is None:
        table = pa.Table.from_pandas(df.iloc[[index]])
        writer = pq.ParquetWriter(PARQUET, table.schema)
    else:
        table = pa.Table.from_pandas(df.iloc[[index]])
        writer.write_table(table)

    del slide, patches, embedding, table
    gc.collect()
    torch.cuda.empty_cache()

if writer is not None:
    writer.close()

Processing: 100%|██████████| 1608/1608 [00:07<00:00, 215.71it/s]


## CT

In [None]:
DATA_DIR = "/mnt/d/TCGA/raw/TCGA-LUAD"
MANIFEST_PATH = "/mnt/d/TCGA/raw/TCGA-LUAD/manifest.json"
MODALITY = "CT"
PARQUET = f"/mnt/d/TCGA-LUAD/parquet/{MODALITY}.parquet"

df = manifest_to_df(MANIFEST_PATH, MODALITY)

# --- CONFIGURATION ---
embedding_model_path = "/mnt/d/Models/REMEDIS/onnx/cxr-50x1-remedis-m.onnx"

# Define a consistent schema
schema = pa.schema(
    [
        ("StudyInstanceUID", pa.string()),
        ("SeriesInstanceUID", pa.string()),
        ("SeriesDate", pa.string()),
        ("BodyPartExamined", pa.string()),
        ("SeriesNumber", pa.string()),
        ("Collection", pa.string()),
        ("Manufacturer", pa.string()),
        ("ManufacturerModelName", pa.string()),
        ("SoftwareVersions", pa.string()),
        ("Visibility", pa.string()),
        ("ImageCount", pa.int64()),
        ("PatientID", pa.string()),
        ("gdc_case_id", pa.string()),
        ("ProtocolName", pa.string()),
        ("SeriesDescription", pa.string()),
        ("embedding", pa.binary()),
        ("embedding_shape", pa.list_(pa.int64())),
        ("__index_level_0__", pa.int64()),
    ]
)

df["embedding"] = None
df["embedding_shape"] = None
writer = None
for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing"):
    try:
        file_path = f"{DATA_DIR}/raw/{row['PatientID']}/{MODALITY}/{row['SeriesInstanceUID']}/{row['SeriesInstanceUID']}"
        scanner = Scan(file_path, modality="CT")
        patches = scanner.load_patches(target_patch_size=448)
        embedding = REMEDIS.load_model_and_predict(embedding_model_path, patches)
        df.at[index, "embedding_shape"] = embedding.shape
        embedding = embedding.reshape(-1)
        embedding = embedding.tobytes()
        df.at[index, "embedding"] = embedding
    except Exception as e:
        print(f"\033[91mError: {e}\033[0m")
        df.at[index, "embedding"] = None
        scanner = None
        patches = None
        embedding = None
        table = None

    if writer is None:
        table = pa.Table.from_pandas(df.iloc[[index]])
        writer = pq.ParquetWriter(PARQUET, schema)
    else:
        table = pa.Table.from_pandas(df.iloc[[index]], schema=schema)
        writer.write_table(table)

    del scanner, patches, embedding, table
    gc.collect()
    torch.cuda.empty_cache()

if writer is not None:
    writer.close()

## Clinical Data

In [None]:
def process_group(group):
    common_fields = {}
    nested_objects = []
    for col in group.columns:
        unique_values = group[col].dropna().unique()
        if len(unique_values) == 1:
            # If only one unique value exists, it's a common field
            common_fields[col] = unique_values[0]

    # Create nested objects for fields that are not common
    for idx, row in group.iterrows():
        nested_object = {
            col: row[col]
            for col in group.columns
            if col not in common_fields and pd.notna(row[col])
        }
        if nested_object:  # Only add if the nested object is not empty
            nested_objects.append(nested_object)

    return common_fields, nested_objects


PROJECTS = [
    "TCGA-ACC",
    "TCGA-COAD",
    "TCGA-KICH",
    "TCGA-LIHC",
    "TCGA-PAAD",
    "TCGA-SKCM",
    "TCGA-UCEC",
    "TCGA-BLCA",
    "TCGA-DLBC",
    "TCGA-KIRC",
    "TCGA-LUAD",
    "TCGA-PCPG",
    "TCGA-STAD",
    "TCGA-UCS",
    "TCGA-BRCA",
    "TCGA-ESCA",
    "TCGA-KIRP",
    "TCGA-LUSC",
    "TCGA-PRAD",
    "TCGA-TGCT",
    "TCGA-UVM",
    "TCGA-CESC",
    "TCGA-GBM",
    "TCGA-LAML",
    "TCGA-MESO",
    "TCGA-READ",
    "TCGA-THCA",
    "TCGA-CHOL",
    "TCGA-HNSC",
    "TCGA-LGG",
    "TCGA-OV",
    "TCGA-SARC",
    "TCGA-THYM",
]

embedding_model = HuggingFaceEmbedder(model_name="UFNLP/gatortron-medium")

for PROJECT in PROJECTS:
    print(f"Processing {PROJECT}")
    DATA_DIR = f"/mnt/d/TCGA/raw/{PROJECT}"
    MANIFEST_PATH = DATA_DIR + "/manifest.json"
    MODALITY = "Clinical Data"
    PARQUET = f"/mnt/d/TCGA/parquet/{PROJECT}/{MODALITY}.parquet"

    tables = minds.get_tables()
    json_objects = {}
    for table in tqdm(tables, desc="Getting data from tables"):
        query = f"SELECT * FROM minds.{table} WHERE project_id='{PROJECT}'"
        df = minds.query(query)
        for case_id, group in tqdm(df.groupby("case_submitter_id"), leave=False):
            if case_id not in json_objects:
                json_objects[case_id] = {}
            common_fields, nested_objects = process_group(group)
            json_objects[case_id].update(common_fields)
            json_objects[case_id][table] = nested_objects

    df = []
    for case_id, patient_data in tqdm(json_objects.items()):
        summary = generate_summary_from_json(patient_data)
        if len(summary) > 0:
            summary_chunks = get_chunk_text(summary)
            chunk_embeddings = []
            for chunk in summary_chunks:
                chunk_embedding = embedding_model.generate_embeddings([chunk])
                chunk_embeddings.append(chunk_embedding)
            clinical_embedding = np.array(chunk_embeddings)
        else:
            clinical_embedding = None
        patient_data["text"] = summary
        patient_data["embedding_shape"] = clinical_embedding.shape
        clinical_embedding = clinical_embedding.reshape(-1)
        clinical_embedding = np.array(clinical_embedding, dtype=np.float32)
        clinical_embedding = clinical_embedding.tobytes()
        patient_data["embedding"] = clinical_embedding
        # Create a new dictionary for DataFrame conversion, excluding lists
        patient_data_for_df = {
            key: value
            for key, value in patient_data.items()
            if not isinstance(value, list)
        }
        df.append(patient_data_for_df)

    clinical_df = pd.DataFrame(df)
    clinical_df.to_parquet(PARQUET, index=False)

## Uploading datasets to huggingface

### Upload the Local HF datasets to HuggingFace datasets hub

In [None]:
# api = HfApi()
# api.upload_folder(
#     folder_path="/mnt/d/TCGA-LUAD/parquet/",
#     repo_id="aakashtripathi/TCGA-LUAD",
#     repo_type="dataset",
#     multi_commits=True,
#     multi_commits_verbose=True,
# )

## Loading Dataset from HuggingFace into PyTorch Dataloader