In [1]:
pip install datasets pillow tqdm




In [2]:
pip uninstall pyarrow -y

Found existing installation: pyarrow 22.0.0Note: you may need to restart the kernel to use updated packages.

Uninstalling pyarrow-22.0.0:
  Successfully uninstalled pyarrow-22.0.0


You can safely remove it manually.
You can safely remove it manually.


In [1]:
pip install pyarrow --upgrade

Note: you may need to restart the kernel to use updated packages.


In [1]:
import pyarrow.parquet as pq
print("PyArrow Parquet OK !")

PyArrow Parquet OK !


In [2]:
pip install huggingface_hub[hf_xet]

Note: you may need to restart the kernel to use updated packages.


In [1]:
from datasets import load_dataset
from PIL import Image
import os
from tqdm import tqdm

# 1. Télécharger 30% du dataset
print("Téléchargement du dataset (30%) ...")
dataset = load_dataset("timbrooks/instructpix2pix-clip-filtered", split="train[:30%]")

# 2. Diviser le dataset en train (80%), validation (10%) et test (10%)
train_dataset, temp_dataset = dataset.train_test_split(test_size=0.2, seed=42).values()
val_dataset, test_dataset = temp_dataset.train_test_split(test_size=0.5, seed=42).values()

splits = {
    "train": train_dataset,
    "val": val_dataset,
    "test": test_dataset
}

# 3. Fonction pour sauvegarder les images et le metadata (résolution 256x256)
def save_dataset(split_name, split_dataset, base_dir="dataset_leditspp_256"):
    split_dir = os.path.join(base_dir, split_name)
    os.makedirs(split_dir, exist_ok=True)
    os.makedirs(os.path.join(split_dir, "original"), exist_ok=True)
    os.makedirs(os.path.join(split_dir, "edited"), exist_ok=True)

    meta_path = os.path.join(split_dir, "metadata.tsv")
    with open(meta_path, "w", encoding="utf-8") as meta:
        meta.write("original_image_path\tedited_image_path\tedit_prompt\toriginal_prompt\tedited_prompt\n")

        for i, sample in tqdm(enumerate(split_dataset), total=len(split_dataset), desc=f"Processing {split_name}"):
            try:
                # Redimensionner les images à 256x256
                orig_img = sample["original_image"].resize((256, 256), Image.LANCZOS)
                edit_img = sample["edited_image"].resize((256, 256), Image.LANCZOS)

                orig_path = f"original/{i:06d}.png"
                edit_path = f"edited/{i:06d}.png"

                orig_img.save(os.path.join(split_dir, orig_path))
                edit_img.save(os.path.join(split_dir, edit_path))

                edit_prompt = sample["edit_prompt"].replace("\t", " ").replace("\n", " ")
                original_prompt = sample["original_prompt"].replace("\t", " ").replace("\n", " ")
                edited_prompt = sample["edited_prompt"].replace("\t", " ").replace("\n", " ")

                meta.write(f"{orig_path}\t{edit_path}\t{edit_prompt}\t{original_prompt}\t{edited_prompt}\n")

            except Exception as e:
                print(f"Erreur sur l’échantillon {i}: {e}")
                continue

# 4. Sauvegarder chaque split
for split_name, split_dataset in splits.items():
    save_dataset(split_name, split_dataset)

print("Tous les splits sont prêts dans :", "dataset_leditspp_256")


Téléchargement du dataset (30%) ...


Resolving data files:   0%|          | 0/262 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/58 [00:00<?, ?it/s]

Processing train: 100%|████████████████████████████████████████████████████████| 75122/75122 [2:13:32<00:00,  9.38it/s]
Processing val: 100%|██████████████████████████████████████████████████████████████| 9390/9390 [16:05<00:00,  9.73it/s]
Processing test: 100%|█████████████████████████████████████████████████████████████| 9391/9391 [16:05<00:00,  9.72it/s]

Tous les splits sont prêts dans : dataset_leditspp_256





In [3]:
input_path = "dataset_leditspp_256/train/metadata.tsv"
output_path_fixed = "dataset_leditspp_256/train/metadata_fixed.tsv"

cleaned = 0
removed = 0

with open(input_path, "r", encoding="utf-8") as fin, open(output_path_fixed, "w", encoding="utf-8") as fout:
    header = fin.readline().strip()
    fout.write(header + "\n")

    for line in fin:
        # Supprimer guillemets isolés et espaces parasites
        line = line.replace('"', '').strip()

        # Diviser selon les tabulations
        parts = line.split("\t")

        # Si on a plus ou moins de 5 colonnes → ignorer la ligne
        if len(parts) == 5:
            fout.write("\t".join(parts) + "\n")
            cleaned += 1
        else:
            removed += 1

print(f"Nettoyage final terminé : {cleaned} lignes gardées, {removed} lignes supprimées.")
print("Nouveau fichier corrigé :", output_path_fixed)


Nettoyage final terminé : 75122 lignes gardées, 0 lignes supprimées.
Nouveau fichier corrigé : dataset_leditspp_256/train/metadata_fixed.tsv


In [9]:
input_path = "dataset_leditspp_256/test/metadata.tsv"
output_path_fixed = "dataset_leditspp_256/test/metadata_fixed.tsv"

cleaned = 0
removed = 0

with open(input_path, "r", encoding="utf-8") as fin, open(output_path_fixed, "w", encoding="utf-8") as fout:
    header = fin.readline().strip()
    fout.write(header + "\n")

    for line in fin:
        # Supprimer guillemets isolés et espaces parasites
        line = line.replace('"', '').strip()

        # Diviser selon les tabulations
        parts = line.split("\t")

        # Si on a plus ou moins de 5 colonnes → ignorer la ligne
        if len(parts) == 5:
            fout.write("\t".join(parts) + "\n")
            cleaned += 1
        else:
            removed += 1

print(f"Nettoyage final terminé : {cleaned} lignes gardées, {removed} lignes supprimées.")
print("Nouveau fichier corrigé :", output_path_fixed)


Nettoyage final terminé : 9391 lignes gardées, 0 lignes supprimées.
Nouveau fichier corrigé : dataset_leditspp_256/test/metadata_fixed.tsv


In [10]:
input_path = "dataset_leditspp_256/val/metadata.tsv"
output_path_fixed = "dataset_leditspp_256/val/metadata_fixed.tsv"

cleaned = 0
removed = 0

with open(input_path, "r", encoding="utf-8") as fin, open(output_path_fixed, "w", encoding="utf-8") as fout:
    header = fin.readline().strip()
    fout.write(header + "\n")

    for line in fin:
        # Supprimer guillemets isolés et espaces parasites
        line = line.replace('"', '').strip()

        # Diviser selon les tabulations
        parts = line.split("\t")

        # Si on a plus ou moins de 5 colonnes → ignorer la ligne
        if len(parts) == 5:
            fout.write("\t".join(parts) + "\n")
            cleaned += 1
        else:
            removed += 1

print(f"Nettoyage final terminé : {cleaned} lignes gardées, {removed} lignes supprimées.")
print("Nouveau fichier corrigé :", output_path_fixed)


Nettoyage final terminé : 9390 lignes gardées, 0 lignes supprimées.
Nouveau fichier corrigé : dataset_leditspp_256/val/metadata_fixed.tsv


In [4]:
import pandas as pd
df = pd.read_csv(output_path_fixed, sep="\t", engine="python")
print("Chargement réussi :", len(df), "lignes valides")
print(df.head())

Chargement réussi : 75122 lignes valides
   original_image_path  edited_image_path                   edit_prompt  \
0  original/000000.png  edited/000000.png  Make it a landscape painting   
1  original/000001.png  edited/000001.png       Add a desert background   
2  original/000002.png  edited/000002.png             Add a storm cloud   
3  original/000003.png  edited/000003.png      make the sheep a giraffe   
4  original/000004.png  edited/000004.png          make it a watercolor   

                                     original_prompt  \
0         Milky way over Reessor lake by Alan Dyer ©   
1  Juan Bosco Forest Animals - Elizabeth Taylor a...   
2  Wooden barn on top of alpine meadow with rugge...   
3  Click image for larger version.  Name:sheep-wi...   
4  KEVIN BEERS Clark Point oil on board, 18 x 24 ...   

                                       edited_prompt  
0  Landscape painting of Milky way over Reessor l...  
1  Juan Bosco Forest Animals - Elizabeth Taylor a...  
2  Woo