# Download the dataset

git lfs install  # Install Git LFS (Large File Storage), needed for models/datasets

git clone https://huggingface.co/datasets/DefectSpectrum/Defect_Spectrum



## load the data as pandas df

In [1]:
import os
import pandas as pd

normal_datasets=["DS-Cotton-Fabric","DS-DAGM","synthetic_Cotton_Fabric"]
variant_datasets=["DS-VISION","synthetic_MVTec","synthetic_VISION"]
captioned_dataset=["DS-MVTec"]


# Root directory
root_dir = "./Defect_Spectrum"

# Storage for rows
entries = []


def read_defect(dataset_path, dataset_name, type, variant="None", part="None"):
    data = []

    if not os.path.isdir(dataset_path):
        return data  # Skip if not a directory

    for file in os.listdir(dataset_path):
        if file.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp')):
            data.append({
                "file_path": os.path.join(dataset_path, file),
                "file_name": file,
                "dataset": dataset_name,
                "type": type,
                "variant": variant,
                "part" : part
            })
    return data

for dataset in os.listdir(root_dir):
    dataset_path = os.path.join(root_dir, dataset)

    if dataset in normal_datasets:
        for type in os.listdir(dataset_path):
            path=os.path.join(dataset_path,type)
            t = os.path.basename(path)
            entries.extend(read_defect(path, dataset,t))

    elif dataset in variant_datasets:
        for variant in os.listdir(dataset_path):
            variant_path=os.path.join(dataset_path,variant)
            if os.path.isdir(variant_path):
                for type in os.listdir(variant_path):
                    path=os.path.join(variant_path,type)
                    t = os.path.basename(path)
                    entries.extend(read_defect(path, dataset, t ,variant=variant))

    elif dataset in captioned_dataset:
         for variant in os.listdir(dataset_path):
            variant_path=os.path.join(dataset_path,variant)
            if os.path.isdir(variant_path):
                for type in os.listdir(variant_path):
                    type_path=os.path.join(variant_path,type)
                    if os.path.isdir(type_path):
                        for part in os.listdir(type_path):
                            path=os.path.join(type_path,part)
                            t= os.path.basename(type_path)
                            entries.extend(read_defect(path, dataset, t, variant=variant, part=part))

df = pd.DataFrame(entries)
len(df)

13069

In [2]:
df["file_name"]=df.file_name.apply(lambda x: x.replace("_rgb", "").replace("_rbg", "").replace("_mask", "").replace("_image", "").replace("_converted", "") )
len(df)

13069

In [3]:
df = df[df["type"].isin(["image", "images", "masks", "rgb_mask", "rbg_mask"])]
len(df)

8850

In [4]:
def fix_type(s):
    if s =="images":
        return "image"
    elif s =="masks":
        return "mask"
    elif s=="rbg_mask":
        return "mask"
    elif s=="rgb_mask":
        return "mask"
    else:
        return s

df.loc[:, "type"] = df["type"].apply(fix_type)

In [5]:
result = df.pivot_table(
    index=['file_name', 'dataset', 'variant', 'part'],
    columns='type',
    values='file_path',
    aggfunc='first'
).reset_index()
result

type,file_name,dataset,variant,part,image,mask
0,0.png,DS-DAGM,,,./Defect_Spectrum/DS-DAGM/image/0.png,./Defect_Spectrum/DS-DAGM/rbg_mask/0_rgb_mask.png
1,000.png,DS-MVTec,bottle,broken_large,./Defect_Spectrum/DS-MVTec/bottle/image/broken...,./Defect_Spectrum/DS-MVTec/bottle/rbg_mask/bro...
2,000.png,DS-MVTec,bottle,broken_small,./Defect_Spectrum/DS-MVTec/bottle/image/broken...,./Defect_Spectrum/DS-MVTec/bottle/rbg_mask/bro...
3,000.png,DS-MVTec,bottle,contamination,./Defect_Spectrum/DS-MVTec/bottle/image/contam...,./Defect_Spectrum/DS-MVTec/bottle/rbg_mask/con...
4,000.png,DS-MVTec,bottle,good,./Defect_Spectrum/DS-MVTec/bottle/image/good/0...,
...,...,...,...,...,...,...
5429,samples_000016_1_0_1.png,synthetic_MVTec,screw,,./Defect_Spectrum/synthetic_MVTec/screw/images...,./Defect_Spectrum/synthetic_MVTec/screw/masks/...
5430,samples_000016_1_1_1.png,synthetic_MVTec,screw,,./Defect_Spectrum/synthetic_MVTec/screw/images...,./Defect_Spectrum/synthetic_MVTec/screw/masks/...
5431,samples_000017_0_0_1.png,synthetic_MVTec,screw,,./Defect_Spectrum/synthetic_MVTec/screw/images...,./Defect_Spectrum/synthetic_MVTec/screw/masks/...
5432,samples_000017_0_1_1.png,synthetic_MVTec,screw,,./Defect_Spectrum/synthetic_MVTec/screw/images...,./Defect_Spectrum/synthetic_MVTec/screw/masks/...


In [6]:
result=result[["image","mask","dataset","variant","part"]]
result=result.rename(columns={"mask":"mask_image","variant":"object","part":"defect"})
len(result)

5434

In [7]:
new_df = pd.DataFrame()
new_df["image"]=result.image.tolist()
new_df["mask_image"]=result.mask_image.tolist()
new_df["dataset"]=result.dataset.tolist()
new_df["object"]=result.object.tolist()
new_df["defect"]=result.defect.tolist()
new_df["name"]=new_df.image.apply(lambda x: str(x).split("/")[-1])

In [8]:
captions=pd.read_excel("./Defect_Spectrum/DS-MVTec/captions.xlsx")
df_split = captions['Path'].str.split('/', expand=True)
df_split.columns = ['object', 'defect', 'name']
captions = pd.concat([captions, df_split], axis=1)
captions=captions[["name","object","defect",'object description',"defect description"]]
captions=captions.rename(columns={"object description":"object_description","defect description":"defect_description"})

  warn("""Cannot parse header or footer so it will be ignored""")


In [9]:
final_df = pd.merge(new_df, captions, on=['defect', 'object', 'name'], how='outer')
final_df = final_df.drop('name', axis=1)
final_df = final_df.dropna(subset=['image','mask_image']).reset_index(drop=True)

In [10]:
from datasets import Dataset

hf_dataset = Dataset.from_pandas(final_df)

In [11]:
from datasets import Dataset, Features, Image, Value

features = Features({
    "image": Image(),          # will load the main image from path
    "mask_image": Image(),     # will load the mask image from path
    "dataset": Value("string"),
    "object": Value("string"),
    "defect": Value("string"),
    "object_description": Value("string"),
    "defect_description": Value("string"),
})

hf_dataset = Dataset.from_pandas(final_df, features=features)


In [12]:
len(hf_dataset)

3416

In [13]:
from huggingface_hub import login

login(token=".....")

In [14]:

# Define the repo name
repo_name = "hamedrahimi/Defect_Spectrum_cleaned"

# Push dataset to the Hub
hf_dataset.push_to_hub(repo_name)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/3416 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/35 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/hamedrahimi/Defect_Spectrum_cleaned/commit/0af4b9b32a3239c1c87fdfdb687e73359fa1dcfc', commit_message='Upload dataset', commit_description='', oid='0af4b9b32a3239c1c87fdfdb687e73359fa1dcfc', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/hamedrahimi/Defect_Spectrum_cleaned', endpoint='https://huggingface.co', repo_type='dataset', repo_id='hamedrahimi/Defect_Spectrum_cleaned'), pr_revision=None, pr_num=None)