# Imports

In [None]:
import os
from pathlib import Path
import datetime
import numpy as np
import pandas
import ramses2
from pathlib import Path

# Dataset format
The data be must organized as follow:

**- An annotations.csv file with:**

**label**: (int) label of the object in the mask image </br>
**baseimg**: (string) base image name e.g. "IM0001", </br>
**x0,y0,x1,y1**: (int) coordinates of the bounding box corners (in pixels, format x0y0x1y1)   </br>
**class**: (string) class of the object </br>
**res**: (float) resolution in pixels/mm </br>
**mass**: (float) mass in g </br>
**gt_mass**: True if ground truth mass (individual measure), False if estimated mass </br>
**height, width**: (int) image dims </br>

**- The images and labeled masks**

The files must be in /images and /labels folder. The image and its corresponding labeled masks must have the same basename


**- metadata.json**

The  metadata.json file contains other information as well as the mass for each batch of aggregates.
It contains a list o fimages and a liist o fimage batch with mass.

```json
{"images":[{
                "file_name": "IMG1.jpg",
                "res": 28.7,
                "height": 4096,
                "width": 6144,
                "camera": "JAI SW-8000Q",
                "date": "2024-10-09",
                "tag": "CONVEYOR"
            },],
    "batches":[{"image_names":["IMG1.jpg", "IMG2.jpg",...],
                "mass":{"CLASS1":111,"CLASS2":222}},
                {"image_names":["IMG3.jpg", "IMG4.jpg",...],
                "mass":{"CLASS3":111}},]
    }
```

# DatasetManager class
The `ramses2.DatasetManager` class can be used to generate train/test split using one or more dataset folders.

Given one or several annotation.csv files, it generates a new CSV file with dataset id and other information as well as a json file containing the filenames of train/valid split.

# How to create a new split

## Define Class ids
It must be consistent with the network's parameters

**Important** : 0 is the reserved index for the background class, do not use it !

In [2]:
# min cls index must be 1 (0 is always bg, it is not needed in the dict)
cls_to_idx = {
    "Ra":1,
    "Rc":2,
    "Rb01":3,
    "Rb02": 4,
    "Rcu01":5,
    "Ru01": 6,
    "Ru02": 7,
    "Ru04": 8,
    "Ru05": 9,
    "Ru06": 10,
    "X01": 11,
    "Coin":12,
    "X02":13,
    "X03":14,
    "Rg":15
    }

# Merge some classes when creating the dataset
merging_policy = {"Ru03":"Ru02",}
                #   "X02":"Other",
                #   "X03":"Other",
                #   "X04":"Other",
                #   "Pl":"Other",
                #   "SHELLS":"Other",
                #   "Rg":"Other",
                #   "UNKNOWN":"Other"}

print(cls_to_idx)
# id_to_cls={1:"Agg",2:"Coin",0:"bg"}
idx_to_cls = {v:k for k, v in cls_to_idx.items()}
print(len(cls_to_idx))


{'Ra': 1, 'Rc': 2, 'Rb01': 3, 'Rb02': 4, 'Rcu01': 5, 'Ru01': 6, 'Ru02': 7, 'Ru04': 8, 'Ru05': 9, 'Ru06': 10, 'X01': 11, 'Coin': 12, 'X02': 13, 'X03': 14, 'Rg': 15}
15


## Model parameters
They are needed to generate the dataset and to filters the images

In [3]:
input_shape = (3072, 4608)
mask_stride = 8

## Create new Dataset split
Here we create a new DatasetManager instance using one or several database folders. </br>
We add a column "_id_" to tag each dataset and a column "_folder_" to indicate where the dataset is stored on disk

In [None]:
DS_paths = []
DS_paths.append(Path("PATH/TO/DATASET1"))
DS_paths.append(Path("PATH/TO/DATASET2"))

df = []

for i, df_fn in enumerate(DS_paths):
    dftemp = pandas.read_csv(df_fn / Path("annotations.csv"), engine='python')
    print(f"Appending {os.path.basename(df_fn)} dataset with {len(dftemp)} instances")
    df.append(dftemp)
    # if 'folder' not in df[-1].columns:
    df[-1]['folder'] = str(df_fn)
    df[-1]["id"] = os.path.basename(df_fn)

# Merge ru03 and ru02
df = pandas.concat(df)
for key, target in merging_policy.items():
    print(f"merging class {key} and {target}")
    df["class"] = np.where(df["class"] == key, target, df["class"])
print(f"Found {len(df)} instances")
# print(df)

all_cls_names = df["class"].unique().tolist()
all_cls_names.sort()

print(df["class"].value_counts())

dataloader = ramses2.DatasetManager(df,
                                   input_shape=input_shape,
                                   cls_to_idx=cls_to_idx,
                                   mininst=1,
                                   maxinst=1000,
                                   minres=0,
                                   mask_stride=mask_stride,
                                   augmentation_func=None,
                                   shuffle=True)

Appending RASET dataset with 89600 instances
Appending SYNTH15CLS dataset with 17875 instances
merging class Ru03 and Ru02
Found 107475 instances
class
Rc         27659
Ru01       15059
Ru02       12095
Rb01        8568
Ra          8177
Ru05        7673
Ru04        7302
Rb02        5337
Ru06        4394
X01         3095
X02         2199
X03         2128
Rg          1808
Rcu01       1288
Coin         429
X04          133
Pl            55
SHELLS        48
UNKNOWN       28
Name: count, dtype: int64
Skipping images []
skipping image P20220317_00188 containing box outside the cropped area
skipping image P20220322_00010 containing box outside the cropped area
skipping image P20220322_00011 containing box outside the cropped area
skipping image P20220504_00050 containing box outside the cropped area
skipping image P20220504_00053 containing box outside the cropped area


## Creating train and valid sets
Here we first create a 'valid' dataset using some of the imported datasets:

- the argument `contraint='in'` indicates that the image comes from the datasets tagged by `dataset_name=("id", ["DATASET1"])`
- the classes in `exclude_from_valid` are not included in the dataset
- images can be used only once (`max_reuse=0`)
- the instances must have a mass value (`mass=True`)
- we aim to create a balanced dataset containing 500 instances of each class which is not in the excluded classes. The class "Coin" is not submitted to this constraint

Then we create a 'train' dataset in 2 times:
- First we use `dataset_name=("id", ["DATASET1])` with a posible oversampling `max_reuse=2`
- Then we append images from `["DATASET2"]` with no oversampling.
- Instances without mass data are included in the dataset

In [None]:
exclude_from_valid = ['UNKNOWN', 'Rcu01', 'Coin', "X02", "X03", "X04", "Pl", "SHELLS", "Rg"]
exclude_from_training = ['UNKNOWN', "X04", "Pl", "SHELLS"]

dataloader.seed = 4
# First generate valid set with only mass data
dataloader.create_set(subset="valid", n=500, exclude=exclude_from_valid, not_counting=["Coin"],
                       max_reuse=0, append=False, seed=None, dataset_name=("id", ["DATASET1"]), constraint='in', mass=True)
# dataloader.create_set(subset="valid", n=300, exclude=exclude_from_valid, not_counting=["Coin"],
#                        max_reuse=0, append=True, seed=None, dataset_name=("id", ["SYNTH2"]), constraint='in', mass=False)

# Then train data using reuse=2 to balance the set
dataloader.create_set(subset="train", n=20000, exclude=exclude_from_training, not_counting=["Coin"],
                       max_reuse=2, append=False, seed=None, dataset_name=("id", ["DATASET1"]), constraint='in', mass=False)
dataloader.create_set(subset="train", n=10000, exclude=exclude_from_training, not_counting=["Coin"],
                       max_reuse=0, append=True, seed=None, dataset_name=("id", ["DATASET2"]), constraint="in", mass=False)


print("\nNumber of images:",len(dataloader.train_basenames))

Creating valid set with an objective of 500 training intances in ['RASET']
Using 1733 images
Adding images, iteration 1     
Ended in 1 iterations.
 Added 273 images. Instances per class: {'Coin': 0, 'Pl': 0, 'Ra': 500, 'Rb01': 500, 'Rb02': 500, 'Rc': 500, 'Rcu01': 0, 'Rg': 0, 'Ru01': 500, 'Ru02': 500, 'Ru04': 500, 'Ru05': 500, 'Ru06': 500, 'SHELLS': 0, 'UNKNOWN': 0, 'X01': 500, 'X02': 0, 'X03': 0, 'X04': 0} 

Creating train set with an objective of 20000 training intances in ['RASET']
Using 2145 images
Adding images, iteration 1     
Adding images, iteration 2     
Ended in 2 iterations.
 Added 3215 images. Instances per class: {'Coin': 550, 'Pl': 0, 'Ra': 13547, 'Rb01': 14378, 'Rb02': 6359, 'Rc': 20000, 'Rcu01': 1085, 'Rg': 271, 'Ru01': 20000, 'Ru02': 19988, 'Ru04': 10207, 'Ru05': 10885, 'Ru06': 4733, 'SHELLS': 0, 'UNKNOWN': 0, 'X01': 2512, 'X02': 1152, 'X03': 1492, 'X04': 0} 

Creating train set with an objective of 10000 training intances in ['SYNTH15CLS']
Using 200 images
Adding i

## Save dataset split and annotations
It generates two files: 
- a myfilename.csv annotations files with additionnal columns
- a myfilename.json files with two lists containing the images in train and valid subsets

In [None]:
dspath = Path("/PATH/TO/NEW/DATASET")
now = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
name = f"DATASET_NAME_{now}"
dataloader.save(os.path.join(dspath, name))

## Statistics

In [6]:
print("Full Annotations stats")
print("Indexes:", dataloader.cls_to_idx)
stats = pandas.DataFrame(dataloader.annotations["class"].value_counts())
print("Number of instances:", len(dataloader.annotations))
stats["freq."] = 100 * stats["count"] / stats["count"].sum()
print(stats)

if len(dataloader.train_basenames) > 0:
    print("\nTrain dataset stats")
    print("number of images in training set:", len(dataloader.train_basenames))
    print("number of unique images in training set:", np.unique(dataloader.train_basenames).size)
    print("number of instances per class")
    print(dataloader.train_class_counts)

if len(dataloader.valid_basenames) > 0:
    print("Valid dataset stats")
    print("number of images in valid set:", len(dataloader.valid_basenames))
    print("number of unique images in valid set:", np.unique(dataloader.valid_basenames).size)
    print("number of instances per class")
    print(dataloader.valid_class_counts)


Full Annotations stats
Indexes: {'Ra': 1, 'Rc': 2, 'Rb01': 3, 'Rb02': 4, 'Rcu01': 5, 'Ru01': 6, 'Ru02': 7, 'Ru04': 8, 'Ru05': 9, 'Ru06': 10, 'X01': 11, 'Coin': 12, 'X02': 13, 'X03': 14, 'Rg': 15}
Number of instances: 107475
         count      freq.
class                    
Rc       27659  25.735287
Ru01     15059  14.011631
Ru02     12095  11.253780
Rb01      8568   7.972087
Ra        8177   7.608281
Ru05      7673   7.139335
Ru04      7302   6.794138
Rb02      5337   4.965806
Ru06      4394   4.088393
X01       3095   2.879739
X02       2199   2.046057
X03       2128   1.979995
Rg        1808   1.682252
Rcu01     1288   1.198418
Coin       429   0.399163
X04        133   0.123750
Pl          55   0.051175
SHELLS      48   0.044662
UNKNOWN     28   0.026053

Train dataset stats
number of images in training set: 3472
number of unique images in training set: 2194
number of instances per class
{'Coin': 668, 'Pl': 0, 'Ra': 14416, 'Rb01': 15145, 'Rb02': 7952, 'Rc': 20749, 'Rcu01': 1825, '

In [1]:
print("Train set statistics")
ntot = np.sum([n for n in dataloader.train_class_counts.values()])
for c, n in dataloader.train_class_counts.items():
    if n > 0:
        print(f"{c:8s} {n:6d}  {100*n/ntot:4.2f}%")
print("\nTest set statistics")
ntot = np.sum([n for n in dataloader.valid_class_counts.values()])
for c, n in dataloader.valid_class_counts.items():
    if n > 0:
        print(f"{c:8s} {n:6d}  {100*n/ntot:4.2f}%")

Train set statistics


NameError: name 'np' is not defined

# Load an existing dataset split and annotations

In [3]:
ds_path = Path("../datasets")
ds_name = "15CLS_20250723-173206_MASS_ONLY"

dataloader = ramses2.DatasetManager.from_file(
    annfile=ds_path / Path(ds_name + ".csv"), filename=ds_path / Path(ds_name + ".json"),
)
if len(dataloader.train_basenames) > 0:
    print("\nTrain dataset stats")
    print("number of images in training set:", len(dataloader.train_basenames))
    print("number of unique images in training set:", np.unique(dataloader.train_basenames).size)
    print("number of instances per class")
    print(dataloader.train_class_counts)

if len(dataloader.valid_basenames) > 0:
    print("Valid dataset stats")
    print("number of images in valid set:", len(dataloader.valid_basenames))
    print("number of unique images in valid set:", np.unique(dataloader.valid_basenames).size)
    print("number of instances per class")
    print(dataloader.valid_class_counts)


Train dataset stats
number of images in training set: 2897
number of unique images in training set: 1750
number of instances per class
{'Coin': 549, 'Pl': 0, 'Ra': 13539, 'Rb01': 14270, 'Rb02': 6348, 'Rc': 19749, 'Rcu01': 1056, 'Rg': 271, 'Ru01': 19843, 'Ru02': 14246, 'Ru04': 10193, 'Ru05': 10876, 'Ru06': 4734, 'SHELLS': 0, 'UNKNOWN': 0, 'X01': 2506, 'X02': 1148, 'X03': 1492, 'X04': 0}
Valid dataset stats
number of images in valid set: 258
number of unique images in valid set: 258
number of instances per class
{'Coin': 0, 'Pl': 0, 'Ra': 500, 'Rb01': 500, 'Rb02': 499, 'Rc': 500, 'Rcu01': 0, 'Rg': 0, 'Ru01': 500, 'Ru02': 500, 'Ru04': 500, 'Ru05': 500, 'Ru06': 499, 'SHELLS': 0, 'UNKNOWN': 0, 'X01': 502, 'X02': 0, 'X03': 0, 'X04': 0}


## Example of filtering: keep only instances with mass data in train/test/ splits

In [None]:
dataloader.filter_set(hasmass=True)

## Save Filtered Dataset

In [None]:
dataloader.save(os.path.join(ds_path,"15CLS_20250723-173206_MASS_ONLY"))