# Data Preprocessing - Normalization Values CLEF16

Obtain the normalization values to use during the data transformation steps before training the image classifier.

In [1]:
# Reference to ../src
import os
import sys
module_path = os.path.abspath(os.path.join('../src'))
if module_path not in sys.path:
    sys.path.append(module_path)

import pandas as pd    
from utils.calc_mean import calc_dataset_mean_std
from dataset.TrainDataLoader import TrainDataLoader

We can map the original train folder structure to the new classes structure (e.g. high modality) using a reference to the original folder, and a CSV file with the image mapping. The base directory provides access to the image in the custom Dataset class.

In [2]:
base_dir = '/mnt/clef/imageclef_2016/train'
print("Original classes from the CLEF16 dataset")
print(os.listdir(base_dir))

Original classes from the CLEF16 dataset
['D3DR', 'DMEL', 'DMFL', 'DMLI', 'DMTR', 'DRAN', 'DRCO', 'DRCT', 'DRMR', 'DRPE', 'DRUS', 'DRXR', 'DSEC', 'DSEE', 'DSEM', 'DVDM', 'DVEN', 'DVOR', 'GCHE', 'GFIG', 'GFLO', 'GGEL', 'GGEN', 'GHDR', 'GMAT', 'GNCP', 'GPLI', 'GSCR', 'GSYS', 'GTAB']


In [3]:
csv_path = '../labels/clef16_train.csv'
df = pd.read_csv(csv_path)
df.head()

Unnamed: 0,ID,FIG_NAME,MODALITY,HIGH_MODALITY
0,0,11373_2007_9226_Fig1_HTML-10,DMFL,MICROSCOPY
1,1,11373_2007_9226_Fig1_HTML-11,DMFL,MICROSCOPY
2,2,11373_2007_9226_Fig1_HTML-12,DMFL,MICROSCOPY
3,3,11373_2007_9226_Fig1_HTML-13,DMFL,MICROSCOPY
4,4,11373_2007_9226_Fig1_HTML-14,DMFL,MICROSCOPY


In [4]:
classes = df['HIGH_MODALITY'].unique().tolist()
print("New classes:")
print(classes)

train_data_loader = TrainDataLoader(base_dir, csv_path, classes)
# `normalized=True` normalizes the data with an already calculated mean and std values. In this case, we assume
# we don't have those values and get the raw values
train_dataset = train_data_loader.get_train_dataset(normalized=False)
validation_dataset = train_data_loader.get_val_dataset(normalized=False)

New classes:
['MICROSCOPY', 'GRAPHICS', 'ORGANISMS', 'MOLECULAR', 'EXPERIMENTAL', 'OTHER']


In [5]:
len(train_dataset), len(validation_dataset)

(5389, 1348)

In [6]:
mean, std = calc_dataset_mean_std(train_dataset, batch_size=1024, num_workers=16)

In [7]:
mean

tensor([0.7364, 0.7319, 0.7295])

In [8]:
std

tensor([0.3538, 0.3543, 0.3593])