# Image preparation

The original image size is 55x90 pixels with a color depth of 3 (RGB).
The below code can be used to transform the images in an input directory (Input_dir) to the right size (20x32 pixels) into an output directory (Output_dir). Inside the directory the pictures are stored in subdirectories according their labeling (0 ... 9 + NaN).
Any other image converter can be used as well.

### Prerequisite
Installed OpenCV libary within python (opencv)

In [1]:
import glob
import os
from PIL import Image 

Input_dir = 'data_raw_all'
Output_dir= 'data_resize_all'

target_size_x = 32
target_size_y = 32

In [2]:
files = glob.glob(Output_dir + '/*.jpg')
for f in files:
    os.remove(f)
print(str(len(files)) + " files have been deleted.")

0 files have been deleted.


In [4]:
import hashlib

files = glob.glob(Input_dir + '/*.jpg')
hashes={}
for i,aktfile in enumerate(files):
    if i%500==0:
        print(i, aktfile)
    test_image = Image.open(aktfile)
    hash=hashlib.sha256(test_image.tobytes()).hexdigest()
    if hash in hashes:
        hashes[hash].append(aktfile)
    else:
        hashes[hash]=[aktfile]
    test_image = test_image.resize((target_size_x, target_size_y), Image.NEAREST)
    base=os.path.basename(aktfile)
    save_name = Output_dir + '/' + base
    test_image.save(save_name, "JPEG")

0 data_raw_all/9.4_bdeab982fbac878494bf669d45b91020.jpg


  test_image = test_image.resize((target_size_x, target_size_y), Image.NEAREST)


500 data_raw_all/2.8_1364_zeiger3_2020-06-15_13-04-32.jpg
1000 data_raw_all/4.9_2469_zeiger3_2019-11-19_13-42-04.jpg
1500 data_raw_all/7.9_3933_zeiger2_2019-11-19_08-07-03.jpg
2000 data_raw_all/7.5_PRODUCED_ANALOG.jpg
2500 data_raw_all/8.9_4366_zeiger1_2019-06-02T235009.jpg
3000 data_raw_all/8.7_4341_analog2_20200813-032607.jpg
3500 data_raw_all/9.1_a51fa9ec434cfbffec87c59e1908b26c.jpg
4000 data_raw_all/2.0_name_20230216-201846.jpg
4500 data_raw_all/6.4_3194_zeiger1_2020-04-29_14-27-02.jpg
5000 data_raw_all/8.3_4151_zeiger3_2019-06-01T193014.jpg
5500 data_raw_all/7.7_3910_zeiger3_2019-06-06T003009.jpg
6000 data_raw_all/2.5_1201_zeiger3_2020-04-29_12-44-02.jpg
6500 data_raw_all/1.7_0734_zeiger2_2019-11-19_17-57-04.jpg


# Removing duplicate files

In [None]:
# duplicate files are a risk to the metrics, they pollute the validation dataset
for hash in hashes:
    if len(hashes[hash])>1:
        print(hashes[hash])    
        for duplicate in hashes[hash][1:]:
            # remove all except the first
            os.remove(duplicate)    