# Classification - Create dataset

In [1]:
import yaml
import os
import shutil
import subprocess
import datetime
import numpy
import random
import sklearn.model_selection
import tensorflow

print("INFO> TensorFlow version : %s" % tensorflow.__version__)
print("INFO> # of GPUs available: %d" % len(tensorflow.config.experimental.list_physical_devices('GPU')))

INFO> TensorFlow version : 2.1.0
INFO> # of GPUs available: 1


In [2]:
# Read parameters from local config.yaml file, and update corresponding Python variables
currentDir = os.getcwd()
print(f"INFO> Current folder : {currentDir}")
yamlFile = open('config.yaml','r')
yamlData = yaml.load(yamlFile,Loader=yaml.Loader)

for key in sorted(yamlData):
    print("YAML> %-15s: %s" % (key,yamlData[key]))

INFO> Current folder : /raid5/disk1/mlproj11/classification
YAML> batchSize      : 16
YAML> checkDataset   : True
YAML> checkpointDir  : /home/jmv/data/mlproj11/tmp/
YAML> createDataset  : True
YAML> datasetDir     : /home/jmv/data/mlproj11/dataset
YAML> goldenDataset  : /home/jmv/data/mlproj11/SignifyGolden20191212/dataset
YAML> imageHeight    : 720
YAML> imageWidth     : 1280
YAML> learningRate   : 1e-6
YAML> logDir         : /home/jmv/data/mlproj11/log/
YAML> nEpochs        : 1000
YAML> trnDir         : /home/jmv/data/mlproj11/dataset/trn
YAML> tstDir         : /home/jmv/data/mlproj11/dataset/tst
YAML> valDir         : /home/jmv/data/mlproj11/dataset/val


In [3]:
# Optionally create dataset from golden data set & check the newly created dataset
if yamlData['createDataset']:
    # Remove datasetDir folder & sub-folders - and then create it again to start from scratch
    shutil.rmtree(yamlData['datasetDir'], ignore_errors=True)
    os.mkdir(yamlData['datasetDir'])
    
    listOfImages = [os.path.join(dirPath,fileName) 
                        for dirPath,_,fileNames in os.walk(yamlData['goldenDataset'])
                        for fileName in fileNames]
    listOfLabels = [image.split(os.sep) [-3] for image in listOfImages]

    # Split dataset: 60% training set, 20% validation & 20% test sets
    trnvalImages, tstImages, trnvalLabels, tstLabels = sklearn.model_selection.train_test_split(
        listOfImages, listOfLabels, test_size=0.2, stratify=listOfLabels, random_state=42)
    trnImages, valImages, trnLabels, valLabels = sklearn.model_selection.train_test_split(
        trnvalImages, trnvalLabels, test_size=0.25, stratify=trnvalLabels, random_state=42)

    # Copy images in datasetDir
    for dstImages,dstDir in zip([trnImages,valImages,tstImages],[yamlData['trnDir'],yamlData['valDir'],yamlData['tstDir']]):
        for dstImage in dstImages:
            currentClass,currentLight,currentImage = dstImage.split(os.sep)[-3:]
            os.makedirs(os.path.join(dstDir,currentClass),exist_ok=True)
            shutil.copyfile(dstImage,os.path.join(dstDir,currentClass,currentLight+"_"+currentImage))

if yamlData['checkDataset']:
    for subDir in sorted(os.listdir(yamlData['datasetDir'])):
        print(f"INFO> Processing folder: {subDir}")
        for myClass in sorted(os.listdir(os.path.join(yamlData['datasetDir'], subDir))):
            nImages = len(os.listdir(os.path.join(yamlData['datasetDir'], subDir, myClass)))
            print(f"INFO>     Number of samples in class {myClass}: {nImages}")

INFO> Processing folder: trn
INFO>     Number of samples in class defect-01: 613
INFO>     Number of samples in class defect-02: 51
INFO>     Number of samples in class defect-03: 307
INFO>     Number of samples in class defect-04: 227
INFO>     Number of samples in class defect-05: 230
INFO>     Number of samples in class defect-06: 115
INFO>     Number of samples in class defect-07: 152
INFO>     Number of samples in class defect-08: 244
INFO>     Number of samples in class defect-09: 202
INFO>     Number of samples in class defect-10: 259
INFO>     Number of samples in class defect-11: 229
INFO>     Number of samples in class ok: 24888
INFO> Processing folder: tst
INFO>     Number of samples in class defect-01: 205
INFO>     Number of samples in class defect-02: 17
INFO>     Number of samples in class defect-03: 103
INFO>     Number of samples in class defect-04: 76
INFO>     Number of samples in class defect-05: 77
INFO>     Number of samples in class defect-06: 38
INFO>     Number