# Generate dataset from scratch (version 04)

Version 04 is derived from version 03. Version 04 was coded on Feb. 27, 2020.

It still uses the undersampling & oversampling concepts explained in https://www.kdnuggets.com/2018/12/handling-imbalanced-datasets-deep-learning.html in order to have the same number of images per class, imposed by nImagesPerClass.

This version works as follows:
- for each class, we perform a split between trn, val, tst & rem
- Then for each class and each bucket (trn, val, tst) we perform oversampling (within each bucket).

In [None]:
import os
import random
import shutil
import numpy
import yaml

In [None]:
# Read parameters from local config.yaml file, and update corresponding Python variables
currentDir = os.getcwd()
print("INFO> Reading file config.yam from directory: %s" %currentDir)
yamlFile = open('config.yaml','r')
yamlData = yaml.load(yamlFile,Loader=yaml.Loader)

for key in yamlData:
    print("INFO> %-15s: %s" % (key,yamlData[key]))
    
imgWidth, imgHeight = yamlData['imageWidth'], yamlData['imageHeight']
tmpDir = yamlData['tmpDir']
trnDir = yamlData['trnDir']
valDir = yamlData['valDir']
tstDir = yamlData['tstDir']
remDir = yamlData['remDir']
nTrnSamples = yamlData['nTrnSamples']
nValSamples = yamlData['nValSamples']
nTstSamples = yamlData['nTstSamples']
nEpochs = yamlData['nEpochs']
batchSize = yamlData['batchSize']
batchlearningRate = float(yamlData['learningRate'])
checkpointDir = yamlData['checkpointDir']
logDir = yamlData['logDir']
createDataset = yamlData['createDataset']
checkDataset = yamlData['checkDataset']

In [None]:
def overSample(myList,actualLength,desiredLength): 
    if actualLength==desiredLength:
        return myList
    else:
        newList = numpy.append(myList,numpy.random.choice(myList,desiredLength-actualLength)) 
        return numpy.random.choice(newList,desiredLength,replace=False)
    
def copyImages(tmpDir, subDir, listOfImages, dstDir):
    cntDict={}
    for image in listOfImages:
        cntDict[image] = cntDict.get(image,0)+1
        if cntDict[image]==1:
            dstImage = image
        else:
            fileName, fileExtension = os.path.splitext(image)
            dstImage = fileName+"_copy_"+str(cntDict[image])+fileExtension
        shutil.copyfile(os.path.join(tmpDir,subDir,image),os.path.join(dstDir,subDir,dstImage))
        
random.seed(42)
listOfSubDir = listOfSubDir = [subdir[1] for subdir in os.walk(tmpDir)][0]

for subDir in listOfSubDir:
    print("Processing subDir: %s" % subDir)

    initialList=os.listdir(os.path.join(tmpDir,subDir))
    nInitialList = len(initialList)
    randomInitialList = random.sample(initialList,k=nInitialList)
        
    # Split randomized list of images according to the training, validation & test ratio
    nTotalSamples = nTrnSamples + nValSamples + nTstSamples
    trnN = min(nTrnSamples,int(nInitialList*nTrnSamples/nTotalSamples))
    valN = min(nTrnSamples+nValSamples,int(nInitialList*(nTrnSamples+nValSamples)/nTotalSamples))
    tstN = min(nTotalSamples,nInitialList)
    trn, val, tst, rem = numpy.split(randomInitialList,[trnN, valN, tstN])
    
    # Oversampling scheme
    trnLen = trnN
    valLen = valN-trnN
    tstLen = tstN-valN
    fullTrn = overSample(trn,trnLen,nTrnSamples)
    fullVal = overSample(val,valLen,nValSamples)
    fullTst = overSample(tst,tstLen,nTstSamples)
    fullRem = rem # No oversampling for rem
    
    # Copy images in folders
    copyImages(tmpDir, subDir, fullTrn, trnDir)
    copyImages(tmpDir, subDir, fullVal, valDir)
    copyImages(tmpDir, subDir, fullTst, tstDir)        
    copyImages(tmpDir, subDir, fullRem, remDir)

# Testbench
N=10
initialList = ['a'+str(i)+'.jpg' for i in range(1,N+1)]
nInitialList = len(initialList)
print(f"DBG> nInitialList={nInitialList}")
print(f"DBG> initialList={initialList}")
randomInitialList = random.sample(initialList,k=nInitialList)
print(f"DBG> randomInitialist={randomInitialList}")
nTrnSamples = 12
nValSamples = 4
nTstSamples = 4
# End testbench

nTotalSamples = nTrnSamples + nValSamples + nTstSamples

trnN = min(nTrnSamples,int(nInitialList*nTrnSamples/nTotalSamples))
valN = min(nTrnSamples+nValSamples,int(nInitialList*(nTrnSamples+nValSamples)/nTotalSamples))
tstN = min(nTotalSamples,nInitialList)
trnLen = trnN
valLen = valN-trnN
tstLen = tstN-valN

print(f"DBG> trnN={trnN}")
print(f"DBG> valN={valN}")
print(f"DBG> tstN={tstN}")

print(f"DBG> trnLen={trnLen}")
print(f"DBG> valLen={valLen}")
print(f"DBG> tstLen={tstLen}")

trn, val, tst,rem = numpy.split(randomInitialList,[trnN, valN, tstN])

print(f"DBG> trn={trn}")
print(f"DBG> val={val}")
print(f"DBG> tst={tst}")
print(f"DBG> rem={rem}")

def copyImages(tmpDir, subDir, listOfImages, dstDir):
    print(len(listOfImages))
    cntDict={}
    for image in listOfImages:
        cntDict[image] = cntDict.get(image,0)+1
        if cntDict[image]==1:
            dstImage = image
        else:
            fileName, fileExtension = os.path.splitext(image)
            dstImage = fileName+"_copy_"+str(cntDict[image])+fileExtension
        print("cp ", os.path.join(tmpDir,subDir,image), " ",os.path.join(dstDir,subDir,dstImage))
        
def overSample(myList,actualLength,desiredLength):
    if actualLength==desiredLength:
        return myList
    else:
        newList = numpy.append(myList,numpy.random.choice(myList,desiredLength-actualLength))
        return numpy.random.choice(newList,desiredLength,replace=False)

fullTrn = overSample(trn,trnLen,nTrnSamples)
fullVal = overSample(val,valLen,nValSamples)
fullTst = overSample(tst,tstLen,nTstSamples)
fullRem = rem

print(f"DBG> fullTrn={fullTrn}")
print(f"DBG> fullVal={fullVal}")
print(f"DBG> fullTst={fullTst}")
print(f"DBG> fullRem={fullRem}")

subDir="ok"
copyImages(tmpDir, subDir, fullTrn, trnDir)
copyImages(tmpDir, subDir, fullVal, valDir)
copyImages(tmpDir, subDir, fullTst, tstDir)        
copyImages(tmpDir, subDir, fullRem, remDir)

