# Training V05
To be filled later

In [7]:
import yaml
import os
import shutil
import subprocess
import datetime
import numpy
import random
import sklearn.metrics
import tensorflow

print("INFO> TensorFlow version: %s" % tensorflow.__version__)
print("INFO> Num GPUs Available: ", len(tensorflow.config.experimental.list_physical_devices('GPU')))

INFO> TensorFlow version: 1.14.0
INFO> Num GPUs Available:  0


In [8]:
# Read parameters from local config.yaml file, and update corresponding Python variables
currentDir = os.getcwd()
print("INFO> Reading file config.yam from directory: %s" % currentDir)
yamlFile = open('config.yaml','r')
yamlData = yaml.load(yamlFile,Loader=yaml.Loader)

for key in sorted(yamlData):
    print("INFO> %-15s: %s" % (key,yamlData[key]))

batchSize = yamlData['batchSize']
checkDataset = yamlData['checkDataset']
checkpointDir = yamlData['checkpointDir']
createDataset = yamlData['createDataset']
datasetDir = yamlData['datasetDir']
goldenDataset = yamlData['goldenDataset']
imageHeight = yamlData['imageHeight']
imageWidth = yamlData['imageWidth']
learningRate = float(yamlData['learningRate'])
logDir = yamlData['logDir']
nEpochs = yamlData['nEpochs']
nTrnSamples = yamlData['nTrnSamples']
nTstSamples = yamlData['nTstSamples']
nValSamples = yamlData['nValSamples']
remDir = os.path.join(yamlData['datasetDir'], yamlData['remDir'])
tmpDir = os.path.join(yamlData['datasetDir'], yamlData['tmpDir'])
trnDir = os.path.join(yamlData['datasetDir'], yamlData['trnDir'])
tstDir = os.path.join(yamlData['datasetDir'], yamlData['tstDir'])
valDir = os.path.join(yamlData['datasetDir'], yamlData['valDir'])

INFO> Reading file config.yam from directory: /raid5/disk1/mlproj10/classification
INFO> batchSize      : 16
INFO> checkDataset   : True
INFO> checkpointDir  : /home/jmv/data/mlproj10/tmp/
INFO> createDataset  : True
INFO> datasetDir     : /home/jmv/data/mlproj10/dataset
INFO> goldenDataset  : /home/jmv/data/mlproj10/LeryPosesGolden/dataset
INFO> imageHeight    : 720
INFO> imageWidth     : 1280
INFO> learningRate   : 1e-6
INFO> logDir         : /home/jmv/data/mlproj10/log/
INFO> nEpochs        : 1000
INFO> nTrnSamples    : 2000
INFO> nTstSamples    : 671
INFO> nValSamples    : 671
INFO> remDir         : /home/jmv/data/mlproj10/dataset/rem
INFO> tmpDir         : /home/jmv/data/mlproj10/dataset/tmp
INFO> trnDir         : /home/jmv/data/mlproj10/dataset/trn
INFO> tstDir         : /home/jmv/data/mlproj10/dataset/tst
INFO> valDir         : /home/jmv/data/mlproj10/dataset/val


In [9]:
# Optionally run scripts to create dataset from golden data set & check the newly created dataset
if createDataset:
    # Remove datasetDir folder & sub-folders - and then create it again to start from scratch
    shutil.rmtree(datasetDir, ignore_errors=True)
    os.mkdir(datasetDir)
    
    # For each class from goldenDataset:
    #  - create a corresponding subfolder in tmpDir
    #  - copy all images from the corresponding class in goldenDataset to the corresponding class in tmpDir
    listOfSubDirs = [subdir[1] for subdir in os.walk(goldenDataset)][0]
    for myClass in listOfSubDirs:
        print(f"Processing folder (1st pass): {myClass}")
        os.makedirs(os.path.join(tmpDir,myClass))

        for myImage in [image[2] for image in os.walk(os.path.join(goldenDataset,myClass))][0]:
            shutil.copyfile(os.path.join(goldenDataset,myClass,myImage),os.path.join(tmpDir,myClass,myImage))
        os.makedirs(os.path.join(trnDir,myClass))
        os.makedirs(os.path.join(valDir,myClass))
        os.makedirs(os.path.join(tstDir,myClass))
        os.makedirs(os.path.join(remDir,myClass))

    # Oversampling section
    def overSample(myList,actualLength,desiredLength): 
        if actualLength==desiredLength:
            return myList
        else:
            newList = numpy.append(myList,numpy.random.choice(myList,desiredLength-actualLength)) 
            return numpy.random.choice(newList,desiredLength,replace=False)
    
    def copyImages(tmpDir, subDir, listOfImages, dstDir):
        cntDict={}
        for image in listOfImages:
            cntDict[image] = cntDict.get(image,0)+1
            if cntDict[image]==1:
                dstImage = image
            else:
                fileName, fileExtension = os.path.splitext(image)
                dstImage = fileName+"_copy_"+str(cntDict[image])+fileExtension
            shutil.copyfile(os.path.join(tmpDir,subDir,image),os.path.join(dstDir,subDir,dstImage))
        
    random.seed(42)
    
    listOfSubDir = [subdir[1] for subdir in os.walk(tmpDir)][0]
    for subDir in listOfSubDir:
        print("Processing folder (2nd pass): %s" % subDir)
    
        initialList=os.listdir(os.path.join(tmpDir,subDir))
        nInitialList = len(initialList)
        randomInitialList = random.sample(initialList,k=nInitialList)
        
        # Split randomized list of images according to the training, validation & test ratio
        nTotalSamples = nTrnSamples + nValSamples + nTstSamples
        trnN = min(nTrnSamples,int(nInitialList*nTrnSamples/nTotalSamples))
        valN = min(nTrnSamples+nValSamples,int(nInitialList*(nTrnSamples+nValSamples)/nTotalSamples))
        tstN = min(nTotalSamples,nInitialList)
        trn, val, tst, rem = numpy.split(randomInitialList,[trnN, valN, tstN])
    
        # Oversampling scheme
        trnLen = trnN
        valLen = valN-trnN
        tstLen = tstN-valN
        fullTrn = overSample(trn,trnLen,nTrnSamples)
        fullVal = overSample(val,valLen,nValSamples)
        fullTst = overSample(tst,tstLen,nTstSamples)
        fullRem = rem # No oversampling for rem
    
        # Copy images in folders
        copyImages(tmpDir, subDir, fullTrn, trnDir)
        copyImages(tmpDir, subDir, fullVal, valDir)
        copyImages(tmpDir, subDir, fullTst, tstDir)        
        copyImages(tmpDir, subDir, fullRem, remDir)   
    
if checkDataset:
    for subDir in sorted(os.listdir(datasetDir)):
        print(f"INFO> Processing folder: {subDir}")
        for myClass in sorted(os.listdir(os.path.join(datasetDir, subDir))):
            nImages = len(os.listdir(os.path.join(datasetDir, subDir, myClass)))
            print(f"INFO>     Number of samples in class {myClass}: {nImages}")

Processing folder (1st pass): 0windsurf
Processing folder (1st pass): 1windsurf
Processing folder (2nd pass): 0windsurf
Processing folder (2nd pass): 1windsurf
INFO> Processing folder: rem
INFO>     Number of samples in class 0windsurf: 414
INFO>     Number of samples in class 1windsurf: 16
INFO> Processing folder: tmp
INFO>     Number of samples in class 0windsurf: 3756
INFO>     Number of samples in class 1windsurf: 3358
INFO> Processing folder: trn
INFO>     Number of samples in class 0windsurf: 2000
INFO>     Number of samples in class 1windsurf: 2000
INFO> Processing folder: tst
INFO>     Number of samples in class 0windsurf: 671
INFO>     Number of samples in class 1windsurf: 671
INFO> Processing folder: val
INFO>     Number of samples in class 0windsurf: 671
INFO>     Number of samples in class 1windsurf: 671


In [10]:
myModelInput = tensorflow.keras.layers.Input(shape=(imageHeight,imageWidth,3))
x = tensorflow.keras.layers.Conv2D(64, (3,3), activation="relu")(myModelInput)
x = tensorflow.keras.layers.BatchNormalization()(x)
x = tensorflow.keras.layers.MaxPooling2D((2,2))(x)

x = tensorflow.keras.layers.Conv2D(128, (3,3), activation="relu")(x)
x = tensorflow.keras.layers.BatchNormalization()(x)
x = tensorflow.keras.layers.MaxPooling2D((2,2))(x)

x = tensorflow.keras.layers.Conv2D(192, (3,3), activation="relu")(x)
x = tensorflow.keras.layers.BatchNormalization()(x)
x = tensorflow.keras.layers.MaxPooling2D((2,2))(x)

x = tensorflow.keras.layers.Conv2D(192, (3,3), activation="relu")(x)
x = tensorflow.keras.layers.BatchNormalization()(x)
x = tensorflow.keras.layers.MaxPooling2D((2,2))(x)

x = tensorflow.keras.layers.Conv2D(192, (3,3), activation="relu")(x)
x = tensorflow.keras.layers.BatchNormalization()(x)
x = tensorflow.keras.layers.MaxPooling2D((2,2))(x)

x = tensorflow.keras.layers.Conv2D(128, (3,3), activation="relu")(x)
x = tensorflow.keras.layers.BatchNormalization()(x)
x = tensorflow.keras.layers.MaxPooling2D((2,2))(x)

x = tensorflow.keras.layers.Flatten()(x)
x = tensorflow.keras.layers.Dense(128, activation="relu")(x)
x = tensorflow.keras.layers.BatchNormalization()(x)

x = tensorflow.keras.layers.Dropout(0.25)(x)
myModelOutput = tensorflow.keras.layers.Dense(1, activation="sigmoid")(x)

model = tensorflow.keras.models.Model(inputs=myModelInput, outputs=myModelOutput)

model.summary()

tensorflow.keras.optimizers.RMSprop(lr=learningRate)
 
model.compile(loss=tensorflow.keras.losses.BinaryCrossentropy(),
              optimizer='rmsprop',
              metrics=['acc']) # should be accuracy in TF2.0"

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 720, 1280, 3)]    0         
_________________________________________________________________
conv2d (Conv2D)              (None, 718, 1278, 64)     1792      
_________________________________________________________________
batch_normalization (BatchNo (None, 718, 1278, 64)     256       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 359, 639, 64)      0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 357, 637, 128)     73856     
_________________________________________________________________
batch_normalization_1 (Batch (None, 357, 637, 128)     512       
_________

In [None]:
trnDataGen = tensorflow.keras.preprocessing.image.ImageDataGenerator(
    rescale=1. / 255,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True)

valDataGen = tensorflow.keras.preprocessing.image.ImageDataGenerator(rescale=1. / 255)

trnGenerator = trnDataGen.flow_from_directory(
    trnDir,
    target_size=(imageHeight,imageWidth),
    batch_size=batchSize,
    class_mode='binary')

valGenerator = valDataGen.flow_from_directory(
    valDir,
    target_size=(imageHeight,imageWidth),
    batch_size=batchSize,
    class_mode='binary')

timeNow = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
fullCheckpointDir = os.path.join(checkpointDir,timeNow)
os.mkdir(fullCheckpointDir)
                                           
# need to replace acc by accuracy below when moving to TF2.0    
filePath = os.path.join(fullCheckpointDir,"{epoch:05d}_{loss:.6f}_{acc:.6f}_{val_loss:.6f}_{val_acc:.6f}.h5")
checkpoint = tensorflow.keras.callbacks.ModelCheckpoint(filePath, monitor='val_loss', verbose=0, save_best_only=False, save_weights_only=False, mode='auto', save_freq='epoch')

# profile_batch=0 required to solve a bug w/ Tensorboard according to     
#   https://github.com/tensorflow/tensorboard/issues/2412    
fullLogDir = os.path.join(logDir, timeNow)
tensorboardCallback = tensorflow.keras.callbacks.TensorBoard(log_dir=fullLogDir,profile_batch=0)

history = model.fit_generator(
    trnGenerator,
    steps_per_epoch=nTrnSamples // batchSize,
    epochs=nEpochs,
    validation_data=valGenerator,
    validation_steps=nValSamples // batchSize,
    callbacks=[tensorboardCallback,checkpoint])

Found 4000 images belonging to 2 classes.
Found 1342 images belonging to 2 classes.
Epoch 1/1000
Epoch 2/1000

In [None]:
# Look in the tmp directory and select best model candidate based on train/val loss & accuracy
# For TF1.14, added compile=False. This is not needed for TF2
model = tensorflow.keras.models.load_model('/home/jmv/data/mlproj1_new/tmp/20200304-232855/00062_0.104636_0.969000_0.155657_0.948171.h5', compile=False)

In [None]:
tstDataGen = tensorflow.keras.preprocessing.image.ImageDataGenerator(rescale=1. / 255)

tstGenerator = tstDataGen.flow_from_directory(
    directory=tstDir,
    target_size=(imageHeight,imageWidth),
    batch_size=1,
    class_mode=None,
    shuffle=False)

# Confusion matrix
predictions = model.predict_generator(tstGenerator,numpy.math.ceil(tstGenerator.samples/tstGenerator.batch_size))
images = tstGenerator.filenames
trueClasses = tstGenerator.classes
predictedClasses = numpy.argmax(predictions, axis=1)

report = sklearn.metrics.confusion_matrix(trueClasses, predictedClasses)

print(tstGenerator.class_indices)
print(report)

# List images which have a different predicted class vs. true class
for image, trueClass, predictedClass in zip(images,trueClasses,predictedClasses):
    if trueClass!=predictedClass:
        print(\"Image: %s, True Class: %d, Predicted Class: %d\" % (image, trueClass, predictedClass))"

In [None]:
remDataGen = tensorflow.keras.preprocessing.image.ImageDataGenerator(rescale=1. / 255)

remGenerator = remDataGen.flow_from_directory(
    directory=remDir,
    target_size=(imageHeight,imageWidth),
    batch_size=1,
    class_mode=None,
    shuffle=False)

# Confusion matrix
predictions = model.predict_generator(remGenerator,numpy.math.ceil(remGenerator.samples/remGenerator.batch_size))
images = remGenerator.filenames
trueClasses = remGenerator.classes
predictedClasses = numpy.argmax(predictions, axis=1)

report = sklearn.metrics.confusion_matrix(trueClasses, predictedClasses)

print(remGenerator.class_indices)
print(report)

for image, trueClass, predictedClass in zip(images,trueClasses,predictedClasses):
    if trueClass!=predictedClass:
        print(\"Image: %s, True Class: %d, Predicted Class: %d\" % (image, trueClass, predictedClass))"

In [None]:
# From book \"Deep Learning w/ Python\" by François Chollet
# From https://stackoverflow.com/questions/58322147/how-to-generate-cnn-heatmaps-using-built-in-keras-in-tf2-0-tf-keras
import matplotlib.pyplot
import cv2
import PIL

def plot_activation(imagePath):
    # Loads an image into PIL format
    myImage = tensorflow.keras.preprocessing.image.load_img(imagePath,target_size=(imageHeight,imageWidth))
    # Converts the PIL image into a Numpy array
    myImageAsArray = tensorflow.keras.preprocessing.image.img_to_array(myImage)
    # Creates a list containing a single image [myImageAsArray]
    myImageAsArray = numpy.expand_dims(myImageAsArray,axis=0)
    # Scales the image in the same way as what we did before the training
    myImageAsArray /= 255.0
    # Gets the result of the model    
    myPrediction = model.predict(myImageAsArray)    
    myPredictedClass = numpy.argmax(myPrediction, axis=1)    
    #print(f\"DBG> Predicted class: {myPredictedClass[0]}\")    
    #    
    convLayer = model.get_layer(\"block5_conv3\")    
    #print(\"DBG> convLayer is\",convLayer)    
    modelOutput = model.output[:,myPredictedClass[0]]    
    # Was forced to add tensorflow.cast(...,'float32') because otherwise the tensor is missing    
    # dtype set to float32. Bug with TF1.14?    
    #grads = tensorflow.cast(tensorflow.keras.backend.gradients(modelOutput,convLayer.output),'float32')    
    grads = tensorflow.keras.backend.gradients(modelOutput,convLayer.output)[0]    
    pooledGrads = tensorflow.keras.backend.mean(grads,axis=(0,1,2))    
    iterate = tensorflow.keras.backend.function([model.input],[pooledGrads,convLayer.output[0]])    
    pooledGradsValue, convLayerOutputValue = iterate([myImageAsArray])    
    numberOfChannelsConvLayer = convLayer.output[0].get_shape()[2]    
    for i in range(numberOfChannelsConvLayer):    
        convLayerOutputValue[:,:,i] *= pooledGradsValue[i]    
    heatMap = numpy.mean(convLayerOutputValue, axis=-1)    
    heatMap = numpy.maximum(heatMap,0)    
    heatMap /= numpy.max(heatMap)    
    matplotlib.pyplot.matshow(heatMap)    
    #    
    img = cv2.imread(imagePath)    
    heatMap = cv2.resize(heatMap,(img.shape[1],img.shape[0]))    
    heatMap =numpy.uint8(255*heatMap)    
    heatMap = cv2.applyColorMap(heatMap,cv2.COLORMAP_JET)    
    superImposedImg = heatMap*0.4+img    
    cv2.imwrite('/home/jmv/data/mlproj8/myresultingimage.jpg',superImposedImg)"

In [None]:
desiredNumberOfImagesToDisplay = 10    
dirToDisplayFrom = tstDir    
print(f\"DBG> Desired number of images: {desiredNumberOfImagesToDisplay}\")    
print(f\"DBG> Directory to display images from: {dirToDisplayFrom}\")    
  
for myClass in sorted(os.listdir(dirToDisplayFrom)):    
    print(f\"DBG> Class={myClass}\")    
    listOfImages = [image for image in sorted(os.listdir(os.path.join(tstDir,myClass))) if \"copy\" not in image]    
    actualNumberOfImagesToDisplay = len(listOfImages)    
    print(f\"DBG> Actual number of images: {actualNumberOfImagesToDisplay}\")    
    for image in listOfImages[:min(desiredNumberOfImagesToDisplay,actualNumberOfImagesToDisplay)]:    
        print(f\"DBG> Image: {image}\")    
        selectImage = os.path.join(tstDir,myClass,image)    
        plot_activation(selectImage)    
        pil_img = PIL.Image.open('/home/jmv/data/mlproj8/myresultingimage.jpg')    
        myImShow = matplotlib.pyplot.imshow(pil_img)    
        matplotlib.pyplot.title(selectImage,pad=30)    