**Sources:**   
- Dataset: 
*http://patreo.dcc.ufmg.br/2017/11/12/brazilian-coffee-scenes-dataset/*   
- Book: *Deep Learning with Python - Chollet*    
- Tutorial to load and preprocess images (image_dataset_from_directory)
*https://www.tensorflow.org/tutorials/load_data/images*    

# Load data images from disk

In [1]:
import os
from keras.preprocessing import image

base_dir = '/tf/data'

# Directories for training and test splits
train_dir = os.path.join(base_dir, 'train')
test_dir = os.path.join(base_dir, 'test')

#Directory with training coffee pictures
train_coffee_dir = os.path.join(train_dir, 'coffee')
# Directory with training noncoffee pictures
train_noncoffee_dir = os.path.join(train_dir, 'noncoffee')

# Directory with test coffee pictures
test_coffee_dir = os.path.join(test_dir, 'coffee')
# Directory with test noncoffee pictures
test_noncoffee_dir = os.path.join(test_dir, 'noncoffee')


img = image.load_img(os.path.join(train_coffee_dir, 
                os.listdir(train_coffee_dir)[0]))

# Generate a ```Dataset```

In [2]:
import tensorflow as tf

In [3]:
image_size = img.size
batch_size = 32

Use the function ```tf.keras.utils.image_dataset_from_directory```: Generates a ```tf.data.Dataset``` from image files in a directory

```train_ds = tf.keras.utils.image_dataset_from_directory(```   
&nbsp;&nbsp;&nbsp;&nbsp; *Directory where data is located. Since it contains subdirectories, labels are inferred*   
&nbsp;&nbsp;&nbsp;&nbsp; ```train_dir,```   
&nbsp;&nbsp;&nbsp;&nbsp; *Fraction of data to reserve for validation*    
&nbsp;&nbsp;&nbsp;&nbsp; ```validation_split=0.2,```    
&nbsp;&nbsp;&nbsp;&nbsp; *Subset of the data to return (**training** or **validation**)*    
&nbsp;&nbsp;&nbsp;&nbsp; ```subset="training",```    
&nbsp;&nbsp;&nbsp;&nbsp; *Shuffles the data. If false, sorts the data in alphanumeric order*    
&nbsp;&nbsp;&nbsp;&nbsp; ```shuffle=True,```    
&nbsp;&nbsp;&nbsp;&nbsp; *Optional random seed for shuffling and transformations*    
&nbsp;&nbsp;&nbsp;&nbsp; ```seed=123,```   
&nbsp;&nbsp;&nbsp;&nbsp; *Size to resize images after thay are read from disk. Since the pipeline processes batches of images thata must all ahve the same size, this must be provided*    
&nbsp;&nbsp;&nbsp;&nbsp; ```image_size=image_size,```    
&nbsp;&nbsp;&nbsp;&nbsp; *Size of the batches of data*    
&nbsp;&nbsp;&nbsp;&nbsp; ```batch_size=batch_size,```    
```)```    

# Build model
### Standardize the data
The images have 3 color channels RGB, and the channels values are in the [0,255] range --> they have to be in the [0,1] range for the neural network    
```normalization_layer = tf.keras.layers.Rescaling(1./255)```

In [4]:
from keras import layers
from keras import models
from tensorflow.keras import optimizers

In [5]:
def build_model(model_size):
    model = models.Sequential()

    # Standardize the data
    model.add(tf.keras.layers.Rescaling(1./255))


    if model_size == 'bigger':
        model.add(layers.Conv2D(32, (3, 3), activation='relu', 
                            input_shape=(img.size[0],
                            img.size[0], img.layers)))
        model.add(layers.MaxPooling2D((2, 2)))

        model.add(layers.Conv2D(64, (3, 3), activation='relu'))
        model.add(layers.MaxPooling2D((2, 2)))

        model.add(layers.Conv2D(128, (3, 3), activation='relu'))
        model.add(layers.MaxPooling2D((2, 2)))

        model.add(layers.Conv2D(128, (3, 3), activation='relu'))
        model.add(layers.MaxPooling2D((2, 2)))

        model.add(layers.Flatten())

        model.add(layers.Dense(512, activation='relu'))
        model.add(layers.Dense(1, activation='sigmoid'))

    if model_size == 'smaller':
        model.add(layers.Conv2D(32, (3, 3), activation='relu',
                            input_shape=(img.size[0],
                            img.size[0], img.layers)))
        model.add(layers.MaxPooling2D((2, 2)))

        model.add(layers.Conv2D(64, (3, 3), activation='relu'))
        model.add(layers.MaxPooling2D((2, 2)))

        model.add(layers.Conv2D(128, (3, 3), activation='relu'))
        model.add(layers.MaxPooling2D((2, 2)))

        model.add(layers.Flatten())

        model.add(layers.Dense(128, activation='relu'))
        model.add(layers.Dense(1, activation='sigmoid'))



    model.compile(optimizer=optimizers.RMSprop(learning_rate=1e-4),
                loss='binary_crossentropy',
                metrics=['accuracy'])

    return model

# Tensorboard

In [6]:
from tensorflow.keras.callbacks import TensorBoard

#import time
#NAME = "coffee-{}".format(int(time.time()))

tensorboard = TensorBoard(log_dir='summaries')

# Train model + MCCV

In [7]:
def train_mccv(folds, model_size):
    train_acc_per_fold = []
    train_loss_per_fold = []
    val_acc_per_fold = []
    val_loss_per_fold = []

    for i in range(folds):
        train_ds = tf.keras.utils.image_dataset_from_directory(
            train_dir,
            validation_split=0.2,
            subset="training",
            shuffle=True,
            seed=123,
            image_size=image_size,
            batch_size=batch_size,
        )
        val_ds = tf.keras.preprocessing.image_dataset_from_directory(
            train_dir,
            validation_split=0.2,
            subset="validation",
            shuffle=True,
            seed=123,
            image_size=image_size,
            batch_size=batch_size,
        )

        model = build_model(model_size)

        history = model.fit(
                        train_ds,
                        epochs=50,
                        validation_data=val_ds,
                        callbacks=[tensorboard],
                        verbose=0)

        acc = history.history['accuracy']
        val_acc = history.history['val_accuracy']
        loss = history.history['loss']
        val_loss = history.history['val_loss']

        print(f'Score for fold {i+1}:\nTraining   -> acc of {acc[-1]};'
        f'  loss of {loss[-1]};'
        f'\nValidation -> acc of {val_acc[-1]};'
        f'  loss of {val_loss[-1]}%\n')

        train_acc_per_fold.append(acc[-1] * 100)
        train_loss_per_fold.append(loss[-1])
        val_acc_per_fold.append(val_acc[-1] * 100)
        val_loss_per_fold.append(val_loss[-1])

    return train_acc_per_fold, train_loss_per_fold, val_acc_per_fold, val_loss_per_fold

In [8]:
train_acc, train_loss, val_acc, val_loss = train_mccv(5, 'bigger')

Found 2400 files belonging to 2 classes.
Using 1920 files for training.
Found 2400 files belonging to 2 classes.
Using 480 files for validation.
Score for fold 1:
Training   -> acc of 0.9114583134651184;  loss of 0.21681948006153107;
Validation -> acc of 0.9125000238418579;  loss of 0.22581185400485992%

Found 2400 files belonging to 2 classes.
Using 1920 files for training.
Found 2400 files belonging to 2 classes.
Using 480 files for validation.
Score for fold 2:
Training   -> acc of 0.9098958373069763;  loss of 0.211153045296669;
Validation -> acc of 0.90625;  loss of 0.22005322575569153%

Found 2400 files belonging to 2 classes.
Using 1920 files for training.
Found 2400 files belonging to 2 classes.
Using 480 files for validation.
Score for fold 3:
Training   -> acc of 0.9145833253860474;  loss of 0.20176509022712708;
Validation -> acc of 0.893750011920929;  loss of 0.24789485335350037%

Found 2400 files belonging to 2 classes.
Using 1920 files for training.
Found 2400 files belongi

In [9]:
import numpy as np
print("Results for bigger version of convnet model")
print("\nOverall training accuracy: " + str(np.average(train_acc)))
print("Overall training loss: " + str(np.average(train_loss)))
print("\nOverall validation accuracy: " + str(np.average(val_acc)))
print("Overall validation loss: " + str(np.average(val_loss)))

Results for bigger version of convnet model

Overall training accuracy: 91.28124952316284
Overall training loss: 0.2081107974052429

Overall validation accuracy: 90.50000071525574
Overall validation loss: 0.2299742728471756


# ===========================================================
# Second basic model

In [10]:
train_acc, train_loss, val_acc, val_loss = train_mccv(5, 'smaller')

Found 2400 files belonging to 2 classes.
Using 1920 files for training.
Found 2400 files belonging to 2 classes.
Using 480 files for validation.
Score for fold 1:
Training   -> acc of 0.9145833253860474;  loss of 0.20692692697048187;
Validation -> acc of 0.8999999761581421;  loss of 0.22861512005329132%

Found 2400 files belonging to 2 classes.
Using 1920 files for training.
Found 2400 files belonging to 2 classes.
Using 480 files for validation.
Score for fold 2:
Training   -> acc of 0.9104166626930237;  loss of 0.22033710777759552;
Validation -> acc of 0.8812500238418579;  loss of 0.26706770062446594%

Found 2400 files belonging to 2 classes.
Using 1920 files for training.
Found 2400 files belonging to 2 classes.
Using 480 files for validation.
Score for fold 3:
Training   -> acc of 0.9166666865348816;  loss of 0.20904116332530975;
Validation -> acc of 0.9020833373069763;  loss of 0.23435507714748383%

Found 2400 files belonging to 2 classes.
Using 1920 files for training.
Found 2400

In [11]:
print("Results for smaller version of convnet model")
print("Results for bigger version of convnet model")
print("\nOverall training accuracy: " + str(np.average(train_acc)))
print("Overall training loss: " + str(np.average(train_loss)))
print("\nOverall validation accuracy: " + str(np.average(val_acc)))
print("Overall validation loss: " + str(np.average(val_loss)))

Results for smaller version of convnet model
Results for bigger version of convnet model

Overall training accuracy: 91.59375071525574
Overall training loss: 0.20934475660324098

Overall validation accuracy: 89.70833420753479
Overall validation loss: 0.23957162499427795
