# Imports

In [1]:
%pip install matplotlib
%pip install scikit-learn
%pip install scipy
%pip install tensorflow
%pip install numpy
%pip install pandas
%pip install setuptools

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
import tensorflow as tf

from tensorflow.image import resize
from tensorflow.keras.backend import clear_session
from tensorflow import keras
from tensorflow.keras.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split
from keras.metrics import  Recall, CategoricalAccuracy
from IPython.display import clear_output

from tensorflow.keras.utils import to_categorical
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from numpy import concatenate as concat
from scipy.stats import entropy
import os

from helpers.help import *


np.random.seed(0)

# Start

In [3]:
# Open Diabetic Retinopathy dataset
path = os.path.join(os.getcwd(),'gaussian_ds')
label_dict={'Mild':1,'Moderate':1,'Proliferate_DR':1,'Severe':1,'No_DR':0}

# remove macOS file
folders = os.listdir(path)
folders.remove('.DS_Store')

# get all the samples
array = []
for i in folders:
    detailPath = os.path.join(path,i)
    for j in os.listdir(detailPath):
        array.append([os.path.join(detailPath,j),label_dict[i.split('.')[0]]])

# transforms the array into nparray
dataset=np.array(array)

np.size(dataset,0)

dataset[0:3]

array([['/Users/leonardosousa/Desktop/mestrado/IA/Project/RetinopathyAlgorithm/gaussian_ds/Mild/2d7666b8884f.png',
        '1'],
       ['/Users/leonardosousa/Desktop/mestrado/IA/Project/RetinopathyAlgorithm/gaussian_ds/Mild/50840c36f0b4.png',
        '1'],
       ['/Users/leonardosousa/Desktop/mestrado/IA/Project/RetinopathyAlgorithm/gaussian_ds/Mild/30cab14951ac.png',
        '1']], dtype='<U113')

# Pre-Processing

In [4]:
# Get characteristics and labels (ensured they have the right type)
X,y=dataset[:,0],dataset[:,1]
y = y.astype(int)
y = to_categorical(y)

#Shuffle the dataset (to make a unbiased model)
p = np.random.permutation(len(X))
X,y = X[p], y[p]

#Strip off 10% samples for hold out test set
test_idxs = np.random.choice(len(X), size=int(0.1*len(X)), replace=False, p=None)
x_test, y_test = X[test_idxs],y[test_idxs]

#Delete the test set samples from X,y 
X = np.delete(X, test_idxs)
y = np.delete(y, test_idxs, axis = 0)

#usual train-val split. We use 11% here just match the test set size to validation set.
x_train, x_val, y_train, y_val = train_test_split(X, y, test_size=0.11)

# Separate the Seed and Pools
- **Seeds**: what is used to start the training
- **Pool**: serve as a pool for uncertainty sampling

In [5]:
initial_seed = 100
x_seed , x_pool = x_train[0:initial_seed], x_train[initial_seed:]
y_seed , y_pool = y_train[0:initial_seed], y_train[initial_seed:]

print(f"Samples in Seed set: {x_seed.shape[0]}")
print(f"Samples in Pool: {x_pool.shape[0]}")
print(f"Samples in Validation set: {x_val.shape[0]}")
print(f"Samples in Test set: {x_test.shape[0]}")

Samples in Seed set: 100
Samples in Pool: 2833
Samples in Validation set: 363
Samples in Test set: 366


# Converts into a dataset compatible with the model

In [11]:
#The buid_dataset is a custom function that returns tensor batches

val_dataset=build_dataset(x_val,y_val,repeat=False,batch=256)
test_dataset=build_dataset(x_test,y_test,repeat=False,batch=256)
pool_dataset=build_dataset(x_pool,y_pool,repeat=False,batch=256, shuffle = False)

BATCH_SIZE=16

STEPS_PER_EPOCH=len(x_train)/BATCH_SIZE

train_dataset=build_dataset(x_seed,y_seed,batch=BATCH_SIZE)
input_shape=train_dataset.element_spec[0].shape[1:]

# Model definition

In [12]:
model=simple_model(input_shape)
model.compile(
        loss = "categorical_crossentropy",
        optimizer = Adam(),
        metrics=[CategoricalAccuracy()]
    )
model.summary()

In [13]:
# Saves the best model
checkpoint=ModelCheckpoint(filepath='model/model_al.keras',
                           monitor='val_loss',save_best_only=True,verbose=1)

# Logs the progress of training
csv_logger=keras.callbacks.CSVLogger('logger/trainlog_al.csv',
                                     separator=',',append=False)

# prevent overfitting
early_stopper=keras.callbacks.EarlyStopping(monitor='val_loss',
                                            min_delta=0.001,
                                            restore_best_weights=True,
                                            patience=10)

callbacks_list=[checkpoint,early_stopper,csv_logger]

# Model training
- **categorical_accuracy**: measures how accurately the model is predicting the right class for each sample across multiple classes
- **loss**: how well the model fits the training data; lower values are better.
- **val_categorical_accuracy**: On the validation set, the model’s accuracy we can see potential overfitting.
- **val_loss**: On the validation set, the loss.

In [14]:
EPOCHS = 10
model.fit(train_dataset,steps_per_epoch=int(STEPS_PER_EPOCH),epochs=EPOCHS,
          validation_data=val_dataset,validation_steps=None,
          callbacks=callbacks_list)

Epoch 1/5
[1m183/183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step - categorical_accuracy: 0.9058 - loss: 0.2637
Epoch 1: val_loss improved from inf to 5.59065, saving model to model/model_al.keras
[1m183/183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 48ms/step - categorical_accuracy: 0.9061 - loss: 0.2630 - val_categorical_accuracy: 0.4904 - val_loss: 5.5907
Epoch 2/5
[1m183/183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step - categorical_accuracy: 0.9820 - loss: 0.0546
Epoch 2: val_loss improved from 5.59065 to 3.16553, saving model to model/model_al.keras
[1m183/183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 48ms/step - categorical_accuracy: 0.9820 - loss: 0.0546 - val_categorical_accuracy: 0.4904 - val_loss: 3.1655
Epoch 3/5
[1m183/183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - categorical_accuracy: 0.9900 - loss: 0.0334
Epoch 3: val_loss improved from 3.16553 to 1.94728, saving model to model/mod

<keras.src.callbacks.history.History at 0x15989a060>

# Initial Model Evaluation on Test Dataset (Smaller Dataset / Seeded)

In [15]:
model = keras.models.load_model("model/model_al.keras")
print("-" * 100)
print(model.evaluate(test_dataset, verbose=0,return_dict=True))

----------------------------------------------------------------------------------------------------
{'categorical_accuracy': 0.8442623019218445, 'loss': 0.4430781602859497}


In [16]:
al_history = []
csv_logger=keras.callbacks.CSVLogger('logger/trainlog_al.csv',
                                 separator=',',append=True)
callbacks_list=[checkpoint,early_stopper,csv_logger]


# Include baseline (full train) model

In [17]:
try:
    
    model_full = keras.models.load_model("model/model_baseline.keras")
    _, acc_baseline = model_full.evaluate(test_dataset)

    print("-" * 100)
    print(model_full.evaluate(test_dataset, verbose=0,return_dict=True))
    
except FileNotFoundError:
    print("model file model_baseline.h5 not found. Make sure to run 01_Training_Full.ipynb entirely")


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step - categorical_accuracy: 0.9193 - loss: 0.1980
----------------------------------------------------------------------------------------------------
{'categorical_accuracy': 0.9180327653884888, 'loss': 0.19902613759040833}


# Entering AL Loop
We will now iteratively query the pool for samples and add them to the seed set. Every time we can pick sampling_size number of points from the pool with the largest entropy.

Loop: 
- **Prediction on pool**: Uses the current model to predict in the pool dataset;
- **Uncertainty Sampling**: Selects *sampling_size* samples with the highest entropy (least confident predictions);
- **Acquire Samples**: Moves high-entropy samples from x_pool to x_seed to continue training;
- **Stopping Condition**: If the accuracy is close to the one in the baseline, the training stops;
- **Re-train**: Updates the model and continues training with the new samples in the seed set;

In [18]:
sampling_size=10
num_iterations = int(x_pool.shape[0]/sampling_size)

# artificial stopper:
num_iterations = num_iterations/10

print(sampling_size)


acc_baseline = round(acc_baseline,4)

#reducing steps:
num_iterations = int(num_iterations -20)
print(num_iterations)
for iteration in range(num_iterations):
    
    #Step_1
    loss, acc = model.evaluate(test_dataset, verbose=0)
    print(f"Test Set Accuracy after {iteration} iteration {acc}")
    al_history.append([loss, acc, x_seed.shape[0], x_pool.shape[0]])
    
    if acc_baseline - 0.0025 < acc <= acc_baseline + 0.0025:
        print("Terminating Training")
        break
    
    #Step_2
    #Use the current model to predict the pool dataset
    print("Predicting pool dataset")
    y_pool_proba = model.predict(pool_dataset)
    
    #Pick the index of the top entropy samples in pool
    pool_max_ents = np.argsort(entropy(y_pool_proba.T))[-sampling_size:]
    
    #Acquire those samples from pool
    x_sample = x_pool[pool_max_ents]
    y_sample = y_pool[pool_max_ents]
    
    #Add these samples to the seed dataset
    y_seed = concat((y_seed,y_sample),axis=0)
    x_seed = concat((x_seed,x_sample),axis=0)
     
    #Delete the acquired samples from pool
    x_pool = np.delete(x_pool, pool_max_ents, 0 )
    y_pool = np.delete(y_pool, pool_max_ents, 0 )

    #Build the tensorflow dataset object for this iteration
    pool_dataset = build_dataset(x_pool,y_pool,repeat=False,batch=256,
                                 shuffle = False)
    train_dataset = build_dataset(x_seed,y_seed,batch=BATCH_SIZE) 

    print(f"Samples in seed dataset {x_seed.shape[0]} , in pool dataset {x_pool.shape[0]}")
    print("-" * 100)

    #Step_3
    model.compile(
        loss = "binary_crossentropy",
        optimizer = Adam(),
        metrics=[CategoricalAccuracy()]
    )
    
    history = model.fit(train_dataset,steps_per_epoch=int(STEPS_PER_EPOCH),epochs=10,
          validation_data=val_dataset,validation_steps=None,
          callbacks=callbacks_list)
    
    #If the fit method generated a new best model , load it for
    #the next iteration
    model = keras.models.load_model("model/model_al.keras")
    clear_output()
    clear_session()

In [19]:
df = pd.DataFrame(al_history, columns = ['Test Loss', 'Test Accuracy', 'Seed', 'Pool'])
df

Unnamed: 0,Test Loss,Test Accuracy,Seed,Pool
0,0.443078,0.844262,100,2833
1,0.324692,0.92623,110,2823
2,0.324692,0.92623,120,2813
3,0.324692,0.92623,130,2803
4,0.328707,0.923497,140,2793
5,0.198282,0.934426,150,2783
6,0.216336,0.937158,160,2773
7,0.216336,0.937158,170,2763


In [21]:
df.to_csv('logger/AL_tracking.csv', index = False)