# Tutorial 01: Validating models

In this notebook, we can see how to validate models. Note that it requires to have run the TUTORIAL_00 notebook first so we precompute the features that will be used in this notebook.

In [3]:
import os
import random
import argparse
import json
from time import time
import pandas as pd
import tensorflow as tf
from tensorflow.data import Dataset
from tensorflow.errors import ResourceExhaustedError

from damage.data import DataStream
from damage.models import CNN, RandomSearch, CNNPreTrained

In [4]:
os.environ['CUDA_VISIBLE_DEVICES'] = '5'
RESULTS_PATH = '../logs/experiments'
FEATURES_PATH = '../logs/features'
features_file_name = 'example.p'

In [7]:
features = pd.read_pickle('{}/{}'.format(FEATURES_PATH, features_file_name)).dropna(subset=['destroyed', 'image'])
features.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,annotation_date,destroyed,latitude,longitude,image
city,patch_id,date,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
raqqa,10016-2208,2015-02-02,NaT,0.0,35.965631,38.987906,"[[[115, 121, 123, 156, 142, 132], [115, 121, 1..."
raqqa,10016-2272,2015-02-02,NaT,0.0,35.965287,38.987906,"[[[41, 40, 74, 66, 53, 66], [41, 40, 66, 66, 5..."
raqqa,10016-2336,2015-02-02,NaT,0.0,35.964944,38.987906,"[[[74, 81, 90, 82, 97, 99], [99, 105, 115, 107..."
raqqa,10016-2400,2015-02-02,NaT,0.0,35.964601,38.987906,"[[[74, 77, 99, 197, 194, 197], [74, 85, 99, 18..."
raqqa,10016-2464,2015-02-02,NaT,0.0,35.964257,38.987906,"[[[165, 166, 181, 247, 247, 255], [165, 170, 1..."


In [8]:
features.index.get_level_values('date').unique()
# import rasterio
# data  = rasterio.open('../data/city_rasters/daraa_2017_02_07_zoom_19.tif')
# data.shape

DatetimeIndex(['2015-02-02'], dtype='datetime64[ns]', name='date', freq=None)

In [9]:
len(features)

22091

We will make use of three custom classes: __RandomSearch__, __CNN__ and __DataStream__. __RandomSearch__ is a class that samples hyperparameters for ML models. As of may 2019, only the space for cnn's has been implemented. __CNN__ is a class that defines a Convolutional Neural Network model and follows the standards of Sklearn and Keras APIs, containing methods called fit, predict, fit_generator, predict_generator and validate_generator. In this case, we make use of the validate_generator method, which takes a generator of data as required by Keras's fit_generator method: each batch yields a tuple of (features, target). We use the __DataStream__ object to create these generators, first by generating the indices with the split_by_path_id method, which follows the standards of Sklearn splitters and then with the get_data_generator_from_index method that turns those indices into data generators.

In [11]:
# Modelling
sampler = RandomSearch()
models = {
    CNN:sampler.sample_cnn,
    CNNPreTrained: sampler.sample_cnn_pretrained,
}
Model = random.choice([CNN])
sample_func = models[Model]
spaces = sample_func(1)
# # Do splits
# class_proportion = {
#     1: 0.3,
# }
batch_size = spaces[0]['batch_size']
test_batch_size = batch_size
train_proportion = 0.7
data_stream = DataStream(
    batch_size=batch_size,
    train_proportion=train_proportion,
    test_batch_size=test_batch_size
)
unique_patches = features.index.get_level_values('patch_id').unique().tolist()
train_patches = random.sample(unique_patches, round(len(unique_patches)*train_proportion))
train_data = features.loc[features.index.get_level_values('patch_id').isin(train_patches)]
# if train_data['destroyed'].mean() > class_proportion[1]:
#     train_data_upsampled = train_data.copy()
# else:
#     train_data_upsampled = data_stream._upsample_class_proportion(train_data, class_proportion).sample(frac=1)
test_patches = list(set(unique_patches) - set(train_patches))
test_data = features.loc[features.index.get_level_values('patch_id').isin(test_patches)]

train_indices = data_stream._get_index_generator(train_data, batch_size)
test_indices = data_stream._get_index_generator(test_data, test_batch_size)
train_generator = data_stream.get_train_data_generator_from_index(
    [train_data['image'], train_data['destroyed']],
    train_indices,
    augment_flip=False,
    augment_brightness=False,
)
test_generator = data_stream.get_train_data_generator_from_index(
    [test_data['image'], test_data['destroyed']], test_indices)
train_dataset = Dataset.from_generator(lambda: train_generator, (tf.float32, tf.int32))
test_dataset = Dataset.from_generator(lambda: test_generator, (tf.float32, tf.int32))
num_batches = len(train_indices)
num_batches_test = len(test_indices)
print(spaces[0])

TypeError: get_train_data_generator_from_index() missing 2 required positional arguments: 'augment_flip' and 'augment_brightness'

In [24]:
#Validate
for space in spaces:
    print('Now validating:\n')
    print(space)
    try:
        model = Model(**space)
        losses = model.validate_generator(train_dataset, test_dataset,
                                          steps_per_epoch=num_batches,
                                          validation_steps=num_batches_test,
                                          **space)
    except Exception as e:
        losses = {'log': str(e)}

    losses['model'] = str(Model)
    losses['space'] = space
    losses['features'] = features_file_name
    losses['num_batches_train'] = num_batches
    losses['num_batches_test'] = num_batches_test
    identifier = round(time())
   

Now validating:

{'dense_units': 256, 'batch_size': 28, 'convolutional_layers': [{'kernel_size': [9, 9], 'pool_size': [8, 8], 'filters': 32, 'dropout': 0.17777777777777778, 'activation': 'relu'}, {'kernel_size': [9, 9], 'pool_size': [8, 8], 'filters': 64, 'dropout': 0.17777777777777778, 'activation': 'relu'}, {'kernel_size': [9, 9], 'pool_size': [8, 8], 'filters': 128, 'dropout': 0.17777777777777778, 'activation': 'relu'}, {'kernel_size': [9, 9], 'pool_size': [8, 8], 'filters': 256, 'dropout': 0.17777777777777778, 'activation': 'relu'}, {'kernel_size': [9, 9], 'pool_size': [8, 8], 'filters': 512, 'dropout': 0.17777777777777778, 'activation': 'relu'}], 'epochs': 6, 'layer_type': 'vgg', 'class_weight': 1.15, 'learning_rate': 0.025595479226995357}
Epoch 1/6

KeyboardInterrupt: 

In [15]:
with open('{}/experiment_{}.json'.format(RESULTS_PATH, identifier), 'w') as f:
        json.dump(str(losses), f)
if 'val_recall_positives' in losses.keys():
    if losses['val_recall_positives'][-1] > 0.4 and losses['val_precision_positives'][-1] > 0.1:
        model.save('../logs/models/model_{}.h5'.format(identifier))

In [16]:
losses

{'loss': [6.438632979860815,
  6.363179270864149,
  6.035386882427432,
  6.399254071322653,
  6.310763664787279,
  6.308132805635514,
  6.247735060430845,
  6.187669580036104,
  6.274610529668155,
  6.135942042181701,
  6.164134275072002,
  6.155083573418517],
 'accuracy': [0.5628227,
  0.5748709,
  0.59380376,
  0.5748709,
  0.58003443,
  0.5697074,
  0.57831323,
  0.5834768,
  0.5886403,
  0.5834768,
  0.5869191,
  0.58003443],
 'recall_positives': [0.49122804,
  0.47368416,
  0.5,
  0.48245612,
  0.47368416,
  0.49122804,
  0.47368416,
  0.46491227,
  0.48245612,
  0.4649122,
  0.47368422,
  0.44736847],
 'recall_negatives': [0.58,
  0.59938586,
  0.6164035,
  0.59719294,
  0.6059649,
  0.588772,
  0.6035965,
  0.6120176,
  0.6142983,
  0.612193,
  0.6142983,
  0.6120176],
 'precision_positives': [0.22428645,
  0.22784775,
  0.24540545,
  0.22996561,
  0.22892281,
  0.23117366,
  0.23320274,
  0.23414735,
  0.24406579,
  0.23490594,
  0.23788176,
  0.22662668],
 'precision_negatives