# Tutorial 02: Making predictions

In this notebook we learn how to make predictions using components explained in previous tutorials.

In [1]:
from math import ceil
from time import time
import pandas as pd
import logging
from functools import reduce
from sklearn.model_selection import KFold
import tensorflow as tf
from tensorflow.data import Dataset
from tensorflow.keras.models import load_model


from damage.models import CNN
from damage.data import DataStream, load_experiment_results

First, we will load the features generated on the first notebook and the experiment results generated when validating the models

In [2]:
features = pd.read_pickle('../logs/features/example_daraa.p').dropna(subset=['destroyed'])
features.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,annotation_date,damage_num,destroyed,latitude,longitude,image
city,patch_id,date,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
daraa,10080-8224,2017-02-07,2016-04-19,2.0,0.0,32.616861,36.122191,"[[[99, 85, 74, 90, 65, 58], [99, 81, 74, 90, 6..."
daraa,10208-8288,2017-02-07,2016-04-19,1.0,0.0,32.616517,36.122878,"[[[123, 125, 123, 90, 61, 58], [107, 113, 115,..."
daraa,10272-4768,2017-02-07,2016-04-19,1.0,0.0,32.6354,36.123221,"[[[99, 73, 66, 41, 40, 41], [99, 73, 66, 41, 4..."
daraa,10336-7904,2017-02-07,2016-04-19,3.0,1.0,32.618577,36.123565,"[[[181, 190, 197, 132, 89, 74], [181, 186, 189..."
daraa,10400-8160,2017-02-07,2016-04-19,2.0,0.0,32.617204,36.123908,"[[[99, 97, 99, 49, 45, 41], [82, 85, 82, 49, 4..."


In [3]:
EXPERIMENTS_PATH = '../logs/experiments/'
experiment_results = load_experiment_results(EXPERIMENTS_PATH)
experiment_results.head()

Unnamed: 0,accuracy,false_negatives,false_positives,features,id,loss,model,name,negatives,num_batches_test,...,val_false_positives,val_loss,val_negatives,val_positives,val_precision_negatives,val_precision_positives,val_recall_negatives,val_recall_positives,val_true_negatives,val_true_positives
0,"[0.4729064, 0.47947454, 0.500821, 0.5123153, 0...","[3.125, 2.9166667, 2.75, 2.8333333, 2.7083333,...","[10.25, 10.291667, 9.916667, 9.541667, 10.2083...",test_daraa.p,1563893402,"[3.2779477617423525, 3.1167680101441633, 3.070...",<class 'damage.models.cnn.CNN'>,experiment_1563893402.json,"[19.541666, 19.541666, 19.541666, 19.541666, 1...",10,...,"[21.1, 20.7, 8.1, 21.1, 19.7, 9.0, 0.2]","[12.996690273284912, 12.751161861419678, 6.048...","[21.1, 21.1, 21.1, 21.1, 21.1, 21.1, 21.1]","[3.8, 3.8, 3.8, 3.8, 3.8, 3.8, 3.8]","[0.0, 0.39999995, 0.8420224, 0.0, 0.6833333, 0...","[0.15238461, 0.1548768, 0.13761905, 0.15238461...","[0.0, 0.018831167, 0.61601734, 0.0, 0.06623377...","[1.0, 1.0, 0.35833332, 1.0, 0.925, 0.43333334,...","[0.0, 0.4, 13.0, 0.0, 1.4, 12.1, 20.9]","[3.8, 3.8, 1.3, 3.8, 3.5, 1.6, 0.0]"
1,"[0.4664372, 0.4698795, 0.5043029, 0.4870912, 0...","[2.8333333, 3.0555556, 3.1666667, 2.8333333, 2...","[14.388889, 14.055555, 12.833333, 13.722222, 1...",test_daraa.p,1563900369,"[7.747854771999812, 8.02643360738705, 7.339660...",<class 'damage.models.cnn.CNN'>,experiment_1563900369.json,"[26.944445, 26.944445, 26.944445, 26.944445, 2...",8,...,"[24.375, 24.125, 0.0, 24.375, 24.375, 21.25]","[12.010393381118774, 11.885567784309387, 3.342...","[24.375, 24.375, 24.375, 24.375, 24.375, 24.375]","[6.75, 6.75, 6.75, 6.75, 6.75, 6.75]","[0.0, 0.24999997, 0.78329134, 0.0, 0.0, 0.6500...","[0.21670869, 0.21851161, 0.0, 0.21033268, 0.21...","[0.0, 0.010416667, 1.0, 0.0, 0.0, 0.12770835]","[1.0, 1.0, 0.0, 0.96428573, 1.0, 0.77678573]","[0.0, 0.25, 24.375, 0.0, 0.0, 3.125]","[6.75, 6.75, 0.0, 6.5, 6.75, 5.25]"


In [4]:
experiment_results.columns

Index(['accuracy', 'false_negatives', 'false_positives', 'features', 'id',
       'loss', 'model', 'name', 'negatives', 'num_batches_test',
       'num_batches_train', 'positives', 'precision_negatives',
       'precision_positives', 'recall_negatives', 'recall_positives', 'space',
       'true_negatives', 'true_positives', 'val_accuracy',
       'val_false_negatives', 'val_false_positives', 'val_loss',
       'val_negatives', 'val_positives', 'val_precision_negatives',
       'val_precision_positives', 'val_recall_negatives',
       'val_recall_positives', 'val_true_negatives', 'val_true_positives'],
      dtype='object')

In [5]:
experiment_results.loc[1,'val_recall_positives']

[1.0, 1.0, 0.0, 0.96428573, 1.0, 0.77678573]

Now we choose the results according to some logic (e.g. best results, last experiment...). In this case, we will just take the last experiment, which we can find using the experiment id column (timestamp of generation).

In [6]:
Model = CNN
experiment_results_single_model = experiment_results.loc[experiment_results['model'] == str(Model)]
space = experiment_results_single_model.loc[experiment_results_single_model['id'].idxmax(), 'space']
space

{'dense_units': 128,
 'batch_size': 33,
 'convolutional_layers': [{'kernel_size': [7, 7],
   'pool_size': [6, 6],
   'filters': 32,
   'dropout': 0.33333333333333337,
   'activation': 'relu'},
  {'kernel_size': [7, 7],
   'pool_size': [6, 6],
   'filters': 64,
   'dropout': 0.33333333333333337,
   'activation': 'relu'},
  {'kernel_size': [7, 7],
   'pool_size': [6, 6],
   'filters': 128,
   'dropout': 0.33333333333333337,
   'activation': 'relu'}],
 'epochs': 6,
 'layer_type': 'cnn',
 'class_weight': 1.15,
 'learning_rate': 0.0017575106248547913}

In [7]:
identifier = experiment_results_single_model.loc[experiment_results_single_model['id'].idxmax(), 'id']

In [8]:
try:
    print('Loading model {}'.format(identifier))
    print('With space {}'.format(space))
    model = load_model('../logs/models/model_{}.h5'.format(identifier))
    print('Model loaded')
except Exception as e:
    raise e('Error loading model')

Loading model 1563900369
With space {'dense_units': 128, 'batch_size': 33, 'convolutional_layers': [{'kernel_size': [7, 7], 'pool_size': [6, 6], 'filters': 32, 'dropout': 0.33333333333333337, 'activation': 'relu'}, {'kernel_size': [7, 7], 'pool_size': [6, 6], 'filters': 64, 'dropout': 0.33333333333333337, 'activation': 'relu'}, {'kernel_size': [7, 7], 'pool_size': [6, 6], 'filters': 128, 'dropout': 0.33333333333333337, 'activation': 'relu'}], 'epochs': 6, 'layer_type': 'cnn', 'class_weight': 1.15, 'learning_rate': 0.0017575106248547913}


W0723 19:37:16.603396 4583605696 hdf5_format.py:266] Sequential models without an `input_shape` passed to the first layer cannot reload their optimizer state. As a result, your model isstarting with a freshly initialized optimizer.


Model loaded


In [9]:
test_generator = DataStream._get_index_generator(features, space['batch_size'], KFold)
num_batches_test = len(test_generator)
test_generator = DataStream.get_test_data_generator_from_index(features['image'], test_generator)
test_dataset = Dataset.from_generator(lambda: test_generator, tf.float32)

W0723 19:37:21.614492 4583605696 deprecation.py:323] From /Users/jordi/anaconda3/lib/python3.7/site-packages/tensorflow/python/data/ops/dataset_ops.py:410: py_func (from tensorflow.python.ops.script_ops) is deprecated and will be removed in a future version.
Instructions for updating:
tf.py_func is deprecated in TF V2. Instead, there are two
    options available in V2.
    - tf.py_function takes a python function which manipulates tf eager
    tensors instead of numpy arrays. It's easy to convert a tf eager tensor to
    an ndarray (just call tensor.numpy()) but having access to eager tensors
    means `tf.py_function`s can use accelerators such as GPUs as well as
    being differentiable using a gradient tape.
    - tf.numpy_function maintains the semantics of the deprecated tf.py_func
    (it is not differentiable, and manipulates numpy arrays). It drops the
    stateful argument making all functions stateful.
    


In [12]:
# Predict
print('Generating predictions')
predictions = model.predict_generator(test_dataset, steps=num_batches_test)

predictions = pd.DataFrame({
    'prediction': predictions.reshape(-1),
}, index=features.index)
predictions.head()

Generating predictions


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,prediction
city,patch_id,date,Unnamed: 3_level_1
daraa,10080-8224,2017-02-07,3.09752
daraa,10208-8288,2017-02-07,7.96073
daraa,10272-4768,2017-02-07,7.992837
daraa,10336-7904,2017-02-07,0.0
daraa,10400-8160,2017-02-07,1.887757


In [13]:
RESULTS_PATH = '../logs/predictions'
file_name = '{}/prediction_{}.p'.format(RESULTS_PATH, round(time()))
predictions.to_pickle(file_name)
print('Store predictions on file: {}'.format(file_name))

Store predictions on file: ../logs/predictions/prediction_1563904454.p


In [12]:
# train_index_generator, test_index_generator = data_stream.split_by_patch_id(features['image'], features['destroyed'])
# train_generator = data_stream.get_data_generator_from_index([features['image'], features['destroyed']],
#                                                             train_index_generator)
# test_indices = list(test_index_generator)
# test_generator = data_stream.get_data_generator_from_index([features['image']], test_indices)

# num_batches = ceil(len(features) / space['batch_size'])
# model = Model(**space)
# model.fit_generator(train_generator,
#                     steps_per_epoch=num_batches,
#                     validation_steps=1,
#                     **space)

# predictions = model.predict_generator(test_generator, steps=len(test_indices))
# predictions = pd.DataFrame({
#     'prediction': predictions[:, 1],
# }, index=reduce(lambda l, r: l.union(r), test_indices))

In [13]:
# RESULTS_PATH = '../logs/predictions'
# file_name = '{}/prediction_test.p'.format(RESULTS_PATH)
# predictions.to_pickle(file_name)
# print('Stored predictions on file: {}'.format(file_name))