# Predict for different cities

In this notebook we will use the best network trained in the city of Aleppo to compute predictions of destruction for other cities.

In [1]:
from math import ceil
from time import time
import pandas as pd
import logging
from functools import reduce
from sklearn.model_selection import KFold
import tensorflow as tf
from tensorflow.data import Dataset
from tensorflow.keras.models import load_model
import os
from sklearn.metrics import classification_report
import numpy as np
import glob

from damage.models import CNN
from damage.data import DataStream, load_experiment_results

In [5]:
cities = ['deir']

# Load experiments
EXPERIMENTS_PATH = '../logs/experiments/'
experiment_results = load_experiment_results(EXPERIMENTS_PATH)
    
# Choose best model
Model = CNN
available_models = [m.split('_')[1].split('.')[0] for m in os.listdir('../logs/models')]
experiment_results = experiment_results.loc[
    (experiment_results['model'] == str(Model))
    & (experiment_results['id'].isin(available_models))
]
experiment_results['val_precision_positives_last_epoch'] = experiment_results['val_precision_positives']\
    .apply(lambda x: np.nan if isinstance(x, float) else x[-1])
experiment_results['val_recall_positives_last_epoch'] = experiment_results['val_recall_positives']\
    .apply(lambda x: np.nan if isinstance(x, float) else x[-1])
experiment_results = experiment_results.loc[
    experiment_results['val_recall_positives_last_epoch'] > 0.5
]
best_experiment = experiment_results.loc[experiment_results['val_precision_positives_last_epoch'].idxmax()]
space, identifier = best_experiment['space'], best_experiment['id']
# Try to load best model
try:
    print('Loading model {}'.format(identifier))
    model = load_model('../logs/models/model_{}.h5'.format(identifier))
    print('Model loaded')
except Exception as e:
    raise e('Error loading model')
for city in cities:
    # Load features
    features = pd.read_pickle('../logs/features/features_{}.p'.format(city)).dropna(subset=['destroyed'])
    
    # Generate test dataset
    test_generator = DataStream._get_index_generator(features, space['batch_size'], KFold)
    num_batches_test = len(test_generator)
    test_generator = DataStream.get_test_data_generator_from_index(features['image'], test_generator)
    test_dataset = Dataset.from_generator(lambda: test_generator, tf.float32)
    # Predict
    print('Generating predictions')
    predictions = model.predict_generator(test_dataset, steps=num_batches_test)
    predictions = pd.DataFrame({
        'prediction': predictions.reshape(-1),
    }, index=features.index)
    file_name = '{}/prediction_{}_{}.p'.format('../logs/predictions',city, round(time()))
    predictions.to_pickle(file_name)
    print('Store predictions on file: {}'.format(file_name))

Loading model 1563900369


W0731 16:29:03.274266 4505814464 hdf5_format.py:266] Sequential models without an `input_shape` passed to the first layer cannot reload their optimizer state. As a result, your model isstarting with a freshly initialized optimizer.


Model loaded


ValueError: k-fold cross-validation requires at least one train/test split by setting n_splits=2 or more, got n_splits=0.

In [8]:
features = pd.read_pickle('../logs/features/features_{}.p'.format(city))
features

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,annotation_date,damage_num,destroyed,is_in_no_analysis?,latitude,longitude,image
city,patch_id,date,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
deir-ez-zor,32-32,2016-05-25,,,,False,35.360696,40.103019,"[[[148, 150, 156, 222, 231, 239], [156, 154, 1..."
deir-ez-zor,32-96,2016-05-25,,,,False,35.360353,40.103019,"[[[33, 24, 25, 41, 57, 49], [49, 36, 33, 49, 6..."
deir-ez-zor,32-160,2016-05-25,,,,True,35.360009,40.103019,"[[[165, 158, 156, 148, 134, 132], [165, 158, 1..."
deir-ez-zor,32-224,2016-05-25,,,,True,35.359666,40.103019,"[[[165, 162, 165, 99, 101, 99], [148, 146, 148..."
deir-ez-zor,32-288,2016-05-25,,,,True,35.359323,40.103019,"[[[156, 150, 156, 33, 36, 33], [156, 154, 156,..."
deir-ez-zor,32-352,2016-05-25,,,,True,35.358979,40.103019,"[[[99, 81, 90, 0, 0, 0], [99, 81, 90, 0, 0, 0]..."
deir-ez-zor,32-416,2016-05-25,,,,True,35.358636,40.103019,"[[[90, 73, 82, 115, 113, 99], [82, 65, 74, 123..."
deir-ez-zor,32-480,2016-05-25,,,,True,35.358293,40.103019,"[[[107, 93, 99, 41, 36, 33], [99, 89, 99, 49, ..."
deir-ez-zor,32-544,2016-05-25,,,,True,35.357949,40.103019,"[[[148, 138, 140, 41, 53, 33], [148, 142, 140,..."
deir-ez-zor,32-608,2016-05-25,,,,True,35.357606,40.103019,"[[[132, 121, 123, 82, 77, 82], [140, 134, 132,..."


In [None]:
for city in cities:
    # Load predictions (last)
    file_path = glob.glob('../logs/predictions/prediction_{}*'.format(city))[-1]
    results = pd.read_pickle(file_path)
    # Load target
    target_file_path = glob.glob('../logs/features/target_features_{}*'.format(city))[-1]
    target = pd.read_pickle(target_file_path).dropna(subset=['destroyed'])

    # Compute best threshold
    differences = []
    for elem in np.arange(0, 30):
        binary_predictions = (results['prediction'] > elem) * 1
        difference = binary_predictions.mean() - target['destroyed'].mean()
        differences.append(np.abs(difference))

    best_threshold = np.arange(0,30)[np.argmin(differences)]
    # Add prediction column (binary format)
    results['prediction_binary'] = (results['prediction'] > best_threshold) * 1
    print(city)
    print(classification_report(target['destroyed'], results['prediction_binary']))
    print('\n')