# TensorFlow modeling experiments
---

Notebook for initial experiments on modeling deforestation through TensorFlow and the [Planet: Understanding the Amazon from Space](https://www.kaggle.com/c/planet-understanding-the-amazon-from-space/) dataset.

## Setup

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow_addons.metrics import FBetaScore
from tensorflow.keras import losses, optimizers, metrics
from tensorflow.data.experimental import AUTOTUNE
from tqdm.keras import TqdmCallback
from ipywidgets import interact
from functools import partial

In [None]:
sys.path.append('../data/')
sys.path.append('../modeling/')
import data_utils
from models import ResNet, data_augmentation

In [None]:
model_type = 'resnet50'
task = 'orig_labels'
pretrain_dataset = 'bigearthnet'
batch_size = 32

In [None]:
@interact
def choose_model_and_task(chosen_model_type=['resnet50', 'pretrained_resnet50'], chosen_task=['orig_labels', 'deforestation']):
    global model_type
    global task
    global pretrain_dataset
    model_type, task = chosen_model_type, chosen_task
    if chosen_model_type == 'pretrained_resnet50':
        pretrain_dataset = 'bigearthnet'
        # pretrain_dataset = 'imagenet'
    else:
        pretrain_dataset = None

## Load the data

### Create a dataset

In [None]:
labels_df = pd.read_csv(data_utils.DATA_PATH + data_utils.LABELS_PATH)
labels_df = data_utils.encode_tags(labels_df, drop_tags_col=True)
if task == 'deforestation':
    labels_df = data_utils.add_deforestation_label(labels_df)
    labels_df = labels_df[['image_name', 'deforestation']]
labels_df

In [None]:
# Specify the dataframe so that the generator has no required arguments
def data_gen():
    for i in data_utils.get_amazon_sample(labels_df):
        yield i

In [None]:
if task == 'deforestation':
    labels_shape = 1
else:
    labels_shape = len(data_utils.TAGS)
dataset = tf.data.Dataset.from_generator(
    data_gen, 
    output_signature=(
        tf.TensorSpec(shape=([256, 256, 3]), dtype=tf.float16),
        tf.TensorSpec(shape=(labels_shape), dtype=tf.uint8)
    )
)

In [None]:
next(iter(dataset))

Alternative dataset creation, following typical TensorFlow image loading approaches, but which is slower:

In [None]:
# file_names = [f"{data_utils.DATA_PATH}{data_utils.IMG_PATH}{image_name}.jpg" for image_name in labels_df.image_name.tolist()]
# if task == 'deforestation':
#     labels = labels_df['deforestation'].tolist()
# else:
#     labels = labels_df[data_utils.TAGS].values

In [None]:
# file_names[0]

In [None]:
# dataset = tf.data.Dataset.from_tensor_slices((file_names, labels)).map(data_utils.decode_img, num_parallel_calls=AUTOTUNE)

In [None]:
# next(iter(dataset))

### Split into train, validation and test sets

In [None]:
n_samples = len(labels_df)
n_samples

In [None]:
train_set, test_set = dataset.take(int(0.9 * n_samples)), dataset.skip(int(0.9 * n_samples))
train_set, val_set = train_set.skip(int(0.1 * n_samples)), train_set.take(int(0.1 * n_samples))

In [None]:
train_set = train_set.cache().shuffle(buffer_size=1000).batch(batch_size).prefetch(AUTOTUNE)
val_set = val_set.cache().shuffle(buffer_size=1000).batch(batch_size).prefetch(AUTOTUNE)
test_set = test_set.cache().shuffle(buffer_size=1000).batch(batch_size).prefetch(AUTOTUNE)

In [None]:
single_batch = train_set.take(1)

## Train models

### Set the modeling configuration

In [None]:
model = ResNet(
    pretrain_dataset=pretrain_dataset,
    pooling='max',
    task=task
)

In [None]:
# model.build(input_shape=(None, 256, 256, 3))
# model.summary()

In [None]:
if pretrain_dataset is not None:
    model.core.trainable = False

In [None]:
lr = 0.003
opt = optimizers.Adam(learning_rate=lr)
# if task == 'orig_labels':
#     loss = losses.CategoricalCrossentropy()#from_logits=True)
# else:
#     loss = losses.BinaryCrossentropy()#from_logits=True)
loss = 'binary_crossentropy'
model_metrics = [
    'accuracy', 
    FBetaScore(num_classes=model.n_outputs, average='macro', beta=2.0)
]

### Test a model

Overfit a model on a batch of a classification task, so as to confirm that it works.

Train on a single batch:

In [None]:
model.compile(optimizer=opt, loss=loss, metrics=model_metrics)

In [None]:
model.fit(single_batch, epochs=100, verbose=0, callbacks=[TqdmCallback()])

Manually test each step of the model:

In [None]:
batch_data = next(iter(single_batch))

In [None]:
batch_data

In [None]:
x = batch_data[0]

In [None]:
x

In [None]:
y = batch_data[1]

In [None]:
y

In [None]:
y_pred = model(x)

In [None]:
y_pred

In [None]:
np.sum(y_pred, axis=1)

In [None]:
if task == 'orig_labels':
    y_ohe = tf.cast(y_pred > (1 / len(data_utils.TAGS)), tf.uint8)
else:
    y_ohe = tf.cast(y_pred > 0.5, tf.uint8)
y_ohe

In [None]:
y_ohe == y

In [None]:
x_proc = model.preprocess_input(x)
x_proc

In [None]:
np.min(x_proc)

In [None]:
np.max(x_proc)

In [None]:
np.mean(x_proc)

In [None]:
np.std(x_proc)

In [None]:
x_aug = data_augmentation(x_proc)
x_aug

In [None]:
x_core = model.core(x_aug)
x_core

In [None]:
np.sum(x_core, axis=1)

In [None]:
y_pred = model.classifier(x_core)
y_pred

In [None]:
# model.fit(x, y, epochs=100, verbose=0, callbacks=[TqdmCallback()])