# Whales and Dolphin ID




In [None]:
import gc
import glob
import os
from numpy import save, load
from random import seed
from os import listdir
from shutil import copyfile
from random import seed
from random import random

import joblib
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from contextlib import contextmanager
from multiprocessing.pool import ThreadPool, Pool
from joblib import Parallel, delayed
import time

from skimage.io import imread
from skimage.transform import resize
from skimage.color import rgb2gray, gray2rgb
import matplotlib.pyplot as plt

seed(2022)
CPUS = 4 # kaggle default - https://www.kaggle.com/product-feedback/64106
HAPPYWHALE_INPUT_DIR = "/kaggle/input/happy-whale-and-dolphin"
# these are paramters to determine how to sample the initial training dataset
MAX_INDIVIDUALS_PER_SPECIES = os.environ.get('MAX_INDIVIDUALS_PER_SPECIES', None) # none to include all
MAX_IMAGES_PER_INDIVIDUAL_THRESHOLD = os.environ.get('MAX_IMAGES_PER_INDIVIDUAL_THRESHOLD', 0) # 0 to include all
MAX_SAMPLE_DATA_SIZE = 37000 # change this to determine max images to include in training
WORKING_DIR = "/kaggle/working"
HEIGHT = 256
WIDTH = 256
VALIDATION_RATIO = .25 # used to split training and validation dataset
IMARRAY_SIZE = (HEIGHT,WIDTH)

EPOCHS = 10

%matplotlib inline

In [None]:


def convert_image_to_ndarray(img_path, output_size=IMARRAY_SIZE, color='normalize_gray'):
    '''
    reads the `img_path` and converts it to a numpy array of size `output_size`
    Supported Options for `color`:
    - 'normalize_gray'
    - 'raw_rgb'
    
    Returns the 
    1. normalized and standardized image
    2. aspect ratio of the original image
    '''
    # read input
    im = imread(img_path)
    if output_size == None:
        return im
    aspect_ratio = float(im.shape[1] / im.shape[0])
    resized_im = resize(im, output_size)
    if color == 'raw_rgb':
        return resized_im, aspect_ratio
    elif color == 'normalize_gray':
        # if image only has 2 channels, it's already grayscale
        if len(resized_im.shape) == 2:
            return resized_im / np.max(resized_im), aspect_ratio
        # if image has 3 channels. it's rgb and needs to be converted to grayscale
        elif len(resized_im.shape) == 3:
            gray_img = rgb2gray(resized_im)
            # Now normalize gray image
            gray_norm = gray_img / np.max(gray_img)
            return gray_norm, aspect_ratio
        
def convert_ndarray_img_to_maintain_aspect_ratio(ndarray_img, ar):
    '''
    aspect ratio is width / height
    '''
    new_width = ndarray_img.shape[1]*ar # new width should be aspect ratio * img width
    new_ar_shape = (ndarray_img.shape[0], new_width)
    return resize(ndarray_img, new_ar_shape)

@contextmanager
def timer(name):
    t0 = time.time()
    yield
    print('{0} done in {1:.3f} seconds.'.format(name, time.time() - t0))

def split_df(df, num_splits, log=False):
    
    df_list = []
    rows_splits = np.linspace(0, df.shape[0], num_splits+1).astype(int)
    if log:
        print('Split into {} parts'.format(num_splits))
        print('Row splits:\n{}'.format(rows_splits))
    
    for i in range(len(rows_splits) - 1):
        df_list.append(df.iloc[rows_splits[i]:rows_splits[i+1]])
        
    return df_list[:num_splits]

def split_arr(arr, num_splits, log=False):
    rows_splits = np.linspace(0, df.shape[0], num_splits+1).astype(int)
    if log:
        print('Split into {} parts'.format(num_splits))
        print('Row splits:\n{}'.format(rows_splits))
    
    r_list = []
    for i in range(len(rows_splits) - 1):
        r_list.append(arr[rows_splits[i]:rows_splits[i+1]])
        
    return r_list[:num_splits]

# create lambda function to construct the full path for a given image.
# this is intended to be used on the `image` column in train.csv
get_img_path = lambda img: os.path.join(HAPPYWHALE_INPUT_DIR, img)


## Sample data

Sample / Select images to be used for training in the challenge’s `train.csv`

In [None]:

train_csv_fn = "train.csv"
# order by image jpg file and reset index to have deterministic index
train_df = pd.read_csv(os.path.join(HAPPYWHALE_INPUT_DIR, train_csv_fn)).sort_values('image').reset_index()

train_df.species.replace({"globis": "short_finned_pilot_whale",
                          "pilot_whale": "short_finned_pilot_whale",
                          "kiler_whale": "killer_whale",
                          "bottlenose_dolpin": "bottlenose_dolphin"}, inplace=True)

In [None]:

with timer('Sampling data: '):
    individual_ids_by_species = pd.DataFrame(train_df[['species', 'individual_id']].value_counts()).rename({0:'count'}, axis=1)
    subsetted_individuals = individual_ids_by_species[individual_ids_by_species['count']>MAX_IMAGES_PER_INDIVIDUAL_THRESHOLD].reset_index()#['count'].sum()
    sampled_data = []
    for iid in subsetted_individuals.individual_id:
        images_of_iid = train_df[train_df['individual_id']==iid]
        sample = images_of_iid[:MAX_INDIVIDUALS_PER_SPECIES]
        sampled_data.append(sample)
    sampled_data_df = pd.concat(sampled_data)

In [None]:
len(sampled_data_df['individual_id'].unique())

In [None]:
import seaborn as sns

In [None]:
sns.histplot(pd.DataFrame(train_df[['species', 'individual_id']].value_counts()).rename({0:'count'}, axis=1))

In [None]:
c = pd.DataFrame(train_df[['species', 'individual_id']].value_counts()).rename({0:'count'}, axis=1).reset_index()

In [None]:
# sns.barplot(c['individual_id'], c['count'])
c['count'].describe()
c
sns.histplot(c['count'][:int(.25*15587)])

## Set up directories and datasets

Set up the working directory such that folder names correspond to species class and contents are the images of that species.

In [None]:
def setup_directories(unique_species, log=False):
    subdirs = ['train', 'test']
    for subdir in subdirs:
        for species in unique_species:
            # create label subdirectories
            newdir = "{}/{}/{}".format(WORKING_DIR, subdir, species)
            if log:
                print('Creating: ', newdir)
            os.makedirs(newdir, exist_ok=True)

def setup_datasets(df, val_ratio=VALIDATION_RATIO, file_type=".jpg", log=False):
    '''
    Set up the working directory such that
    folder names correspond to species class and
    contents are the images of that species.
    '''
    num_saved_files = 0
    for index, row in df.iterrows():
        label = row['species']
        image = row['image']
        src_img_path = get_img_path('train_images/{}'.format(image))
        output_fn = image.split('.')[0] + file_type
        dst_img_path = "{working_dir}/{dataset}/{label}/{filename}".format(working_dir=WORKING_DIR,
                                                                              dataset='test' if random() < val_ratio else 'train',
                                                                              label=label,
                                                                              filename=output_fn,
                                                                              file_type=file_type
                                                                             )
        try:
            if log:
                    print('Saving {} to {} as {}'.format(src_img_path,
                                                         dst_img_path,
                                                         type(arr)))
            if file_type == '.npy':
                arr, ar = convert_image_to_ndarray(src_img_path, color='raw_rgb')
                save(dst_img_path, arr)
            elif file_type == '.jpg':
                copyfile(src_img_path,dst_img_path)
                
            num_saved_files +=1
        except Exception as e:
            print(type(e))
            print(e)
            print('Skipping: ', src_img_path)
    return num_saved_files


def pool_setup_datasets(df, timer_str, CPUS=4, log=False, max_images_to_include=MAX_SAMPLE_DATA_SIZE):
    df_splits = split_df(df[:max_images_to_include], num_splits=CPUS, log=log)
    with timer(timer_str):
        with Pool(processes=CPUS) as pool:
            num_files_saved = pool.map(setup_datasets, df_splits)
    total_saved = sum(num_files_saved)
    if log:
        print('Saved {} image arrays or image files'.format(total_saved))
    return total_saved




In [None]:
unique_ids = sorted(sampled_data_df['species'].unique())

setup_directories(unique_ids, log=False)
pool_setup_datasets(sampled_data_df, timer_str='setting up datasets', log=True)


## Train the Xception CNN model.


References:

- https://arxiv.org/pdf/1610.02357.pdf
- https://github.com/otenim/Xception-with-Your-Own-Dataset


In [None]:
import math
import os
import argparse
import matplotlib
import imghdr
import pickle as pkl
import numpy as np
import matplotlib.pyplot as plt
from keras.applications.xception import Xception, preprocess_input
from tensorflow.keras.optimizers import Adam
from keras.preprocessing import image
from keras.losses import categorical_crossentropy
from keras.layers import Dense, GlobalAveragePooling2D
from keras.models import Model
from tensorflow.keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint

matplotlib.use('Agg')

def generate_from_paths_and_labels(
        input_paths, labels, batch_size, input_size=(299, 299)):
    num_samples = len(input_paths)
    while 1:
        perm = np.random.permutation(num_samples)
        input_paths = input_paths[perm]
        labels = labels[perm]
        for i in range(0, num_samples, batch_size):
            inputs = list(map(
                lambda x: image.load_img(x, target_size=input_size),
                input_paths[i:i+batch_size]
            ))
            inputs = np.array(list(map(
                lambda x: image.img_to_array(x),
                inputs
            )))
            inputs = preprocess_input(inputs)
            yield (inputs, labels[i:i+batch_size])


def fine_tune(dataset_root,
                classes,
                result_root,
                epochs_pre=5,
                epochs_fine=EPOCHS,
                batch_size_pre=32,
                batch_size_fine=16,
                lr_pre=1e-3,
                lr_fine=1e-4,
                snapshot_period_pre=1,
                snapshot_period_fine=1,
                split=0.8,):
    '''
    Reference https://github.com/otenim/Xception-with-Your-Own-Dataset
    '''

    # ====================================================
    # Preparation
    # ====================================================
    # parameters
    epochs = epochs_pre + epochs_fine
    dataset_root = os.path.expanduser(dataset_root)
    result_root = os.path.expanduser(result_root)

    # load class names
#     with open(classes, 'r') as f:
#         classes = f.readlines()
#         classes = list(map(lambda x: x.strip(), classes))
    num_classes = len(classes)

    # make input_paths and labels
    input_paths, labels = [], []
    for class_name in os.listdir(dataset_root):
        class_root = os.path.join(dataset_root, class_name)
        class_id = classes.index(class_name)
        for path in os.listdir(class_root):
            path = os.path.join(class_root, path)
            if imghdr.what(path) is None:
                # this is not an image file
                continue
            input_paths.append(path)
            labels.append(class_id)

    # convert to one-hot-vector format
    labels = to_categorical(labels, num_classes=num_classes)

    # convert to numpy array
    input_paths = np.array(input_paths)

    # shuffle dataset
    perm = np.random.permutation(len(input_paths))
    labels = labels[perm]
    input_paths = input_paths[perm]

    # split dataset for training and validation
    border = int(len(input_paths) * split)
    train_labels = labels[:border]
    val_labels = labels[border:]
    train_input_paths = input_paths[:border]
    val_input_paths = input_paths[border:]
    print("Training on %d images and labels" % (len(train_input_paths)))
    print("Validation on %d images and labels" % (len(val_input_paths)))

    # create a directory where results will be saved (if necessary)
    if os.path.exists(result_root) is False:
        os.makedirs(result_root)

    # ====================================================
    # Build a custom Xception
    # ====================================================
    # instantiate pre-trained Xception model
    # the default input shape is (299, 299, 3)
    # NOTE: the top classifier is not included
    base_model = Xception(
        include_top=False,
        weights='imagenet',
        input_shape=(HEIGHT, WIDTH, 3))

    # create a custom top classifier
    x = base_model.output
    x = GlobalAveragePooling2D()(x)
    x = Dense(1024, activation='relu')(x)
    predictions = Dense(num_classes, activation='softmax')(x)
    model = Model(inputs=base_model.inputs, outputs=predictions)

    # ====================================================
    # Train only the top classifier
    # ====================================================
    # freeze the body layers
    for layer in base_model.layers:
        layer.trainable = False

    # compile model
    model.compile(
        loss=categorical_crossentropy,
        optimizer=Adam(lr=lr_pre),
        metrics=['accuracy']
    )

    # train
    hist_pre = model.fit_generator(
        generator=generate_from_paths_and_labels(
            input_paths=train_input_paths,
            labels=train_labels,
            batch_size=batch_size_pre
        ),
        steps_per_epoch=math.ceil(
            len(train_input_paths) / batch_size_pre),
        epochs=epochs_pre,
        validation_data=generate_from_paths_and_labels(
            input_paths=val_input_paths,
            labels=val_labels,
            batch_size=batch_size_pre
        ),
        validation_steps=math.ceil(
            len(val_input_paths) / batch_size_pre),
        verbose=1,
        callbacks=[
            ModelCheckpoint(
                filepath=os.path.join(
                    result_root,
                    'model_pre_ep{epoch}_valloss{val_loss:.3f}.h5'),
                period=snapshot_period_pre,
            ),
        ],
    )
    model.save(os.path.join(result_root, 'model_pre_final.h5'))

    # ====================================================
    # Train the whole model
    # ====================================================
    # set all the layers to be trainable
    for layer in model.layers:
        layer.trainable = True

    # recompile
    model.compile(
        optimizer=Adam(lr=lr_fine),
        loss=categorical_crossentropy,
        metrics=['accuracy'])

    # train
    hist_fine = model.fit_generator(
        generator=generate_from_paths_and_labels(
            input_paths=train_input_paths,
            labels=train_labels,
            batch_size=batch_size_fine
        ),
        steps_per_epoch=math.ceil(
            len(train_input_paths) / batch_size_fine),
        epochs=epochs_fine,
        validation_data=generate_from_paths_and_labels(
            input_paths=val_input_paths,
            labels=val_labels,
            batch_size=batch_size_fine
        ),
        validation_steps=math.ceil(
            len(val_input_paths) / batch_size_fine),
        verbose=1,
        callbacks=[
            ModelCheckpoint(
                filepath=os.path.join(
                    result_root,
                    'model_fine_ep{epoch}_valloss{val_loss:.3f}.h5'),
                period=snapshot_period_fine,
            ),
        ],
    )
    model.save(os.path.join(result_root, 'model_fine_final.h5'))

    # ====================================================
    # Create & save result graphs
    # ====================================================
    # concatinate plot data
    acc = hist_pre.history['accuracy']
    val_acc = hist_pre.history['val_accuracy']
    loss = hist_pre.history['loss']
    val_loss = hist_pre.history['val_loss']
    acc.extend(hist_fine.history['accuracy'])
    val_acc.extend(hist_fine.history['val_accuracy'])
    loss.extend(hist_fine.history['loss'])
    val_loss.extend(hist_fine.history['val_loss'])

    # save graph image
    plt.plot(range(epochs), acc, marker='.', label='accuracy')
    plt.plot(range(epochs), val_acc, marker='.', label='val_accuracy')
    plt.legend(loc='best')
    plt.grid()
    plt.xlabel('epoch')
    plt.ylabel('accuracy')
    plt.savefig(os.path.join(result_root, 'accuracy.png'))
    plt.clf()

    plt.plot(range(epochs), loss, marker='.', label='loss')
    plt.plot(range(epochs), val_loss, marker='.', label='val_loss')
    plt.legend(loc='best')
    plt.grid()
    plt.xlabel('epoch')
    plt.ylabel('loss')
    plt.savefig(os.path.join(result_root, 'loss.png'))
    plt.clf()

    # save plot data as pickle file
    plot = {
        'accuracy': acc,
        'val_accuracy': val_acc,
        'loss': loss,
        'val_loss': val_loss,
    }
    with open(os.path.join(result_root, 'plot.dump'), 'wb') as f:
        pkl.dump(plot, f)



In [None]:
fine_tune('{}/train/'.format(WORKING_DIR), unique_ids, WORKING_DIR)