# Setup

In [1]:
import gc
import glob
import os
from numpy import save, load
from random import seed
from os import listdir
from shutil import copyfile
from random import seed
from random import random

import joblib
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from contextlib import contextmanager
from multiprocessing.pool import ThreadPool, Pool
from joblib import Parallel, delayed
import time

from skimage.io import imread
from skimage.transform import resize
from skimage.color import rgb2gray, gray2rgb
import matplotlib.pyplot as plt

seed(2022)
CPUS = 4 # kaggle default - https://www.kaggle.com/product-feedback/64106
HAPPYWHALE_INPUT_DIR = "/kaggle/input/happy-whale-and-dolphin"
SPECIES_MODEL_INPUT_DIR = "/kaggle/input/cnn-species-sample6750-ep50"
BELUGA_MODEL_INPUT_DIR = '/kaggle/input/belugaxception'
XCEPTION_PREDICTIONS = '/kaggle/input/xceptionspeciespredicted'
# these are paramters to determine how to sample the initial training dataset
MAX_INDIVIDUALS_PER_SPECIES = os.environ.get('MAX_INDIVIDUALS_PER_SPECIES', None) # none to include all
MIN_IMAGES_PER_INDIVIDUAL_THRESHOLD = os.environ.get('MIN_IMAGES_PER_INDIVIDUAL_THRESHOLD', 0) # 0 to include all
MAX_SAMPLE_DATA_SIZE = 10000 # change this to determine max images to include in training
WORKING_DIR = "/kaggle/working"
HEIGHT = 256
WIDTH = 256
VALIDATION_RATIO = .25 # used to split training and validation dataset
IMARRAY_SIZE = (HEIGHT,WIDTH)

EPOCHS = 10

%matplotlib inline

In [2]:


def convert_image_to_ndarray(img_path, output_size=IMARRAY_SIZE, color='normalize_gray'):
    '''
    reads the `img_path` and converts it to a numpy array of size `output_size`
    Supported Options for `color`:
    - 'normalize_gray'
    - 'raw_rgb'
    
    Returns the 
    1. normalized and standardized image
    2. aspect ratio of the original image
    '''
    # read input
    im = imread(img_path)
    if output_size == None:
        return im
    aspect_ratio = float(im.shape[1] / im.shape[0])
    resized_im = resize(im, output_size)
    if color == 'raw_rgb':
        return resized_im, aspect_ratio
    elif color == 'normalize_gray':
        # if image only has 2 channels, it's already grayscale
        if len(resized_im.shape) == 2:
            return resized_im / np.max(resized_im), aspect_ratio
        # if image has 3 channels. it's rgb and needs to be converted to grayscale
        elif len(resized_im.shape) == 3:
            gray_img = rgb2gray(resized_im)
            # Now normalize gray image
            gray_norm = gray_img / np.max(gray_img)
            return gray_norm, aspect_ratio
        
def convert_ndarray_img_to_maintain_aspect_ratio(ndarray_img, ar):
    '''
    aspect ratio is width / height
    '''
    new_width = ndarray_img.shape[1]*ar # new width should be aspect ratio * img width
    new_ar_shape = (ndarray_img.shape[0], new_width)
    return resize(ndarray_img, new_ar_shape)

@contextmanager
def timer(name):
    t0 = time.time()
    yield
    print('{0} done in {1:.3f} seconds.'.format(name, time.time() - t0))

def split_df(df, num_splits, log=False):
    
    df_list = []
    rows_splits = np.linspace(0, df.shape[0], num_splits+1).astype(int)
    if log:
        print('Split into {} parts'.format(num_splits))
        print('Row splits:\n{}'.format(rows_splits))
    
    for i in range(len(rows_splits) - 1):
        df_list.append(df.iloc[rows_splits[i]:rows_splits[i+1]])
        
    return df_list[:num_splits]

def split_arr(arr, num_splits, log=False):
    rows_splits = np.linspace(0, df.shape[0], num_splits+1).astype(int)
    if log:
        print('Split into {} parts'.format(num_splits))
        print('Row splits:\n{}'.format(rows_splits))
    
    r_list = []
    for i in range(len(rows_splits) - 1):
        r_list.append(arr[rows_splits[i]:rows_splits[i+1]])
        
    return r_list[:num_splits]

# create lambda function to construct the full path for a given image.
# this is intended to be used on the `image` column in train.csv
get_img_path = lambda img: os.path.join(HAPPYWHALE_INPUT_DIR, img)


# Load training data

In [3]:

train_csv_fn = "train.csv"
# order by image jpg file and reset index to have deterministic index
train_df = pd.read_csv(os.path.join(HAPPYWHALE_INPUT_DIR, train_csv_fn)).sort_values('image').reset_index()

train_df.species.replace({"globis": "short_finned_pilot_whale",
                          "pilot_whale": "short_finned_pilot_whale",
                          "kiler_whale": "killer_whale",
                          "bottlenose_dolpin": "bottlenose_dolphin"}, inplace=True)

## Load P1_Xception Species Model

This section also includes the predict_species function to use the loaded `p1_xception_species_model`.

In [4]:
from tensorflow import keras
with timer('Loading p1_xception_species_model'):
    p1_xception_species_model = keras.models.load_model("{}/{}".format(SPECIES_MODEL_INPUT_DIR,
                                                                   "model_fine_ep14_valloss0.388.h5"))

2022-04-15 18:30:35.173895: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


Loading p1_xception_species_model done in 8.269 seconds.


In [5]:
import numpy as np
from keras.applications.xception import preprocess_input
from keras.preprocessing import image
from keras.models import load_model

def infer(model, classes, image_path, top_n=5, log=False):
    '''
    Reference https://github.com/otenim/Xception-with-Your-Own-Dataset/blob/master/inference.py
    '''
    

    # load an input image
    img = image.load_img(image_path, target_size=(HEIGHT, WIDTH))
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)

    # predict
    pred = model.predict(x)[0]
    result = [(classes[i], float(pred[i]) * 100.0) for i in range(len(pred))]
    result.sort(reverse=True, key=lambda x: x[1])
    for i in range(top_n):
        (class_name, prob) = result[i]
        if log:
            print("Top %d ====================" % (i + 1))
            print("Class name: %s" % (class_name))
            print("Probability: %.2f%%" % (prob))
    return result[:top_n]


def predict_species(model, image_jpg, log=False):
    # classes must be sorted bc thats how the model was trained
    species_classes = sorted(os.listdir("{}/train".format(SPECIES_MODEL_INPUT_DIR)))
    return infer(model,
                 classes=species_classes,
                 image_path=get_img_path('train_images/{}'.format(image_jpg)),
                log=log)

def predict_beluga_individual_id(model, classes, image_jpg, log=False):
    return infer(model,
                 classes=classes,
                 image_path=get_img_path('train_images/{}'.format(image_jpg)),
                log=log)

def top_5_beluga_individual_id(df, model, classes, log=False):
#     top_5_list = []
#     for index, row in df.iterrows():
#         top_5 = predict_beluga_individual_id(model, classes, row.image, log=log)
        
        
    top_5 = [predict_beluga_individual_id(model, classes, img, log=log) for img in df.image]
    for i in range(5):
        df['predicted_id_{}'.format(i+1)] = [pred[i][0] for pred in top_5]
        df['predicted_prob_{}'.format(i+1)] = [pred[i][1] for pred in top_5]
    return df

def batch_predict(df, model, classes=None, log=False, target='species'):
    if target == 'species':
        df['predicted_species'] = [predict_species(model, img)[0][0] for img in df.image]
    elif target == 'id':
        df = top_5_beluga_individual_id(df, model, classes)#df.apply(top_5_beluga_individual_id, args=(model, classes, log))
#         df['predicted_id'] = [predict_beluga_individual_id(model, classes, img)[0][0] for img in df.image]
    return df

## Predicting belugas and subsetting only beluga whales for individual id prediction

In [6]:
predicted_species_df = pd.read_csv('{}/p1_predicted_species.csv'.format(XCEPTION_PREDICTIONS))
correct_predictions = predicted_species_df[predicted_species_df['predicted_species'] == predicted_species_df['species']]
incorrect_predictions = predicted_species_df[predicted_species_df['predicted_species'] != predicted_species_df['species']]

In [7]:
species_accuracy = len(correct_predictions) / len(predicted_species_df)
print('Overall species accuracy: ', species_accuracy)

Overall species accuracy:  0.8301883095252092


In [8]:
belugas_only = predicted_species_df[predicted_species_df['species'] == 'beluga']
predicted_belugas = predicted_species_df[predicted_species_df['predicted_species'] == 'beluga']
all_belugas = pd.concat([belugas_only, predicted_belugas]).drop_duplicates()

In [9]:
with timer('Loading belugas_id_model'):
    belugas_id_model = keras.models.load_model("{}/{}".format(BELUGA_MODEL_INPUT_DIR,
                                                                   "model_fine_ep10_valloss9.626.h5"))

Loading belugas_id_model done in 5.923 seconds.


In [10]:
classes = sorted(belugas_only[belugas_only['species']=='beluga']['individual_id'].unique())
with timer('P2 beluga individual id predict'):
    r = batch_predict(all_belugas, belugas_id_model, classes=classes, target='id')
r.to_csv('p2_predicted_individual_beluga_id.csv')

2022-04-15 18:30:48.474356: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


P2 beluga individual id predict done in 2067.549 seconds.


In [11]:
r

Unnamed: 0.1,Unnamed: 0,index,image,species,individual_id,predicted_species,predicted_id_1,predicted_prob_1,predicted_id_2,predicted_prob_2,predicted_id_3,predicted_prob_3,predicted_id_4,predicted_prob_4,predicted_id_5,predicted_prob_5
6,6,6,000be9acf46619.jpg,beluga,afb9b3978217,beluga,6f47542558a1,55.070007,afb9b3978217,37.499511,d155445f3213,3.615727,cf0fd38d30ae,1.127177,2e268c8dbd31,1.052630
8,8,8,000c3d63069748.jpg,beluga,df94b15285b9,beluga,df94b15285b9,91.257328,191e293af2b5,8.136679,aeae6f5bf5cd,0.344812,29623de1f9a5,0.099005,5fc809d9e819,0.035668
13,13,13,00167e8375c967.jpg,beluga,0ad50d0d9b06,beluga,d155445f3213,34.409979,985e7d6a8052,16.406746,aeae6f5bf5cd,6.650720,5ac053677ed1,6.385757,3d5a4dd24d21,6.084830
25,25,25,00243d4b15bafc.jpg,beluga,3cbfd98eea8f,beluga,84502c5db6bc,27.763629,ace276611cba,13.899772,5cdd8c0d46d9,11.623245,9336b7a2ae28,7.346926,06ef73efe924,7.319721
34,34,34,00354cd9244e28.jpg,beluga,a03830613b45,beluga,d9517eb16bd3,15.714386,9a149b8ff660,14.098999,84502c5db6bc,11.948583,262f464ee602,9.034760,a03830613b45,8.565432
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50894,50894,50894,ff4d68370bdec9.jpg,humpback_whale,031816fd9d9d,beluga,84a261c0e5cf,23.520933,247e4c6de07e,20.279676,cf0fd38d30ae,10.385996,edf59f267081,9.496196,d155445f3213,7.901943
50948,50948,50948,ff96d5f69628ad.jpg,humpback_whale,2e4e27f72c73,beluga,5185e90ddcb1,61.026078,8ca74f512aa5,26.382202,0ad50d0d9b06,6.376754,8274ddd12a43,2.435383,cd720f8127f5,0.784566
50966,50966,50966,ffa82c7b455a9b.jpg,gray_whale,416c48277ab4,beluga,5fc809d9e819,32.759878,cf0fd38d30ae,20.333934,d3f7b6ee4291,13.025984,191e293af2b5,7.076839,91970ecd7d55,5.711379
51022,51022,51022,fff219829b3c68.jpg,long_finned_pilot_whale,ada9e2afe2bd,beluga,3a055fad1478,83.204055,212a788c210d,9.935261,0a67aa3bfe4e,0.952378,44bd3fec6ad6,0.922049,863b52580868,0.729540
