In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
import glob
from tqdm import tqdm
import random
import os

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory


# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Mapping the Directories
train_img_dir = '/kaggle/input/happy-whale-and-dolphin/train_images'
test_img_dir = '/kaggle/input/happy-whale-and-dolphin/test_images'
sub_path = '/kaggle/input/happy-whale-and-dolphin/sample_submission.csv'
train_path = '/kaggle/input/happy-whale-and-dolphin/train.csv'

# KRP: Get the training data set
train_csv = '/kaggle/input/happy-whale-and-dolphin/train.csv'
train_df = pd.read_csv(train_csv)

# KRP: Adding the edits to fix known issues in the dataset suggested by Aleksey Alekseev in the challenge discussions
#   see: https://www.kaggle.com/c/happy-whale-and-dolphin/discussion/305574
train_df.species.replace({"globis": "short_finned_pilot_whale",
                          "pilot_whale": "short_finned_pilot_whale",
                          "kiler_whale": "killer_whale",
                          "bottlenose_dolpin": "bottlenose_dolphin"}, inplace=True)

# KRP: Print first to understand what is here
print(train_df.iloc[1])
#print(train_array[0])

# KRP: limit the size of train_df to speed up debugging, remove when ready to go
train_df = train_df.head(300)


# KRP: Get number of unique species
species = train_df.species.unique()
print(species)
outputSize = len(species) #Number of unique species to identify
print(outputSize)

# KRP replace species with number for training
j = 0
for i in species:
    #print(i)
    train_df.species.replace({i:j}, inplace=True)
    j = j+1

species = train_df.species.unique()
print(species)

# KRP: Convert to arrays for labels and images
train_array = train_df.to_numpy()

# MJI: function to resize each image 
# KRP: TODO: see if we need to add a rescaling layer to the input (each item in the image has a value from 0 to 255)
def resize_images(path,n_w,n_h):#n_w =new width n_h = new_height
    img = tf.io.read_file(path)
    img = tf.image.decode_jpeg(img, channels=3)#can decode to another format 
    img = tf.image.resize(img, [n_w, n_h])
    return img

# KRP: create an array of all the image names
train_images = train_array[:,0]
print("Length Image List")
print(len(train_images))

# KRP: create a label array with each of the species
train_labels = train_array[:,1]
print("Length Label List")
print(len(train_images))

# KRP: make the image array, images of size 128*128*3
# Note: the resize image is being called against specific image names in the image name array
#   this is to ensure that the images are loaded in the same order as the labels
N = len(train_images)
image_size = 128
image_train = np.empty((N, image_size, image_size, 3), dtype=np.uint8)
# loop through the images from the images ids from the target\id dataset
# then grab the cooresponding image from disk, pre-process, and store in matrix in memory
for i in range(0,N):
    re_img = resize_images(train_img_dir + "/" + train_images[i],image_size,image_size)

    image_train[i, :]#, :, :] = re_img    
    print(i)

# KRP: TODO
#   - split up the data to give a valid set
#   - randomize the data order
#   - tweak how the model is getting trained
#   - split into individual whale identification (create datasets for each species)
    
# KRP: Reference https://keras.io/guides/transfer_learning/
# KRP: Load model
# KRP: define new input
new_input_tensor = tf.keras.Input(shape=(image_size, image_size, 3))

whale_model = keras.models.Sequential()

# Get the Base Model
base_model = tf.keras.applications.NASNetLarge(include_top=False, input_tensor = new_input_tensor, weights='imagenet')

# Freeze the Base model
base_model.trainable = False

whale_model.add(base_model)

whale_model.add(keras.layers.Flatten())
whale_model.add(keras.layers.Dense(512, activation='relu'))
whale_model.add(keras.layers.Dense(25, activation='softmax'))

print(whale_model.summary())

whale_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3), loss=tf.keras.losses.BinaryCrossentropy(), metrics=[tf.keras.metrics.BinaryAccuracy(), tf.keras.metrics.FalseNegatives()])

# Make sure that the input
print(np.shape(image_train))
np.asarray(image_train).astype('float32')
print(np.shape(train_labels))
np.asarray(train_labels).astype('float32')

# KRP: Train the model
whale_model.fit(x=image_train, y=train_labels, epochs=100, batch_size=10)

Deprecated Demo Code

In [None]:
# # KRP: Deprecated demo code, could be useful again in the future

# KRP: Original filesystem walk, Commenting this out, it takes a really long time
#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))

# # KRP: doesn't work in console, should let you show an image on the dataset
# example_image = image_train[1, :, :, :]
# pil_img = tf.keras.preprocessing.image.array_to_img(example_image)
# pil_img.show()


# # KRP: Check the whale type label list
# train_labels_whaletype = train_array[:,1]
# print("random label")
# print(train_labels_whaletype[3])

# # KRP: Lets Look at the images, this was earlier demo code, taking it out now
# list_train_images = glob.glob(train_img_dir+"/*")
# image_count = len(list_train_images)
# print("Image Count")
# print(image_count)

# # KRP: demo to show we can get each 
# print(train_df.iloc[1].image)
# image = tf.keras.preprocessing.image.load_img(train_img_dir+"/"+train_df.iloc[1].image)
# input_arr = tf.keras.preprocessing.image.img_to_array(image)