# Import libraries

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved
# as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of 
# the current session
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files
# under the input directory
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import tensorflow as tf
import glob
from tqdm import tqdm
import random
import os
import tensorflow.keras.layers as L
from tensorflow.keras.models import Sequential
import tensorflow.keras.applications.efficientnet as efn
from tensorflow.keras.applications.resnet50 import ResNet50
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing import image
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import IsolationForest
from sklearn import svm

In [2]:
image_h = 224
image_w = 224
sample_size = 12000

In [3]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()
    print("Device:", tpu.master())
    strategy = tf.distribute.TPUStrategy(tpu)
except ValueError:
    print("Not connected to a TPU runtime. Using CPU/GPU strategy")
    strategy = tf.distribute.MirroredStrategy()

In [5]:
img_augmentation = Sequential(
    [
        L.RandomRotation(factor=0.15),
        L.RandomTranslation(height_factor=0.1, width_factor=0.1),
        L.RandomFlip(),
        L.RandomContrast(factor=0.1),
    ],
    name="img_augmentation",
)

# Read Directories

In [6]:
#print all file names
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename)) #end printing all file names

train_img_dir = '/kaggle/input/happy-whale-and-dolphin/train_images'
test_img_dir = '/kaggle/input/happy-whale-and-dolphin/test_images'
sub_path = '/kaggle/input/happy-whale-and-dolphin/sample_submission.csv'
train_path = '/kaggle/input/happy-whale-and-dolphin/train.csv'

# Label Inspection

In [7]:
train_df = pd.read_csv(train_path)
train_df.loc[train_df.species == "bottlenose_dolpin", "species"] = "bottlenose_dolphin"
train_df.loc[train_df.species == "kiler_whale", "species"] = "killer_whale"
train_df.loc[train_df.species == "globis", "species"] = "short_finned_pilot_whale"
train_df.loc[train_df.species == "pilot_whale", "species"] = "short_finned_pilot_whale"
train_df.loc[train_df.species == "beluga", "species"] = "beluga_whale"
train_df.loc[train_df.species.str.contains("whale")==True, "label"] = "whale"
train_df.loc[train_df.species.str.contains("dolphin")==True, "label"] = "dolphin"
#the following can also be used however pd.np is deprecated in the future version 
# train_df['label'] = pd.np.where(train_df.species.str.contains("whale"), "whale",
#                    pd.np.where(train_df.species.str.contains("dolphin"), "dolphin","task"))
# train_df_all = train_df
train_df = train_df.sample(frac=.10).reset_index(drop=True)

train_df, test_df = train_test_split(train_df, test_size=0.2)
# train_df = train_df.head(sample_size)
print(len(train_df['species'].unique()))

In [8]:
len(test_df)

# Add image url

In [9]:
train_df['image_path'] = train_img_dir+'/'+train_df['image']
list_of_test_image_paths = glob.glob(test_img_dir+'/*')

# String Label to neumeric lableing 

In [10]:
print(len(train_df['species'].unique()))
list_of_species = train_df['species'].unique()
print(train_df['species'].unique())
species_to_neumeric = dict()
for i in range(0,len(list_of_species)):
    species_to_neumeric[list_of_species[i]] = i

In [11]:
print(len(train_df['individual_id'].unique()))
list_of_ids = train_df['individual_id'].unique()
ids_to_neumeric = dict()
for i in range(0,len(list_of_ids)):
    ids_to_neumeric[list_of_ids[i]] = i

In [12]:
print(len(train_df['label'].unique()))
list_of_labels = train_df['label'].unique()
labels_to_neumeric = dict()
for i in range(0,len(list_of_labels)):
    labels_to_neumeric[list_of_labels[i]] = i

# Image Pre-Processing
1. Resizing the image
2. Converting the colored image to greyscale image may be 

In [13]:
#function to resize each image 
def resize_images(path,image_w,image_h):#n_w =new width n_h = new_height
    img = tf.io.read_file(path)
#     img = tf.image.convert_image_dtype(img, tf.float32)
    img = tf.image.decode_jpeg(img, channels=3)#can decode to another format 
    img = tf.image.resize(img, [image_w, image_h])
    return img
#concern After resizing it what we can do should we store the image to a new directory 

# Iterate the images and resize and store in a numpy array

# Train Test data generation function 

In [14]:
from random import sample

def train_data_genration(train_df):#takes data frame input
    train_image_list = []
    for index,row in tqdm(train_df.iterrows()):
    #     img = tf.io.read_file(row.image_path)
    #     img = tf.image.decode_jpeg(img, channels=3)#can decode to another format 
    #     img = tf.image.resize(img, [128, 128])
        img = tf.keras.preprocessing.image.load_img(row.image_path, target_size=(image_h,image_w,3), grayscale=False)
        img = tf.keras.preprocessing.image.img_to_array(img)
        img = img/255
        train_image_list.append(img)
    X_train = np.array(train_image_list)
    return X_train
def test_data_generation(list_of_test_image_paths,f_t_num): #takes the image directory array and number of images
    test_image_list = []
    for i in tqdm(range(0,f_t_num)):
        img = tf.keras.preprocessing.image.load_img(list_of_test_image_paths[i],
                                                    target_size=(image_h,image_w,3), grayscale=False)
        img = tf.keras.preprocessing.image.img_to_array(img)
        img = img/255
        test_image_list.append(img)
    X_T = np.array(test_image_list)
    return X_T

In [15]:
X_train = train_data_genration(train_df)

In [16]:
test_df['image_path'] = train_img_dir+'/'+test_df['image']

X_test = train_data_genration(test_df)

In [None]:
t_num = len(list_of_test_image_paths) #27956 number of images 
f_t_num = int(t_num/1000) # trying to test for small number 
# for t_path in list_of_test_image_paths:#when all images will be tested 
temporary_list = sample(list_of_test_image_paths,f_t_num)
X_test = test_data_generation(temporary_list,f_t_num)

# Indentifying whale and dolphin 

In [17]:
def build_binary_classification_model():
    inputs = L.Input(shape=(image_h, image_w, 3))
    x = img_augmentation(inputs) #how the input sensor is prepared
    model = efn.EfficientNetB0(include_top=False, input_tensor=x, weights="imagenet")

    # Freeze the pretrained weights
    model.trainable = False

    # Rebuild top
    x = L.GlobalAveragePooling2D(name="avg_pool")(model.output)
    x = L.BatchNormalization()(x)

    top_dropout_rate = 0.1
    x = L.Dropout(top_dropout_rate, name="top_dropout")(x)
    outputs = L.Dense(2, activation="sigmoid", name="pred")(x)

    # Compile
    model = tf.keras.Model(inputs, outputs, name="EfficientNet")
    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-2)
    model.compile(
        optimizer=optimizer, loss="binary_crossentropy", metrics=["accuracy"]
    )
    return model

def plot_hist(hist):
    plt.plot(hist.history["accuracy"])
    plt.plot(hist.history["val_accuracy"])
    plt.title("model accuracy")
    plt.ylabel("accuracy")
    plt.xlabel("epoch")
    plt.legend(["train", "validation"], loc="upper left")
    plt.show()

In [None]:
IMAGE_SIZE = [image_h, image_w]
dense_layer_size = len(labels_to_neumeric.values())
model_l = tf.keras.Sequential([efn.EfficientNetB0(input_shape=(*IMAGE_SIZE, 3),
                                                weights='imagenet',
                                                include_top=False,classes=labels_to_neumeric.values()),
                             L.GlobalAveragePooling2D(),
                            L.BatchNormalization(),
                               L.Dropout(0.20, name="top_dropout"),
                             L.Flatten(),
                             L.Dense(512, activation='relu'),
                             L.Dense(dense_layer_size, activation='sigmoid')])
model_l.compile(optimizer='adam',
              loss = 'binary_crossentropy',
              metrics=['accuracy']
             )
model_l.summary()

# Model For Identifying species

In [None]:
IMAGE_SIZE = [image_h, image_w]
dense_layer_size = len(species_to_neumeric.values())
model = tf.keras.Sequential([efn.EfficientNetB0(input_shape=(*IMAGE_SIZE, 3),
                                                weights='imagenet',
                                                include_top=False,classes=species_to_neumeric.values()),
                             L.GlobalAveragePooling2D(),
                             L.Flatten(),
                             L.Dense(512, activation='relu'),
                             L.Dense(dense_layer_size, activation='softmax')])
model.compile(optimizer='adam',
              loss = 'categorical_crossentropy',
              metrics=['accuracy']
             )
model.summary()

# Model for individual Detection

In [None]:
IMAGE_SIZE = [image_h, image_w]
dense_layer_size = len(ids_to_neumeric.values())
model_ind = tf.keras.Sequential([efn.EfficientNetB0(input_shape=(*IMAGE_SIZE, 3),
                                                weights='imagenet',
                                                include_top=False,classes=ids_to_neumeric.values()),
                             L.GlobalAveragePooling2D(),
                             L.Flatten(),
                             L.Dense(512, activation='relu'),
                             L.Dense(dense_layer_size, activation='softmax')])
model_ind.compile(optimizer='adam',
              loss = 'categorical_crossentropy',
              metrics=['accuracy']
             )
model_ind.summary()

# Neumerical Label to catergorical for whale and dolphin detection

In [18]:
labels= train_df['label'].values
neumeric_label_list = []
for label in labels:
    neumeric_label_list.append(labels_to_neumeric[label])
# label = []
# for i in y:
#     if i=='whale':
#         label.append(1)
#     else:
#         label.append(0)

# tr_labes = np.array(label)
tr_lables = np.array(neumeric_label_list)
count_0 = 0
count_1 = 0
for i in tr_lables:
    if(i==0):
        count_0 = count_0+1
    else:
        count_1 = count_1+1
print(count_0," ",count_1," ",len(tr_lables))
tr_one_hot_label = to_categorical(tr_lables)
print(tr_one_hot_label)


# Split for training whale and dolphin detection

In [19]:
X_train_wd, X_valid_wd, y_train_wd, y_valid_wd = train_test_split(X_train, tr_one_hot_label, random_state=42, test_size=0.2)

In [20]:
with strategy.scope():
    model = build_binary_classification_model()
epochs = 500  # @param {type: "slider", min:8, max:80}
hist = model.fit(X_train_wd, y_train_wd, batch_size = 32, epochs=epochs, validation_data=(X_valid_wd,y_valid_wd), verbose=2)
plot_hist(hist)

In [None]:
model_history_l = model_l.fit(X_train_wd, y_train_wd, batch_size = 32,epochs=10,validation_data=(X_valid_wd, y_valid_wd)) #the batch size should be changed 


In [28]:
probabilities = model.predict(X_test)

In [29]:
print(len(probabilities))
lsit = np.argmax(probabilities,axis = 1)
inverse_labels_to_neumeric = dict((v, k) for k, v in labels_to_neumeric.items())
for i in lsit:
    pass
#     print(inverse_labels_to_neumeric[i])

In [34]:
i = 0
for index,row in test_df.iterrows():
#     print(row)
    print("actual Lable: ",row.label,' Predicted as:', inverse_labels_to_neumeric[lsit[i]])
    i = i+1

In [None]:
for i in range(0,f_t_num):
    re_img = resize_images(temporary_list[i],image_w,image_h)
    image = tf.cast(re_img, np.uint8)
    print(temporary_list[i],' as', inverse_labels_to_neumeric[lsit[i]])
    plt.imshow(image)
    plt.show()

# Neumerical Label to Categorical for species Detection

In [None]:
species_as_label = train_df['species'].values
species_label_list = []
for species in species_as_label:
    species_label_list.append(species_to_neumeric[species])
# label = []
# for i in y:
#     if i=='whale':
#         label.append(1)
#     else:
#         label.append(0)

# tr_labes = np.array(label)
tr_sp_lables = np.array(species_label_list)
tr_sp_one_hot_label = to_categorical(tr_sp_lables)
len(tr_sp_one_hot_label)

# Neumerical Label to Categorical for Individual Detection

In [None]:
individual_as_label = train_df['individual_id'].values
individual_label_list = []
for individual in individual_as_label:
    individual_label_list.append(ids_to_neumeric[individual])
# label = []
# for i in y:
#     if i=='whale':
#         label.append(1)
#     else:
#         label.append(0)

# tr_labes = np.array(label)
tr_id_lables = np.array(individual_label_list)
tr_id_one_hot_label = to_categorical(tr_id_lables)
tr_id_one_hot_label

# Divide into Train and validation set  for Species detection

In [None]:
X_train_SD, X_valid_SD, y_train_SD, y_valid_SD = train_test_split(X_train, tr_sp_one_hot_label, random_state=42, test_size=0.2)

# Divide into Train and validation set  for individual detection

In [None]:
X_train_ID, X_valid_ID, y_train_ID, y_valid_ID = train_test_split(X_train, tr_id_one_hot_label, random_state=42, test_size=0.2)

# Train the model for Species detection

In [None]:
model_history = model.fit(X_train_SD, y_train_SD, batch_size = 32,epochs=10,validation_data=(X_valid_SD, y_valid_SD)) #the batch size should be changed 
#according to the total input size if train image is very small and batch size is almost equal to train image numbers it will
#return OOM -->out of memory error

In [None]:
model.save('whale_detection_model.h5') 
model_h = tf.keras.models.load_model('whale_detection_model.h5')

In [None]:
# print(model.history['val_accuracy'])
# summarize history for accuracy
plt.plot(model_history.history['accuracy'])
plt.plot(model_history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(model_history.history['loss'])
plt.plot(model_history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
t_num = len(list_of_test_image_paths) #27956 number of images 
f_t_num = int(t_num/100) # trying to test for small number 
# for t_path in list_of_test_image_paths:#when all images will be tested 
X_test = test_data_generation(list_of_test_image_paths,f_t_num)

In [None]:
probabilities = model.predict(X_test)

In [None]:
print(len(probabilities))
lsit = np.argmax(probabilities,axis = 1)
inverse_species_to_neumeric = dict((v, k) for k, v in species_to_neumeric.items())
for i in lsit:
    pass
#     print(inverse_species_to_neumeric[i])

In [None]:
for i in range(0,f_t_num):
    re_img = resize_images(temporary_list[i],image_w,image_h)
    image = tf.cast(re_img, np.uint8)
    print(list_of_test_image_paths[i], ' as', inverse_species_to_neumeric[lsit[i]])
    plt.imshow(image)
    plt.show()

# Train the model for Individual detection

In [None]:
model_history_ind = model_ind.fit(X_train_ID, y_train_ID, batch_size = 32,epochs=100,validation_data=(X_valid_ID, y_valid_ID)) #the batch size should be changed 

In [None]:
probabilities = model_ind.predict(X_test)

In [None]:
print(len(probabilities))
lsit = np.argmax(probabilities,axis = 1)
inverse_ids_to_neumeric = dict((v, k) for k, v in ids_to_neumeric.items())
for i in lsit:
    print(inverse_ids_to_neumeric[i])

In [None]:
# train_df[train_df['individual_id'] == "3cf81d69cc5911"]

In [None]:
for i in range(0,f_t_num):
    re_img = resize_images(list_of_test_image_paths[i],image_w,image_h)
    image = tf.cast(re_img, np.uint8)
    print(list_of_test_image_paths[i])
    plt.imshow(image)
    plt.show()

# Separating the data for the individual training 

In [None]:
# train_df_all['image_path'] = train_img_dir+'/'+train_df_all['image']
# train_data_group_by_id = train_df_all.groupby('individual_id')
train_data_group_by_id = train_df.groupby('individual_id')
train_groups_array = dict()
val_groups_array = []
for g_index,group in train_data_group_by_id:
    if(len(group) < 5):
        val_groups_array.append(group)
    else:
        train_groups_array[g_index] = group
# train_group_data_frame = pd.concat(train_groups_array) 
val_group_data_frame = pd.concat(val_groups_array)
print(len(val_group_data_frame))

# Extract features by RESNET50

In [None]:
#transfor the image to numpy array for training 
#then fit a model per each individual
def extract_resnet(X,image_h,image_w):  
    # X : images numpy array
    resnet_model = ResNet50(input_shape=(image_h, image_w, 3), weights='imagenet', include_top=False)  # Since top layer is the fc layer used for predictions
    features_array = resnet_model.predict(X)
    return features_array

# Incomplete code for individual model and prediction 

In [None]:
model_array = dict()
for key in train_groups_array.keys():
    X_train_temp = train_data_genration(train_groups_array[key])
    X_train_temp_features = extract_resnet(X_train_temp,image_h,image_w)
    X_train_temp_features = X_train_temp_features.reshape(len(train_groups_array[key]),
                                                          int(X_train_temp_features.size/len(train_groups_array[key])))
    ss = StandardScaler()
    ss.fit(X_train_temp_features)
    X_train = ss.transform(X_train_temp_features)
#     X_val = ss.transform(X_val)
    # Take PCA to reduce feature space dimensionality
    pca = PCA(n_components=len(train_groups_array[key]), whiten=True)# as the componant number min(number of sample, feature)
    pca = pca.fit(X_train)
#     print('Explained variance percentage = %0.2f' % sum(pca.explained_variance_ratio_))
    X_train = pca.transform(X_train)
#     X_val = pca.transform(X_val)
    oc_svm_clf = svm.OneClassSVM(gamma=0.001, kernel='rbf', nu=0.08)  # Obtained using grid search
    oc_svm_clf.fit(X_train)
    model_array[key] = oc_svm_clf
    test_image_batches = [list_of_test_image_paths[i:i + len(train_groups_array[key])] for i in range(0, len(list_of_test_image_paths),
                                                                                 len(train_groups_array[key]))]
    for batch in test_image_batches:
        X_test_temp = test_data_generation(batch,len(batch))
        X_test_temp_features = extract_resnet(X_test_temp,image_h,image_w)
        X_test_temp_features = X_test_temp_features.reshape(len(train_groups_array[key]),
                                                          int(X_test_temp_features.size/len(train_groups_array[key])))
        X_test = ss.transform(X_test_temp_features)
        X_test = pca.transform(X_test)
        oc_svm_preds = oc_svm_clf.predict(X_test)
#     if_preds = if_clf.predict(X_test)    

# Tried other things not required now

In [None]:
# Apply standard scaler to output from resnet50
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.applications import resnet50

for i in range(0,100):
    test_image = load_img(list_of_test_image_paths[i], target_size = (image_h, image_w)) 
    test_np_image = img_to_array(test_image) 
    test_image_batch = np.expand_dims(test_np_image, axis = 0) 
    test_processed_image = resnet50.preprocess_input(test_image_batch.copy())
#     ss = StandardScaler()
#     test_processed_image = ss.transform(test_processed_image)
    # Take PCA to reduce feature space dimensionality
#     pca = PCA(n_components=512, whiten=True)
#     pca = pca.fit(test_processed_image)
#     test_processed_image = pca.transform(test_processed_image)
    print(test_processed_image.shape)
    test_processed_image = test_processed_image.reshape(1,image_h*image_w*3)

    for key in model_array.keys():
        oc_svm_preds = oc_svm_clf.predict(test_processed_image)
        print(oc_svm_preds)
    break
    