# Humpback whale prediction using Resnet-50
##### Ashish Patel in kaggle (Data Scientist at Softweb Solution. Ahmedabad, Gujarat, India)

### 1. Import necessary library

In [1]:
from collections import defaultdict
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import normalize
from scipy.stats import logistic
from os.path import join
from PIL import Image
from keras import backend as K
from keras.models import Model
from keras.optimizers import Adam
from keras.layers import Input, Dense, Dropout, Lambda, Convolution2D, MaxPooling2D, Flatten
from keras.losses import categorical_crossentropy
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.applications.resnet50 import ResNet50, preprocess_input
import os
import matplotlib.pyplot as plt

Using TensorFlow backend.


### 2. Basic setup

In [2]:
batch_size = 32
embedding_dim = 50
image_size = 224

path_base = "C:/Python/Whale_playground/"
path_train = join(path_base, 'train')
path_test = join(path_base, 'test')
path_model = join(path_base, 'MyModel.hdf5')
path_csv = 'C:/Python/Whale_playground/train.csv'

### 3. Define functions
#### 3.1 sample_gen class

In [3]:
class sample_gen(object):
    
    # 1. init function
    
    def __init__(self, file_class_mapping, other_class = "new_whale"):   # image value & Id
        self.file_class_mapping = file_class_mapping
        self.class_to_list_files = defaultdict(list)    # define nonexistent key for dictionary with default value
        self.list_other_class = []
        self.list_all_files = list(file_class_mapping.keys())    # key() -> extract keys of dictionary data as list
        self.range_all_files = list(range(len(self.list_all_files)))
        
        for file, class_ in file_class_mapping.items():    # items() -> extract keys and values of dictionary data as list
            if class_ == other_class:
                self.list_other_class.append(file)    # fill 'image value' of new_whale for empty list defined before
            else:
                self.class_to_list_files[class_].append(file)    # fill 'image value' of specific_whale with 'Id value' for list
        
        self.list_classes = list(set(self.file_class_mapping.values()))    # set data type. No overlap, Unordered. For filtering overlap
        self.range_list_classes = range(len(self.list_classes))    # range
        
        self.class_weight = np.array([len(self.class_to_list_files[class_]) for class_ in self.list_classes])
        self.class_weight = self.class_weight / np.sum(self.class_weight)
        
    # 2. get_sample function    
    
    def get_sample(self):
        class_idx = np.random.choice(self.range_list_classes, 1, p=self.class_weight)[0]
        examples_class_idx = np.random.choice(range(len(self.class_to_list_files[self.list_classes[class_idx]])), 2)
        positive_example_1, positive_example_2 = \
            self.class_to_list_files[self.list_classes[class_idx]][examples_class_idx[0]], \
            self.class_to_list_files[self.list_classes[class_idx]][examples_class_idx[1]]
        
        negative_example = None
        while negative_example is None or self.file_class_mapping[negative_example] == \
                self.file_class_mapping[positive_example_1]:
            negative_example_idx = np.random.choice(self.range_all_files, 1)[0]
            negative_example = self.list_all_files[negative_example_idx]
        return positive_example_1, negative_example, positive_example_2

#### 3.2 other functions
##### Preprocessing

In [4]:
# 1. read_and_resize function
    
def read_and_resize(filepath):
    im = Image.open((filepath)).convert('RGB')
    im = im.resize((image_size, image_size))
    return np.array(im, dtype="float32")
    
# 2. augment function
    
def augment(im_array):
    if np.random.uniform(0, 1) > 0.9:
        im_array = np.fliplr(im_array)
    return im_array
    
# 3. gen function
    
def gen(triplet_gen):
    while True:
        list_positive_examples_1 = []
        list_negative_examples = []
        list_positive_examples_2 = []
            
        for i in range(batch_size):
            positive_example_1, negative_example, positive_example_2 = triplet_gen.get_sample()
                
            path_pos1 = join(path_train, positive_example_1)
            path_neg = join(path_train, negative_example)
            path_pos2 = join(path_train, positive_example_2)
                
            positive_example_1_img = read_and_resize(path_pos1)
            negative_example_img = read_and_resize(path_neg)
            positive_example_2_img = read_and_resize(path_pos2)
                
            positive_example_1_img = augment(positive_example_1_img)
            negative_example_img = augment(negative_example_img)
            positive_example_2_img = augment(positive_example_2_img)
                
            list_positive_examples_1.append(positive_example_1_img)
            list_negative_examples.append(negative_example_img)
            list_positive_examples_2.append(positive_example_2_img)
                
            
        A = preprocess_input(np.array(list_positive_examples_1))
        B = preprocess_input(np.array(list_positive_examples_2))
        C = preprocess_input(np.array(list_negative_examples))
            
        label = None
            
        yield({'anchor_input': A, 'positive_input': B, 'negative_input': C}, label)