In [1]:
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    print(dirname)
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input
/kaggle/input/happy-whale-and-dolphin
/kaggle/input/happy-whale-and-dolphin/train_images
/kaggle/input/happy-whale-and-dolphin/test_images


In [2]:
import gc
import glob
import os

import joblib
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from contextlib import contextmanager
from multiprocessing.pool import ThreadPool, Pool
from joblib import Parallel, delayed
import time

%matplotlib inline

In [3]:
CPUS = 4 # kaggle default - https://www.kaggle.com/product-feedback/64106
happywhale_input_dir = "/kaggle/input/happy-whale-and-dolphin"
train_csv_fn = "train.csv"
# order by image jpg file and reset index to have deterministic index
train_df = pd.read_csv(os.path.join(happywhale_input_dir, train_csv_fn)).sort_values('image').reset_index()

train_df.species.replace({"globis": "short_finned_pilot_whale",
                          "pilot_whale": "short_finned_pilot_whale",
                          "kiler_whale": "killer_whale",
                          "bottlenose_dolpin": "bottlenose_dolphin"}, inplace=True)

In [4]:
for c in train_df.columns:
    print('number of {}: {}'.format(c, len(train_df[c].unique())))


number of index: 51033
number of image: 51033
number of species: 26
number of individual_id: 15587


In [5]:
from skimage.io import imread
from skimage.transform import resize
from skimage.color import rgb2gray, gray2rgb
import matplotlib.pyplot as plt

def convert_image_to_ndarray(img_path, output_size=(256,256), color='normalize_gray'):
    '''
    reads the `img_path` and converts it to a numpy array of size `output_size`
    Supported Options for `color`:
    - 'normalize_gray'
    - 'raw_rgb'
    
    Returns the 
    1. normalized and standardized image
    2. aspect ratio of the original image
    '''
    # read input
    im = imread(img_path)
    if output_size == None:
        return im
    aspect_ratio = float(im.shape[1] / im.shape[0])
    resized_im = resize(im, output_size)
    if color == 'raw_rgb':
        return resized_im, aspect_ratio
    elif color == 'normalize_gray':
        # if image only has 2 channels, it's already grayscale
        if len(resized_im.shape) == 2:
            return resized_im / np.max(resized_im), aspect_ratio
        # if image has 3 channels. it's rgb and needs to be converted to grayscale
        elif len(resized_im.shape) == 3:
            gray_img = rgb2gray(resized_im)
            # Now normalize gray image
            gray_norm = gray_img / np.max(gray_img)
            return gray_norm, aspect_ratio
        
def convert_ndarray_img_to_maintain_aspect_ratio(ndarray_img, ar):
    '''
    aspect ratio is width / height
    '''
    new_width = ndarray_img.shape[1]*ar # new width should be aspect ratio * img width
    new_ar_shape = (ndarray_img.shape[0], new_width)
    return resize(ndarray_img, new_ar_shape)


# create lambda function to construct the full path for a given image.
# this is intended to be used on the `image` column in train.csv
get_img_path = lambda img: os.path.join(happywhale_input_dir, img)

In [6]:
# https://medium.com/codebyte/add-progress-bar-to-your-for-loops-5f0a50500ff3
from tqdm import tqdm

def pixelize_images_in_df(df, show_progress_bar=False):
    '''
    Processes the dataframe from `train.csv` or `sample_submission.csv`
    1. converting each image to a normalized grayscale (256x256) pixel representation
    2. creating 256*256 + 1 additional features to the original dataframe
        - each feature is 1 pixel from step 1.
        - final feature is the aspect ratio of the original image
    '''
    images = df.image
    flat_ndarray_images = []
    aspect_ratios = []
    n_pixels = 256
    output_shape = (n_pixels,n_pixels)
    items = tqdm(images) if show_progress_bar else images
    for img in items:
        img_path = get_img_path('train_images/' + img)
        ndarray, ar = convert_image_to_ndarray(img_path, output_size=output_shape)
        
        flat_ndarray_images.append(ndarray.reshape(n_pixels*n_pixels))
        aspect_ratios.append(ar)
        
    # construct final dataframe to return
    pixelized_images_df = pd.DataFrame(flat_ndarray_images, index=df.index)
    pixelized_images_df['aspect_ratio'] = aspect_ratios
    return pd.merge(df, pixelized_images_df, left_index=True, right_index=True)

In [7]:
# https://www.kaggle.com/wrosinski/parallel-data-processing-and-model-training

@contextmanager
def timer(name):
    t0 = time.time()
    yield
    print('{0} done in {1:.3f} seconds.'.format(name, time.time() - t0))

def split_df(df, num_splits, log=False):
    
    df_list = []
    rows_splits = np.linspace(0, df.shape[0], num_splits+1).astype(int)
    if log:
        print('Split into {} parts'.format(num_splits))
        print('Row splits:\n{}'.format(rows_splits))
    
    for i in range(len(rows_splits) - 1):
        df_list.append(df.iloc[rows_splits[i]:rows_splits[i+1]])
        
    return df_list[:num_splits]

In [8]:
def pool_process_and_dump(df, output_path_from_batch_processed, timer_str, CPUS=4, log=False):
    df_splits = split_df(df, CPUS, log=log)
    with timer(timer_str):
        with Pool(processes=CPUS) as pool:
            dfs_proc = pool.map(pixelize_images_in_df, df_splits)
    processed_df = pd.concat(dfs_proc)
    
    output_dir = '/kaggle/working'
    batch_process_dir = os.path.join(output_dir, 'batch_processed')
    if not os.path.exists(batch_process_dir):
        os.makedirs(batch_process_dir)
    batch_process_fn = os.path.join(batch_process_dir,
                            output_path_from_batch_processed)
    processed_df.to_csv(batch_process_fn, index=False)
    return processed_df

What we want in our sampled data set

tradeoff between
- equal representation of each species with at least 1 number pictures for each individual
- at least X number of pictures for each individual regardless of species; X becomes the tradeoff variable

if we want all species and individuals,

if we want all individuals in each species to have at least X images, we will not have enough images for each species. for example, the frasier_dolphine

In [9]:
individual_ids_by_species = pd.DataFrame(train_df[['species', 'individual_id']].value_counts()).rename({0:'count'}, axis=1)

In [10]:

# for individual in individual_ids_by_species[individual_ids_by_species['count']>5]:
    
subsetted_individuals = individual_ids_by_species[individual_ids_by_species['count']>5].reset_index()#['count'].sum()
subsetted_individuals#.reset_index()['individual_id']

Unnamed: 0,species,individual_id,count
0,minke_whale,37c7aba965a5,400
1,minke_whale,114207cab555,168
2,bottlenose_dolphin,a6e325d8e924,155
3,minke_whale,19fbb960f07d,154
4,bottlenose_dolphin,c995c043c353,153
...,...,...,...
1345,humpback_whale,4975b214e6dc,6
1346,beluga,cd5c42233201,6
1347,humpback_whale,aabc3fa15abf,6
1348,bottlenose_dolphin,045ca1b5a580,6


In [11]:
sampled_data = []
for iid in subsetted_individuals.individual_id:
    images_of_iid = train_df[train_df['individual_id']==iid]
    sample = images_of_iid[:5]
    sampled_data.append(sample)
sampled_data_df = pd.concat(sampled_data)

In [12]:
sampled_data_df

Unnamed: 0,index,image,species,individual_id
48,48,0046ceef89b3fc.jpg,minke_whale,37c7aba965a5
62,62,005e53b1b6aada.jpg,minke_whale,37c7aba965a5
174,174,0106d276033b78.jpg,minke_whale,37c7aba965a5
226,226,014ba64e8ce8ec.jpg,minke_whale,37c7aba965a5
253,253,01637f0b588ed8.jpg,minke_whale,37c7aba965a5
...,...,...,...,...
8280,8280,29778c02fc2154.jpg,humpback_whale,5fb723f0717e
9576,9576,30355c9b68dde3.jpg,humpback_whale,5fb723f0717e
12059,12059,3cfb490bff6e3b.jpg,humpback_whale,5fb723f0717e
29928,29928,96826b9a212fdd.jpg,humpback_whale,5fb723f0717e


In [13]:
pool_process_and_dump(sampled_data_df,
                      'processed_batch_df_individuals_over_5_pics.csv',
                      'pool pixelize images processing all species df')

pool pixelize images processing all species df done in 3924.392 seconds.


Unnamed: 0,index,image,species,individual_id,0,1,2,3,4,5,...,65527,65528,65529,65530,65531,65532,65533,65534,65535,aspect_ratio
48,48,0046ceef89b3fc.jpg,minke_whale,37c7aba965a5,0.577004,0.578429,0.575466,0.566464,0.569012,0.566929,...,0.772681,0.775807,0.781642,0.790128,0.806353,0.829458,0.847465,0.863127,0.873118,1.500208
62,62,005e53b1b6aada.jpg,minke_whale,37c7aba965a5,0.965076,0.968856,0.968865,0.973752,0.973024,0.972196,...,0.681208,0.680034,0.689564,0.685673,0.686724,0.682054,0.680568,0.677896,0.675313,1.500208
174,174,0106d276033b78.jpg,minke_whale,37c7aba965a5,0.830182,0.833412,0.832983,0.835845,0.839813,0.839705,...,0.761637,0.758382,0.760516,0.764138,0.766854,0.769477,0.766741,0.753012,0.740826,1.500208
226,226,014ba64e8ce8ec.jpg,minke_whale,37c7aba965a5,0.941528,0.957661,0.964716,0.966203,0.959400,0.959557,...,0.506930,0.514200,0.539230,0.579651,0.653189,0.702384,0.711095,0.705012,0.693893,1.500208
253,253,01637f0b588ed8.jpg,minke_whale,37c7aba965a5,0.964056,0.966930,0.972216,0.972371,0.973288,0.975568,...,0.701135,0.692862,0.693875,0.693232,0.676420,0.657260,0.645715,0.648181,0.662060,1.500208
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8280,8280,29778c02fc2154.jpg,humpback_whale,5fb723f0717e,0.596735,0.609790,0.628049,0.623233,0.651835,0.654167,...,0.812787,0.773042,0.645310,0.485793,0.542528,0.647493,0.750273,0.745116,0.678813,4.141542
9576,9576,30355c9b68dde3.jpg,humpback_whale,5fb723f0717e,0.383635,0.431643,0.394578,0.349969,0.332534,0.359629,...,0.309050,0.277381,0.249283,0.276766,0.344834,0.298137,0.232686,0.224702,0.190208,2.044463
12059,12059,3cfb490bff6e3b.jpg,humpback_whale,5fb723f0717e,0.384261,0.341843,0.322892,0.311288,0.347934,0.439466,...,0.654368,0.653795,0.641204,0.634613,0.613434,0.611588,0.606349,0.595877,0.591314,4.373178
29928,29928,96826b9a212fdd.jpg,humpback_whale,5fb723f0717e,0.380415,0.353658,0.378988,0.437415,0.497482,0.472076,...,0.530661,0.501390,0.482982,0.483813,0.515366,0.543902,0.540436,0.512015,0.509161,4.916667


In [14]:
# min(train_df.species.value_counts())
# Simple training set will be composed of 14 of each species with pairs of each individual id
# all_species_training_sets = []
# n_each_species = min(train_df.species.value_counts())
# print('Using {} of each species'.format(n_each_species))
# for s in train_df.species.unique():
#     sample = train_df[train_df.species == s].head(n_each_species) # use head to be deterministic for sample
#     all_species_training_sets.append(sample)
    
# all_species_train_df = pd.concat(all_species_training_sets)
# pool_process_and_dump(all_species_train_df,
#                       'processed_batch_df_all_species.csv',
#                       'pool pixelize images processing all species df')

# sample up to 1000 for training set
# remaining_training_set = 1000 - len(all_species_train_df)
# batch_size = 40

# left_off_on_n = 0
# continue_until_n = remaining_training_set # None # changing this to None will use the entire train_df
# small_training_df = all_species_train_df.copy()
# while left_off_on_n < continue_until_n:
#     next_batch_index = left_off_on_n+batch_size
    
#     sampled_df = train_df.sample(batch_size, random_state=left_off_on_n)
#     pool_process_and_dump(sampled_df,
#                       'processed_batch_sampled_df_random_state_{}.csv'.format(left_off_on_n),
#                       'pool pixelize images processing sampled batch')
#     small_training_df = pd.concat([small_training_df, sampled_df])
#     left_off_on_n = next_batch_index


In [15]:
# len(small_training_df['individual_id'].unique()), small_training_df.shape

In [16]:
# from sklearn.model_selection import train_test_split

In [17]:
# x_cols = []

# X_train, X_test, y_train, y_test = train_test_split(small_training_df[x_cols], small_training_df['individual_id'], test_size=.2, random_state=2022)



In [18]:
# # building a linear stack of layers with the sequential model
# model = Sequential()
# # convolutional layer
# model.add(Conv2D(25, kernel_size=(3,3), strides=(1,1), padding='valid', activation='relu', input_shape=(28,28,1)))
# model.add(MaxPool2D(pool_size=(1,1)))
# # flatten output of conv
# model.add(Flatten())
# # hidden layer
# model.add(Dense(100, activation='relu'))
# # output layer
# model.add(Dense(10, activation='softmax'))

# # compiling the sequential model
# model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')

# # training the model for 10 epochs
# model.fit(X_train, Y_train, batch_size=128, epochs=10, validation_data=(X_test, Y_test))