In [None]:
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    print(dirname)
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import gc
import glob
import os

import joblib
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from contextlib import contextmanager
from multiprocessing.pool import ThreadPool, Pool
from joblib import Parallel, delayed
import time

%matplotlib inline

In [None]:
CPUS = 4 # kaggle default - https://www.kaggle.com/product-feedback/64106
happywhale_input_dir = "/kaggle/input/happy-whale-and-dolphin"
train_csv_fn = "train.csv"
# order by image jpg file and reset index to have deterministic index
train_df = pd.read_csv(os.path.join(happywhale_input_dir, train_csv_fn)).sort_values('image').reset_index()

train_df.species.replace({"globis": "short_finned_pilot_whale",
                          "pilot_whale": "short_finned_pilot_whale",
                          "kiler_whale": "killer_whale",
                          "bottlenose_dolpin": "bottlenose_dolphin"}, inplace=True)

In [None]:
for c in train_df.columns:
    print('number of {}: {}'.format(c, len(train_df[c].unique())))


In [None]:
from skimage.io import imread
from skimage.transform import resize
from skimage.color import rgb2gray, gray2rgb
import matplotlib.pyplot as plt

def convert_image_to_ndarray(img_path, output_size=(256,256), color='normalize_gray'):
    '''
    reads the `img_path` and converts it to a numpy array of size `output_size`
    Supported Options for `color`:
    - 'normalize_gray'
    - 'raw_rgb'
    
    Returns the 
    1. normalized and standardized image
    2. aspect ratio of the original image
    '''
    # read input
    im = imread(img_path)
    if output_size == None:
        return im
    aspect_ratio = float(im.shape[1] / im.shape[0])
    resized_im = resize(im, output_size)
    if color == 'raw_rgb':
        return resized_im, aspect_ratio
    elif color == 'normalize_gray':
        # if image only has 2 channels, it's already grayscale
        if len(resized_im.shape) == 2:
            return resized_im / np.max(resized_im), aspect_ratio
        # if image has 3 channels. it's rgb and needs to be converted to grayscale
        elif len(resized_im.shape) == 3:
            gray_img = rgb2gray(resized_im)
            # Now normalize gray image
            gray_norm = gray_img / np.max(gray_img)
            return gray_norm, aspect_ratio
        
def convert_ndarray_img_to_maintain_aspect_ratio(ndarray_img, ar):
    '''
    aspect ratio is width / height
    '''
    new_width = ndarray_img.shape[1]*ar # new width should be aspect ratio * img width
    new_ar_shape = (ndarray_img.shape[0], new_width)
    return resize(ndarray_img, new_ar_shape)


# create lambda function to construct the full path for a given image.
# this is intended to be used on the `image` column in train.csv
get_img_path = lambda img: os.path.join(happywhale_input_dir, img)

In [None]:
# https://medium.com/codebyte/add-progress-bar-to-your-for-loops-5f0a50500ff3
from tqdm import tqdm

def pixelize_images_in_df(df, show_progress_bar=False):
    '''
    Processes the dataframe from `train.csv` or `sample_submission.csv`
    1. converting each image to a normalized grayscale (256x256) pixel representation
    2. creating 256*256 + 1 additional features to the original dataframe
        - each feature is 1 pixel from step 1.
        - final feature is the aspect ratio of the original image
    '''
    images = df.image
    flat_ndarray_images = []
    aspect_ratios = []
    n_pixels = 256
    output_shape = (n_pixels,n_pixels)
    items = tqdm(images) if show_progress_bar else images
    for img in items:
        img_path = get_img_path('train_images/' + img)
        ndarray, ar = convert_image_to_ndarray(img_path, output_size=output_shape)
        
        flat_ndarray_images.append(ndarray.reshape(n_pixels*n_pixels))
        aspect_ratios.append(ar)
        
    # construct final dataframe to return
    pixelized_images_df = pd.DataFrame(flat_ndarray_images, index=df.index)
    pixelized_images_df['aspect_ratio'] = aspect_ratios
    return pd.merge(df, pixelized_images_df, left_index=True, right_index=True)

In [None]:
# https://www.kaggle.com/wrosinski/parallel-data-processing-and-model-training

@contextmanager
def timer(name):
    t0 = time.time()
    yield
    print('{0} done in {1:.3f} seconds.'.format(name, time.time() - t0))

def split_df(df, num_splits, log=False):
    
    df_list = []
    rows_splits = np.linspace(0, df.shape[0], num_splits+1).astype(int)
    if log:
        print('Split into {} parts'.format(num_splits))
        print('Row splits:\n{}'.format(rows_splits))
    
    for i in range(len(rows_splits) - 1):
        df_list.append(df.iloc[rows_splits[i]:rows_splits[i+1]])
        
    return df_list[:num_splits]

In [None]:
def pool_process_and_dump(df, output_path_from_batch_processed, timer_str, CPUS=4, log=False):
    df_splits = split_df(df, CPUS, log=log)
    with timer(timer_str):
        with Pool(processes=CPUS) as pool:
            dfs_proc = pool.map(pixelize_images_in_df, df_splits)
    processed_df = pd.concat(dfs_proc)
    
    output_dir = '/kaggle/working'
    batch_process_dir = os.path.join(output_dir, 'batch_processed')
    if not os.path.exists(batch_process_dir):
        os.makedirs(batch_process_dir)
    batch_process_fn = os.path.join(batch_process_dir,
                            output_path_from_batch_processed)
    processed_df.to_csv(batch_process_fn, index=False)
    return processed_df

What we want in our sampled data set

tradeoff between
- equal representation of each species with at least 1 number pictures for each individual
- at least X number of pictures for each individual regardless of species; X becomes the tradeoff variable

if we want all species and individuals,

if we want all individuals in each species to have at least X images, we will not have enough images for each species. for example, the frasier_dolphine

In [None]:
individual_ids_by_species = pd.DataFrame(train_df[['species', 'individual_id']].value_counts()).rename({0:'count'}, axis=1)

In [None]:

# for individual in individual_ids_by_species[individual_ids_by_species['count']>5]:
    
subsetted_individuals = individual_ids_by_species[individual_ids_by_species['count']>5].reset_index()#['count'].sum()
subsetted_individuals#.reset_index()['individual_id']

In [None]:
sampled_data = []
for iid in subsetted_individuals.individual_id:
    images_of_iid = train_df[train_df['individual_id']==iid]
    sample = images_of_iid[:5]
    sampled_data.append(sample)
sampled_data_df = pd.concat(sampled_data)

In [None]:
sampled_data_df

In [None]:
pool_process_and_dump(sampled_data_df,
                      'processed_batch_df_individuals_over_5_pics.csv',
                      'pool pixelize images processing all species df')

In [None]:
# min(train_df.species.value_counts())
# Simple training set will be composed of 14 of each species with pairs of each individual id
# all_species_training_sets = []
# n_each_species = min(train_df.species.value_counts())
# print('Using {} of each species'.format(n_each_species))
# for s in train_df.species.unique():
#     sample = train_df[train_df.species == s].head(n_each_species) # use head to be deterministic for sample
#     all_species_training_sets.append(sample)
    
# all_species_train_df = pd.concat(all_species_training_sets)
# pool_process_and_dump(all_species_train_df,
#                       'processed_batch_df_all_species.csv',
#                       'pool pixelize images processing all species df')

# sample up to 1000 for training set
# remaining_training_set = 1000 - len(all_species_train_df)
# batch_size = 40

# left_off_on_n = 0
# continue_until_n = remaining_training_set # None # changing this to None will use the entire train_df
# small_training_df = all_species_train_df.copy()
# while left_off_on_n < continue_until_n:
#     next_batch_index = left_off_on_n+batch_size
    
#     sampled_df = train_df.sample(batch_size, random_state=left_off_on_n)
#     pool_process_and_dump(sampled_df,
#                       'processed_batch_sampled_df_random_state_{}.csv'.format(left_off_on_n),
#                       'pool pixelize images processing sampled batch')
#     small_training_df = pd.concat([small_training_df, sampled_df])
#     left_off_on_n = next_batch_index


In [None]:
# len(small_training_df['individual_id'].unique()), small_training_df.shape

In [None]:
# from sklearn.model_selection import train_test_split

In [None]:
# x_cols = []

# X_train, X_test, y_train, y_test = train_test_split(small_training_df[x_cols], small_training_df['individual_id'], test_size=.2, random_state=2022)



In [None]:
# # building a linear stack of layers with the sequential model
# model = Sequential()
# # convolutional layer
# model.add(Conv2D(25, kernel_size=(3,3), strides=(1,1), padding='valid', activation='relu', input_shape=(28,28,1)))
# model.add(MaxPool2D(pool_size=(1,1)))
# # flatten output of conv
# model.add(Flatten())
# # hidden layer
# model.add(Dense(100, activation='relu'))
# # output layer
# model.add(Dense(10, activation='softmax'))

# # compiling the sequential model
# model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')

# # training the model for 10 epochs
# model.fit(X_train, Y_train, batch_size=128, epochs=10, validation_data=(X_test, Y_test))