In [1]:
from pathlib import Path
from multiprocessing import Pool
import multiprocessing

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from mpl_toolkits.axes_grid1 import ImageGrid
from PIL import Image, ImageStat
from tqdm.notebook import tqdm
import seaborn as sns

# Top Solutions from Previous Competitions 🏆

The top solution from previous competitions are as follows:

* [1st solution(classification) && code](https://www.kaggle.com/c/humpback-whale-identification/discussion/82366)
* [2nd place code, end to end whale Identification model](https://www.kaggle.com/c/humpback-whale-identification/discussion/83885)
* [3rd place solution with code: ArcFace](https://www.kaggle.com/c/humpback-whale-identification/discussion/82484)
* [4th Place Solution: SIFT + Siamese](https://www.kaggle.com/c/humpback-whale-identification/discussion/82356)
* [5th solution blog post + code](https://www.kaggle.com/c/humpback-whale-identification/discussion/82369)

<a id="load-dataset"></a>

# Load and Preprocess Dataset ⌛

In [2]:
train_df = pd.read_csv('../input/happy-whale-and-dolphin/train.csv')
test_df = pd.read_csv('../input/happy-whale-and-dolphin/sample_submission.csv')

In [3]:
SAMPLE = None

In [4]:
if SAMPLE:
    train_df = train_df.sample(SAMPLE)
    test_df =  test_df.sample(SAMPLE)

In [5]:
train_df.head()

I'm going to calculate a bunch of stats about the images which will make the analysis quick later on.

The preprocessing is saved to a file so we can load it again later.

In [6]:
from multiprocessing import Pool
from functools import partial

def _do_image(image_id, dataset):
    image_path = Path(f'../input/happy-whale-and-dolphin/{dataset}_images')/image_id

    image = Image.open(image_path)

    width, height = image.size
    mode = image.mode
    stat = ImageStat.Stat(image)
    min_max_channels = image.getextrema()

    if len(min_max_channels) == 2:
        min_0, max_0 = min_max_channels[0], min_max_channels[1]
        min_1 = max_1 = min_2 = max_2 = 0
        avg_0, avg_1, avg_2 = stat.mean[0], stat.mean[0], stat.mean[0]
        std_0, std_1, std_2 = stat.stddev[0], stat.stddev[0], stat.stddev[0]
    else:
        min_0, max_0 = min_max_channels[0][0], min_max_channels[0][1]
        min_1, max_1 = min_max_channels[1][0], min_max_channels[1][1]
        min_2, max_2 = min_max_channels[2][0], min_max_channels[2][1]
        avg_0, avg_1, avg_2 = stat.mean
        std_0, std_1, std_2 = stat.stddev

    area = (width * height) / 1_000_000
    mean = (avg_0 + avg_1 + avg_2) / 3
    
    return (
        image_id, width, height, area, mean, mode,
        min_0, max_0, min_1, max_1, min_2, max_2,
        avg_0, avg_1, avg_2, std_0, std_1, std_2
    )

def get_image_stats(image_ids, dataset):
    with Pool(multiprocessing.cpu_count()) as p:
        func = partial(_do_image, dataset=dataset)
        output = list(tqdm(p.imap(func, image_ids), total=len(image_ids)))

    df = pd.DataFrame(output, columns=[
        'image', 'width', 'height', 'area', 'mean_px', 'mode',
        'min_px_0', 'max_px_0', 'min_px_1', 'max_px_1', 'min_px_2', 'max_px_2',
        'avg_px_0', 'avg_px_1', 'avg_px_2', 'std_px_0', 'std_px_1', 'std_px_2'
    ])
    
    return df

In [None]:
train_image_stats = get_image_stats(train_df.image, 'train')
test_image_stats = get_image_stats(test_df.image, 'test')

In [None]:
train_df_stats = train_df.merge(train_image_stats, on='image')
test_df_stats = test_df.merge(test_image_stats, on='image')

In [None]:
train_df_stats.to_csv('train_stats.csv', index=False)
test_df_stats.to_csv('test_stats.csv', index=False)

In [None]:
train_df_stats.head()

<a id="visualise-examples"></a>

# Visualise Some Examples 👀

In [None]:
def image_grid(images, nrows_ncols, title=None, figsize=(16, 5)):
    fig = plt.figure(figsize=figsize)
    if title:
        plt.title(title)

    grid = ImageGrid(fig, 111, nrows_ncols=nrows_ncols, axes_pad=0.1)

    for ax, im in zip(grid, images):
        ax.imshow(im)

    plt.show()


def load_images(image_ids, resize=(128, 128)):
    output = []
    for i in image_ids:
        img = Image.open(Path('../input/happy-whale-and-dolphin/train_images')/i)
        if resize:
            img = img.resize(resize)
            
        output.append(img)
        
    return output

In [None]:
image_ids = list(train_df_stats.sample(15).image)
images = load_images(image_ids)
image_grid(images, nrows_ncols=(3, 5), figsize=(20, 15))

<a id="dataset-size"></a>

# Dataset Sizes 📏

In [None]:
size_df = pd.DataFrame([(len(train_df_stats), 'train'), (len(test_df_stats), 'test')], columns=['size', 'dataset'])
ax = sns.barplot(x=size_df.dataset, y=size_df['size'])
ax.bar_label(ax.containers[0])
plt.show()

In [None]:
print(f'Train set size: {len(train_df_stats)}, Test set size: {len(test_df_stats)}')

# Metadata Columns 🏛️

In [None]:
train_df.columns

<a id="species-column"></a>

# Species Column 🐋

## Unique species before cleaning

In [None]:
train_df_stats.species.unique()

In [None]:
train_df_stats.species.nunique()

<a id="specie-label-cleaning"></a>

## Specie label cleaning

[This](https://www.kaggle.com/kwentar/what-about-species) notebook has researched the labels that appear to be misspelled. So I'll use the good work to clean up the species labels.

In [None]:
train_df_stats.species.replace({
    "globis": "short_finned_pilot_whale",
    "pilot_whale": "short_finned_pilot_whale",
    "kiler_whale": "killer_whale",
    "bottlenose_dolpin": "bottlenose_dolphin",
    "beluga": "beluga_whale"
}, inplace=True)

## Unique species after cleaning

In [None]:
train_df_stats.species.nunique()

<a id="species-per-class"></a>

## Species Per Class

In another notebook, I train a 94% accurate model to predict the test set species distribution.

In [None]:
test_species = pd.read_csv('../input/happywhale-what-species-are-in-the-test-set/test_species.csv')

In [None]:
test_species.head()

In [None]:
train_val_count = train_df_stats.species.value_counts()
test_val_count = test_species.species_pred.value_counts()

fig, (ax, ax2) = plt.subplots(ncols=2, figsize=(15,8))

chart = sns.barplot(x=train_val_count.index, y=train_val_count.values, ax=ax)
ax.set_title('Train species dist')
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
chart = sns.barplot(x=test_val_count.index, y=test_val_count.values, ax=ax2)
ax2.set_title('Test species dist (estimate)')
ax2.set_xticklabels(ax2.get_xticklabels(), rotation=90)
plt.show()

I'll add a column that tells us whether it's a whale or dolphin.

It seems that the species is suffixed with the base breed.

<a id="base-specie-label"></a>

### Add base species label

In [None]:
train_df_stats['base_species'] = np.where(train_df.species.str.endswith('dolphin'), 'dolphin', 'whale')

In [None]:
train_df_stats[train_df_stats['base_species'] == 'whale'].species.unique()

In [None]:
train_df_stats[train_df_stats['base_species'] == 'dolphin'].species.unique()

Apparently False Killer Whale and Killer Whale are actually considered dolphins!

https://en.wikipedia.org/wiki/Orca
https://en.wikipedia.org/wiki/False_killer_whale

For now, I will leave them in the whale class. They certainly look like Whales to me!

<a id="visualise-examples-of-species"></a>

# Visualise examples of species 📷

## Whales

In [None]:
whale_ids = list(train_df_stats.query('base_species == "whale"').sample(10).image)
whale_images = load_images(whale_ids)
image_grid(whale_images, nrows_ncols=(2, 5), figsize=(18, 8), title='Whale images')

## Dolphins

In [None]:
dolphin_ids = list(train_df_stats.query('base_species == "dolphin"').sample(10).image)
dolphin_images = load_images(dolphin_ids)
image_grid(dolphin_images, nrows_ncols=(2, 5), figsize=(18, 8), title='Dolphin images')

## Examples of Top 5 Species

## Beluga Whale

In [None]:
image_ids = list(train_df_stats.query('species == "beluga_whale"').sample(10).image)
images = load_images(image_ids)
image_grid(images, nrows_ncols=(2, 5), figsize=(18, 8), title='Beluga images')

## Bottlenose Dolphin

In [None]:
image_ids = list(train_df_stats.query('species == "bottlenose_dolphin"').sample(10).image)
images = load_images(image_ids)
image_grid(images, nrows_ncols=(2, 5), figsize=(18, 8), title='Bottlenose Dolphin images')

## Humpback Whale

In [None]:
image_ids = list(train_df_stats.query('species == "humpback_whale"').sample(10).image)
images = load_images(image_ids)
image_grid(images, nrows_ncols=(2, 5), figsize=(18, 8), title='Humpback Whale images')

## Blue Whale

In [None]:
image_ids = list(train_df_stats.query('species == "blue_whale"').sample(10).image)
images = load_images(image_ids)
image_grid(images, nrows_ncols=(2, 5), figsize=(18, 8), title='Blue Whale images')

The labels here seem like they might be a leak. Something to investigate!

## Killerwhale

In [None]:
image_ids = list(train_df_stats.query('species == "killer_whale"').sample(10).image)
images = load_images(image_ids)
image_grid(images, nrows_ncols=(2, 5), figsize=(18, 8), title='Killer Whale images')

My takeaways from that:
  * The dataset is mostly fin images.
  * There appear to be a lot of black and white images.
  * Some of the images have text that should be investigated for leaks.

<a id="dolphins-vs-whales"></a>

# How Many Dolphins vs Whales? 🆚

In [None]:
whale_count = np.sum(train_df_stats.base_species == 'whale')
dolphin_count = np.sum(train_df_stats.base_species == 'dolphin')

count_df = pd.DataFrame([(whale_count, 'whale'), (dolphin_count, 'dolphin')], columns=['number', 'species'])
ax = sns.barplot(x=count_df.species, y=count_df.number)
ax.bar_label(ax.containers[0])
plt.show()

<a id="how-many-unique"></a>

# Individual Id Column 😎

## How many unique?

In [None]:
train_df_stats.individual_id.nunique()

<a id="how-many-individual-whales-vs-dolphins"></a>
## How many unique whales vs dolphins?

In [None]:
nunique_whales = train_df_stats[train_df_stats.base_species == 'whale'].individual_id.nunique()
nunique_dolphins = train_df_stats[train_df_stats.base_species == 'dolphin'].individual_id.nunique()

unique_df = pd.DataFrame([(nunique_whales, 'whale'), (nunique_dolphins, 'dolphin')], columns=['unique_number', 'species'])
ax = sns.barplot(x=unique_df.species, y=unique_df.unique_number)
ax.bar_label(ax.containers[0])
plt.show()

<a id="unique-id-distribution"></a>

### What is the unique id distribution?

In [None]:
fig, ax = plt.subplots()
most_common = train_df_stats.individual_id.sample(1000).value_counts()[:125]
most_common.plot(kind='bar', figsize=(20,8), title='Individual ids', ax=ax)
ax.set_xticklabels([i[:5]+'...' for i in most_common.index])
plt.show()

In [None]:
id_counts = pd.DataFrame(train_df_stats.individual_id.value_counts())

Clearly there's a bunch of examples of whales/dolphins with only one example. Let's find those.

<a id="unique-id-only-one"></a>

### How many id examples have only one example in the train set?

In [None]:
len(id_counts[id_counts.individual_id == 1])

Seems like a lot!

Let's see some.

In [None]:
ind_id_set = set(id_counts[id_counts.individual_id == 1].index)
whale_ids = list(train_df_stats[train_df_stats.individual_id.isin(ind_id_set)].sample(10).image)
whale_images = load_images(whale_ids)
image_grid(whale_images, nrows_ncols=(2, 5), figsize=(18, 8))

<a id="how-big"></a>

# Image Sizes ⚖️

## Train vs Test Distribution

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 5))
fig.suptitle('Image sizes across datasets')
ax1.hist(train_df_stats.width, alpha=0.5, label='Train')
ax1.hist(test_df_stats.width, alpha=0.5, label='Test')
ax1.set_title('Width')
ax1.legend(loc='upper left')

ax2.hist(train_df_stats.height, alpha=0.5, label='Train')
ax2.hist(test_df_stats.height, alpha=0.5, label='Test')
ax2.set_title('Height')
ax2.legend(loc='upper left')

plt.show()

Both datasets appear to have quite evenly distributed image sizes.

## Images Size Distribution Per Species

The idea of this plot came from [this notebook](https://www.kaggle.com/andradaolteanu/whales-dolphins-effnet-embedding-cos-distance).

### Width by Species

In [None]:
data = train_df_stats[["species", "width"]]

plt.figure(figsize=(15, 5))
sns.violinplot(data=data, x="species", y="width", cut=0)
ax = plt.gca()
ax.set_xlabel("")
ax.set_ylabel("Width", size = 13, weight='bold')
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
plt.show()

### Height by Species

In [None]:
data = train_df_stats[["species", "height"]]

plt.figure(figsize=(15, 5))
sns.violinplot(data=data, x="species", y="height", cut=0)
ax = plt.gca()
ax.set_title("Height", size = 15, weight='bold')
ax.set_xlabel("")
ax.set_ylabel("Height", size = 13, weight='bold')
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
plt.show()

# Largest and Smallest Examples 🤏

## Largest

In [None]:
large_images = train_df_stats[['image', 'species', 'width', 'height', 'area']].sort_values(by='area', ascending=False).head(10)
large_images

In [None]:
image_ids = list(large_images.image)
images = load_images(image_ids, resize=False)
image_grid(images, nrows_ncols=(2, 5), figsize=(40, 18))

## Smallest

There appears to be a few really tiny images! Let's see those.

In [None]:
tiny_images = train_df_stats[['image', 'species', 'width', 'height', 'area']].sort_values(by='area').head(10)
tiny_images

In [None]:
whale_ids = list(tiny_images.image)
whale_images = load_images(whale_ids, resize=False)
image_grid(whale_images, nrows_ncols=(2, 5), figsize=(40, 18))

There appears to be a few grayscale images. Let's see how many.

<a id="rgb-vs-grayscale"></a>

# RGB vs Grayscale? 🎨 

In [None]:
val_count = train_df_stats['mode'].value_counts()
plt.figure(figsize=(15,8))
plt.title('RGB vs Grayscale')
ax = sns.barplot(y=val_count.values, x=val_count.index)
ax.bar_label(ax.containers[0])
plt.show()

# Greyscale Examples ⚪

In [None]:
grey = train_df_stats[train_df_stats['mode'] != 'RGB']
image_ids = list(grey.image)
images = load_images(image_ids, resize=False)
image_grid(images, nrows_ncols=(4, 5), figsize=(40, 18))

## Grayscale Examples by Species

In [None]:
grey.species.value_counts().plot.bar(title='Species with grayscale examples')
plt.show()

<a id="avg-pixels"></a>
# Pixel Brightness 🔆

## Per Set Distribution

In [None]:
train_color_only = train_df_stats[train_df_stats['mode'] == 'RGB']
test_color_only = test_df_stats[test_df_stats['mode'] == 'RGB']

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 5))
fig.suptitle('Avg pixel sizes across channels')

ax1.hist(train_color_only.avg_px_0, alpha=0.5, label='R', color='r')
ax1.hist(train_color_only.avg_px_1, alpha=0.5, label='G', color='g')
ax1.hist(train_color_only.avg_px_2, alpha=0.5, label='B', color='b')
ax1.set_title('Train')
ax1.legend(loc='upper left')

ax2.hist(test_color_only.avg_px_0, alpha=0.5, label='R', color='r')
ax2.hist(test_color_only.avg_px_1, alpha=0.5, label='G', color='g')
ax2.hist(test_color_only.avg_px_2, alpha=0.5, label='B', color='b')
ax2.set_title('Test')
ax2.legend(loc='upper left')

plt.show()

# Darkest Images ⚫

In [None]:
filtered = train_df_stats.sort_values(by='mean_px', ascending=True).head(15)
image_ids = list(filtered.image)
images = load_images(image_ids, resize=False)
image_grid(images, nrows_ncols=(2, 5), figsize=(18, 40))

# Lightest Images 💡

In [None]:
filtered = train_df_stats.sort_values(by='mean_px', ascending=False).head(15)
image_ids = list(filtered.image)
images = load_images(image_ids, resize=False)
image_grid(images, nrows_ncols=(2, 5), figsize=(18, 40))

In [None]:
# visualization
# ref: https://www.kaggle.com/sahamed/eda-visualization-augmentation
train_df['class'] = train_df['species'].apply(lambda x: x.split('_')[-1])
train_df.head()

**Image count of individuals**

In [None]:
train_df['count'] = train_df.groupby('individual_id',as_index=False)['individual_id'].transform(lambda x: x.count())
train_df.head()

In [None]:
train_df[train_df['count']==1]

In [None]:
tmp = train_df[train_df['count']<=4]
len(tmp)/len(train_df)

### **Image Resolutions**

In [None]:
widths, heights = [], []

for path in tqdm(train_df["path"]):
    width, height = Image.open(path).size
    widths.append(width)
    heights.append(height)
    
train_df["width"] = widths
train_df["height"] = heights
train_df["dimension"] = train_df["width"] * train_df["height"]

In [None]:
# color analysis
def is_grey_scale(givenImage):
    w,h = givenImage.size
    for i in range(w):
        for j in range(h):
            r,g,b = givenImage.getpixel((i,j))
            if r != g != b: return False
    return True

In [None]:
# get mean intensity for each channel RGB
def get_rgb_men(row):
    img = cv2.imread(row['path'])
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    return np.sum(img[:,:,0]), np.sum(img[:,:,1]), np.sum(img[:,:,2])

tqdm.pandas()
train_df['R'], train_df['G'], train_df['B'] = zip(*train_df.progress_apply(lambda row: get_rgb_men(row), axis=1) )

In [None]:
def show_color_dist(df, count):
    fig, axr = plt.subplots(count,2,figsize=(15,15))
    for idx, i in enumerate(np.random.choice(df['path'], count)):
        img = cv2.imread(i)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        axr[idx,0].imshow(img)
        axr[idx,0].axis('off')
        axr[idx,1].set_title('R={:.0f}, G={:.0f}, B={:.0f} '.format(np.mean(img[:,:,0]), np.mean(img[:,:,1]), np.mean(img[:,:,2]))) 
        x, y = np.histogram(img[:,:,0], bins=255)
        axr[idx,1].bar(y[:-1], x, label='R', alpha=0.8, color='red')
        x, y = np.histogram(img[:,:,1], bins=255)
        axr[idx,1].bar(y[:-1], x, label='G', alpha=0.8, color='green')
        x, y = np.histogram(img[:,:,2], bins=255)
        axr[idx,1].bar(y[:-1], x, label='B', alpha=0.8, color='blue')
        axr[idx,1].legend()
        axr[idx,1].axis('off')

In [None]:
df = train_df[((train_df['B']*1.05) < train_df['R']) & ((train_df['G']*1.05) < train_df['R'])]
show_color_dist(df, 8)

In [None]:
df = train_df[(train_df['B'] > 1.3*train_df['R']) & (train_df['B'] > 1.3*train_df['G'])]
show_color_dist(df, 8)