In [78]:
# prepare training set based on specific conditions
# TRAIN images are in TRAIN_PATH
# out-of-sample 20% TEST images are in TEST_PATH

In [1]:
import glob, os
import shutil
import numpy as np
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt
import math
from numpy import newaxis
from tqdm import tqdm
from sklearn.model_selection import train_test_split
%matplotlib inline

In [2]:
version_number = '0_1'

In [3]:
CSV_PATH   = 'C:/Users/KaiPin Liao/Documents/kaggle_whales/data/'
READ_PATH  = 'C:/Users/KaiPin Liao/Documents/kaggle_whales/data/train_black_and_white/'
TRAIN_PATH = 'C:/Users/KaiPin Liao/Documents/kaggle_whales/data/train_' + version_number + '/'
VALIDATION_PATH = 'C:/Users/KaiPin Liao/Documents/kaggle_whales/data/validation_' + version_number + '/'
TEST_PATH  = 'C:/Users/KaiPin Liao/Documents/kaggle_whales/data/test_' + version_number + '/'

In [4]:
def create_folder(directory, alert = True):
    if not os.path.exists(directory):
        os.makedirs(directory)
    elif alert:
        print(directory, '\talready exists!')

In [27]:
create_folder(TRAIN_PATH)
create_folder(VALIDATION_PATH)
create_folder(TEST_PATH)

In [6]:
# get whales with more than 5 samples

# read label csv
df = pd.read_csv(CSV_PATH + 'train.csv')
print('There are ', len(df), 'pictures.')

# get unique whales
whales = df['Id'].drop_duplicates().values
print('There are ', len(whales), 'unique whales.')

# there are on average ~5 unique pictures per whale. What does this look like distributionally?
count = df.groupby('Id').count().sort_values('Image', ascending=False).reset_index()

# yikes 806 whales have >= five images... this will be tough!
print(len(count[count['Image']>=5]), 'whales have more than 5 images.', len(count[count['Image']>=5]) / len(whales))

There are  25361 pictures.
There are  5005 unique whales.
806 whales have more than 5 images. 0.16103896103896104


In [7]:
count.head(15)

Unnamed: 0,Id,Image
0,new_whale,9664
1,w_23a388d,73
2,w_9b5109b,65
3,w_9c506f6,62
4,w_0369a5c,61
5,w_700ebb4,57
6,w_3de579a,54
7,w_564a34b,51
8,w_fd3e556,50
9,w_88e4537,49


#### select whales having >= 48 sample images (10 most popular classes) and whales != 'new_whale'

In [8]:
# unique list of 805 whales that satisfy above conditions
train_whale_list = count.loc[(count['Image'] >= 48) & (count['Id'] != 'new_whale')]
print(train_whale_list['Image'].sum(), 'images')
train_whale_list = train_whale_list['Id'].unique()

570 images


In [9]:
train_image = df.loc[df['Id'].isin(train_whale_list)]
train_image_list = train_image['Image'].unique()
train_image_list = [x[:-4] for x in train_image_list]
len(train_image_list)

570

In [23]:
# leave 20% for out-of-sample test
X_train, X_test, y_train, y_test = train_test_split(train_image['Image'], train_image['Id'], test_size=0.2, random_state=999)

In [24]:
# from train set, split train:validation = 3:1
X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=0.25, random_state=111)

In [25]:
X_train_list = [x[:-4] for x in X_train]
X_validation_list = [x[:-4] for x in X_validation]
X_test_list  = [x[:-4] for x in X_test]

In [13]:
# for filename in tqdm(os.listdir(READ_PATH)):
#     if filename[:9] in X_train_list:
#         shutil.copy( READ_PATH + filename, TRAIN_PATH + filename[:9] + '.jpg')
#     elif filename[:9] in X_validation_list:
#         shutil.copy( VALIDATION_PATH + filename, TEST_PATH + filename[:9] + '.jpg')
#     elif filename[:9] in X_test_list:
#         shutil.copy( READ_PATH + filename, TEST_PATH + filename[:9] + '.jpg')

100%|██████████████████████████████████████████████████████████████████████████| 25361/25361 [00:04<00:00, 5326.39it/s]


In [52]:
# print(len([name for name in os.listdir(TRAIN_PATH)]))
# print(len([name for name in os.listdir(VALIDATION_PATH)]))
# print(len([name for name in os.listdir(TEST_PATH)]))
# print(len([name for name in os.listdir(TEST_PATH)]) + len([name for name in os.listdir(VALIDATION_PATH)]) + len([name for name in os.listdir(TRAIN_PATH)]))

460
115
575


In [28]:
# put images of each class in its own sub-directory
for filename in os.listdir(READ_PATH):
    # find class
    cls_string = df.loc[df['Image'] == filename[:9] + '.jpg']['Id'].iloc[-1]
    
    if filename[:9] in X_train_list:
        create_folder(TRAIN_PATH + cls_string, False)
        shutil.copy( READ_PATH + filename, TRAIN_PATH + cls_string + '/' + filename[:9] + '.jpg')
    elif filename[:9] in X_validation_list:
        create_folder(VALIDATION_PATH + cls_string, False)
        shutil.copy( READ_PATH + filename, VALIDATION_PATH + cls_string + '/' + filename[:9] + '.jpg')
    elif filename[:9] in X_test_list:
        create_folder(TEST_PATH + cls_string, False)
        shutil.copy( READ_PATH + filename, TEST_PATH + cls_string + '/' + filename[:9] + '.jpg')

In [29]:
# calculate total number of images
image_count = 0
directory = TRAIN_PATH
for filename in os.listdir(directory):
    image_count = image_count + len([name for name in os.listdir(directory + filename)])
print(image_count)

342


In [30]:
image_count = 0
directory = VALIDATION_PATH
for filename in os.listdir(directory):
    image_count = image_count + len([name for name in os.listdir(directory + filename)])
print(image_count)

114


In [31]:
image_count = 0
directory = TEST_PATH
for filename in os.listdir(directory):
    image_count = image_count + len([name for name in os.listdir(directory + filename)])
print(image_count)

114
