In [1]:
%load_ext autoreload
%autoreload 2

import utils

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import time

# Looking at the data

In [3]:
labels = pd.read_csv(utils.data_fp/'train.csv')
print(labels.shape)
print(f'Number of classes: {len(labels.Id.unique()):,}')
labels.head()

(25361, 2)
Number of classes: 5,005


Unnamed: 0,Image,Id
0,0000e88ab.jpg,w_f48451c
1,0001f9222.jpg,w_c3d896a
2,00029d126.jpg,w_20df2c5
3,00050a15a.jpg,new_whale
4,0005c1ef8.jpg,new_whale


## Split data into training and validation set
Because the data is extremely unbalance where new_whale takes a large proportion and many classes only has one sample, we cannot use a random sampling method. For example:   
* Fastai will get an error when there is a class in the validation set and not in training set. [link](https://github.com/fastai/fastai/issues/1273)

In addition, we cannot use StratifiedShuffleSplit because the minimum number of groups for any class cannot be less than 2. Instead, we will manually create a train and validation split. 

In [4]:
# ## Save code just in case for future use
# sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
# ## split is decided by the Id column
# for train_index, val_index in sss.split(labels.index.values, labels.Id):
#     train_idx, val_idx = train_index, val_index  

In [5]:
class_counts = labels.Id.value_counts(sort=True, ascending=True)
print(f'The number of images: {class_counts.sum():,}')
print('{}'.format('='*20))
print(f'Number of classes with only one image: {sum(class_counts == 1):,}')
print(f'Percentage of classes with one image: {sum(class_counts == 1)/len(labels.Id.unique()):.0%}')
print('{}'.format('='*20))
print(f'Number of new_whate image: {class_counts["new_whale"].sum():,}')
print(f'Percentage of images are new_whale: {class_counts["new_whale"]/class_counts.sum()*100:0.0f}%')

The number of images: 25,361
Number of classes with only one image: 2,073
Percentage of classes with one image: 41%
Number of new_whate image: 9,664
Percentage of images are new_whale: 38%


In [6]:
class_counts[::-1][:5]  # top five most common class

new_whale    9664
w_23a388d      73
w_9b5109b      65
w_9c506f6      62
w_0369a5c      61
Name: Id, dtype: int64

In [7]:
## Number of samples per class
class_counts.value_counts()

1       2073
2       1285
3        568
4        273
5        172
6        136
7         86
8         76
9         62
10        46
11        39
12        26
15        19
17        17
14        16
16        16
13        14
19         8
24         7
20         7
21         7
18         6
22         5
23         4
31         3
37         3
30         3
40         3
45         2
34         2
29         2
25         2
48         1
9664       1
36         1
32         1
51         1
33         1
49         1
57         1
47         1
65         1
73         1
50         1
54         1
62         1
27         1
35         1
61         1
Name: Id, dtype: int64

### Stratified Split
Split the data such that classes with only 1 sample will go into training set and classes with 2+ will split randomly with ~20% into validation set. 

In [8]:
start_time = time.time()
## stratify sampling that can handle 
train_idx, val_idx = pd.Series(), pd.Series()
for name, group in labels.reset_index()[['index', 'Id']].groupby(['Id']):
    ## if a class only have 1 sample, just return that one
    if group.shape[0] == 1:
        train, val = group['index'], []
    ## split each group randomly and obtain their index
    else:
        train, val = train_test_split(group['index'], test_size=0.2, random_state=284)
    train_idx = train_idx.append(train)
    val_idx = val_idx.append(val)
    
print(f'This took {int(time.time() - start_time)} seconds')

This took 2 seconds


In [9]:
train_idx.head()

22474    22474
16267    16267
21988    21988
13940    13940
10178    10178
dtype: int64

The training/validation split is off from the 80/20 split because classes with a small amount of images are harder to split with that ratio.  
For instance, classes with 2 images will do a 50/50 split such that atleast one is in training and validation. 

In [10]:
print(f'Total population of class with only 1 data point \
{(class_counts == 1).sum()/len(class_counts)*100:0.0f}%')
print(f'Total population of class with only 2 data point \
{(class_counts == 2).sum()/len(class_counts)*100:0.0f}%')

Total population of class with only 1 data point 41%
Total population of class with only 2 data point 26%


## Verify that the split was successful

In [11]:
labels['validation'] = True
labels.loc[train_idx, 'validation'] = False

In [12]:
## assess that the number of class are all accounted for in the training indexing
assert len(labels.loc[train_idx, 'Id'].unique()) == len(labels.Id.unique())
print('Number of class {}: {}'.format(len(labels.loc[train_idx, 'Id'].unique()), len(labels['Id'].unique())))
print('Percent of validation split: {:.0%}'.format(labels.validation.mean()))

Number of class 5005: 5005
Percent of validation split: 24%


In [13]:
# select classes with more than 10 data points.
# Get the train/validation split percentage by class
# Get the mean of the splits
val_split = labels[labels.Id.isin(class_counts[class_counts >= 10].index)].groupby('Id')['validation'].mean().mean()
print(f'On average for each class, the split to valiation is {val_split:0.0%}')

On average for each class, the split to valiation is 23%


Since the split is random and it normal that the split isn't perfectly at 20%. 

## Export Train and Validation Index

In [14]:
labels.head()

Unnamed: 0,Image,Id,validation
0,0000e88ab.jpg,w_f48451c,False
1,0001f9222.jpg,w_c3d896a,False
2,00029d126.jpg,w_20df2c5,False
3,00050a15a.jpg,new_whale,False
4,0005c1ef8.jpg,new_whale,False


In [15]:
labels.to_csv(utils.data_fp/'train_stratified_split.csv', index=False)