In [62]:
import numpy as np
import pandas as pd
import math

In [63]:
features_file = 'species_classification/train_features.csv'
labels_file = 'species_classification/train_labels.csv'

features = pd.read_csv(features_file)
labels = pd.read_csv(labels_file)

In [64]:
features.head()

Unnamed: 0,id,filepath,site
0,ZJ000000,train_features/ZJ000000.jpg,S0120
1,ZJ000001,train_features/ZJ000001.jpg,S0069
2,ZJ000002,train_features/ZJ000002.jpg,S0009
3,ZJ000003,train_features/ZJ000003.jpg,S0008
4,ZJ000004,train_features/ZJ000004.jpg,S0036


In [65]:
labels.head()

Unnamed: 0,id,antelope_duiker,bird,blank,civet_genet,hog,leopard,monkey_prosimian,rodent
0,ZJ000000,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ZJ000001,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,ZJ000002,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ZJ000003,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,ZJ000004,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [66]:
def extract_label_name(row):
    col_names = ['antelope_duiker', 'bird', 'blank', 'civet_genet', 'hog',
       'leopard', 'monkey_prosimian', 'rodent']
    for col in col_names:
        if row[col] == 1.0:
            return col
            break

In [67]:
labels = pd.concat([labels['id'], labels.apply(extract_label_name, axis=1)], axis=1)
labels.columns = ['id', 'label']
labels.head()

Unnamed: 0,id,label
0,ZJ000000,bird
1,ZJ000001,monkey_prosimian
2,ZJ000002,bird
3,ZJ000003,monkey_prosimian
4,ZJ000004,leopard


In [68]:
assert (features['id'] == labels['id']).all()
assert features['id'].nunique() == len(features)

In [69]:
data_df = pd.merge(features, labels, on='id')
data_df.head()

Unnamed: 0,id,filepath,site,label
0,ZJ000000,train_features/ZJ000000.jpg,S0120,bird
1,ZJ000001,train_features/ZJ000001.jpg,S0069,monkey_prosimian
2,ZJ000002,train_features/ZJ000002.jpg,S0009,bird
3,ZJ000003,train_features/ZJ000003.jpg,S0008,monkey_prosimian
4,ZJ000004,train_features/ZJ000004.jpg,S0036,leopard


In [70]:
from sklearn.model_selection import train_test_split

SEED = 42
train_p = 0.7
valid_p = 0.15
test_p = 0.15

train_features_all, test_features, train_labels_all, test_labels = \
    train_test_split(
        features,
        labels,
        test_size=test_p,
        random_state=SEED,
        stratify=labels['label']
    )
train_features, valid_features, train_labels, valid_labels = \
    train_test_split(
        train_features_all,
        train_labels_all,
        test_size=(valid_p/(train_p + valid_p)),
        random_state=SEED,
        stratify=train_labels_all['label']
    )

In [71]:
train_features = train_features.reset_index(drop=True)
train_labels = train_labels.reset_index(drop=True)
valid_features = valid_features.reset_index(drop=True)
valid_labels = valid_labels.reset_index(drop=True)
test_features = test_features.reset_index(drop=True)
test_labels = test_labels.reset_index(drop=True)

In [72]:
train_labels['label'].value_counts()

label
monkey_prosimian    1744
antelope_duiker     1732
civet_genet         1695
leopard             1578
blank               1549
rodent              1409
bird                1149
hog                  684
Name: count, dtype: int64

In [73]:
valid_labels['label'].value_counts()

label
monkey_prosimian    374
antelope_duiker     371
civet_genet         364
leopard             338
blank               332
rodent              302
bird                246
hog                 147
Name: count, dtype: int64

In [74]:
test_labels['label'].value_counts()

label
monkey_prosimian    374
antelope_duiker     371
civet_genet         364
leopard             338
blank               332
rodent              302
bird                246
hog                 147
Name: count, dtype: int64

In [75]:
train_labels['label'].value_counts(normalize=True)

label
monkey_prosimian    0.151127
antelope_duiker     0.150087
civet_genet         0.146880
leopard             0.136742
blank               0.134229
rodent              0.122097
bird                0.099567
hog                 0.059272
Name: proportion, dtype: float64

In [13]:
(len(train_labels), len(valid_labels), len(test_labels))

(11540, 2474, 2474)

In [26]:
train_features.head()

Unnamed: 0,id,filepath,site
0,ZJ006825,train_features/ZJ006825.jpg,S0085
1,ZJ011463,train_features/ZJ011463.jpg,S0013
2,ZJ010399,train_features/ZJ010399.jpg,S0047
3,ZJ001767,train_features/ZJ001767.jpg,S0127
4,ZJ006294,train_features/ZJ006294.jpg,S0150


In [28]:
train_labels.head()

Unnamed: 0,id,label
0,ZJ006825,leopard
1,ZJ011463,blank
2,ZJ010399,blank
3,ZJ001767,monkey_prosimian
4,ZJ006294,monkey_prosimian


In [29]:
train_df = pd.merge(train_features, train_labels, on='id')
valid_df = pd.merge(valid_features, valid_labels, on='id')
test_df = pd.merge(test_features, test_labels, on='id')

In [34]:
train_df.head()

Unnamed: 0,id,filepath,site,label
0,ZJ006825,train_features/ZJ006825.jpg,S0085,leopard
1,ZJ011463,train_features/ZJ011463.jpg,S0013,blank
2,ZJ010399,train_features/ZJ010399.jpg,S0047,blank
3,ZJ001767,train_features/ZJ001767.jpg,S0127,monkey_prosimian
4,ZJ006294,train_features/ZJ006294.jpg,S0150,monkey_prosimian


In [61]:
import os, shutil, pathlib

def remove_prefix(row):
    return row['filepath'].split('/')[1]

def create_data_directories(data_subsets, labels, original_dir, new_base_dir):
    for subset_name, subset in data_subsets.items():
        for label in labels:
            data_dir = new_base_dir /  subset_name / label
            os.makedirs(data_dir)
            fnames = list(subset[subset['label'] == label].apply(remove_prefix, axis=1))
            for fname in fnames:
                shutil.copyfile(src=original_dir / fname, dst=data_dir / fname)

original_dir = pathlib.Path('species_classification/train_features')
new_base_dir = pathlib.Path('species_data')
data_subsets = {
    'train': train_df,
    'valid': valid_df,
    'test': test_df
}
labels = list(train_df['label'].unique())

create_data_directories(data_subsets, labels, original_dir, new_base_dir)