# National Data Science Bowl Plankton

## Preliminary

In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
import tensorflow as tf

## Initialize DataFrames
Naming convention 'labelled' and 'unlabelled' rather than 'train' and 'test' since train/test splitting will be done on labelled portion of data.
<ul>
    <li>df_labelled:    Labelled training images.</li>
    <li>df_unlabelled:  Test images with predicted label probabilities, initialized to 1.0 / number of possible labels.</li>
</ul>

In [2]:
folder_labelled   = 'data/train'
folder_unlabelled = 'data/test'
list_labelled     = []
list_unlabelled   = []

# training set: file paths to list containing all training data
for (folder_path, folder_name, file_names) in os.walk(folder_labelled):
    list_labelled += [folder_path.replace('\\', '/') + '/' + item for item in file_names]
list_labelled = [item.split('/') for item in list_labelled]

columns_base_labelled  = ['file_url', 'file_path', 'file_name', 'label_name', 'label_index']
labels_unique = list(np.unique([item[2] for item in list_labelled]))

# TRAINING SET

# list to DataFrame
columns_labelled = columns_base_labelled + labels_unique
df_labelled = pd.DataFrame(columns=columns_labelled)
df_labelled['file_url']    = pd.Series(['/'.join(item) for item in list_labelled],               dtype='string')
df_labelled['file_path']   = pd.Series(['/'.join(item[:3]) for item in list_labelled],           dtype='string')
df_labelled['file_name']   = pd.Series([item[3] for item in list_labelled],                      dtype='string')
df_labelled['label_name']  = pd.Series([item[2] for item in list_labelled],                      dtype='string')
df_labelled['label_index'] = pd.Series([labels_unique.index(item[2]) for item in list_labelled], dtype='int64')

# one-hot encode the labels
encoder = sklearn.preprocessing.LabelEncoder()
encoder.fit(df_labelled['label_index'])
encoded = encoder.transform(df_labelled['label_index'])
df_labelled[df_labelled.columns[-len(labels_unique):]] = tf.keras.utils.to_categorical(encoded)

# TEST SET

# file paths to list containing all training data
for (folder_path, folder_name, file_names) in os.walk(folder_unlabelled):
    list_unlabelled += [folder_path.replace('\\', '/') + '/' + item for item in file_names]
list_unlabelled = [item.split('/') for item in list_unlabelled]

columns_base_unlabelled = ['file_url', 'file_path', 'file_name']

# temporary
probabilities_initial = pd.Series(np.zeros((len(list_unlabelled),)) + (1.0 / len(labels_unique)), dtype='float32')

# list to DataFrame, with initialized probabilities for each class
columns_unlabelled = columns_base_unlabelled + labels_unique
df_unlabelled = pd.DataFrame(columns=columns_unlabelled)
df_unlabelled['file_url']  = pd.Series(['/'.join(item) for item in list_unlabelled],     dtype='string')
df_unlabelled['file_path'] = pd.Series(['/'.join(item[:2]) for item in list_unlabelled], dtype='string')
df_unlabelled['file_name'] = pd.Series([item[2] for item in list_unlabelled],            dtype='string')
for i in range(3, len(columns_unlabelled)):
    df_unlabelled[columns_unlabelled[i]] = probabilities_initial



del folder_labelled, list_labelled, folder_unlabelled, list_unlabelled, probabilities_initial, encoder, encoded

In [3]:
df_labelled.head()

Unnamed: 0,file_url,file_path,file_name,label_name,label_index,acantharia_protist,acantharia_protist_big_center,acantharia_protist_halo,amphipods,appendicularian_fritillaridae,...,trichodesmium_tuft,trochophore_larvae,tunicate_doliolid,tunicate_doliolid_nurse,tunicate_partial,tunicate_salp,tunicate_salp_chains,unknown_blobs_and_smudges,unknown_sticks,unknown_unclassified
0,data/train/acantharia_protist/100224.jpg,data/train/acantharia_protist,100224.jpg,acantharia_protist,0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,data/train/acantharia_protist/100723.jpg,data/train/acantharia_protist,100723.jpg,acantharia_protist,0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,data/train/acantharia_protist/101165.jpg,data/train/acantharia_protist,101165.jpg,acantharia_protist,0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,data/train/acantharia_protist/101232.jpg,data/train/acantharia_protist,101232.jpg,acantharia_protist,0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,data/train/acantharia_protist/101260.jpg,data/train/acantharia_protist,101260.jpg,acantharia_protist,0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
df_unlabelled.head()

Unnamed: 0,file_url,file_path,file_name,acantharia_protist,acantharia_protist_big_center,acantharia_protist_halo,amphipods,appendicularian_fritillaridae,appendicularian_s_shape,appendicularian_slight_curve,...,trichodesmium_tuft,trochophore_larvae,tunicate_doliolid,tunicate_doliolid_nurse,tunicate_partial,tunicate_salp,tunicate_salp_chains,unknown_blobs_and_smudges,unknown_sticks,unknown_unclassified
0,data/test/1.jpg,data/test,1.jpg,0.008264,0.008264,0.008264,0.008264,0.008264,0.008264,0.008264,...,0.008264,0.008264,0.008264,0.008264,0.008264,0.008264,0.008264,0.008264,0.008264,0.008264
1,data/test/10.jpg,data/test,10.jpg,0.008264,0.008264,0.008264,0.008264,0.008264,0.008264,0.008264,...,0.008264,0.008264,0.008264,0.008264,0.008264,0.008264,0.008264,0.008264,0.008264,0.008264
2,data/test/100.jpg,data/test,100.jpg,0.008264,0.008264,0.008264,0.008264,0.008264,0.008264,0.008264,...,0.008264,0.008264,0.008264,0.008264,0.008264,0.008264,0.008264,0.008264,0.008264,0.008264
3,data/test/1000.jpg,data/test,1000.jpg,0.008264,0.008264,0.008264,0.008264,0.008264,0.008264,0.008264,...,0.008264,0.008264,0.008264,0.008264,0.008264,0.008264,0.008264,0.008264,0.008264,0.008264
4,data/test/10000.jpg,data/test,10000.jpg,0.008264,0.008264,0.008264,0.008264,0.008264,0.008264,0.008264,...,0.008264,0.008264,0.008264,0.008264,0.008264,0.008264,0.008264,0.008264,0.008264,0.008264


## Starter neural network
Just to organize everything into a framework.

In [15]:
df_X_train, df_X_test, df_Y_train, df_Y_test = train_test_split(
    df_labelled[['file_url']], df_labelled[labels_unique], test_size=0.2
)

# df format image urls + one-hot labels needed for
# ImageDataGenerator().flow_from_dataframe(class_mode='raw')
df_train = df_X_train.join(df_Y_train)
df_test  = df_X_test.join(df_Y_test)

n_classes = len(labels_unique)
image_shape_0 = 40

In [17]:
df_test.head()

Unnamed: 0,file_url,acantharia_protist,acantharia_protist_big_center,acantharia_protist_halo,amphipods,appendicularian_fritillaridae,appendicularian_s_shape,appendicularian_slight_curve,appendicularian_straight,artifacts,...,trichodesmium_tuft,trochophore_larvae,tunicate_doliolid,tunicate_doliolid_nurse,tunicate_partial,tunicate_salp,tunicate_salp_chains,unknown_blobs_and_smudges,unknown_sticks,unknown_unclassified
27554,data/train/trichodesmium_tuft/31909.jpg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14526,data/train/echinoderm_larva_seastar_brachiolar...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11439,data/train/detritus_filamentous/149942.jpg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22016,data/train/radiolarian_chain/106839.jpg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9523,data/train/copepod_cyclopoid_oithona_eggs/1350...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
datagen_train = tf.keras.preprocessing.image.ImageDataGenerator(
    rescale=1.0,
    shear_range=0.0,
    zoom_range=0.0,
    horizontal_flip=False,
    vertical_flip=False
)
generator_train = datagen_train.flow_from_dataframe(
    dataframe=df_train,
    x_col='file_url',
    y_col=labels_unique,
    color_mode='grayscale',
    target_size=(image_shape_0, image_shape_0),
    interpolation='bilinear',
    batch_size=32,
    class_mode='raw'
)

datagen_test = tf.keras.preprocessing.image.ImageDataGenerator(
    rescale=1.0
)
generator_test = datagen_test.flow_from_dataframe(
    dataframe=df_test,
    x_col='file_url',
    y_col=labels_unique,
    color_mode='grayscale',
    target_size=(image_shape_0, image_shape_0),
    interpolation='bilinear',
    batch_size=32,
    class_mode='raw'
)

datagen_unlabelled = tf.keras.preprocessing.image.ImageDataGenerator(
    rescale=1.0
)
generator_unlabelled = datagen_unlabelled.flow_from_dataframe(
    dataframe=df_unlabelled,
    x_col='file_url',
    y_col=labels_unique,
    color_mode='grayscale',
    target_size=(image_shape_0, image_shape_0),
    interpolation='bilinear',
    batch_size=32,
    class_mode='raw'
)

Found 24268 validated image filenames.
Found 6068 validated image filenames.


In [28]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.InputLayer(input_shape=(image_shape_0, image_shape_0, 1)))
model.add(tf.keras.layers.Conv2D(32, (3, 3), activation='relu'))
model.add(tf.keras.layers.MaxPooling2D((2, 2)))
model.add(tf.keras.layers.Conv2D(64, (3, 3), activation='relu'))
model.add(tf.keras.layers.MaxPooling2D((2, 2)))
model.add(tf.keras.layers.Conv2D(64, (3, 3), activation='relu'))
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(n_classes, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])

history = model.fit_generator(
    generator=generator_train,
    steps_per_epoch=(generator_train.n // generator_train.batch_size + 1),
    validation_data=generator_test,
    validation_steps=(generator_test.n // generator_test.batch_size + 1),
    epochs=5
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x1a3344f5340>

In [46]:
df_predict_test = df_test.copy(deep=True)
df_predict_test[labels_unique] = model.predict(
    generator_test,
    steps=(generator_test.n // generator_test.batch_size + 1)
)

df_predict_unlabelled = df_unlabelled.copy(deep=True)
df_predict_unlabelled[labels_unique] = model.predict(
    generator_unlabelled,
    steps=(generator_unlabelled.n // generator_unlabelled.batch_size + 1)
)



In [47]:
df_predict_test.head()

Unnamed: 0,file_url,acantharia_protist,acantharia_protist_big_center,acantharia_protist_halo,amphipods,appendicularian_fritillaridae,appendicularian_s_shape,appendicularian_slight_curve,appendicularian_straight,artifacts,...,trichodesmium_tuft,trochophore_larvae,tunicate_doliolid,tunicate_doliolid_nurse,tunicate_partial,tunicate_salp,tunicate_salp_chains,unknown_blobs_and_smudges,unknown_sticks,unknown_unclassified
27554,data/train/trichodesmium_tuft/31909.jpg,3.009745e-06,1.523324e-08,5.41921e-06,8.600298e-05,5.828344e-09,0.000423,0.000401,0.000104,1.491824e-06,...,0.003292931,1.297947e-05,0.004625367,0.054464,1.313979e-06,7.345422e-05,2.432554e-06,0.000388,2.873345e-05,0.036964
14526,data/train/echinoderm_larva_seastar_brachiolar...,0.0004506358,1.625029e-13,7.257492e-11,7.427145e-06,3.6256779999999997e-19,1.2e-05,0.000227,3.3e-05,6.599215e-12,...,8.857865e-05,2.628632e-09,1.328123e-08,1e-06,7.371256e-16,3.65851e-11,6.607284e-13,0.000531,2.174276e-06,2e-06
11439,data/train/detritus_filamentous/149942.jpg,1.696819e-07,1.86126e-11,1.581084e-07,6.266873e-07,1.253383e-08,0.000292,0.000491,0.001479,1.024688e-06,...,0.004364154,2.542425e-06,0.0003006776,0.099812,0.000100706,0.0001524295,0.004737272,0.000371,0.0004360868,0.002682
22016,data/train/radiolarian_chain/106839.jpg,9.431265e-07,5.527375e-11,1.795924e-08,8.832241e-07,0.0009132548,0.003147,0.001078,9.9e-05,2.122184e-06,...,8.245679e-08,6.978747e-05,0.08841804,0.001116,4.057937e-06,9.544814e-05,6.25419e-06,0.002287,7.701811e-09,0.00611
9523,data/train/copepod_cyclopoid_oithona_eggs/1350...,0.0002882558,2.713762e-08,1.711247e-05,4.175699e-09,1.298096e-06,0.001324,0.00232,0.005542,0.0002743278,...,0.0005912459,2.313321e-06,6.130177e-05,0.035257,0.001409457,0.0001210174,0.0002600315,0.000415,0.0003149137,0.025982


In [48]:
df_predict_unlabelled.head()

Unnamed: 0,file_url,file_path,file_name,acantharia_protist,acantharia_protist_big_center,acantharia_protist_halo,amphipods,appendicularian_fritillaridae,appendicularian_s_shape,appendicularian_slight_curve,...,trichodesmium_tuft,trochophore_larvae,tunicate_doliolid,tunicate_doliolid_nurse,tunicate_partial,tunicate_salp,tunicate_salp_chains,unknown_blobs_and_smudges,unknown_sticks,unknown_unclassified
0,data/test/1.jpg,data/test,1.jpg,2.207199e-18,1.065295e-12,8.668208e-09,1.062977e-09,2.422653e-08,1.008512e-09,8.710981e-10,...,4.679527e-10,1.497866e-08,0.000145,0.000202,0.4409373,0.008728959,0.005073542,3.353478e-11,2.073298e-10,0.00019
1,data/test/10.jpg,data/test,10.jpg,0.05527532,7.374887e-07,7.220593e-06,1.614184e-05,0.0004338064,0.02104851,0.008141055,...,0.123676,5.64322e-07,0.000202,0.003101,3.002046e-09,1.660555e-07,3.415944e-07,0.06538457,0.0004155817,0.008144
2,data/test/100.jpg,data/test,100.jpg,6.310199e-09,1.842242e-20,5.598021e-15,1.157005e-11,1.482516e-08,0.004529967,0.1214416,...,2.400567e-07,5.828068e-10,0.000146,0.000147,5.021666e-06,3.327198e-07,3.208227e-09,3.147073e-06,8.600485e-07,0.001846
3,data/test/1000.jpg,data/test,1000.jpg,3.030438e-05,7.243115e-15,7.435184e-09,3.402766e-08,2.563138e-07,0.06281743,0.6817811,...,0.04492475,7.368654e-07,3.5e-05,0.013224,1.59385e-06,1.788648e-06,0.0002243992,0.004399244,0.000287398,0.000104
4,data/test/10000.jpg,data/test,10000.jpg,9.808105e-07,8.872316e-07,0.0005345467,2.409727e-08,3.315903e-10,2.458803e-05,2.487057e-06,...,0.00367018,0.0002124213,1.3e-05,0.000267,2.475842e-12,8.02031e-07,2.80377e-07,0.0005500565,0.0005047526,0.000121
