# Most of code comes from Jeff Delancy works in the following
[https://www.kaggle.com/jeffd23/the-nature-conservancy-fisheries-monitoring/deep-learning-in-the-deep-blue-lb-1-279](https://www.kaggle.com/jeffd23/the-nature-conservancy-fisheries-monitoring/deep-learning-in-the-deep-blue-lb-1-279)

# Classification by Simple CNN 
In this code, I would simply feed shrinked training images into simple CNN for classification

There would not be any fish focused attention. 

So there are lots of non-fish objects like fishers, boats, sea waters, etc in images


In [18]:
import os, cv2, random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder

from subprocess import check_output

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from keras.models import Sequential
from keras.layers import Dropout, Flatten, Convolution2D, MaxPooling2D, ZeroPadding2D, Dense, Activation
from keras.optimizers import RMSprop, Adam
from keras.callbacks import EarlyStopping
from keras.utils import np_utils
from keras import backend as K

In [19]:
TRAIN_DIR = './train/'
TEST_DIR = './test_stg1/'
FISH_CLASSES = check_output(["ls", "./train/"]).decode("utf8").strip().split('\n')
ROWS = 90
COLS = 160
CHANNELS = 3

In [20]:
FISH_CLASSES

['ALB', 'BET', 'DOL', 'LAG', 'NoF', 'OTHER', 'SHARK', 'YFT']

In [21]:
# Load train images and resize them
X_files = []
y_all = []
for fish_class in FISH_CLASSES:
    fish_dir = TRAIN_DIR + fish_class
    files = [fish_dir+'/'+filename for filename in os.listdir(fish_dir)]
    X_files.extend(files)
    y_all.extend( np.tile(fish_class, len(files)))

X_all = np.ndarray((len(X_files), ROWS, COLS, CHANNELS), dtype=np.uint8)
for i, file_path in enumerate(X_files):
    img_ori = cv2.imread(file_path, cv2.IMREAD_COLOR)
    X_all[i] = cv2.resize(img_ori, (COLS, ROWS), interpolation=cv2.INTER_CUBIC)
    if i%1000 ==0: 
        print('Processed {} of {}'.format(i, len(X_files)))
y_all = np.array(y_all)

print(X_all.shape)
print(y_all.shape)

Processed 0 of 3777
Processed 1000 of 3777
Processed 2000 of 3777
Processed 3000 of 3777
(3777, 90, 160, 3)
(3777,)


In [22]:
# Split into train/validation data set
# one-hot encoded labels
y_all = LabelEncoder().fit_transform(y_all)
y_all = np_utils.to_categorical(y_all)

X_train, X_valid, y_train, y_valid = train_test_split(X_all, y_all, test_size=0.2, random_state=23, stratify=y_all)

print(X_train.shape)
print(X_valid.shape)

(3021, 90, 160, 3)
(756, 90, 160, 3)


In [23]:
# Model : simple CNN

def normalize(x):
    return (x - K.mean(x)) / K.std(x)

model = Sequential()
model.add(Activation(activation=normalize, input_shape=(ROWS,COLS, CHANNELS)))
model.add(Convolution2D(32, 5, 5, border_mode='same', activation='relu', dim_ordering='tf'))
model.add(Convolution2D(32, 5, 5, border_mode='same', activation='relu', dim_ordering='tf'))
model.add(MaxPooling2D(pool_size=(2,2), dim_ordering='tf'))

model.add(Convolution2D(64, 3, 3, border_mode='same', activation='relu', dim_ordering='tf'))
model.add(Convolution2D(64, 3, 3, border_mode='same', activation='relu', dim_ordering='tf'))
model.add(MaxPooling2D(pool_size=(2,2), dim_ordering='tf'))

model.add(Convolution2D(128, 3, 3, border_mode='same', activation='relu', dim_ordering='tf'))
model.add(Convolution2D(128, 3, 3, border_mode='same', activation='relu', dim_ordering='tf'))
model.add(MaxPooling2D(pool_size=(2,2), dim_ordering='tf'))

model.add(Convolution2D(256, 3, 3, border_mode='same', activation='relu', dim_ordering='tf'))
model.add(Convolution2D(256, 3, 3, border_mode='same', activation='relu', dim_ordering='tf'))
model.add(MaxPooling2D(pool_size=(2,2), dim_ordering='tf'))

model.add(Flatten())
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))

model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))

model.add(Dense(len(FISH_CLASSES)))
model.add(Activation('sigmoid'))

model.compile(loss='categorical_crossentropy', optimizer=RMSprop(lr=1e-4))



In [24]:
early_stopping = EarlyStopping(monitor='val_loss', patience=4, verbose=1, mode='auto')

model.fit(X_train, y_train, batch_size=64, nb_epoch=1, 
          validation_split=0.2, verbose=1, shuffle=True, callbacks=[early_stopping])

Train on 2416 samples, validate on 605 samples
Epoch 1/1


<keras.callbacks.History at 0x7facf77654e0>

In [25]:
# measure accuracy based on validation set
preds = model.predict(X_valid, verbose=1)
print("Validation Log Loss: {}".format(log_loss(y_valid, preds)))


Validation Log Loss: 1.751053714405292


In [26]:
# Make submission
test_files = [file_name for file_name in os.listdir(TEST_DIR)]
X_test = np.ndarray((len(test_files), ROWS, COLS, CHANNELS), dtype=np.uint8)
for i, file_name in enumerate(test_files):
    file_path = TEST_DIR + file_name
    img_ori = cv2.imread(file_path, cv2.IMREAD_COLOR)
    X_test[i] = cv2.resize(img_ori, (COLS, ROWS), interpolation=cv2.INTER_CUBIC)
test_preds = model.predict(X_test, verbose=1)   






TypeError: __init__() got an unexpected keyword argument 'column'

In [31]:
submission = pd.DataFrame(test_preds, columns=FISH_CLASSES)
submission.insert(0, 'image', test_files)
submission.to_csv('submission.csv', index=False)
submission.head()

Unnamed: 0,image,ALB,BET,DOL,LAG,NoF,OTHER,SHARK,YFT
0,img_00005.jpg,0.892977,0.285425,0.401399,0.20851,0.558435,0.543462,0.286894,0.747706
1,img_00007.jpg,0.868921,0.31083,0.408803,0.240085,0.546238,0.53923,0.310006,0.721155
2,img_00009.jpg,0.840704,0.328983,0.419799,0.260138,0.543532,0.534353,0.331338,0.699787
3,img_00018.jpg,0.898099,0.281932,0.395801,0.205969,0.55486,0.548504,0.28564,0.75008
4,img_00027.jpg,0.879534,0.29895,0.405492,0.223496,0.551615,0.541841,0.302608,0.733403
