In [1]:
from sklearn.datasets import load_files
from keras.utils import np_utils
import numpy as np
import pandas as pd
from glob import glob

Using TensorFlow backend.


In [2]:
#function to load the dataset
def load_dataset(path):
    data = load_files(path)
    fish_files = np.array(data['filenames'])
    fish_target = np_utils.to_categorical(np.array(data['target']), 8)
    return fish_files,fish_target

In [3]:
#let's load the training-data
train_files, train_targets = load_dataset('data/train')

#let's load the teting-data
test_files, _ = load_dataset('data/test')

#print the number of samples in test and trainin sets
print ("There are %d images in training dataset"%len(train_files))
print ("There are %d images in the training set"%len(test_files))

There are 3777 images in training dataset
There are 13153 images in the training set


In [9]:
from keras.preprocessing import image
from tqdm import tqdm

#converting image to tensor
def path_to_tensor(img_path):
    # loads RGB image
    img = image.load_img(img_path, target_size=(224,224))
    #convering the image to 3-D tensor with shape (224,224,3)
    x = image.img_to_array(img)
    #convert 3D tensor to 4D tensor
    return np.expand_dims(x, axis=0)

def paths_to_tensor(img_paths):
    list_of_tensors = [path_to_tensor(img_path) for img_path in tqdm(img_paths)]
    return np.vstack(list_of_tensors)

In [10]:
from PIL import ImageFile                            
ImageFile.LOAD_TRUNCATED_IMAGES = True 

#preprocessing the data
test_tensors = paths_to_tensor(test_files).astype('float32')/255

100%|████████████████████████████████████████████████████████████████████████████| 13153/13153 [05:26<00:00, 40.64it/s]


In [11]:
train_tensors = paths_to_tensor(train_files).astype('float32')/255

100%|██████████████████████████████████████████████████████████████████████████████| 3777/3777 [01:38<00:00, 38.49it/s]


In [12]:
#shape of the tensor
print(np.shape(train_tensors))

(3777, 224, 224, 3)


# Model 4, Building a new model from scratch

In [13]:
from keras.layers import Dense, Conv2D, MaxPooling2D, GlobalAveragePooling2D, Dropout
from keras.models import Sequential

model4 = Sequential()

# model Convolution layer
model4.add(Conv2D(filters=16,kernel_size=2,strides=1,activation='relu',input_shape=(224,224,3)))
# Max Pooling layer to reduce the dimensionality
model4.add(MaxPooling2D(pool_size=2,strides=2))
#Dropout layer, for turning off each node with the probability of 0.2
model4.add(Dropout(0.2))
model4.add(Conv2D(filters=32, kernel_size=2,strides=1,activation='relu'))
model4.add(MaxPooling2D(pool_size=2,strides=2))
model4.add(Dropout(0.2))
model4.add(Conv2D(filters=64,kernel_size=2,strides=1,activation='relu'))
model4.add(MaxPooling2D(pool_size=2,strides=2))
model4.add(Dropout(0.2))
model4.add(GlobalAveragePooling2D())
#A fully connected dense layer with 8 nodes (no of classes of fish)
model4.add(Dense(8,activation='softmax'))
model4.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_4 (Conv2D)            (None, 223, 223, 16)      208       
_________________________________________________________________
max_pooling2d_4 (MaxPooling2 (None, 111, 111, 16)      0         
_________________________________________________________________
dropout_4 (Dropout)          (None, 111, 111, 16)      0         
_________________________________________________________________
conv2d_5 (Conv2D)            (None, 110, 110, 32)      2080      
_________________________________________________________________
max_pooling2d_5 (MaxPooling2 (None, 55, 55, 32)        0         
_________________________________________________________________
dropout_5 (Dropout)          (None, 55, 55, 32)        0         
_________________________________________________________________
conv2d_6 (Conv2D)            (None, 54, 54, 64)        8256      
__________

### Compiling the Model 4

In [14]:
model4.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

### Training Model 4

In [15]:
from keras.callbacks import ModelCheckpoint, EarlyStopping


epochs = 10

#checkpointer saves the weight of the best model only
checkpointer_4 = [EarlyStopping(monitor='val_loss',min_delta=0.01, patience=0, verbose=1), ModelCheckpoint(filepath='saved_models/weights.best.model4.hdf5',
                                  verbose=1, save_best_only=True)]

model4.fit(train_tensors, train_targets, batch_size=20, epochs=epochs, callbacks=checkpointer_4, validation_split=0.3, verbose=1)

Train on 2643 samples, validate on 1134 samples
Epoch 1/10

Epoch 00001: val_loss improved from inf to 1.65566, saving model to saved_models/weights.best.model4.hdf5
Epoch 2/10

Epoch 00002: val_loss improved from 1.65566 to 1.61160, saving model to saved_models/weights.best.model4.hdf5
Epoch 3/10



Epoch 00003: val_loss improved from 1.61160 to 1.57431, saving model to saved_models/weights.best.model4.hdf5
Epoch 4/10

Epoch 00004: val_loss improved from 1.57431 to 1.55226, saving model to saved_models/weights.best.model4.hdf5
Epoch 5/10



Epoch 00005: val_loss improved from 1.55226 to 1.53973, saving model to saved_models/weights.best.model4.hdf5
Epoch 6/10

Epoch 00006: val_loss improved from 1.53973 to 1.47847, saving model to saved_models/weights.best.model4.hdf5
Epoch 7/10

Epoch 00007: val_loss did not improve from 1.47847
Epoch 00007: early stopping


<keras.callbacks.History at 0x2277ab0b9b0>

## Predictions for Model-2

### Loading the  weights of Benchmark model

In [16]:
#loading the weights of pretrained model
model4.load_weights('saved_models/weights.best.model4.hdf5')

### Predictions

In [17]:
#making predictions
model4_prediction = [model4.predict(np.expand_dims(img_tensor, axis=0)) for img_tensor in test_tensors]

### Processing the Predictions

In [18]:
#swapping the axes of the model4_prediction for easy handling
model4_prediction = np.swapaxes(model4_prediction,0,1)

In [19]:
import pandas as pd

#creating a pandas dataframe for with benchmark model's prediction
df_pred_model4 = pd.DataFrame(model4_prediction[0][:], columns=['ALB','BET','DOL','LAG','NoF','OTHER','SHARK','YFT'])

In [21]:
#test_files[0]

#extracting relevant name of the image from the full-path of image
image_names = [test_files[i][15:] for i in range(len(test_files))]

In [22]:
#adjusting the filename of the image to match the submission guidelines
for i in range(13153):
    if image_names[i][5]=='_':
        image_names[i] = "test_stg2/" + image_names[i]

In [23]:
#adding image names to our dataframe
df_pred_model4['image'] = pd.DataFrame(image_names)

#reindexing the dataframe
df_pred_model4 = df_pred_model4.reindex_axis(['image','ALB','BET','DOL','LAG','NoF','OTHER','SHARK','YFT'], axis=1)

  """


In [24]:
df_pred_model4.tail(10)

Unnamed: 0,image,ALB,BET,DOL,LAG,NoF,OTHER,SHARK,YFT
13143,test_stg2/image_05875.jpg,0.427824,0.072496,0.059171,0.011158,0.1804,0.024542,0.006113,0.218295
13144,test_stg2/image_04374.jpg,0.400981,0.087192,0.072842,0.01808,0.147266,0.032463,0.013517,0.22766
13145,test_stg2/image_07892.jpg,0.396636,0.071194,0.039691,0.034079,0.095064,0.113421,0.070739,0.179176
13146,test_stg2/image_09226.jpg,0.364504,0.085395,0.065137,0.024323,0.151262,0.061419,0.028545,0.219416
13147,test_stg2/image_04860.jpg,0.229461,0.060167,0.040366,0.014635,0.472447,0.057282,0.004087,0.121556
13148,img_07578.jpg,0.3284,0.081171,0.088251,0.008882,0.195327,0.01824,0.005119,0.274612
13149,test_stg2/image_03265.jpg,0.389653,0.071608,0.063397,0.010416,0.213824,0.023768,0.005262,0.222071
13150,test_stg2/image_09846.jpg,0.210877,0.058776,0.063438,0.004489,0.419978,0.014641,0.00114,0.226661
13151,test_stg2/image_10800.jpg,0.422967,0.067524,0.036494,0.034155,0.07893,0.105252,0.081145,0.173533
13152,test_stg2/image_02733.jpg,0.247614,0.067075,0.087595,0.003842,0.281914,0.009291,0.001287,0.301382


##  .csv file for submission

In [25]:
df_pred_model4.to_csv('submission4.csv',index=False)

-------------------

# Public Score - 1.50787 and Private Score - 1.76167