In [1]:
from sklearn.datasets import load_files
from keras.utils import np_utils
import numpy as np
import pandas as pd
from glob import glob

Using TensorFlow backend.


In [2]:
#function to load the dataset
def load_dataset(path):
    data = load_files(path)
    fish_files = np.array(data['filenames'])
    fish_target = np_utils.to_categorical(np.array(data['target']), 8)
    return fish_files,fish_target

In [3]:
#let's load the training-data
train_files, train_targets = load_dataset('data/train')

#let's load the teting-data
test_files, _ = load_dataset('data/test')

#print the number of samples in test and trainin sets
print ("There are %d images in training dataset"%len(train_files))
print ("There are %d images in the training set"%len(test_files))

There are 3777 images in training dataset
There are 13153 images in the training set


In [5]:
from keras.preprocessing import image
from tqdm import tqdm

#converting image to tensor
def path_to_tensor(img_path):
    # loads RGB image
    img = image.load_img(img_path, target_size=(224,224))
    #convering the image to 3-D tensor with shape (224,224,3)
    x = image.img_to_array(img)
    #convert 3D tensor to 4D tensor
    return np.expand_dims(x, axis=0)

def paths_to_tensor(img_paths):
    list_of_tensors = [path_to_tensor(img_path) for img_path in tqdm(img_paths)]
    return np.vstack(list_of_tensors)

In [6]:
from PIL import ImageFile                            
ImageFile.LOAD_TRUNCATED_IMAGES = True 

#preprocessing the data
test_tensors = paths_to_tensor(test_files).astype('float32')/255

100%|███████████████████████████████████████████████████████████████████████████| 13153/13153 [02:06<00:00, 103.80it/s]


In [7]:
train_tensors = paths_to_tensor(train_files).astype('float32')/255

100%|██████████████████████████████████████████████████████████████████████████████| 3777/3777 [01:38<00:00, 41.52it/s]


In [8]:
#shape of the tensor
print(np.shape(train_tensors))

(3777, 224, 224, 3)


# Let's have the Benchmark Model

In [9]:
# Let's have a Benchmark model

from keras.layers import Dense, Conv2D, MaxPooling2D, GlobalAveragePooling2D, Dropout
from keras.models import Sequential

benchmark = Sequential()

# model Convolution layer
benchmark.add(Conv2D(filters=16,kernel_size=2,strides=1,activation='relu',input_shape=(224,224,3)))
# Max Pooling layer to reduce the dimensionality
benchmark.add(MaxPooling2D(pool_size=2,strides=2))
#Dropout layer, for turning off each node with the probability of 0.3
benchmark.add(Dropout(0.3))
benchmark.add(Conv2D(filters=32, kernel_size=2,strides=1,activation='relu'))
benchmark.add(Dropout(0.3))
benchmark.add(GlobalAveragePooling2D())
#A fully connected dense layer with 8 nodes (no of classes of fish)
benchmark.add(Dense(8,activation='softmax'))
benchmark.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 223, 223, 16)      208       
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 111, 111, 16)      0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 111, 111, 16)      0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 110, 110, 32)      2080      
_________________________________________________________________
dropout_2 (Dropout)          (None, 110, 110, 32)      0         
_________________________________________________________________
global_average_pooling2d_1 ( (None, 32)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 8)                 264       
Total para

In [10]:
benchmark.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

In [11]:
from keras.callbacks import ModelCheckpoint, EarlyStopping


epochs = 10

#checkpointer saves the best weights.
checkpointer = ModelCheckpoint(filepath='saved_models/weights.best.benchmark.hdf5', verbose=1, save_best_only=True)

benchmark.fit(train_tensors, train_targets, batch_size=20, epochs=epochs, callbacks=[checkpointer], validation_split=0.2, verbose=1)

Train on 3021 samples, validate on 756 samples
Epoch 1/10

Epoch 00001: val_loss improved from inf to 1.61031, saving model to saved_models/weights.best.benchmark.hdf5
Epoch 2/10

Epoch 00002: val_loss did not improve from 1.61031
Epoch 3/10



Epoch 00003: val_loss improved from 1.61031 to 1.59026, saving model to saved_models/weights.best.benchmark.hdf5
Epoch 4/10

Epoch 00004: val_loss improved from 1.59026 to 1.57265, saving model to saved_models/weights.best.benchmark.hdf5
Epoch 5/10



Epoch 00005: val_loss did not improve from 1.57265
Epoch 6/10

Epoch 00006: val_loss improved from 1.57265 to 1.54906, saving model to saved_models/weights.best.benchmark.hdf5
Epoch 7/10



Epoch 00007: val_loss did not improve from 1.54906
Epoch 8/10

Epoch 00008: val_loss improved from 1.54906 to 1.52976, saving model to saved_models/weights.best.benchmark.hdf5
Epoch 9/10



Epoch 00009: val_loss improved from 1.52976 to 1.51633, saving model to saved_models/weights.best.benchmark.hdf5
Epoch 10/10

Epoch 00010: val_loss did not improve from 1.51633


<keras.callbacks.History at 0x157ffa88198>

## Predictions for Benchmark

### Loading the  weights of Benchmark model

In [12]:
benchmark.load_weights('saved_models/weights.best.benchmark.hdf5')

### Predictions

In [13]:
benchmark_model_prediction = [benchmark.predict(np.expand_dims(img_tensor, axis=0)) for img_tensor in test_tensors]

### Processing the Predictions

In [14]:
#visaulizing the array
print(benchmark_model_prediction[:][0])

#swapping the axes of the benchmark_model_prediction for easy handling
benchmark_model_prediction = np.swapaxes(benchmark_model_prediction,0,1)

#creating a pandas dataframe for with benchmark model's prediction
df_pred_model1 = pd.DataFrame(benchmark_model_prediction[0][:], columns=['ALB','BET','DOL','LAG','NoF','OTHER','SHARK','YFT'])

#first five rows of df_pred_model1 dataframe
print(df_pred_model1[:5])

[[0.42553875 0.05191546 0.03911333 0.01771741 0.06787238 0.06665134
  0.0635898  0.26760146]]
        ALB       BET       DOL       LAG       NoF     OTHER     SHARK  \
0  0.425539  0.051915  0.039113  0.017717  0.067872  0.066651  0.063590   
1  0.479572  0.043038  0.030075  0.020164  0.235390  0.055788  0.014474   
2  0.432765  0.043701  0.026840  0.024427  0.299017  0.066578  0.013751   
3  0.506477  0.047196  0.027785  0.024373  0.201054  0.063204  0.020254   
4  0.464244  0.044821  0.029505  0.022817  0.252187  0.061723  0.015182   

        YFT  
0  0.267601  
1  0.121501  
2  0.092921  
3  0.109657  
4  0.109522  


In [15]:
#test_files[0]

#extracting relevant name of the image from the full-path of image
image_names = [test_files[i][15:] for i in range(len(test_files))]

In [16]:
#adjusting the filename of the image to match the submission guidelines
for i in range(13153):
    if image_names[i][5]=='_':
        image_names[i] = "test_stg2/" + image_names[i]

In [17]:
image_names[1323]

'test_stg2/image_04056.jpg'

In [18]:
#adding image names to our dataframe
df_pred_model1['image'] = pd.DataFrame(image_names)

#reindexing the dataframe
df_pred_model1 = df_pred_model1.reindex_axis(['image','ALB','BET','DOL','LAG','NoF','OTHER','SHARK','YFT'], axis=1)

#printing the first five rows of dataframe
print(df_pred_model1[:5])

  """


                       image       ALB       BET       DOL       LAG  \
0  test_stg2/image_10973.jpg  0.425539  0.051915  0.039113  0.017717   
1  test_stg2/image_00175.jpg  0.479572  0.043038  0.030075  0.020164   
2  test_stg2/image_09645.jpg  0.432765  0.043701  0.026840  0.024427   
3              img_02920.jpg  0.506477  0.047196  0.027785  0.024373   
4  test_stg2/image_09349.jpg  0.464244  0.044821  0.029505  0.022817   

        NoF     OTHER     SHARK       YFT  
0  0.067872  0.066651  0.063590  0.267601  
1  0.235390  0.055788  0.014474  0.121501  
2  0.299017  0.066578  0.013751  0.092921  
3  0.201054  0.063204  0.020254  0.109657  
4  0.252187  0.061723  0.015182  0.109522  


In [19]:
df_pred_model1.tail(10)

Unnamed: 0,image,ALB,BET,DOL,LAG,NoF,OTHER,SHARK,YFT
13143,test_stg2/image_05875.jpg,0.515315,0.04194,0.032962,0.016914,0.178082,0.047733,0.015448,0.151607
13144,test_stg2/image_04374.jpg,0.540486,0.040701,0.034733,0.013437,0.118196,0.03811,0.019496,0.194842
13145,test_stg2/image_07892.jpg,0.452034,0.052957,0.033044,0.021281,0.106285,0.081277,0.050915,0.202207
13146,test_stg2/image_09226.jpg,0.45002,0.052148,0.036669,0.020324,0.128799,0.070184,0.03701,0.204846
13147,test_stg2/image_04860.jpg,0.276634,0.042274,0.030476,0.024683,0.448714,0.07451,0.009589,0.09312
13148,img_07578.jpg,0.501905,0.041839,0.035838,0.014108,0.145416,0.04261,0.017613,0.200671
13149,test_stg2/image_03265.jpg,0.556458,0.035502,0.025563,0.01357,0.167819,0.042091,0.013229,0.145767
13150,test_stg2/image_09846.jpg,0.437534,0.036371,0.027692,0.015496,0.283582,0.052302,0.009798,0.137225
13151,test_stg2/image_10800.jpg,0.509217,0.04743,0.028931,0.018075,0.084743,0.067471,0.049382,0.194752
13152,test_stg2/image_02733.jpg,0.484187,0.037991,0.033073,0.013554,0.196655,0.043481,0.01185,0.17921


##  .csv file for submission

In [20]:
df_pred_model1.to_csv('submission1-benchmark.csv',index=False)

---