In [1]:
import os
import zipfile
import py7zlib
import shutil
import hashlib
from tqdm import tqdm_notebook
from keras import layers

Using TensorFlow backend.


In [2]:
data_base_dir = 'data'

train_path = 'train.zip'
test_stg1_path = 'test_stg1.zip'
test_stg2_path = 'test_stg2.7z'

In [3]:
if not os.path.exists(data_base_dir):
    os.makedirs(data_base_dir)

In [4]:
def extract_zip(zip_path, out_dir):
    #if not os.path.exists(out_dir):
    name = os.path.basename(zip_path).split('.')[0]
    zip_ref = zipfile.ZipFile(zip_path, 'r')
    zip_ref.extractall(out_dir)
    zip_ref.close()
    return os.path.join(out_dir, name)

In [5]:
#extract training data
orig_train_dir = extract_zip(train_path, data_base_dir)

In [6]:
#extract stage_1 test data
test_stg1_dir  = extract_zip(test_stg1_path, os.path.join(data_base_dir, 'test'))

In [7]:
%%time
import subprocess
subprocess.call(r'"C:\Program Files\7-Zip\7z.exe" x ' + test_stg2_path + ' -o' + data_base_dir)

Wall time: 1min 35s


In [8]:
test_stg2_dir = os.path.join(data_base_dir, 'test_stg2')

for img in os.listdir(test_stg2_dir):
    shutil.move(os.path.join(test_stg2_dir, img), test_stg1_dir)

In [9]:
#extract stage_1 test data
# this takes forever to run, if you are in a hurry, extract it using the 7zip software
"""
class SevenZFile(object):
    
    def __init__(self, filepath):
        fp = open(filepath, 'rb')
        self.archive = py7zlib.Archive7z(fp)
        
    def is_7zfile(cls, filepath):
        is7z = False
        fp = None
        try:
            fp = open(filepath, 'rb')
            archive = py7zlib.Archive7z(fp)
            n = len(archive.getnames())
            is7z = True
        finally:
            if fp:
                fp.close()
        return is7z

    def extractall(self, path):
        for name in tqdm_notebook(self.archive.getnames()):
            outfilename = os.path.join(path, name)
            outdir = os.path.dirname(outfilename)
            if not os.path.exists(outdir):
                os.makedirs(outdir)
            outfile = open(outfilename, 'wb')
            outfile.write(self.archive.getmember(name).read())
            outfile.close()
SevenZFile(test_stg2_path).extractall(data_base_dir)
"""

"\nclass SevenZFile(object):\n    \n    def __init__(self, filepath):\n        fp = open(filepath, 'rb')\n        self.archive = py7zlib.Archive7z(fp)\n        \n    def is_7zfile(cls, filepath):\n        is7z = False\n        fp = None\n        try:\n            fp = open(filepath, 'rb')\n            archive = py7zlib.Archive7z(fp)\n            n = len(archive.getnames())\n            is7z = True\n        finally:\n            if fp:\n                fp.close()\n        return is7z\n\n    def extractall(self, path):\n        for name in tqdm_notebook(self.archive.getnames()):\n            outfilename = os.path.join(path, name)\n            outdir = os.path.dirname(outfilename)\n            if not os.path.exists(outdir):\n                os.makedirs(outdir)\n            outfile = open(outfilename, 'wb')\n            outfile.write(self.archive.getmember(name).read())\n            outfile.close()\nSevenZFile(test_stg2_path).extractall(data_base_dir)\n"

## Train validation split

In [10]:
training_dir = os.path.join(data_base_dir, 'train_val_split', 'training')
validation_dir = os.path.join(data_base_dir, 'train_val_split', 'validation')

In [11]:
classes = [class_ for class_ in os.listdir(orig_train_dir) if os.path.isdir(os.path.join(orig_train_dir, class_))]

In [12]:
classes

['ALB', 'BET', 'DOL', 'LAG', 'NoF', 'OTHER', 'SHARK', 'YFT']

In [13]:
for class_ in classes:
    
    class_orig_dir = os.path.join(orig_train_dir, class_)
    class_training_dir = os.path.join(training_dir, class_)
    class_validation_dir = os.path.join(validation_dir, class_)
    
    if not os.path.exists(class_training_dir):
        os.makedirs(class_training_dir)
        
    if not os.path.exists(class_validation_dir):
        os.makedirs(class_validation_dir)

    img_list = os.listdir(class_orig_dir)

    for img in img_list:
        hash_name = hashlib.sha1(img.encode('ascii'))
        if int(hash_name.hexdigest(), 16) % 1000 > 100:
            shutil.copy(os.path.join(class_orig_dir, img), class_training_dir)
        else:
            shutil.copy(os.path.join(class_orig_dir, img), class_validation_dir)

## Finetuning InceptionResnetV2 (trained on imagenet)

In [14]:
from keras.applications.inception_resnet_v2 import InceptionResNetV2

In [15]:
#from keras.applications.vgg16 import VGG16

In [16]:
conv_base = InceptionResNetV2(include_top=False) #VGG16(include_top=False) #

In [17]:
from keras import models, optimizers

In [18]:
model = models.Sequential()

In [19]:
model.add(conv_base)
model.add(layers.GlobalAveragePooling2D())
model.add(layers.Dense(8, activation='softmax'))

In [20]:
conv_base.trainable = False

In [21]:
model.compile(loss='categorical_crossentropy', optimizer=optimizers.Adam(lr=1e-4), metrics=['accuracy'])

In [28]:
from keras.callbacks import ModelCheckpoint, EarlyStopping

checkpointer = ModelCheckpoint('quicksign_inception_resnet_512.h5', monitor='val_loss', save_best_only=True, verbose=1)
earlystopper = EarlyStopping(monitor='val_loss', patience=2)


In [23]:
from keras.preprocessing.image import ImageDataGenerator

In [24]:
train_data_gen = ImageDataGenerator(rescale=1./255,
                                   rotation_range=40,
                                   width_shift_range=0.2,
                                   height_shift_range=0.2,
                                   shear_range=0.2,
                                   zoom_range=0.2,
                                   horizontal_flip=True)
validation_data_gen = ImageDataGenerator(rescale=1./255)

In [25]:
train_generator = train_data_gen.flow_from_directory(training_dir,
                                                    target_size=(512, 512),
                                                    batch_size=16,
                                                    class_mode='categorical')
validation_generator = validation_data_gen.flow_from_directory(validation_dir,
                                                    target_size=(512, 512),
                                                    batch_size=32,
                                                    class_mode='categorical')

Found 3393 images belonging to 8 classes.
Found 384 images belonging to 8 classes.


In [30]:
model.fit_generator(train_generator, epochs=20, validation_data=validation_generator, verbose=2,
                    callbacks=[checkpointer, earlystopper])

Epoch 1/20
 - 276s - loss: 1.5224 - acc: 0.4575 - val_loss: 1.5335 - val_acc: 0.5078

Epoch 00001: val_loss improved from inf to 1.53354, saving model to quicksign_inception_resnet_512.h5
Epoch 2/20
 - 269s - loss: 1.5041 - acc: 0.4586 - val_loss: 1.4945 - val_acc: 0.5182

Epoch 00002: val_loss improved from 1.53354 to 1.49449, saving model to quicksign_inception_resnet_512.h5
Epoch 3/20
 - 268s - loss: 1.4728 - acc: 0.4736 - val_loss: 1.4541 - val_acc: 0.5286

Epoch 00003: val_loss improved from 1.49449 to 1.45407, saving model to quicksign_inception_resnet_512.h5
Epoch 4/20
 - 267s - loss: 1.4443 - acc: 0.4789 - val_loss: 1.4424 - val_acc: 0.5417

Epoch 00004: val_loss improved from 1.45407 to 1.44240, saving model to quicksign_inception_resnet_512.h5
Epoch 5/20
 - 270s - loss: 1.4214 - acc: 0.4982 - val_loss: 1.4376 - val_acc: 0.5182

Epoch 00005: val_loss improved from 1.44240 to 1.43760, saving model to quicksign_inception_resnet_512.h5
Epoch 6/20
 - 269s - loss: 1.4177 - acc: 0.5

<keras.callbacks.History at 0x28725c1d898>

In [31]:
model.load_weights('quicksign_inception_resnet_512.h5')

In [32]:
earlystopper = EarlyStopping(monitor='val_loss', patience=5)



In [33]:
conv_base.trainable = True
set_trainable = False
for layer in conv_base.layers:
    if 'conv_7b' in layer.name:
        set_trainable = True
    if set_trainable:
        layer.trainable = True
    else:
        layer.trainable = False
    

In [34]:
model.compile(loss='categorical_crossentropy', optimizer=optimizers.Adam(lr=1e-5), metrics=['accuracy'])

In [36]:
model.fit_generator(train_generator, epochs=100, validation_data=validation_generator, verbose=2,
                    callbacks=[checkpointer, earlystopper])

Epoch 1/100
 - 292s - loss: 1.2777 - acc: 0.5622 - val_loss: 1.2595 - val_acc: 0.5677

Epoch 00001: val_loss improved from 1.31421 to 1.25952, saving model to quicksign_inception_resnet_512.h5
Epoch 2/100
 - 282s - loss: 1.2359 - acc: 0.5754 - val_loss: 1.2148 - val_acc: 0.5729

Epoch 00002: val_loss improved from 1.25952 to 1.21477, saving model to quicksign_inception_resnet_512.h5
Epoch 3/100
 - 278s - loss: 1.1923 - acc: 0.5857 - val_loss: 1.1703 - val_acc: 0.5885

Epoch 00003: val_loss improved from 1.21477 to 1.17029, saving model to quicksign_inception_resnet_512.h5
Epoch 4/100
 - 283s - loss: 1.1662 - acc: 0.5930 - val_loss: 1.1449 - val_acc: 0.5911

Epoch 00004: val_loss improved from 1.17029 to 1.14490, saving model to quicksign_inception_resnet_512.h5
Epoch 5/100
 - 280s - loss: 1.1250 - acc: 0.5971 - val_loss: 1.1133 - val_acc: 0.6120

Epoch 00005: val_loss improved from 1.14490 to 1.11333, saving model to quicksign_inception_resnet_512.h5
Epoch 6/100
 - 277s - loss: 1.0926 

Epoch 00043: val_loss improved from 0.69689 to 0.69321, saving model to quicksign_inception_resnet_512.h5
Epoch 44/100
 - 252s - loss: 0.6683 - acc: 0.7911 - val_loss: 0.6893 - val_acc: 0.7943

Epoch 00044: val_loss improved from 0.69321 to 0.68932, saving model to quicksign_inception_resnet_512.h5
Epoch 45/100
 - 250s - loss: 0.6594 - acc: 0.7905 - val_loss: 0.6835 - val_acc: 0.7969

Epoch 00045: val_loss improved from 0.68932 to 0.68347, saving model to quicksign_inception_resnet_512.h5
Epoch 46/100
 - 251s - loss: 0.6476 - acc: 0.7923 - val_loss: 0.6781 - val_acc: 0.7891

Epoch 00046: val_loss improved from 0.68347 to 0.67809, saving model to quicksign_inception_resnet_512.h5
Epoch 47/100
 - 251s - loss: 0.6394 - acc: 0.7940 - val_loss: 0.6665 - val_acc: 0.8073

Epoch 00047: val_loss improved from 0.67809 to 0.66650, saving model to quicksign_inception_resnet_512.h5
Epoch 48/100
 - 248s - loss: 0.6370 - acc: 0.7893 - val_loss: 0.6710 - val_acc: 0.7969

Epoch 00048: val_loss did not 

Epoch 00088: val_loss improved from 0.54454 to 0.54193, saving model to quicksign_inception_resnet_512.h5
Epoch 89/100
 - 253s - loss: 0.4992 - acc: 0.8457 - val_loss: 0.5341 - val_acc: 0.8411

Epoch 00089: val_loss improved from 0.54193 to 0.53405, saving model to quicksign_inception_resnet_512.h5
Epoch 90/100
 - 255s - loss: 0.4933 - acc: 0.8421 - val_loss: 0.5341 - val_acc: 0.8411

Epoch 00090: val_loss did not improve from 0.53405
Epoch 91/100
 - 253s - loss: 0.4820 - acc: 0.8551 - val_loss: 0.5408 - val_acc: 0.8385

Epoch 00091: val_loss did not improve from 0.53405
Epoch 92/100
 - 257s - loss: 0.4844 - acc: 0.8498 - val_loss: 0.5346 - val_acc: 0.8411

Epoch 00092: val_loss did not improve from 0.53405
Epoch 93/100
 - 253s - loss: 0.4776 - acc: 0.8489 - val_loss: 0.5294 - val_acc: 0.8490

Epoch 00093: val_loss improved from 0.53405 to 0.52944, saving model to quicksign_inception_resnet_512.h5
Epoch 94/100
 - 258s - loss: 0.4766 - acc: 0.8557 - val_loss: 0.5255 - val_acc: 0.8385

E

<keras.callbacks.History at 0x287262620b8>

### Retrain on the whole dataset

### Predictions and submission

In [37]:
test_data_gen = ImageDataGenerator(rescale=1./255)

test_generator = test_data_gen.flow_from_directory('data/test/',
                                                    target_size=(512, 512),
                                                    batch_size=64,
                                                    class_mode='categorical',
                                                  shuffle=False)

Found 13153 images belonging to 1 classes.


In [38]:
preds = model.predict_generator(test_generator, verbose=1)



In [39]:
preds.shape

(13153, 8)

In [41]:
import numpy as np
im_names = np.array(os.listdir(os.path.join('data/test', 'test_stg1')))

In [42]:
im_names = ['test_stg2/'+name if 'image' in name else name for name in im_names ]

In [43]:
import pandas as pd

In [44]:
df_names = pd.DataFrame({'image': im_names})

In [45]:
df_preds = pd.DataFrame(data=preds, columns=['ALB','BET','DOL','LAG','NoF','OTHER','SHARK','YFT'])

In [46]:
df_submission = pd.concat([df_names, df_preds], axis=1)

In [47]:
df_submission.to_csv('submission.csv', index=False)

In [None]:
df_submission.head()