In [4]:
import pandas as pd
import numpy as np
import cv2  
seed = 251736
np.random.seed(seed)

from sklearn.model_selection import StratifiedKFold

from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Flatten, Activation
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import Input, concatenate
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, TensorBoard
from keras.layers.normalization import BatchNormalization
from keras.regularizers import l1, l2, l1_l2
from keras.optimizers import Adam

import os

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [11]:
def get_scaled_imgs(df):
    """
    basic function for reshaping and rescaling data as images
    """
    imgs = []

    for i, row in df.iterrows():
        # make 75x75 image
        band_1 = np.array(row['band_1']).reshape(75, 75)
        band_2 = np.array(row['band_2']).reshape(75, 75)
        band_3 = band_1 + band_2  # plus since log(x*y) = log(x) + log(y)

        # Rescale
        a = (band_1 - band_1.mean()) / (band_1.max() - band_1.min())
        b = (band_2 - band_2.mean()) / (band_2.max() - band_2.min())
        c = (band_3 - band_3.mean()) / (band_3.max() - band_3.min())

        imgs.append(np.dstack((a, b,c)))

    return np.array(imgs)

In [12]:
def central_crop(imgs, cropsize):
    """
    central crop for images
    """

    x = imgs.shape[1]
    y = imgs.shape[2]
    startx = x // 2 - (cropsize // 2)
    starty = y // 2 - (cropsize // 2)
    return imgs[:, startx:startx + cropsize, starty:starty + cropsize, :]

In [13]:
def get_more_images(imgs):
    """
    augmentation for more data
    """

    more_images = []
    vert_flip_imgs = []
    hori_flip_imgs = []

    # Central crop
    # imgs = central_crop(imgs, cropsize)

    for i in range(0, imgs.shape[0]):
        a = imgs[i, :, :, 0]
        b = imgs[i, :, :, 1]
        c = imgs[i, :, :, 2]

        av = cv2.flip(a, 1)
        ah = cv2.flip(a, 0)
        bv = cv2.flip(b, 1)
        bh = cv2.flip(b, 0)
        cv = cv2.flip(c, 1)
        ch = cv2.flip(c, 0)

        vert_flip_imgs.append(np.dstack((av, bv,cv)))
        hori_flip_imgs.append(np.dstack((ah, bh,ch)))

    v = np.array(vert_flip_imgs)
    h = np.array(hori_flip_imgs)

    more_images = np.concatenate((imgs, v, h))

    return more_images

In [14]:
def get_model():
    
    """
    Keras Sequential model

    """
    
    model=Sequential()
    
    # Conv block 1
    model.add(Conv2D(64, kernel_size=(3, 3),activation='relu', input_shape=(75, 75, 3)))
    model.add(Conv2D(64, kernel_size=(3, 3), activation='relu' ))
    model.add(Conv2D(64, kernel_size=(3, 3), activation='relu' ))
    model.add(MaxPooling2D(pool_size=(3, 3), strides=(2, 2)))
   
    # Conv block 2
    model.add(Conv2D(128, kernel_size=(3, 3), activation='relu' ))
    model.add(Conv2D(128, kernel_size=(3, 3), activation='relu' ))
    model.add(Conv2D(128, kernel_size=(3, 3), activation='relu' ))
    model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))
   
    # Conv block 3
    model.add(Conv2D(128, kernel_size=(3, 3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))
   
    #Conv block 4
    model.add(Conv2D(256, kernel_size=(3, 3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))
   
    # Flatten before dense
    model.add(Flatten())

    #Dense 1
    model.add(Dense(1024, activation='relu'))
    model.add(Dropout(0.4))

    #Dense 2
    model.add(Dense(512, activation='relu'))
    model.add(Dropout(0.2))

    # Output 
    model.add(Dense(1, activation="sigmoid"))

    optimizer = Adam(lr=0.0001, decay=0.0)
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    
    return model

In [15]:
df_train = pd.read_json('./input/train.json')  # this is a dataframe


Xtrain = get_scaled_imgs(df_train)
Ytrain = np.array(df_train['is_iceberg'])
df_train.inc_angle = df_train.inc_angle.replace('na', 0)
idx_tr = np.where(df_train.inc_angle > 0)

Ytrain = Ytrain[idx_tr[0]]
Xtrain = Xtrain[idx_tr[0], ...]
# Xangle = df_train.inc_angle[idx_tr[0]]

Xtr_more = get_more_images(Xtrain)
# Xangle_more = np.concatenate((Xangle, Xangle, Xangle, Xangle))
Ytr_more = np.concatenate((Ytrain, Ytrain, Ytrain))

# Test data
df_test = pd.read_json('./input/test.json')
df_test.inc_angle = df_test.inc_angle.replace('na', 0)
Xtest = (get_scaled_imgs(df_test))
# Xtest = central_crop(Xtest, 50)

In [7]:
expname = 'v7'
folds = 10
batch_size = 32
epochs = 30

In [9]:
dirs = {'logs': f'./logs/{expname}',
        'model': f'./checkpoints/{expname}', 'result': f'./result/{expname}'}
for i in dirs.values():
    if not os.path.exists(i):
        os.makedirs(i)

In [21]:
kfold = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)
for fold_n, (train, test) in enumerate(kfold.split(Xtr_more, Ytr_more)):
    print(f"FOLD {fold_n}: ")
    # Since we use augmentation, angle changes on augmented images, so we do not pass angle to model
    model = get_model()
    

    MODEL_FILE = f'{dirs["model"]}/mdl_simple_k{fold_n}_wght.hdf5'

    mcp_save = ModelCheckpoint(
        MODEL_FILE, save_best_only=True, monitor='val_loss', mode='min')
    reduce_lr_loss = ReduceLROnPlateau(
        monitor='val_loss', factor=0.1, patience=15, verbose=1, epsilon=1e-4, mode='min')
    tb = TensorBoard(log_dir=f'{dirs["logs"]}/{fold_n}')
    es = EarlyStopping(monitor='val_loss', patience=10, verbose=1, mode='min')

    model.fit(Xtr_more[train], Ytr_more[train],
              batch_size=batch_size,
              epochs=epochs,
              verbose=1,
              validation_data=(
                  Xtr_more[test], Ytr_more[test]),
              callbacks=[mcp_save, reduce_lr_loss, tb, ])

    model.load_weights(filepath=MODEL_FILE)

    score = model.evaluate(
        Xtr_more[test], Ytr_more[test], verbose=1)
    print('\n Val score:', score[0])
    print('\n Val accuracy:', score[1])

    SUBMISSION = f'{dirs["result"]}/sub_part{fold_n}.csv'

    
    pred_test = model.predict(Xtest)

    submission = pd.DataFrame(
        {'id': df_test["id"], 'is_iceberg': pred_test.reshape((pred_test.shape[0]))})
    print(submission.head(10))

    submission.to_csv(SUBMISSION, index=False)
    print(f"submission part {fold_n} saved")

FOLD 0: 
Train on 3971 samples, validate on 442 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30

 Val score: 0.19934389291845295

 Val accuracy: 0.9298642552815951
         id  is_iceberg
0  5941774d    0.024316
1  4023181e    0.853870
2  b20200e4    0.189898
3  e7f018bb    0.999083
4  4371c8c3    0.012572
5  a8d9b1fd    0.978093
6  29e7727e    0.074941
7  92a51ffb    0.999374
8  c769ac97    0.000017
9  aee0547d    0.000003
submission part 0 saved
FOLD 1: 
Train on 3971 samples, validate on 442 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/3

Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30

 Val score: 0.18527390143720274

 Val accuracy: 0.9321266989902134
         id    is_iceberg
0  5941774d  2.701466e-03
1  4023181e  7.725724e-01
2  b20200e4  9.221743e-02
3  e7f018bb  9.986681e-01
4  4371c8c3  4.435530e-02
5  a8d9b1fd  7.808241e-01
6  29e7727e  1.566338e-02
7  92a51ffb  9.992700e-01
8  c769ac97  1.691815e-06
9  aee0547d  7.307015e-08
submission part 3 saved
FOLD 4: 
Train on 3972 samples, validate on 441 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30

 Val score: 0.1682952735699764

 Val accuracy: 0.9229024918982232
         id    is_iceberg
0 

Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 00029: reducing learning rate to 9.999999747378752e-06.
Epoch 30/30

 Val score: 0.19901894715502688

 Val accuracy: 0.9183673445059328
         id  is_iceberg
0  5941774d    0.026669
1  4023181e    0.687751
2  b20200e4    0.081889
3  e7f018bb    0.994439
4  4371c8c3    0.035597
5  a8d9b1fd    0.966777
6  29e7727e    0.031499
7  92a51ffb    0.992002
8  c769ac97    0.000130
9  aee0547d    0.000034
submission part 7 saved
FOLD 8: 
Train on 3972 samples, validate on 441 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/3


 Val score: 0.16175541364484364

 Val accuracy: 0.9274376405069347
         id  is_iceberg
0  5941774d    0.003964
1  4023181e    0.885740
2  b20200e4    0.946397
3  e7f018bb    0.999570
4  4371c8c3    0.066347
5  a8d9b1fd    0.995320
6  29e7727e    0.050039
7  92a51ffb    0.999578
8  c769ac97    0.000524
9  aee0547d    0.000061
submission part 8 saved
FOLD 9: 
Train on 3973 samples, validate on 440 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30

 Val score: 0.1904411482540044

 Val accuracy: 0.9250000021674416
         id    is_iceberg
0  5941774d  6.953267e-03
1  4023181e  6.958792e-01
2  b20200e4  1.040613e-04
3  e7f018bb  9.990308e-01
4  4371c8c3  1.500640e-01
5  a8d9b1fd  8

In [10]:
stacked = [pd.read_csv(dirs['result'] + f'/sub_part{i}.csv')
           for i in range(folds)]

sub = pd.DataFrame()
sub['id'] = stacked[1]['id']
sub['is_iceberg'] = np.exp(
    np.mean([i['is_iceberg'].apply(lambda x: np.log(x)) for i in stacked], axis=0))

sub.to_csv(dirs['result'] + '/final_ensemble.csv',
           index=False, float_format='%.6f')