# Statoil

In [0]:
from google.colab import files
files.upload()

In [0]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [0]:
!kaggle competitions download -c statoil-iceberg-classifier-challenge

In [0]:
!7z e sample_submission.csv.7z
!7z e test.json.7z
!7z e train.json.7z

In [7]:
!ls

kaggle.json  sample_submission.csv     test.json     train.json
sample_data  sample_submission.csv.7z  test.json.7z  train.json.7z


In [0]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import pylab
plt.rcParams['figure.figsize'] = 10, 10

In [0]:
train = pd.read_json('train.json')
test = pd.read_json('test.json')

In [12]:
train.head()

Unnamed: 0,id,band_1,band_2,inc_angle,is_iceberg
0,dfd5f913,"[-27.878360999999998, -27.15416, -28.668615, -...","[-27.154118, -29.537888, -31.0306, -32.190483,...",43.9239,0
1,e25388fd,"[-12.242375, -14.920304999999999, -14.920363, ...","[-31.506321, -27.984554, -26.645678, -23.76760...",38.1562,0
2,58b2aaa0,"[-24.603676, -24.603714, -24.871029, -23.15277...","[-24.870956, -24.092632, -20.653963, -19.41104...",45.2859,1
3,4cfc3a18,"[-22.454607, -23.082819, -23.998013, -23.99805...","[-27.889421, -27.519794, -27.165262, -29.10350...",43.8306,0
4,271f93f4,"[-26.006956, -23.164886, -23.164886, -26.89116...","[-27.206915, -30.259186, -30.259186, -23.16495...",35.6256,0


In [13]:
test.head()

Unnamed: 0,id,band_1,band_2,inc_angle
0,5941774d,"[-15.863251, -15.201077, -17.887735, -19.17248...","[-21.629612, -21.142353, -23.908337, -28.34524...",34.9664
1,4023181e,"[-26.058969497680664, -26.058969497680664, -26...","[-25.754207611083984, -25.754207611083984, -25...",32.615072
2,b20200e4,"[-14.14109992980957, -15.064241409301758, -17....","[-14.74563980102539, -14.590410232543945, -14....",37.505433
3,e7f018bb,"[-12.167478, -13.706167, -16.54837, -13.572674...","[-24.32222, -26.375538, -24.096739, -23.8769, ...",34.4739
4,4371c8c3,"[-23.37459373474121, -26.02718162536621, -28.1...","[-25.72234344482422, -27.011577606201172, -23....",43.918874


데이터는 600km 상공에서 촬영된 인공위성에서 촬영 됐습니다.   

`id` : 이미지 식별변호(id)  
`band_1`, `band_2` : 5625(75*75) 픽셀 이미지로, 일반적인 이미지 파일(0 ~ 255값의 정수)과 달리 음의 값을 갖는 유리수로 구성 돼 있습니다. 또한, 일반 이미지(RGB 3채널)과 달리 HH, HV로 2채널로 구성돼 있습니다.       
`inc_angle` : 인공위성의 입사각  
`is_control` : 목적 변수로 0은 선박, 1은 빙산

In [14]:
len(train['band_1'][0]), 75*75

(5625, 5625)

In [17]:
len(train['inc_angle'])

1604

In [20]:
len(train['inc_angle'].unique())

879

In [29]:
len(train.loc[train['inc_angle'] == 'na', 'inc_angle'])

133

`inc_angle`의 경우 동일한 입사각의 데이터가 존재

In [84]:
train.loc[train['inc_angle'] == train['inc_angle'].unique()[5]]

Unnamed: 0,id,band_1,band_2,inc_angle,is_iceberg
5,b51d18b5,"[-20.769371, -20.769434, -25.906025, -25.90602...","[-29.288746, -29.712593, -28.884804, -28.88480...",36.9034,1
192,4b58ca02,"[-23.641403, -22.057777, -22.427505, -23.01335...","[-27.723801, -29.662001, -30.577213, -30.57727...",36.9034,1
692,63d60bde,"[-22.433567, -21.709385, -21.040972, -21.04097...","[-28.084499, -25.861551, -26.440887, -26.44088...",36.9034,1
1171,bfd71df1,"[-19.57299, -21.892731, -21.892731, -18.080442...","[-28.848137, -31.087675, -31.087675, -30.59126...",36.9034,1
1187,38b16888,"[-15.226004, -15.226004, -18.002369, -19.21075...","[-27.104845, -27.104845, -23.476507, -23.06287...",36.9034,1
1188,3741117d,"[-18.568018, -19.193235, -20.008198, -18.93787...","[-25.61167, -26.466772, -30.139721, -26.17243,...",36.9034,1
1392,5a869fac,"[-17.668499, -18.463905, -18.463905, -16.45466...","[-27.430832, -27.771561, -27.771561, -23.47474...",36.9034,1


데이터를 살펴 본 결과  
동일한 입사각의 데이터는 모두 동일한 `is_iceberg` 값을 갖는다.

In [0]:
X_band_1 = np.array([np.array(band).astype(np.float32).reshape(75, 75) for band in train["band_1"]])
X_band_2 = np.array([np.array(band).astype(np.float32).reshape(75, 75) for band in train["band_2"]])
X_train = np.concatenate([X_band_1[:, :, :, np.newaxis], 
                          X_band_2[:, :, :, np.newaxis],
                          ((X_band_1 + X_band_2)/2)[:, :, :, np.newaxis]],
                          axis = -1)

In [0]:
X_band_1.shape, X_band_2.shape, X_train.shape

((1604, 75, 75), (1604, 75, 75), (1604, 75, 75, 3))

In [0]:
import plotly.offline as py
import plotly.tools as tls
import plotly.graph_objs as go
import plotly.io as pio
pio.renderers.default = 'colab'

def plotmy3d(c, name):
    data = [
        go.Surface(
            z = c
        )
    ]
    layout = go.Layout(
        title = name,
        autosize = False,
        width = 700,
        height = 700,
        margin = dict(
            l = 65,
            r = 50,
            b = 65,
            t = 90
        )
    )
    fig = go.Figure(data = data, layout = layout)
    fig.show()

In [89]:
plotmy3d(X_band_1[0, :, :], 'iceberg')

In [87]:
plotmy3d(X_band_1[2, :, :], 'ship')

In [0]:
from matplotlib import pyplot
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dense, Dropout, Input, Flatten, Activation
from tensorflow.keras.layers import GlobalMaxPooling2D
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras import initializers
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, Callback, EarlyStopping

In [0]:
def getModel():
    #Building the model
    gmodel=Sequential()
    #Conv Layer 1
    gmodel.add(Conv2D(64, kernel_size =(3, 3),activation = 'relu', input_shape = (75, 75, 3)))
    gmodel.add(MaxPooling2D(pool_size = (3, 3), strides=(2, 2)))
    gmodel.add(Dropout(0.2))

    #Conv Layer 2
    gmodel.add(Conv2D(128, kernel_size = (3, 3), activation = 'relu' ))
    gmodel.add(MaxPooling2D(pool_size = (2, 2), strides = (2, 2)))
    gmodel.add(Dropout(0.2))

    #Conv Layer 3
    gmodel.add(Conv2D(128, kernel_size = (3, 3), activation = 'relu'))
    gmodel.add(MaxPooling2D(pool_size = (2, 2), strides = (2, 2)))
    gmodel.add(Dropout(0.2))

    #Conv Layer 4
    gmodel.add(Conv2D(64, kernel_size = (3, 3), activation = 'relu'))
    gmodel.add(MaxPooling2D(pool_size = (2, 2), strides = (2, 2)))
    gmodel.add(Dropout(0.2))

    #Flatten the data for upcoming dense layers
    gmodel.add(Flatten())

    #Dense Layers
    gmodel.add(Dense(512))
    gmodel.add(Activation('relu'))
    gmodel.add(Dropout(0.2))

    #Dense Layer 2
    gmodel.add(Dense(256))
    gmodel.add(Activation('relu'))
    gmodel.add(Dropout(0.2))

    #Sigmoid Layer
    gmodel.add(Dense(1))
    gmodel.add(Activation('sigmoid'))

    mypotim=Adam(lr = 0.001, beta_1 = 0.9, beta_2 = 0.999, epsilon = 1e-08, decay = 0.0)
    gmodel.compile(loss = 'binary_crossentropy',
                  optimizer = mypotim,
                  metrics = ['accuracy'])
    gmodel.summary()
    return gmodel


def get_callbacks(filepath, patience = 2):
    es = EarlyStopping('val_loss', patience = patience, mode = "min")
    msave = ModelCheckpoint(filepath, save_best_only = True)
    return [es, msave]
    
file_path = ".model_weights.hdf5"
callbacks = get_callbacks(filepath=file_path, patience = 5)

In [0]:
target_train = train['is_iceberg']
X_train_cv, X_valid, y_train_cv, y_valid = train_test_split(X_train, target_train, random_state = 1, train_size = 0.75)

In [94]:
import os
gmodel = getModel()
gmodel.fit(X_train_cv, y_train_cv,
          batch_size = 24,
          epochs = 50,
          verbose = 1,
          validation_data = (X_valid, y_valid),
          callbacks = callbacks)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 73, 73, 64)        1792      
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 36, 36, 64)        0         
_________________________________________________________________
dropout (Dropout)            (None, 36, 36, 64)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 34, 34, 128)       73856     
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 17, 17, 128)       0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 17, 17, 128)       0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 15, 15, 128)       1

<tensorflow.python.keras.callbacks.History at 0x7fc8093f16d8>

In [98]:
gmodel.load_weights(filepath = file_path)
score = gmodel.evaluate(X_valid, y_valid, verbose = 1)
print('Test loss : ', score[0])
print('Test accuracy : ', score[1])

Test loss :  0.2828901708126068
Test accuracy :  0.8927680850028992


In [0]:
X_band_test_1 = np.array([np.array(band).astype(np.float32).reshape(75, 75) for band in test['band_1']])
X_band_test_2 = np.array([np.array(band).astype(np.float32).reshape(75, 75) for band in test['band_2']])
X_test = np.concatenate([X_band_test_1[:, :, :, np.newaxis],
                         X_band_test_2[:, :, :, np.newaxis],
                         ((X_band_test_1 + X_band_test_2)/2)[:, :, :, np.newaxis]], axis = -1)
predicted_test = gmodel.predict_proba(X_test)

In [99]:
predicted_test

array([[4.3599628e-02],
       [6.5314847e-01],
       [9.3002375e-03],
       ...,
       [8.3133712e-02],
       [9.9125963e-01],
       [8.9207806e-06]], dtype=float32)

## Reference
- [Keras Model for Beginners (0.210 on LB)+EDA+R&D](https://www.kaggle.com/devm2024/keras-model-for-beginners-0-210-on-lb-eda-r-d)