In [2]:
import os
import numpy as np 
import pandas as pd 

# Data Load

In [3]:
sub_path = "./statoil-iceberg-submissions"
all_files = os.listdir(sub_path)

# Read and concatenate submissions
outs = [pd.read_csv(os.path.join(sub_path, f), index_col=0) for f in all_files]
concat_sub = pd.concat(outs, axis=1)
cols = list(map(lambda x: "is_iceberg_" + str(x), range(len(concat_sub.columns))))
concat_sub.columns = cols
concat_sub.reset_index(inplace=True)
concat_sub.head()


Unnamed: 0,id,is_iceberg_0,is_iceberg_1,is_iceberg_2,is_iceberg_3,is_iceberg_4,is_iceberg_5,is_iceberg_6,is_iceberg_7,is_iceberg_8,is_iceberg_9
0,5941774d,0.013974,0.005586,0.088246,0.0478,0.022661,0.030726,0.048098,0.002674,0.022661,0.01943216
1,4023181e,0.435513,0.145927,0.604716,0.346582,0.281293,0.441007,0.448249,0.125127,0.281293,0.03168809
2,b20200e4,0.001904,1.5e-05,0.009394,0.001091,0.06591,0.000311,0.057416,9.8e-05,0.06591,4e-08
3,e7f018bb,0.999721,0.999914,0.998628,0.998541,0.995712,0.99925,0.99746,0.987586,0.995712,0.9925741
4,4371c8c3,0.024783,0.033843,0.033681,0.009229,0.042113,0.024562,0.010555,0.026264,0.042113,0.02215107


In [4]:
# check correlation
concat_sub.corr()

Unnamed: 0,is_iceberg_0,is_iceberg_1,is_iceberg_2,is_iceberg_3,is_iceberg_4,is_iceberg_5,is_iceberg_6,is_iceberg_7,is_iceberg_8,is_iceberg_9
is_iceberg_0,1.0,0.872093,0.964119,0.965293,0.942633,0.976467,0.976215,0.909198,0.942633,0.862926
is_iceberg_1,0.872093,1.0,0.892673,0.892067,0.91414,0.884691,0.875416,0.914032,0.91414,0.890434
is_iceberg_2,0.964119,0.892673,1.0,0.975933,0.956652,0.97757,0.967446,0.934716,0.956652,0.902715
is_iceberg_3,0.965293,0.892067,0.975933,1.0,0.963686,0.987501,0.957484,0.930742,0.963686,0.905973
is_iceberg_4,0.942633,0.91414,0.956652,0.963686,1.0,0.959968,0.936378,0.941965,1.0,0.95133
is_iceberg_5,0.976467,0.884691,0.97757,0.987501,0.959968,1.0,0.960945,0.919837,0.959968,0.892922
is_iceberg_6,0.976215,0.875416,0.967446,0.957484,0.936378,0.960945,1.0,0.9199,0.936378,0.871502
is_iceberg_7,0.909198,0.914032,0.934716,0.930742,0.941965,0.919837,0.9199,1.0,0.941965,0.936248
is_iceberg_8,0.942633,0.91414,0.956652,0.963686,1.0,0.959968,0.936378,0.941965,1.0,0.95133
is_iceberg_9,0.862926,0.890434,0.902715,0.905973,0.95133,0.892922,0.871502,0.936248,0.95133,1.0


In [5]:
# get the data fields ready for stacking
concat_sub['is_iceberg_max'] = concat_sub.iloc[:, 1:6].max(axis=1)
concat_sub['is_iceberg_min'] = concat_sub.iloc[:, 1:6].min(axis=1)
concat_sub['is_iceberg_mean'] = concat_sub.iloc[:, 1:6].mean(axis=1)
concat_sub['is_iceberg_median'] = concat_sub.iloc[:, 1:6].median(axis=1)

In [6]:
# set up cutoff threshold for lower and upper bounds, easy to twist 
cutoff_lo = 0.8
cutoff_hi = 0.2

# Mean Stacking

In [7]:
concat_sub['is_iceberg'] = concat_sub['is_iceberg_mean']
concat_sub[['id', 'is_iceberg']].to_csv('stack_mean.csv', index=False, float_format='%.6f')

# Median Stacking

In [8]:
concat_sub['is_iceberg'] = concat_sub['is_iceberg_median']
concat_sub[['id', 'is_iceberg']].to_csv('stack_median.csv', index=False, float_format='%.6f')

# PushOut + Median Stacking 

Pushout strategy is a bit agressive given what it does...

In [9]:
concat_sub['is_iceberg'] = np.where(np.all(concat_sub.iloc[:,1:6] > cutoff_lo, axis=1), 1, 
                                    np.where(np.all(concat_sub.iloc[:,1:6] < cutoff_hi, axis=1),
                                             0, concat_sub['is_iceberg_median']))
concat_sub[['id', 'is_iceberg']].to_csv('stack_pushout_median.csv', index=False, float_format='%.6f')

# MinMax + Mean Stacking

MinMax seems more gentle and it outperforms the previous one given its peformance score.

In [10]:
concat_sub['is_iceberg'] = np.where(np.all(concat_sub.iloc[:,1:6] > cutoff_lo, axis=1), 
                                    concat_sub['is_iceberg_max'], 
                                    np.where(np.all(concat_sub.iloc[:,1:6] < cutoff_hi, axis=1),
                                             concat_sub['is_iceberg_min'], 
                                             concat_sub['is_iceberg_mean']))
concat_sub[['id', 'is_iceberg']].to_csv('stack_minmax_mean.csv', 
                                        index=False, float_format='%.6f')

# MinMax + Median Stacking 

In [11]:
concat_sub['is_iceberg'] = np.where(np.all(concat_sub.iloc[:,1:6] > cutoff_lo, axis=1), 
                                    concat_sub['is_iceberg_max'], 
                                    np.where(np.all(concat_sub.iloc[:,1:6] < cutoff_hi, axis=1),
                                             concat_sub['is_iceberg_min'], 
                                             concat_sub['is_iceberg_median']))
concat_sub[['id', 'is_iceberg']].to_csv('stack_minmax_median.csv', 
                                        index=False, float_format='%.6f')

# MinMax + BestBase Stacking

In [13]:
# load the model with best base performance
sub_base = pd.read_csv('./statoil-iceberg-submissions/VGG16_lee_0.1446.csv')

In [14]:
concat_sub['is_iceberg_base'] = sub_base['is_iceberg']
concat_sub['is_iceberg'] = np.where(np.all(concat_sub.iloc[:,1:6] > cutoff_lo, axis=1), 
                                    concat_sub['is_iceberg_max'], 
                                    np.where(np.all(concat_sub.iloc[:,1:6] < cutoff_hi, axis=1),
                                             concat_sub['is_iceberg_min'], 
                                             concat_sub['is_iceberg_base']))
concat_sub[['id', 'is_iceberg']].to_csv('stack_minmax_bestbase.csv', 
                                        index=False, float_format='%.6f')

# MinMax + BestBase Stacking + Denoising

In [16]:
u = np.where(np.all(concat_sub.iloc[:,1:6] > cutoff_lo, axis=1), 
                                    concat_sub['is_iceberg_max'], 
                                    np.where(np.all(concat_sub.iloc[:,1:6] < cutoff_hi, axis=1),
                                             concat_sub['is_iceberg_min'], 
                                             concat_sub['is_iceberg_base']))
thres = 0.01
for i in range(len(u)):
    if u[i] < thres:
        u[i] = 0
    if u[i] > 1 - thres:
        u[i] = 1
concat_sub['is_iceberg'] = u
concat_sub[['id', 'is_iceberg']].to_csv('denoise_%f.csv' % thres, index=False, float_format='%.6f')