In [1]:
from os.path import join

import boto3
from botocore.exceptions import ClientError

from rastervision.common.settings import results_path
from rastervision.common.utils import _makedirs


rob_run_names = [
    'tagging/7_11_17/dense_ensemble2/0',
    'tagging/7_11_17/dense_ensemble2/1',
    'tagging/7_11_17/dense_ensemble2/2',
    'tagging/7_11_17/dense_ensemble2/3',
    'tagging/7_11_17/dense_ensemble2/4',
    'tagging/7_5_17/ensemble/0',
    'tagging/7_5_17/ensemble/1',
    'tagging/7_5_17/ensemble/2',
    'tagging/7_5_17/ensemble/3',
    'tagging/7_5_17/ensemble/4',
    'tagging/7_5_17/ensemble/avg',
    'tagging/6_30_17/rerun_best-test_aug',
    'tagging/7_3_17/baseline-branch-tiffdrop',
    'tagging/7_3_17/dual_resnet-stage4-moretrain',
    'tagging/7_3_17/dual_resnet-stage5-moretrain',
    'tagging/7_7_17/tif_ensemble_0/0',
    'tagging/7_7_17/tif_ensemble_0/1',
    'tagging/7_7_17/tif_ensemble_0/2',
    'tagging/7_7_17/tif_ensemble_0/3',
    'tagging/7_7_17/jpg_ensemble_0/0',
    'tagging/7_7_17/jpg_ensemble_0/1',
    'tagging/7_7_17/jpg_ensemble_0/2',
    'tagging/7_7_17/jpg_ensemble_0/3',
    'tagging/7_7_17/baseline_cyclic_1',
    'tagging/7_7_17/baseline_cyclic_1200steps',
    'tagging/7_7_17/baseline_cyclic_2',
    'tagging/7_6_17/baseline_cyclic',
    'tagging/7_2_17/IRRGtiff_v5actual_dense121_3x10epochs_0',
    'tagging/6_30_17/dense121_dropout_1',
    'tagging/6_30_17/dense121_dropout_2',
    'tagging/6_30_17/dense121_dropout_3',
    'tagging/6_29_17/dense121_3x10epochs_0',
    'tagging/6_29_17/IRRGBtiff_v5_dense121_3x10epochs_0',
    'tagging/6_28_17/baseline_testrot_0'
]

best_densenet_run_names = [
    'tagging/7_5_17/ensemble/0',
    'tagging/7_5_17/ensemble/1',
    'tagging/7_5_17/ensemble/2',
    'tagging/7_5_17/ensemble/3',
    'tagging/7_5_17/ensemble/4',
    'tagging/7_11_17/dense_ensemble2/0',
    'tagging/7_11_17/dense_ensemble2/1',
    'tagging/7_11_17/dense_ensemble2/2',
    'tagging/7_11_17/dense_ensemble2/3',
    'tagging/7_11_17/dense_ensemble2/4',
    'tagging/7_7_17/jpg_ensemble_0/0',
    'tagging/7_7_17/jpg_ensemble_0/1',
    'tagging/7_7_17/jpg_ensemble_0/2',
    'tagging/7_7_17/jpg_ensemble_0/3',
    'tagging/7_7_17/jpg_ensemble_0/4'
]

other_densenet_run_names = [
    'tagging/6_30_17/dense121_dropout_1',
    'tagging/6_30_17/dense121_dropout_2',
    'tagging/6_30_17/dense121_dropout_3',
    'tagging/6_29_17/dense121_3x10epochs_0'
]

resnet_run_names = [
    'tagging/7_7_17/baseline_cyclic_1',
    'tagging/7_7_17/baseline_cyclic_1200steps',
    'tagging/7_7_17/baseline_cyclic_2',
    'tagging/7_6_17/baseline_cyclic',
    'tagging/6_28_17/baseline_testrot_0',
    'tagging/7_10_17/cyclic_2400steps'
]

tiff_run_names = [
    'tagging/7_3_17/baseline-branch-tiffdrop',
    'tagging/7_3_17/dual_resnet-stage4-moretrain',
    'tagging/7_3_17/dual_resnet-stage5-moretrain',
    'tagging/7_2_17/dual_resnet_30epoch',
    'tagging/7_7_17/tif_ensemble_0/0',
    'tagging/7_7_17/tif_ensemble_0/1',
    'tagging/7_7_17/tif_ensemble_0/2',
    'tagging/7_7_17/tif_ensemble_0/3',
    'tagging/7_7_17/tif_ensemble_0/4',
    'tagging/7_2_17/IRRGtiff_v5actual_dense121_3x10epochs_0',
    'tagging/6_29_17/IRRGBtiff_v5_dense121_3x10epochs_0'
]

grouped_run_names = best_densenet_run_names + other_densenet_run_names + resnet_run_names + tiff_run_names

all_run_names = list(set(grouped_run_names) | set(rob_run_names))

test_pred_fn = 'test_preds.csv'
scores_fn = 'scores.json'
s3_bucket = 'raster-vision'



In [23]:
def s3_download(run_name, file_name, new_file_name=None):
    if new_file_name is None:
        new_file_name = file_name
    s3_key = 'results/{}/{}'.format(run_name, file_name)
    run_path = join('/opt/data/results/', run_name, new_file_name)
    s3 = boto3.resource('s3')
    s3.Bucket(s3_bucket).download_file(s3_key, run_path)
    
def download_run(run_name):
    _makedirs(join('/opt/data/results/', run_name))
    try:
        s3_download(run_name, test_pred_fn)
    except:
        s3_download(run_name, 'test_predictions.csv', test_pred_fn)
        
    s3_download(run_name, scores_fn)
    print(run_name)
    s3_download(run_name, 'validation_preds.csv')
    

In [4]:
import json

run_f2 = {}
for run_name in all_run_names:
    run_path = join(results_path, run_name)
    scores_path = join(run_path, 'scores.json')
    with open(scores_path, 'r') as scores_file:
        scores = json.load(scores_file)
        if 'f2' in scores:
            f2 = scores['f2']
        elif 'f2_samples' in scores:
            f2 = scores['f2_samples']
    run_f2[run_name] = f2

In [6]:
rob_f2_list = list(map(lambda run_name: (run_name, run_f2[run_name]), rob_run_names))
decent_f2_list = list(filter(lambda x: x[1] > 0.92, rob_f2_list))
bad_f2_list = list(filter(lambda x: x[1] < 0.92, rob_f2_list))
decent_run_names = list(map(lambda x: x[0], decent_f2_list))

print(len(rob_run_names))
print(rob_f2_list)
print(len(rob_f2_list))
print(len(decent_f2_list))
print(len(bad_f2_list))
print(bad_f2_list)


34
[('tagging/7_11_17/dense_ensemble2/0', 0.92871), ('tagging/7_11_17/dense_ensemble2/1', 0.93108), ('tagging/7_11_17/dense_ensemble2/2', 0.93043), ('tagging/7_11_17/dense_ensemble2/3', 0.93124), ('tagging/7_11_17/dense_ensemble2/4', 0.92978), ('tagging/7_5_17/ensemble/0', 0.9295), ('tagging/7_5_17/ensemble/1', 0.92978), ('tagging/7_5_17/ensemble/2', 0.93026), ('tagging/7_5_17/ensemble/3', 0.93029), ('tagging/7_5_17/ensemble/4', 0.92986), ('tagging/7_5_17/ensemble/avg', 0.93175), ('tagging/6_30_17/rerun_best-test_aug', 0.92875), ('tagging/7_3_17/baseline-branch-tiffdrop', 0.92515), ('tagging/7_3_17/dual_resnet-stage4-moretrain', 0.92547), ('tagging/7_3_17/dual_resnet-stage5-moretrain', 0.92624), ('tagging/7_7_17/tif_ensemble_0/0', 0.9153), ('tagging/7_7_17/tif_ensemble_0/1', 0.91378), ('tagging/7_7_17/tif_ensemble_0/2', 0.91531), ('tagging/7_7_17/tif_ensemble_0/3', 0.9159), ('tagging/7_7_17/jpg_ensemble_0/0', 0.9285), ('tagging/7_7_17/jpg_ensemble_0/1', 0.929), ('tagging/7_7_17/jpg_ens

In [24]:
import numpy as np
import math

from rastervision.tagging.data.planet_kaggle import TagStore

run_name = all_run_names[0]
run_path = join(results_path, run_name)
test_pred_path = join(run_path, test_pred_fn)
test_file_inds = list(TagStore(test_pred_path).file_ind_to_tags.keys())

def make_test_preds(run_names, out_path):
    test_preds_list = []
    for run_name in run_names:
        run_path = join(results_path, run_name)
        test_pred_path = join(run_path, test_pred_fn) 
        test_preds = TagStore(test_pred_path).get_tag_array(test_file_inds)
        test_preds_list.append(np.expand_dims(test_preds, axis=2))

    test_preds = np.concatenate(test_preds_list, axis=2)
    test_preds_sum = np.sum(test_preds, axis=2)
    weight_sum = test_preds.shape[2]
    true_thresh = math.ceil(weight_sum / 2)
    test_preds = test_preds_sum >= true_thresh

    tag_store = TagStore()
    for i in range(test_preds.shape[0]):
        tag_store.add_tags(test_file_inds[i], test_preds[i, :])

    tag_store.save(out_path)
    
out_path = '/opt/data/results/jupyter_out/'
_makedirs(out_path)

In [9]:
# ensemble 5 is made of rob_run_names
# 0.93217 LB

# ensemble 5 minus models with < 0.92 val f2
# 0.93201 LB
out_path = '/opt/data/results/jupyter_out/'
_makedirs(out_path)
make_test_preds(decent_run_names, join(out_path, 'decent_preds.csv'))

In [58]:
# ensemble 5 plus 5 new densenets
# 0.93202 LB
out_path = '/opt/data/results/jupyter_out/'
new_dn_run_names = [
    'tagging/7_11_17/dense_ensemble2/0',
    'tagging/7_11_17/dense_ensemble2/1',
    'tagging/7_11_17/dense_ensemble2/2',
    'tagging/7_11_17/dense_ensemble2/3',
    'tagging/7_11_17/dense_ensemble2/4'
]
make_test_preds(rob_run_names + new_dn_run_names, join(out_path, 'all_preds.csv'))

In [59]:
# ensemble 5 minus 5 densenets
# 0.93221 LB
second_densenet_run_names = [
    'tagging/7_5_17/ensemble/0',
    'tagging/7_5_17/ensemble/1',
    'tagging/7_5_17/ensemble/2',
    'tagging/7_5_17/ensemble/3',
    'tagging/7_5_17/ensemble/4',
    'tagging/7_5_17/ensemble/avg'
]

make_test_preds(list(set(rob_run_names) - set(second_densenet_run_names)), join(out_path, 'minus_second_dn.csv'))

In [11]:
# ensemble of 3 densenets, 3 densenets with dropout, 3 resnets, 3 tiffs
# 0.93144 LB
streamlined_run_names = [
    'tagging/7_7_17/jpg_ensemble_0/0',
    'tagging/7_7_17/jpg_ensemble_0/1',
    'tagging/7_7_17/jpg_ensemble_0/2',
    'tagging/7_7_17/tif_ensemble_0/0',
    'tagging/7_7_17/tif_ensemble_0/1',
    'tagging/7_7_17/tif_ensemble_0/2',
    'tagging/7_7_17/baseline_cyclic_1200steps',
    'tagging/7_7_17/baseline_cyclic_2',
    'tagging/7_10_17/cyclic_2400steps',
    'tagging/6_30_17/dense121_dropout_1',
    'tagging/6_30_17/dense121_dropout_2',
    'tagging/6_30_17/dense121_dropout_3'
]

make_test_preds(streamlined_run_names, join(out_path, 'streamlined_test_preds.csv'))

In [15]:
# majority voting ensemble of 5 DN and 5 TIFF
# 0.92895
streamlined_run_names = [
    'tagging/7_7_17/tif_ensemble_0/0',
    'tagging/7_7_17/tif_ensemble_0/1',
    'tagging/7_7_17/tif_ensemble_0/2',
    'tagging/7_7_17/tif_ensemble_0/3',
    'tagging/7_7_17/tif_ensemble_0/4',
    'tagging/7_7_17/jpg_ensemble_0/0',
    'tagging/7_7_17/jpg_ensemble_0/1',
    'tagging/7_7_17/jpg_ensemble_0/2',
    'tagging/7_7_17/jpg_ensemble_0/3',
    'tagging/7_7_17/jpg_ensemble_0/4'
]

make_test_preds(streamlined_run_names, join(out_path, 'dense_tiff_test_preds.csv'))

In [16]:
# adding tiffs hurt, so get rid of them
# ensemble of 4 densenets, 2 densenets with dropout, 4 resnets
# 0.93197
streamlined_run_names = [
    'tagging/7_7_17/jpg_ensemble_0/0',
    'tagging/7_7_17/jpg_ensemble_0/1',
    'tagging/7_7_17/jpg_ensemble_0/2',
    'tagging/7_7_17/jpg_ensemble_0/3',
    'tagging/7_7_17/baseline_cyclic_1200steps',
    'tagging/7_7_17/baseline_cyclic_2',
    'tagging/7_10_17/cyclic_2400steps',
    'tagging/6_28_17/baseline_testrot_0',
    'tagging/6_30_17/dense121_dropout_1',
    'tagging/6_30_17/dense121_dropout_2',
]

make_test_preds(streamlined_run_names, join(out_path, '10_no_tiff_test_preds.csv'))

In [7]:
run_names = [
    'tagging/7_16_17/densenet/0',
    'tagging/7_16_17/densenet/1',
    'tagging/7_16_17/densenet/2',
    'tagging/7_16_17/densenet/3',
    'tagging/7_16_17/densenet/4',
    'tagging/7_16_17/resnet/0',
    'tagging/7_16_17/resnet/1',
    'tagging/7_16_17/resnet/2',
    'tagging/7_16_17/resnet/3',
    'tagging/7_16_17/resnet/4'
]

for run_name in run_names:
    download_run(run_name)
make_test_preds(run_names, join(out_path, '5rn_5dn_99.csv'))


tagging/7_16_17/densenet/0
tagging/7_16_17/densenet/1
tagging/7_16_17/densenet/2
tagging/7_16_17/densenet/3
tagging/7_16_17/densenet/4
tagging/7_16_17/resnet/0
tagging/7_16_17/resnet/1
tagging/7_16_17/resnet/2
tagging/7_16_17/resnet/3
tagging/7_16_17/resnet/4


In [8]:
run_names = [
    'tagging/7_16_17/densenet/0',
    'tagging/7_16_17/densenet/1',
    'tagging/7_16_17/densenet/2',
    'tagging/7_16_17/densenet/3',
    'tagging/7_16_17/densenet/4',
    'tagging/7_16_17/resnet/0',
    'tagging/7_16_17/resnet/1',
    'tagging/7_16_17/resnet/2',
    'tagging/7_16_17/resnet/3',
    'tagging/7_16_17/resnet/4',
    'tagging/7_7_17/jpg_ensemble_0/0',
    'tagging/7_7_17/jpg_ensemble_0/1',
    'tagging/7_7_17/jpg_ensemble_0/2',
    'tagging/7_7_17/jpg_ensemble_0/3',
    'tagging/7_7_17/baseline_cyclic_1200steps',
    'tagging/7_7_17/baseline_cyclic_2',
    'tagging/7_10_17/cyclic_2400steps',
    'tagging/6_28_17/baseline_testrot_0'
]

make_test_preds(run_names, join(out_path, '9rn_9dn.csv'))

In [25]:
run_names = [
    'tagging/7_17_17/densenet_transform/0',
    'tagging/7_17_17/densenet_transform/1',
    'tagging/7_17_17/densenet_transform/2',
    'tagging/7_17_17/densenet_transform/3',
    'tagging/7_17_17/densenet_transform/4',
    'tagging/7_17_17/resnet_transform/0',
    'tagging/7_17_17/resnet_transform/1',
    'tagging/7_17_17/resnet_transform/2',
    'tagging/7_17_17/resnet_transform/3',
    'tagging/7_17_17/resnet_transform/4'
]
for run_name in run_names:
    download_run(run_name)


tagging/7_17_17/densenet_transform/0
tagging/7_17_17/densenet_transform/1
tagging/7_17_17/densenet_transform/2
tagging/7_17_17/densenet_transform/3
tagging/7_17_17/densenet_transform/4
tagging/7_17_17/resnet_transform/0
tagging/7_17_17/resnet_transform/1
tagging/7_17_17/resnet_transform/2
tagging/7_17_17/resnet_transform/3
tagging/7_17_17/resnet_transform/4


In [26]:
run_names = [
    'tagging/7_16_17/densenet/0',
    'tagging/7_16_17/densenet/1',
    'tagging/7_16_17/densenet/2',
    'tagging/7_16_17/densenet/3',
    'tagging/7_16_17/densenet/4',
    'tagging/7_16_17/resnet/0',
    'tagging/7_16_17/resnet/1',
    'tagging/7_16_17/resnet/2',
    'tagging/7_16_17/resnet/3',
    'tagging/7_16_17/resnet/4',
    'tagging/7_7_17/jpg_ensemble_0/0',
    'tagging/7_7_17/jpg_ensemble_0/1',
    'tagging/7_7_17/jpg_ensemble_0/2',
    'tagging/7_7_17/jpg_ensemble_0/3',
    'tagging/7_7_17/baseline_cyclic_1200steps',
    'tagging/7_7_17/baseline_cyclic_2',
    'tagging/7_10_17/cyclic_2400steps',
    'tagging/6_28_17/baseline_testrot_0',
    'tagging/7_17_17/densenet_transform/0',
    'tagging/7_17_17/densenet_transform/1',
    'tagging/7_17_17/densenet_transform/2',
    'tagging/7_17_17/densenet_transform/3',
    'tagging/7_17_17/densenet_transform/4',
    'tagging/7_17_17/resnet_transform/0',
    'tagging/7_17_17/resnet_transform/1',
    'tagging/7_17_17/resnet_transform/2',
    'tagging/7_17_17/resnet_transform/3',
    'tagging/7_17_17/resnet_transform/4'
]    

make_test_preds(run_names, join(out_path, '9rn_9dn_add_transform.csv'))

In [19]:
# 0.93193

# make big ensemble without tiff and weight non-standard densenets double, 
# so that densenets and others are balanced
big_no_tiff_run_names = [
    'tagging/7_11_17/dense_ensemble2/0',
    'tagging/7_11_17/dense_ensemble2/1',
    'tagging/7_11_17/dense_ensemble2/2',
    'tagging/7_11_17/dense_ensemble2/3',
    'tagging/7_11_17/dense_ensemble2/4',
    'tagging/7_5_17/ensemble/0',
    'tagging/7_5_17/ensemble/1',
    'tagging/7_5_17/ensemble/2',
    'tagging/7_5_17/ensemble/3',
    'tagging/7_5_17/ensemble/4',
    'tagging/7_7_17/jpg_ensemble_0/0',
    'tagging/7_7_17/jpg_ensemble_0/1',
    'tagging/7_7_17/jpg_ensemble_0/2',
    'tagging/7_7_17/jpg_ensemble_0/3',
    'tagging/7_7_17/jpg_ensemble_0/4',
    'tagging/7_7_17/baseline_cyclic_1',
    'tagging/7_7_17/baseline_cyclic_1200steps',
    'tagging/7_7_17/baseline_cyclic_2',
    'tagging/6_28_17/baseline_testrot_0',
    'tagging/6_30_17/dense121_dropout_1',
    'tagging/6_30_17/dense121_dropout_2',
    'tagging/6_30_17/dense121_dropout_3',
    'tagging/7_7_17/baseline_cyclic_1',
    'tagging/7_7_17/baseline_cyclic_1200steps',
    'tagging/7_7_17/baseline_cyclic_2',
    'tagging/6_28_17/baseline_testrot_0',
    'tagging/6_30_17/dense121_dropout_1',
    'tagging/6_30_17/dense121_dropout_2',
    'tagging/6_30_17/dense121_dropout_3'
]
make_test_preds(big_no_tiff_run_names, join(out_path, 'no_tiff_test_preds.csv'))

In [20]:
# 4 runs with same validation set
# compute validation f2
# add weights
# greedy forward selection

In [96]:
run_names = [
    'tagging/7_11_17/dense_ensemble2/0',
    'tagging/7_11_17/dense_ensemble2/1',
    'tagging/7_11_17/dense_ensemble2/2',
    'tagging/7_11_17/dense_ensemble2/3',
    'tagging/7_11_17/dense_ensemble2/4',
    'tagging/7_5_17/ensemble/0',
    'tagging/7_5_17/ensemble/1',
    'tagging/7_5_17/ensemble/2',
    'tagging/7_5_17/ensemble/3',
    'tagging/7_5_17/ensemble/4'
]

for run_name in run_names:
    _makedirs(join('/opt/data/results/', run_name))
    try:
        s3_download(run_name, test_pred_fn)
    except:
        s3_download(run_name, 'test_predictions.csv', test_pred_fn)
        
    s3_download(run_name, scores_fn)
    print(run_name)
    s3_download(run_name, 'validation_preds.csv')
    s3_download(run_name, 'train_preds.csv')

tagging/7_11_17/dense_ensemble2/0
tagging/7_11_17/dense_ensemble2/1
tagging/7_11_17/dense_ensemble2/2
tagging/7_11_17/dense_ensemble2/3
tagging/7_11_17/dense_ensemble2/4
tagging/7_5_17/ensemble/0
tagging/7_5_17/ensemble/1
tagging/7_5_17/ensemble/2
tagging/7_5_17/ensemble/3
tagging/7_5_17/ensemble/4


In [14]:
# test TTA
def download_run(run_name):
    print(run_name)
    _makedirs(join('/opt/data/results/', run_name))
    for i in range(6):
        test_fn = '{}_test_preds.csv'.format(i)
        print(test_fn)
        s3_download(run_name, test_fn)
    

run_names = [
    'tagging/7_7_17/jpg_ensemble_0/0',
    'tagging/7_7_17/jpg_ensemble_0/1',
    'tagging/7_7_17/jpg_ensemble_0/2',
    'tagging/7_7_17/jpg_ensemble_0/3',
    'tagging/7_7_17/baseline_cyclic_1200steps',
    'tagging/7_7_17/baseline_cyclic_2',
    'tagging/7_10_17/cyclic_2400steps'
]

for run_name in run_names:
    download_run(run_name)

tagging/7_7_17/jpg_ensemble_0/0
0_test_preds.csv
1_test_preds.csv
2_test_preds.csv
3_test_preds.csv
4_test_preds.csv
5_test_preds.csv
tagging/7_7_17/jpg_ensemble_0/1
0_test_preds.csv
1_test_preds.csv
2_test_preds.csv
3_test_preds.csv
4_test_preds.csv
5_test_preds.csv
tagging/7_7_17/jpg_ensemble_0/2
0_test_preds.csv
1_test_preds.csv
2_test_preds.csv
3_test_preds.csv
4_test_preds.csv
5_test_preds.csv
tagging/7_7_17/jpg_ensemble_0/3
0_test_preds.csv
1_test_preds.csv
2_test_preds.csv
3_test_preds.csv
4_test_preds.csv
5_test_preds.csv
tagging/7_7_17/baseline_cyclic_1200steps
0_test_preds.csv
1_test_preds.csv
2_test_preds.csv
3_test_preds.csv
4_test_preds.csv
5_test_preds.csv
tagging/7_7_17/baseline_cyclic_2
0_test_preds.csv
1_test_preds.csv
2_test_preds.csv
3_test_preds.csv
4_test_preds.csv
5_test_preds.csv
tagging/7_10_17/cyclic_2400steps
0_test_preds.csv
1_test_preds.csv
2_test_preds.csv
3_test_preds.csv
4_test_preds.csv
5_test_preds.csv


In [21]:
def make_test_preds_tta(run_names, out_path):
    test_preds_list = []
    for run_name in run_names:
        for i in range(6):
            run_path = join(results_path, run_name)
            test_pred_path = join(run_path, '{}_test_preds.csv'.format(i))
            test_preds = TagStore(test_pred_path).get_tag_array(test_file_inds)
            test_preds_list.append(np.expand_dims(test_preds, axis=2))

    test_preds = np.concatenate(test_preds_list, axis=2)
    print(test_preds.shape)
    test_preds_sum = np.sum(test_preds, axis=2)
    weight_sum = test_preds.shape[2]
    true_thresh = math.ceil(weight_sum / 2)
    test_preds = test_preds_sum >= true_thresh

    tag_store = TagStore()
    for i in range(test_preds.shape[0]):
        tag_store.add_tags(test_file_inds[i], test_preds[i, :])

    tag_store.save(out_path)

In [20]:
make_test_preds_tta(run_names, join(out_path, 'tta0.csv'))

(61191, 17, 42)


In [97]:
results_path = '/opt/data/results/'
train_tag_stores = []
val_tag_stores = []
for run_name in run_names:
    val_path = join(results_path, run_name, 'validation_preds.csv')
    val_tag_stores.append(TagStore(val_path))
    
    train_path = join(results_path, run_name, 'train_preds.csv')
    train_tag_stores.append(TagStore(train_path))

train_file_inds = train_tag_stores[0].file_ind_to_tags.keys()
val_file_inds = val_tag_stores[0].file_ind_to_tags.keys()

gt_csv_path = '/opt/data/datasets/planet_kaggle/train_v2.csv'
gt_tag_store = TagStore(gt_csv_path)

In [100]:
from sklearn.metrics import fbeta_score

val_preds = []
train_preds = []
for i, run_name in enumerate(run_names):
    run_val_preds = np.expand_dims(val_tag_stores[i].get_tag_array(val_file_inds), axis=2)
    val_preds.append(run_val_preds)
    
    run_train_preds = np.expand_dims(train_tag_stores[i].get_tag_array(train_file_inds), axis=2)
    train_preds.append(run_train_preds)
    
val_preds = np.concatenate(val_preds, axis=2)
print(val_preds.shape)

train_preds = np.concatenate(train_preds, axis=2)
print(train_preds.shape)

(8096, 17, 10)
(32383, 17, 10)


In [None]:
weights = np.array([1, 1])
best_f2 = -1
while False:
    print()
    val_preds_sum = np.sum(val_preds * [[weights]], axis=2)
    weight_sum = np.sum(weights)
    true_thresh = math.ceil(weight_sum / 2)
    our_val_preds = val_preds_sum >= true_thresh

    gt_val_preds = gt_tag_store.get_tag_array(val_file_inds)
    f2_samples = fbeta_score(
        gt_val_preds, our_val_preds, beta=2, average='samples')

    print(f2_samples)

In [103]:
def get_f2(w, is_train=True):
    if is_train:
        train_preds_sum = np.sum(train_preds * [[w]], axis=2)
        weight_sum = np.sum(w)
        true_thresh = math.ceil(weight_sum / 2)
        our_train_preds = train_preds_sum >= true_thresh
        gt_train_preds = gt_tag_store.get_tag_array(train_file_inds)
        f2 = fbeta_score(
            gt_train_preds, our_train_preds, beta=2, average='samples')
    else:
        val_preds_sum = np.sum(val_preds * [[w]], axis=2)
        weight_sum = np.sum(w)
        true_thresh = math.ceil(weight_sum / 2)
        our_val_preds = val_preds_sum >= true_thresh
        gt_val_preds = gt_tag_store.get_tag_array(val_file_inds)
        f2 = fbeta_score(
            gt_val_preds, our_val_preds, beta=2, average='samples')

    
    return f2

best_weights = np.zeros((len(run_names),))
best_f2 = -1
sweep_best_weights = best_weights.copy()
sweep_best_f2 = best_f2
nb_sweeps = 10

for sweep_ind in range(nb_sweeps):
    for run_ind in range(len(run_names)):
        w = best_weights.copy()
        w[run_ind] = 0 if w[run_ind] == 1 else 1
        f2 = get_f2(w, is_train=True)
        # print(w)
        # print(f2)
        if f2 > sweep_best_f2:
            sweep_best_f2 = f2
            sweep_best_weights = w

    if sweep_best_f2 > best_f2:
        best_f2 = sweep_best_f2
        best_weights = sweep_best_weights
        print('sweep: {}, best_f2: {:0.5f}, best_weights: {}'.format(sweep_ind, best_f2, best_weights))


sweep: 0, best_f2: 0.94149, best_weights: [ 0.  0.  0.  0.  0.  0.  0.  0.  1.  0.]
sweep: 1, best_f2: 0.94165, best_weights: [ 0.  0.  0.  0.  0.  0.  0.  1.  1.  0.]
sweep: 2, best_f2: 0.94278, best_weights: [ 0.  0.  0.  0.  0.  0.  0.  1.  1.  1.]
sweep: 3, best_f2: 0.94279, best_weights: [ 0.  0.  0.  0.  0.  1.  0.  1.  1.  1.]
sweep: 4, best_f2: 0.94286, best_weights: [ 0.  0.  0.  0.  0.  1.  1.  1.  1.  1.]
0.931974666254
0.932528525481


In [107]:
print(get_f2(np.ones((10,)), is_train=True))
print(get_f2(np.ones((10,)), is_train=False))

print(get_f2(best_weights, is_train=True))
print(get_f2(best_weights, is_train=False))

0.941866842839
0.932528525481
0.942857098009
0.931974666254
