In [19]:
run_names = [
    'tagging/7_11_17/dense_ensemble2/0',
    'tagging/7_11_17/dense_ensemble2/1',
    'tagging/7_11_17/dense_ensemble2/2',
    'tagging/7_11_17/dense_ensemble2/3',
    'tagging/7_11_17/dense_ensemble2/4'
]

train_probs_fn = 'train_probs.npy'
train_preds_fn = 'train_preds.csv'

val_probs_fn = 'validation_probs.npy'
val_preds_fn = 'validation_preds.csv'

s3_bucket = 'raster-vision'

In [20]:
from os.path import join

import boto3
from botocore.exceptions import ClientError

from rastervision.common.settings import results_path
from rastervision.common.utils import _makedirs

def s3_download(run_name, file_name, new_file_name=None):
    if new_file_name is None:
        new_file_name = file_name
    s3_key = 'results/{}/{}'.format(run_name, file_name)
    run_path = join('/opt/data/results/', run_name, new_file_name)
    s3 = boto3.resource('s3')
    s3.Bucket(s3_bucket).download_file(s3_key, run_path)
    
def download_run(run_name):
    s3_download(run_name, train_probs_fn)
    s3_download(run_name, train_preds_fn)
    
    s3_download(run_name, val_probs_fn)
    s3_download(run_name, val_preds_fn)

In [21]:
for run_name in run_names:
    print(run_name)
    download_run(run_name)

In [31]:
import numpy as np

train_probs = []
val_probs = []
for run_name in run_names:
    run_path = join('/opt/data/results/', run_name)
    train_probs_path = join(run_path, train_probs_fn)
    train_probs.append(np.load(train_probs_path))
        
    val_probs_path = join(run_path, val_probs_fn)
    val_probs.append(np.load(val_probs_path))
    
all_train_probs = np.concatenate(train_probs, axis=1)
all_val_probs = np.concatenate(val_probs, axis=1)
print(all_train_probs.shape)
print(all_val_probs.shape)

(32383, 85)
(8096, 85)


In [33]:
from rastervision.tagging.data.planet_kaggle import TagStore

gt_csv_path = '/opt/data/datasets/planet_kaggle/train_v2.csv'
gt_tag_store = TagStore(gt_csv_path)

train_file_inds = TagStore('/opt/data/results/tagging/7_11_17/dense_ensemble2/0/train_preds.csv').file_ind_to_tags.keys()
gt_train_preds = gt_tag_store.get_tag_array(train_file_inds)

val_file_inds = TagStore('/opt/data/results/tagging/7_11_17/dense_ensemble2/0/validation_preds.csv').file_ind_to_tags.keys()
gt_val_preds = gt_tag_store.get_tag_array(val_file_inds)

In [37]:
import numpy as np
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.multioutput import MultiOutputClassifier

In [47]:
# fit model no training data
model = XGBClassifier()
# multi_model = MultiOutputClassifier(model, n_jobs=1)
# eval_set=[(all_val_probs, gt_val_preds)], eval_metric='error', early_stopping_rounds=10,
tag_ind = 16
model.fit(all_train_probs, gt_train_preds[:, tag_ind], eval_set=[(all_val_probs, gt_val_preds[:, tag_ind])], eval_metric='error', early_stopping_rounds=10, verbose=True)

[0]	validation_0-error:0.180089
Will train until validation_0-error hasn't improved in 10 rounds.
[1]	validation_0-error:0.179842
[2]	validation_0-error:0.179471
[3]	validation_0-error:0.179471
[4]	validation_0-error:0.179471
[5]	validation_0-error:0.179471
[6]	validation_0-error:0.179471
[7]	validation_0-error:0.179348
[8]	validation_0-error:0.179348
[9]	validation_0-error:0.179348
[10]	validation_0-error:0.179348
[11]	validation_0-error:0.179348
[12]	validation_0-error:0.179348
[13]	validation_0-error:0.179348
[14]	validation_0-error:0.179348
[15]	validation_0-error:0.179348
[16]	validation_0-error:0.179348
[17]	validation_0-error:0.179348
Stopping. Best iteration:
[7]	validation_0-error:0.179348



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [26]:
# make predictions for test data
y_pred = model.predict(X_test)
# predictions = [round(value) for value in y_pred]
predictions = np.round(y_pred)
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 80.73%
