In [2]:
# Pandas for DataFrames
import pandas as pd
pd.set_option('display.max_column', 100)

# NumPy for numerical computing
import numpy as np
np.random.seed(123)
import random
random.seed(123)

import tensorflow as tf

import os, gc
import threading

# Matplotlib for visualization
from matplotlib import pyplot as plt

# display plots in the notebook
%matplotlib inline

In [3]:
import sys
sys.path.append('./utils')

from data import Data
from models import Models
from tags import Tags
tags = Tags()

Using TensorFlow backend.


In [4]:
PLANET_KAGGLE_ROOT = '/data/planet-data/'
if not os.path.exists(PLANET_KAGGLE_ROOT):
    PLANET_KAGGLE_ROOT = '/Users/jiayou/Documents/Kaggle Data/Amazon'
    
N_TAGS = 17
N_TRAIN = 40479
N_TEST_T = 40669
N_TEST_F = 20522
N_TEST = N_TEST_T + N_TEST_F

# Predict test data

In [4]:
def predict_on_gpu(batch_size=20, name='', gpu=0):
    g = tf.Graph()
    with g.as_default():
        sess = tf.Session()
        with sess.as_default():
            with tf.device('/gpu:{}'.format(gpu)):
                predict(batch_size=batch_size, name=name)

def predict(batch_size=20, name=''):
    m = Models.load_incnet('weights-{}.hdf5'.format(name))
    print('Model weights loaded')
    
    d = Data(train=[])
    
    cnt = 0
    pred = np.zeros((N_TEST, N_TAGS))
    pred8 = np.zeros((N_TEST * 8, N_TAGS))
    print('Start predicting..')
    for X_test in d.gen_test_augmented(batch_size):
        y_test = m.predict_on_batch(X_test)
        k = int(len(y_test) / 8 + 0.1)
        pred8[cnt*8:(cnt+k)*8,:] = y_test[:,:]
        for i in range(k):
            pred[cnt+i,:] = d.consolidate(y_test[8*i:8*(i+1),:])
        cnt += k
        print('Predicted {} images for {}'.format(cnt, name))
    print('Predicted all {} images for {}'.format(cnt, name))
            
    print('Saving raw predictions for {}...'.format(name))
    np.save('raw_pred.{}.npy'.format(name), pred)
    np.save('raw_pred.{}.tta8.npy'.format(name), pred8)
    print('Saved')
    
    thres = [0.2]*17
    result = tags.pred_to_output(pred, thres)
    print('Saving submission file for {}...'.format(name))
    result.to_csv('submission.{}.csv'.format(name), index = None)
    print('Saved')
    return result

In [7]:
# Predict test data in parallel
ts = []
for i in range(8):
    t = threading.Thread(
        target=predict_on_gpu, 
        kwargs={'name': 'v13-n{}'.format(i), 'gpu': i})
    t.start()
    ts.append(t)
for i in range(len(ts)):
    ts[i].join()

In [8]:
# Predict test data
for i in [0,1,2,3,4]:
    predict(model_id = i)

# Predict training data

In [None]:
pred = None
pred8 = None

def predict_train(toy=None, batch_size=20, model_id=0, d=None):
    model = Models.load_resnet50('weights-v9-f{}.hdf5'.format(model_id))
    print('Model weights loaded')
    
    if d is None:
        d = Data(toy=toy)
    
    cnt = 0
    global pred
    global pred8
    pred = np.zeros((N_TRAIN, N_TAGS))
    pred8 = np.zeros((N_TRAIN * 8, N_TAGS))
    
    print('Start predicting..')
    for X in d.gen_train_augmented(batch_size):
        y = model.predict_on_batch(X)
        k = int(len(y) / 8 + 0.1)
        pred8[cnt*8:(cnt+k)*8,:] = y[:,:]
        for i in range(k):
            pred[cnt+i,:] = d.consolidate(y[8*i:8*(i+1),:])
        cnt += k
        print('Predicted {} images'.format(cnt))
    print('Predicted all {} images'.format(cnt))
            
    print('Saving raw predictions...')
    np.save('raw_train_pred.v9.f{}.tta.npy'.format(model_id), pred)
    np.save('raw_train_pred.v9.f{}.tta8.npy'.format(model_id), pred8)
    print('Saved')

In [9]:
# Predict training data
toy = None
d = Data(toy=toy)
for i in [0,1,2,3,4]:
    predict_train(model_id = i, d=d)

Loading data...
Getting 13 training images...
Got 1 images
Done
Loaded fold 0.
Getting 13 training images...
Got 1 images
Done
Loaded fold 1.
Getting 13 training images...
Got 1 images
Done
Loaded fold 2.
Getting 12 training images...
Got 1 images
Done
Loaded fold 3.
Getting 12 training images...
Got 1 images
Done
Loaded fold 4.
Loading done
Model weights loaded
Start predicting..
Predicted 20 images
Predicted 40 images
Predicted 60 images
Predicted 63 images
Predicted all 63 images
Saving raw predictions...
Saved


In [None]:
# Experiment with thresholds
pred = np.load('raw_pred.v9.npy')
thres = [0.23067564, 0.27402788, 0.15499838, 0.18645976, 0.12418672, 0.093219191, 0.14909597, 0.13256209, 0.041971382, 0.17731731, 0.10376091, 0.25468382, 0.090709485, 0.13336645, 0.13344041, 0.10004906, 0.036582272]
for i in range(N_TAGS):
    if thres[i] > 0.2:
        thres[i] = 0.21
    else:
        thres[i] = 0.19
result = tags.pred_to_output(pred, thres)
result.to_csv('submission.v9-2.csv', index = None)

In [None]:
# Generate output from fine tuned predictions
val = 4
select = [2, 4, 7, 12]

pred = np.load('raw_pred.v9.f{}.tta.npy'.format(val))
pred_tune = np.load('raw_pred.v9.f{}-tune2.tta.npy'.format(val))
for i in select:
    pred[:,i] = pred_tune[:,i]

result = tags.pred_to_output(pred, thres)
result.to_csv('submission.v9.f{}-tune2.tta.csv'.format(val), index = None)

In [6]:
for i in range(8):
    name = 'v13-n{}'.format(i)
    pred = np.load(os.path.join(PLANET_KAGGLE_ROOT, 'ensemble', 'raw_pred.{}.npy'.format(name)))
    
    thres = [0.2]*17
    result = tags.pred_to_output(pred, thres)
    print('Saving submission file for {}...'.format(name))
    result.to_csv('submission.{}.csv'.format(name), index = None)
    print('Saved')

Saving submission file for v13-n0...
Saved
Saving submission file for v13-n1...
Saved
Saving submission file for v13-n2...
Saved
Saving submission file for v13-n3...
Saved
Saving submission file for v13-n4...
Saved
Saving submission file for v13-n5...
Saved
Saving submission file for v13-n6...
Saved
Saving submission file for v13-n7...
Saved
