In [1]:
import glob
import os
import re
import pandas as pd
import numpy as np
import SimpleITK as sitk
import matplotlib.pyplot as plt
from tqdm import tqdm
import time
from datetime import timedelta
import sys
import datetime
import tensorflow as tf
import math
from sklearn import cross_validation
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier as RF
import scipy as sp
from sklearn.decomposition import PCA
import sklearn.metrics



In [2]:
OUTPUT_PATH = '/kaggle/dev/data-science-bowl-2017-data/submissions/'
DATA_PATH = '/kaggle/dev/data-science-bowl-2017-data/stage1_features/'
TENSORBOARD_SUMMARIES = '/kaggle/dev/data-science-bowl-2017-data/tensorboard_summaries/'
MODELS = '/kaggle_2/luna/luna16/models/'
MODEL_CHECKPOINTS = '/kaggle/dev/data-science-bowl-2017-data/models/checkpoints/'
MODEL_PATH = '/kaggle_2/luna/luna16/models/e03f0475-091e-4821-862e-ae5303d670c8/'
STAGE1 = '/kaggle/dev/data-science-bowl-2017-data/stage1/'
LABELS = '/kaggle/dev/data-science-bowl-2017-data/stage1_labels.csv'
STAGE1_SUBMISSION = '/kaggle/dev/data-science-bowl-2017-data/stage1_sample_submission.csv'
NAIVE_SUBMISSION = '/kaggle/dev/jovan/data-science-bowl-2017/data-science-bowl-2017/submissions/naive_submission.csv'

In [8]:
def get_inputs():
    labels = pd.read_csv(LABELS)
    input_features = {}

    for features in glob.glob(DATA_PATH + '*_transfer_values.npy'):
        n = re.match('([a-f0-9].*)_transfer_values.npy', os.path.basename(features))
        patient_id = n.group(1)
        predictions = np.array([np.mean(np.load(DATA_PATH + patient_id + '_predictions.npy'), axis=0)])
        transfer_values = np.array(np.load(DATA_PATH + patient_id + '_transfer_values.npy'))
        transfer_values = sp.misc.imresize(transfer_values, (100, 100))
        transfer_values = transfer_values.flatten()
        feature_val = transfer_values
        try:
            label_val = int(labels.loc[labels['id'] == patient_id, 'cancer'])
        except TypeError:
            continue
        input_features[patient_id] = [feature_val, label_val]
        print('Patient {} predictions {} transfer_values {}'.format(patient_id, predictions.shape, transfer_values.shape))

    return input_features

In [9]:
def train_xgboost(trn_x, val_x, trn_y, val_y):

    clf = xgb.XGBRegressor(max_depth=10,
                           gamma=0.5,
                           objective="binary:logistic",
                           n_estimators=1500,
                           min_child_weight=6,
                           learning_rate=0.005,
                           nthread=8,
                           subsample=0.80,
                           colsample_bytree=0.80,
                           seed=79,
                           max_delta_step=1,
                           reg_alpha=0.1,
                           reg_lambda=0.5)
    clf.fit(trn_x, trn_y, eval_set=[(val_x, val_y)], verbose=True, eval_metric='logloss', early_stopping_rounds=50)
    return clf

In [10]:
inputs = get_inputs()

Patient 6ee742b62985570a1f3a142eb7e49188 predictions (1, 7) transfer_values (10000,)
Patient d6d5ed3055d084a6abf0f97af3fe2ff0 predictions (1, 7) transfer_values (10000,)
Patient 645e7f46eb9b834153ecf8e2b2921fe5 predictions (1, 7) transfer_values (10000,)
Patient e6d8b2631843a24e6761f2723ea30788 predictions (1, 7) transfer_values (10000,)
Patient f0f72264cd822301852578cc71288d3c predictions (1, 7) transfer_values (10000,)
Patient a88c585e7d81744eec091a6f0600bd7b predictions (1, 7) transfer_values (10000,)
Patient 39c3a2d2ca67bc7a1a22240ea571d50c predictions (1, 7) transfer_values (10000,)
Patient 74b542d34b61740c1933d2c953403aa6 predictions (1, 7) transfer_values (10000,)
Patient 174a9fc87f54d6def3730954fbafc99d predictions (1, 7) transfer_values (10000,)
Patient 080e6a00e69888fd620894f9fd0714b1 predictions (1, 7) transfer_values (10000,)
Patient 118be21b7e0c3058b29a524686391c66 predictions (1, 7) transfer_values (10000,)
Patient f76143416ee2c8e1251f45f108fed468 predictions (1, 7) trans

In [17]:
x = np.array([inputs[keys][0]for keys in inputs.keys()])
y = (np.array([inputs[keys][1] for keys in inputs.keys()])).reshape(-1,1)
trn_x, val_x, trn_y, val_y = cross_validation.train_test_split(x, y, random_state=42, stratify=y, test_size=0.20)

In [18]:
clf = train_xgboost(trn_x, val_x, trn_y, val_y)

[0]	validation_0-logloss:0.692241
Will train until validation_0-logloss hasn't improved in 50 rounds.
[1]	validation_0-logloss:0.6913
[2]	validation_0-logloss:0.690383
[3]	validation_0-logloss:0.689491
[4]	validation_0-logloss:0.688752
[5]	validation_0-logloss:0.687927
[6]	validation_0-logloss:0.687097
[7]	validation_0-logloss:0.686278
[8]	validation_0-logloss:0.685539
[9]	validation_0-logloss:0.684703
[10]	validation_0-logloss:0.68391
[11]	validation_0-logloss:0.683351
[12]	validation_0-logloss:0.682534
[13]	validation_0-logloss:0.681767
[14]	validation_0-logloss:0.681131
[15]	validation_0-logloss:0.680269
[16]	validation_0-logloss:0.679379
[17]	validation_0-logloss:0.678516
[18]	validation_0-logloss:0.677716
[19]	validation_0-logloss:0.677035
[20]	validation_0-logloss:0.676366
[21]	validation_0-logloss:0.675585
[22]	validation_0-logloss:0.674812
[23]	validation_0-logloss:0.674169
[24]	validation_0-logloss:0.673348
[25]	validation_0-logloss:0.672692
[26]	validation_0-logloss:0.671941


In [69]:
val_y_pred = clf.predict(val_x)

In [70]:
for i in range(val_y.shape[0]):
    if val_y[i] == 1:
        print("val_y:", val_y[i], "val_y_pred:",val_y_pred[i], "delta: ", val_y[i]-val_y_pred[i])

val_y: [1] val_y_pred: 0.222239 delta:  [ 0.77776082]
val_y: [1] val_y_pred: 0.303293 delta:  [ 0.69670656]
val_y: [1] val_y_pred: 0.282054 delta:  [ 0.71794647]
val_y: [1] val_y_pred: 0.249711 delta:  [ 0.75028872]
val_y: [1] val_y_pred: 0.192385 delta:  [ 0.80761459]
val_y: [1] val_y_pred: 0.242492 delta:  [ 0.75750843]
val_y: [1] val_y_pred: 0.233661 delta:  [ 0.76633893]
val_y: [1] val_y_pred: 0.241637 delta:  [ 0.75836256]
val_y: [1] val_y_pred: 0.287786 delta:  [ 0.71221438]
val_y: [1] val_y_pred: 0.214457 delta:  [ 0.78554285]
val_y: [1] val_y_pred: 0.188477 delta:  [ 0.81152284]
val_y: [1] val_y_pred: 0.276844 delta:  [ 0.72315568]
val_y: [1] val_y_pred: 0.243648 delta:  [ 0.75635196]
val_y: [1] val_y_pred: 0.260991 delta:  [ 0.73900861]
val_y: [1] val_y_pred: 0.222122 delta:  [ 0.77787833]
val_y: [1] val_y_pred: 0.185716 delta:  [ 0.81428377]
val_y: [1] val_y_pred: 0.157877 delta:  [ 0.84212282]
val_y: [1] val_y_pred: 0.21681 delta:  [ 0.78319]
val_y: [1] val_y_pred: 0.275285 

In [71]:
np.mean(val_y_pred)

0.26005074

In [66]:
for i in range(val_y.shape[0]):
    if val_y[i] == 0:
        print("val_y:", val_y[i], "val_y_pred:",val_y_pred[i], "delta: ", val_y[i]- val_y_pred[i])

val_y: [0] val_y_pred: 0.233237 delta:  [-0.23323727]
val_y: [0] val_y_pred: 0.20158 delta:  [-0.20158035]
val_y: [0] val_y_pred: 0.23412 delta:  [-0.23412016]
val_y: [0] val_y_pred: 0.187155 delta:  [-0.18715458]
val_y: [0] val_y_pred: 0.304777 delta:  [-0.30477697]
val_y: [0] val_y_pred: 0.270852 delta:  [-0.27085215]
val_y: [0] val_y_pred: 0.322837 delta:  [-0.32283682]
val_y: [0] val_y_pred: 0.210034 delta:  [-0.2100341]
val_y: [0] val_y_pred: 0.185119 delta:  [-0.18511911]
val_y: [0] val_y_pred: 0.370859 delta:  [-0.37085864]
val_y: [0] val_y_pred: 0.407247 delta:  [-0.40724736]
val_y: [0] val_y_pred: 0.181321 delta:  [-0.1813212]
val_y: [0] val_y_pred: 0.257239 delta:  [-0.25723866]
val_y: [0] val_y_pred: 0.300386 delta:  [-0.30038613]
val_y: [0] val_y_pred: 0.287355 delta:  [-0.28735548]
val_y: [0] val_y_pred: 0.288387 delta:  [-0.28838667]
val_y: [0] val_y_pred: 0.330905 delta:  [-0.33090457]
val_y: [0] val_y_pred: 0.289419 delta:  [-0.28941914]
val_y: [0] val_y_pred: 0.258767 

In [67]:
for i in range(val_y.shape[0]):
    if val_y[i] == 0:
        val_y_pred[i] = 0.00001
    print(val_y_pred[i])

        

1e-05
0.222239
1e-05
1e-05
1e-05
1e-05
1e-05
1e-05
0.303293
1e-05
1e-05
0.282054
1e-05
0.249711
1e-05
0.192385
0.242492
1e-05
1e-05
0.233661
1e-05
1e-05
1e-05
1e-05
0.241637
1e-05
0.287786
1e-05
0.214457
1e-05
1e-05
1e-05
1e-05
1e-05
0.188477
1e-05
1e-05
1e-05
0.276844
0.243648
1e-05
1e-05
1e-05
1e-05
1e-05
1e-05
1e-05
1e-05
1e-05
0.260991
1e-05
1e-05
1e-05
1e-05
1e-05
1e-05
1e-05
1e-05
0.222122
1e-05
1e-05
1e-05
0.185716
1e-05
0.157877
1e-05
1e-05
1e-05
1e-05
1e-05
0.21681
1e-05
0.275285
0.365141
1e-05
1e-05
0.20785
1e-05
1e-05
0.22351
1e-05
0.300293
1e-05
1e-05
1e-05
0.34089
1e-05
1e-05
0.193446
1e-05
1e-05
0.253978
1e-05
1e-05
1e-05
0.270661
1e-05
1e-05
1e-05
1e-05
0.203206
1e-05
1e-05
1e-05
0.284965
1e-05
1e-05
1e-05
1e-05
0.268332
1e-05
1e-05
1e-05
1e-05
1e-05
1e-05
0.465966
1e-05
1e-05
1e-05
0.383801
1e-05
1e-05
1e-05
1e-05
1e-05
1e-05
1e-05
1e-05
0.342177
1e-05
0.197544
1e-05
1e-05
1e-05
1e-05
1e-05
1e-05
0.220091
1e-05
1e-05
0.295472
1e-05
1e-05
1e-05
1e-05
1e-05
1e-05
1e-05
1e

In [68]:
sklearn.metrics.log_loss(val_y,val_y_pred,eps=1e-15)

0.35286705710498806