In [1]:
import glob
import os
import re
import pandas as pd
import numpy as np
import SimpleITK as sitk
import matplotlib.pyplot as plt
from tqdm import tqdm
import time
from datetime import timedelta
import sys
import datetime
import tensorflow as tf
import math
import multiprocessing as mp
import random
# Fixes "SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame"
pd.options.mode.chained_assignment = None

import scipy.misc
from mpl_toolkits.mplot3d.art3d import Poly3DCollection

In [2]:
OUTPUT_PATH = '/kaggle/dev/data-science-bowl-2017-data/submissions/'
DATA_PATH = '/kaggle_3/all_stage_features/'
LABELS = '/kaggle/dev/data-science-bowl-2017-data/all_labels.csv'
STAGE1_SUBMISSION = '/kaggle/dev/data-science-bowl-2017-data/stage1_sample_submission.csv'
STAGE2_SUBMISSION = '/kaggle/dev/data-science-bowl-2017-data/stage2_sample_submission.csv'
TENSORBOARD_SUMMARIES = '/kaggle/dev/data-science-bowl-2017-data/tensorboard_summaries/'
MODELS = '/kaggle/dev/data-science-bowl-2017-data/models/'

#globals initializing
FLAGS = tf.app.flags.FLAGS

## Prediction problem specific
tf.app.flags.DEFINE_integer('iteration_analysis', 25000,
                            """Number of steps after which analysis code is executed""")
tf.app.flags.DEFINE_integer('num_classes', 2,
                            """Number of classes to predict.""")
tf.app.flags.DEFINE_integer('num_classes_luna', 2,
                            """Number of classes predicted by LUNA model.""")
tf.app.flags.DEFINE_integer('transfer_values_shape', 1000,
                            'Size of transfer values')
tf.app.flags.DEFINE_integer('batch_size', 128,
                            """Number of items in a batch.""")
tf.app.flags.DEFINE_integer('max_iterations', 200000,
                            """Number of batches to run.""")
tf.app.flags.DEFINE_float('reg_constant', 0.1, 'Regularization constant.')
tf.app.flags.DEFINE_float('dropout', 0.5, 'Dropout')

## Tensorflow specific
tf.app.flags.DEFINE_boolean('log_device_placement', False,
                            """Whether to log device placement.""")
tf.app.flags.DEFINE_boolean('allow_soft_placement', True,
                            """Whether to allow soft placement of calculations by tf.""")
tf.app.flags.DEFINE_boolean('allow_growth', True,
                            """Whether to allow GPU growth by tf.""")

In [46]:
def get_patient_features(patient_ids):
    input_features = []
    input_labels = []
    labels = pd.read_csv(LABELS)

    num_patients = len(patient_ids)
    patient_count = 0
    chunk_count = 0
    for patient_id in patient_ids:
        num_class_1 = 0
        predictions = np.array(np.load(DATA_PATH + patient_id + '_predictions.npy'))
        transfer_values = np.array(np.load(DATA_PATH + patient_id + '_transfer_values.npy'))
        label = int(labels.loc[labels['id'] == patient_id, 'cancer'])
        for i in range(predictions.shape[0]):
            predicted_class = np.argmax(predictions[i])
            if (predicted_class == 1):
                features = np.ndarray(shape=(transfer_values.shape[1] + predictions.shape[1]), dtype=np.float32)
                features[0 : transfer_values.shape[1]] = transfer_values[i]
                features[transfer_values.shape[1] : transfer_values.shape[1] + predictions.shape[1]] = predictions[i]
                input_features.append(features)
                input_labels.append(label)
                num_class_1 += 1
            chunk_count += 1
        
        num_class_0 = 0
        for j in range(predictions.shape[0]):
            predicted_class = np.argmax(predictions[j])
            if(predicted_class == 0) and (num_class_0 < num_class_1):
                features = np.ndarray(shape=(transfer_values.shape[1] + predictions.shape[1]), dtype=np.float32)
                features[0 : transfer_values.shape[1]] = transfer_values[j]
                features[transfer_values.shape[1] : transfer_values.shape[1] + predictions.shape[1]] = predictions[j]
                input_features.append(features)
                input_labels.append(label)
                num_class_0 += 1
            chunk_count += 1
        patient_count = patient_count + 1
        print('Loaded data for patient {}/{}/{} - {}/{}'.format(patient_count, num_patients, chunk_count, num_class_0, num_class_1))

    return input_features, input_labels

In [52]:
# del X, Y, patient_ids
patient_ids = set()

for file_path in glob.glob(DATA_PATH + "*_transfer_values.npy"):
    filename = os.path.basename(file_path)
    patient_id = re.match(r'([a-f0-9].*)_transfer_values.npy', filename).group(1)
    patient_ids.add(patient_id)

sample_submission = pd.read_csv(STAGE2_SUBMISSION)
test_patient_ids = set(sample_submission['id'].tolist())
train_patient_ids = patient_ids.difference(test_patient_ids)

train_patient_ids = list(train_patient_ids)[0:20]
train_inputs, train_labels = get_patient_features(train_patient_ids)
num_patients = len(train_inputs)
X = np.ndarray(shape=(num_patients, FLAGS.transfer_values_shape + FLAGS.num_classes), dtype=np.float32)
Y = np.ndarray(shape=(num_patients), dtype=np.float32)

count = 0
for i in range(len(train_inputs)):
    X[count] = train_inputs[i]
    Y[count] = train_labels[i]
    count = count + 1

print('X.shape: {}'.format(X.shape))
print('Y.shape: {}'.format(Y.shape))

Loaded data for patient 1/20/22264 - 1801/1801
Loaded data for patient 2/20/52008 - 2292/2292
Loaded data for patient 3/20/84456 - 2563/2563
Loaded data for patient 4/20/114200 - 2033/2033
Loaded data for patient 5/20/138534 - 2619/2619
Loaded data for patient 6/20/157734 - 1742/1742
Loaded data for patient 7/20/176256 - 1696/1696
Loaded data for patient 8/20/195660 - 1503/1503
Loaded data for patient 9/20/221348 - 1816/1816
Loaded data for patient 10/20/256500 - 2934/2934
Loaded data for patient 11/20/288948 - 2458/2458
Loaded data for patient 12/20/321204 - 2658/2658
Loaded data for patient 13/20/337962 - 1533/1533
Loaded data for patient 14/20/364212 - 1685/1685
Loaded data for patient 15/20/393012 - 2164/2164
Loaded data for patient 16/20/420660 - 1922/1922
Loaded data for patient 17/20/448160 - 2102/2102
Loaded data for patient 18/20/482656 - 2413/2413
Loaded data for patient 19/20/520288 - 2973/2973
Loaded data for patient 20/20/558974 - 2640/2640
X.shape: (87094, 1002)
Y.shape: 

In [7]:
counter0 = 0
for i in range(X.shape[0]):
    if(X[i,-2] >= 0.0): 
        #print(X[i, 1000:1003])
        counter0 = counter0 + 1
print(counter0)

3508363


In [53]:
counter1 = 0
for i in range(Y.shape[0]):
    if Y[i]==1:
        counter1=counter1+ 1
counter1

22892

In [54]:
22892/87094

0.2628424460927274