## Data output manipulation

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import scipy.ndimage
import matplotlib.pyplot as plt
import glob
import re
from skimage import measure, morphology
from mpl_toolkits.mplot3d.art3d import Poly3DCollection
from sklearn.decomposition import PCA
from time import time
import math
from sklearn import cross_validation



In [50]:
# constants

data = '/kaggle/dev/data-science-bowl-2017-data/'
stage1 = '/kaggle/dev/data-science-bowl-2017-data/stage1/'
labels = '/kaggle/dev/data-science-bowl-2017-data/stage1_labels.csv'
stage1_processed = '/kaggle/dev/data-science-bowl-2017-data/stage1_processed/'
stage1_features_resnet = '/kaggle/dev/data-science-bowl-2017-data/stage1_features_mx/'
stage1_submission = '/kaggle/dev/data-science-bowl-2017-data/stage1_sample_submission.csv'
naive_submission = '/kaggle/dev/jovan/data-science-bowl-2017/data-science-bowl-2017/submissions/naive_submission.csv'
stage1_processed_pca = '/kaggle/dev/data-science-bowl-2017-data/stage1_processed_pca/'
stage1_features_inception = '/kaggle/dev/data-science-bowl-2017-data/CIFAR-10/cache/'
submissions = '/kaggle/dev/data-science-bowl-2017-data/submissions/'

cifar10_url = "https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz"
cifar_data = "/kaggle/dev/data-science-bowl-2017-data/CIFAR-10/"

In [87]:
ids = list()
for s in glob.glob(stage1_features_inception + "*"):
    id = os.path.basename(s)
    id = re.match(r'inception_cifar10_([a-f0-9].*).pkl' , id).group(1)
    ids.append(id)
ids = pd.DataFrame(ids,  columns=["id"])

df = pd.read_csv(labels)
df = pd.merge(df, ids, how='inner', on=['id'])


x = np.array([np.load(stage1_features_inception + "inception_cifar10_" + s + ".pkl") for s in df['id'].tolist()])

y = df['cancer'].as_matrix()
trn_x, val_x, trn_y, val_y = cross_validation.train_test_split(x, y, random_state=42, stratify=y,
                                                                test_size=0.20)

In [88]:
num_classes = 2

transfer_values_test = val_x.flatten()
transfer_values_train = trn_x.flatten()
labels_test = val_y.reshape(-1,1)
labels_train = trn_y.reshape(-1,1)
cls_train = (np.arange(num_classes) == trn_y[:, None])+0
cls_test = (np.arange(num_classes) == val_y[:, None])+0

print("transfer_values_test : " + str(transfer_values_test.shape))
print("transfer_values_train : " + str(transfer_values_train.shape))
print("labels_test : " + str(labels_test.shape))
print("labels_train : " + str(labels_train.shape))
print("cls_test : " + str(cls_test.shape))
print("cls_train : " + str(cls_train.shape))

transfer_values_test : (280,)
transfer_values_train : (1117,)
labels_test : (280, 1)
labels_train : (1117, 1)
cls_test : (280, 2)
cls_train : (1117, 2)


In [85]:
val_y

array([0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 1])

In [102]:
biggest_0 = 0
biggest_total = 0
for i in range(0, len(transfer_values_test)):
    x = transfer_values_test[i].shape[0] * transfer_values_test[i].shape[1]
    if x > biggest_total:
        biggest_total = x
        biggest_0 = transfer_values_test[i].shape[0]
        
print(biggest_0)
print(biggest_total)

428
876544


In [113]:
(transfer_values_test[1:2].flatten())

array([ array([[  9.08796191e-01,   0.00000000e+00,   4.99632582e-02, ...,
          4.20329571e-01,   4.69928086e-02,   1.44751236e-01],
       [  9.08796191e-01,   0.00000000e+00,   4.99632582e-02, ...,
          4.20329571e-01,   4.69928086e-02,   1.44751236e-01],
       [  9.08796191e-01,   0.00000000e+00,   4.99632582e-02, ...,
          4.20329571e-01,   4.69928086e-02,   1.44751236e-01],
       ..., 
       [  1.03081763e+00,   4.26401570e-02,   3.77762467e-02, ...,
          5.92050016e-01,   1.62868679e-03,   2.56208718e-01],
       [  1.05185521e+00,   3.84171121e-02,   2.32128575e-02, ...,
          5.98619640e-01,   9.89130582e-04,   4.68121111e-01],
       [  1.05447912e+00,   3.49369906e-02,   8.44418257e-03, ...,
          6.56412423e-01,   3.25394183e-04,   4.36309338e-01]], dtype=float32)], dtype=object)