In [1]:
import pandas as pd
import shutil
import os
from sklearn.utils import shuffle

import tensorflow as tf
import keras
from keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array
from keras.models import Sequential, Model, load_model

import warnings
warnings.filterwarnings('ignore')

Using TensorFlow backend.


In [2]:
TEST_DATA_DIRECTORY = '../data/test/'
# read the image labels 
df = pd.read_csv('../data/sample_submission.csv')
df.head()

Unnamed: 0,id,label
0,0b2ea2a822ad23fdb1b5dd26653da899fbd2c0d5,0
1,95596b92e5066c5c52466c90b69ff089b39f2737,0
2,248e6738860e2ebcf6258cdc1f32f299e0c76914,0
3,2c35657e312966e9294eac6841726ff3a748febf,0
4,145782eb7caa1c516acbe2eda34d9a3f31c41fd6,0


In [3]:
print('Images in test dir {}' . format(len(os.listdir('../data/test'))))

Images in test dir 57459


In [4]:
df['label'].value_counts()

0    57458
Name: label, dtype: int64

In [5]:
test_directory = 'test_dir'
os.mkdir(test_directory)

# create images directory inside test_dir
images = os.path.join(test_directory, 'images')
os.mkdir(images)

In [6]:
for image in os.listdir('../data/test/'):
    
    file_name = image
    
    # source path to image
    src = os.path.join('../data/test', file_name)
    # destination path to image
    dst = os.path.join(images, file_name)
    # copy the image from the source to the destination
    shutil.copyfile(src, dst)

In [7]:
print('Images in new test dir {}' . format(len(os.listdir(images))))

Images in new test dir 57459


In [8]:
IMAGE_SIZE = 96
batch_size = 10

datagen = ImageDataGenerator(rescale=1.0/255, shear_range=0.2, zoom_range=0.2)
test_datagen = datagen.flow_from_directory(test_directory, target_size=(IMAGE_SIZE,IMAGE_SIZE),
                                                   batch_size=1, class_mode='binary',
                                                  shuffle=False)

Found 57458 images belonging to 1 classes.


In [9]:
from keras.models import load_model

my_model6 = load_model('./models/new_model6.h5')

Instructions for updating:
If using Keras pass *_constraint arguments to layers.

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



In [10]:
number_of_test_images = 57458
predictions = my_model6.predict_generator(test_datagen, steps=number_of_test_images, verbose=1)



In [11]:
df_predictions = pd.DataFrame(predictions, columns=['cancer','normal'])

df_predictions.head()

Unnamed: 0,cancer,normal
0,0.999999,8.252396e-07
1,0.999561,0.0002982884
2,0.562755,0.458792
3,7.8e-05,0.9999337
4,7.1e-05,0.9999253


In [12]:
# As you can see it doesn't have file ids (names)
# The following will add the file names in the sequence in which 
# the test datagenerator processed the test image files.
test_file_IDs = test_datagen.filenames

# add the fileIDs to the dataframe
df_predictions['file_IDs'] = test_file_IDs

df_predictions.head()

Unnamed: 0,cancer,normal,file_IDs
0,0.999999,8.252396e-07,images/00006537328c33e284c973d7b39d340809f7271...
1,0.999561,0.0002982884,images/0000ec92553fda4ce39889f9226ace43cae3364...
2,0.562755,0.458792,images/00024a6dee61f12f7856b0fc6be20bc7a48ba3d...
3,7.8e-05,0.9999337,images/000253dfaa0be9d0d100283b22284ab2f6b643f...
4,7.1e-05,0.9999253,images/000270442cc15af719583a8172c87cd2bd9c774...


In [13]:
# Notice that the file_IDs have names like images/0000653............
# We need to remove the 'images/' and keep just the 000653..........

def clean_file_ID_name(id):
    split_id = id.split('/')
    
    # please note that the name also has .tif suffix which we need to remove
    # before final submission
    suffix_removed_id = split_id[1].split('.')
    
    return suffix_removed_id[0]

df_predictions['ID'] = df_predictions['file_IDs'].apply(clean_file_ID_name)
df_predictions.head()

Unnamed: 0,cancer,normal,file_IDs,ID
0,0.999999,8.252396e-07,images/00006537328c33e284c973d7b39d340809f7271...,00006537328c33e284c973d7b39d340809f7271b
1,0.999561,0.0002982884,images/0000ec92553fda4ce39889f9226ace43cae3364...,0000ec92553fda4ce39889f9226ace43cae3364e
2,0.562755,0.458792,images/00024a6dee61f12f7856b0fc6be20bc7a48ba3d...,00024a6dee61f12f7856b0fc6be20bc7a48ba3d2
3,7.8e-05,0.9999337,images/000253dfaa0be9d0d100283b22284ab2f6b643f...,000253dfaa0be9d0d100283b22284ab2f6b643f6
4,7.1e-05,0.9999253,images/000270442cc15af719583a8172c87cd2bd9c774...,000270442cc15af719583a8172c87cd2bd9c7746


In [14]:
# Now we need just the cancer prediction (i.e. predict the probability of cancer)

cancer_predictions = df_predictions['cancer']

# get the id column
image_ID = df_predictions['ID']

submission_file = pd.DataFrame({'ID':image_ID, 'label':cancer_predictions,}).set_index('ID')

submission_file.to_csv('submission_file.csv', columns=['label']) 