In [66]:
from keras.models import Sequential
from keras.layers import Convolution2D
from keras.layers import MaxPooling2D
from keras.layers import Flatten
from keras.layers import Dense
from keras.preprocessing.image import ImageDataGenerator 
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib as mpl

from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
import sklearn.metrics as metrics
import PIL

In [32]:
# Parameters
img_width = 101
img_height = 101
batch_size = 32

In [64]:
def plot_roc(labels, prediction_scores):
    fpr, tpr, _ = metrics.roc_curve(labels, prediction_scores, pos_label=1)
    auc = metrics.roc_auc_score(labels, prediction_scores)
    legend_string = 'AUC = {:0.3f}'.format(auc)
   
    plt.plot([0,1],[0,1],'--', color='gray', label='Chance')
    plt.plot(fpr, tpr, label=legend_string)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.grid('on')
    plt.axis('square')
    plt.legend()
    plt.tight_layout()
    
def load_data(dir_data, dir_labels, training=True):
    ''' Load each of the image files into memory 

    While this is feasible with a smaller dataset, for larger datasets,
    not all the images would be able to be loaded into memory

    When training=True, the labels are also loaded
    '''
    labels_pd = pd.read_csv(dir_labels)
    ids       = labels_pd.id.values
    data      = []
    for identifier in ids:
        fname     = dir_data + identifier.astype(str) + '.tif'
        image     = mpl.image.imread(fname)
        data.append(image)
    data = np.array(data) # Convert to Numpy array
    if training:
        labels = labels_pd.label.values
        return data, labels
    else:
        return data, ids

In [166]:
model = Sequential()
# First layer
model.add(Convolution2D(filters = 32, kernel_size = (3, 3), 
                        input_shape = (img_width, img_height, 3), activation = 'relu'))
model.add(MaxPooling2D(pool_size = (2, 2)))

# Second layer
model.add(Convolution2D(32, kernel_size = (3, 3), activation = 'relu'))
model.add(MaxPooling2D(pool_size = (2, 2)))

# Flatten
model.add(Flatten())

# FC
model.add(Dense(units = 5, activation = 'tanh'))

# Output 
model.add(Dense(units = 1, activation = 'sigmoid'))

In [167]:
# Setup optimizer and Compile model
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['binary_accuracy'])

In [168]:
# Get image data
train_data_dir = './data/training'
train_datagen = ImageDataGenerator(rescale = 1./255, shear_range = 0.1, zoom_range = 0.2, horizontal_flip = True,
                                  validation_split = 0.33)
test_datagen = ImageDataGenerator(rescale = 1./255)

In [169]:
# Generate image label dataframe
traindf = pd.read_csv("./data/labels_training.csv",dtype=str)
def append_ext(fn):
    return fn+".tif"
traindf["id"]=traindf["id"].apply(append_ext)
traindf = traindf.sample(frac=1)

train_generator = train_datagen.flow_from_dataframe( 
    dataframe=traindf,
    directory=train_data_dir, 
    x_col="id",
    y_col="label",
    seed=42,
    batch_size = batch_size,
    target_size=(img_width, img_height),
    class_mode='binary',
    subset='training')

validation_generator = train_datagen.flow_from_dataframe( 
    dataframe=traindf,
    directory=train_data_dir, 
    x_col="id",
    y_col="label",
    seed=42,
    batch_size = 32,
    target_size=(img_width, img_height),
    class_mode='binary',
    subset='validation')

Found 1005 validated image filenames belonging to 2 classes.
Found 495 validated image filenames belonging to 2 classes.


In [170]:
# Generate fit generator
model.fit_generator(train_generator, steps_per_epoch = train_generator.samples//batch_size, 
                    validation_data = validation_generator,
                    validation_steps = validation_generator.samples//batch_size,
                    nb_epoch = 20)

  """
  """


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.callbacks.History at 0x14cb82710>

In [172]:
#score = model.predict_generator(validation_generator)
#labels = validation_generator.classes
from sklearn.metrics import classification_report

score = model.predict_generator(train_generator)
labels = train_generator.classes
#plot_roc(labels, score.ravel())
auc = metrics.roc_auc_score(labels, score.ravel())
print(auc)
print (classification_report(labels, score.ravel()>0.5))

AttributeError: 'Sequential' object has no attribute 'predict_class'

In [44]:
# Retrain with all data

# Get image data
train_data_dir = './data/training'
test_data_dir = './data/testing'
train_datagen = ImageDataGenerator(rescale = 1./255, shear_range = 0.1, zoom_range = 0.2, horizontal_flip = True)

test_datagen = ImageDataGenerator(rescale = 1./255)

# Generate image label dataframe
train_generator = train_datagen.flow_from_dataframe( 
    dataframe=traindf,
    directory=train_data_dir, 
    x_col="id",
    y_col="label",
    seed=42,
    batch_size = batch_size,
    target_size=(img_width, img_height),
    class_mode='binary')

test_generator = test_datagen.flow_from_directory(
    directory=test_data_dir,
    target_size=(img_width, img_height),
    color_mode="rgb",
    class_mode=None,
    shuffle=False,
    seed=42,
    batch_size=None
)

Found 1500 validated image filenames belonging to 2 classes.
Found 558 images belonging to 1 classes.


In [45]:
# Reoptimize model on the full training set

# Setup optimizer and Compile model
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

# Generate fit generator
model.fit_generator(train_generator, steps_per_epoch = train_generator.samples//batch_size,
                    nb_epoch = 30)

  
  


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.callbacks.History at 0x145772d10>

In [83]:
'''
Set directory parameters
'''
# Set the directories for the data and the CSV files that contain ids/labels
dir_train_images  = './data/training/'
dir_test_images   = './data/testing/'
dir_train_labels  = './data/labels_training.csv'
dir_test_ids      = './data/sample_submission.csv'

# Load the test data and test the classifier
test_data, ids = load_data(dir_test_images, dir_test_ids, training=False)
test_data = test_data/255
test_scores    = model.predict_proba(test_data)


In [89]:

# Save the predictions to a CSV file for upload to Kaggle
submission_file = pd.DataFrame({'id':    ids,
                                   'score':  test_scores.ravel()})
submission_file.to_csv('submission_CNN.csv',
                           columns=['id','score'],
                           index=False)

In [90]:
peer = pd.read_csv('submission_PCA_SVM_3C.csv')

In [94]:
np.corrcoef(test_scores.ravel(),np.array(peer.score))

array([[1.        , 0.55051583],
       [0.55051583, 1.        ]])

In [134]:
print(score)

[[0.02277895]
 [0.40673482]
 [0.34235457]
 [0.61456823]
 [0.1727706 ]
 [0.08660366]
 [0.126175  ]
 [0.0432112 ]
 [0.61456823]
 [0.61456823]
 [0.04241512]
 [0.06314329]
 [0.14860891]
 [0.10705954]
 [0.02988541]
 [0.04064206]
 [0.61456823]
 [0.06816328]
 [0.39856255]
 [0.0684965 ]
 [0.15086024]
 [0.05097397]
 [0.61456823]
 [0.00461176]
 [0.02062681]
 [0.04107146]
 [0.5419805 ]
 [0.0425634 ]
 [0.39531627]
 [0.12381522]
 [0.02820242]
 [0.03623557]
 [0.61456823]
 [0.61456823]
 [0.03285847]
 [0.61456823]
 [0.16615933]
 [0.01679355]
 [0.61456823]
 [0.61456823]
 [0.4765532 ]
 [0.38803717]
 [0.03237739]
 [0.13267504]
 [0.02393799]
 [0.00575777]
 [0.18776496]
 [0.61456823]
 [0.04360319]
 [0.61456823]
 [0.10414106]
 [0.2081161 ]
 [0.02468728]
 [0.61456823]
 [0.1462597 ]
 [0.09362742]
 [0.61456823]
 [0.05050301]
 [0.11330345]
 [0.04345893]
 [0.13870475]
 [0.04093717]
 [0.22369507]
 [0.0422663 ]
 [0.03117658]
 [0.00977774]
 [0.05777572]
 [0.0952001 ]
 [0.61456823]
 [0.5867135 ]
 [0.61456823]
 [0.51

In [137]:
labels

[1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
