In [75]:
# -*- coding: utf-8 -*-
'''
Import the packages needed for classification
'''
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
import sklearn.metrics as metrics
from sklearn.ensemble import RandomForestClassifier
#Load the VGG model
from keras.models import Sequential, Model, load_model
from keras import applications
from keras import optimizers
from keras.layers import Dropout, Flatten, Dense

img_rows, img_cols, img_channel = 224, 224, 3

base_model = applications.VGG16(weights='imagenet', include_top=False, input_shape=(img_rows, img_cols, img_channel))

Downloading data from https://github.com/fchollet/deep-learning-models/releases/download/v0.1/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5


In [20]:
'''
Set directory parameters
'''
# Set the directories for the data and the CSV files that contain ids/labels
dir_train_images  = 'data/training/'
dir_test_images   = 'data/testing/'
dir_train_labels  = 'data/labels_training.csv'
dir_test_ids      = 'data/sample_submission.csv'

'''
Include the functions used for loading, preprocessing, features extraction, 
classification, and performance evaluation
'''


'\nInclude the functions used for loading, preprocessing, features extraction, \nclassification, and performance evaluation\n'

In [21]:
def load_data(dir_data, dir_labels, training=True):
    ''' Load each of the image files into memory 

    While this is feasible with a smaller dataset, for larger datasets,
    not all the images would be able to be loaded into memory

    When training=True, the labels are also loaded
    '''
    labels_pd = pd.read_csv(dir_labels)
    ids       = labels_pd.id.values
    data      = []
    for identifier in ids:
        fname     = dir_data + identifier.astype(str) + '.tif'
        image     = mpl.image.imread(fname)
        data.append(image)
    data = np.array(data) # Convert to Numpy array
    if training:
        labels = labels_pd.label.values
        return data, labels
    else:
        return data, ids

In [22]:
def preprocess_and_extract_features(data):
    '''Preprocess data and extract features
    
    Preprocess: normalize, scale, repair
    Extract features: transformations and dimensionality reduction
    '''
    # Here, we do something trivially simple: we take the average of the RGB
    # values to produce a grey image, transform that into a vector, then
    # extract the mean and standard deviation as features.
    
    # Make the image grayscale
    data = np.mean(data, axis=3)
    
    # Vectorize the grayscale matrices
    vectorized_data = data.reshape(data.shape[0],-1)
    
    # extract the mean and standard deviation of each sample as features
    feature_mean = np.mean(vectorized_data,axis=1)
    feature_std  = np.std(vectorized_data,axis=1)
    
    # Combine the extracted features into a single feature vector
    features = np.stack((feature_mean,feature_std),axis=-1)
    
    return features

In [23]:
def set_classifier():
    '''Shared function to select the classifier for both performance evaluation
    and testing
    '''
    return KNeighborsClassifier(n_neighbors=55)

In [33]:
def set_classifier_rf(estimator):
    return RandomForestClassifier(n_estimators=estimator, n_jobs=4, verbose=1)

In [25]:
def cv_performance_assessment(X,y,k,clf):
    '''Cross validated performance assessment
    
    X   = training data
    y   = training labels
    k   = number of folds for cross validation
    clf = classifier to use
    
    Divide the training data into k folds of training and validation data. 
    For each fold the classifier will be trained on the training data and
    tested on the validation data. The classifier prediction scores are 
    aggregated and output
    '''
    # Establish the k folds
    prediction_scores = np.empty(y.shape[0],dtype='object')
    kf = StratifiedKFold(n_splits=k, shuffle=True)
    for train_index, val_index in kf.split(X, y):
        # Extract the training and validation data for this fold
        X_train, X_val   = X[train_index], X[val_index]
        y_train          = y[train_index]
        
        # Train the classifier
        X_train_features = preprocess_and_extract_features(X_train)
        clf              = clf.fit(X_train_features,y_train)
        
        # Test the classifier on the validation data for this fold
        X_val_features   = preprocess_and_extract_features(X_val)
        cpred            = clf.predict_proba(X_val_features)
        
        # Save the predictions for this fold
        prediction_scores[val_index] = cpred[:,1]
    return prediction_scores

In [26]:
def plot_roc(labels, prediction_scores):
    fpr, tpr, _ = metrics.roc_curve(labels, prediction_scores, pos_label=1)
    auc = metrics.roc_auc_score(labels, prediction_scores)
    legend_string = 'AUC = {:0.3f}'.format(auc)
   
    plt.plot([0,1],[0,1],'--', color='gray', label='Chance')
    plt.plot(fpr, tpr, label=legend_string)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.grid('on')
    plt.axis('square')
    plt.legend()
    plt.tight_layout()

In [66]:
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from keras.applications.imagenet_utils import decode_predictions
def load_data_cnn(dir_data, dir_labels, training=True):
    ''' Load each of the image files into memory 

    While this is feasible with a smaller dataset, for larger datasets,
    not all the images would be able to be loaded into memory

    When training=True, the labels are also loaded
    '''
    labels_pd = pd.read_csv(dir_labels)
    ids       = labels_pd.id.values
    data      = []
    for identifier in ids:
        fname     = dir_data + identifier.astype(str) + '.tif'
        image     = load_img(fname, target_size=(224, 224))
        data.append(img_to_array(image))
    data = np.array(data) # Convert to Numpy array
    if training:
        labels = labels_pd.label.values
        return data, labels
    else:
        return data, ids

In [52]:
'''
Sample script for cross validated performance
'''
# Set parameters for the analysis
num_training_folds = 20

# Load the data

data, labels = load_data(dir_train_images, dir_train_labels, training=True)
plt.imshow(data[0])
plt.show()

# Choose which classifier to use
# clf = set_classifier()


# Perform cross validated performance assessment
# max_auc, best = 0.5, 0
# for estimator in range(50,1000,50):
#     clf = set_classifier_rf(estimator)
#     prediction_scores = cv_performance_assessment(data,labels,num_training_folds,clf)
#     auc = metrics.roc_auc_score(labels, prediction_scores)
#     if auc > max_auc:
#         best = estimator
#         max_auc = auc
# print(best)

# clf = set_classifier_rf(800)
# prediction_scores = cv_performance_assessment(data,labels,num_training_folds,clf)
# Compute and plot the ROC curves
# plot_roc(labels, prediction_scores)

In [78]:
data, labels = load_data_cnn(dir_train_images, dir_train_labels, training=True)

In [99]:
print('data', data.shape)

data (1500, 224, 224, 3)


In [79]:
val_split_num = int(round(0.2*len(labels)))
x_train = data[val_split_num:]
y_train = labels[val_split_num:]
x_test = data[:val_split_num]
y_test = labels[:val_split_num]

print('x_train', x_train.shape)
print('y_train', y_train.shape)
print('x_test', x_test.shape)
print('y_test', y_test.shape)

x_train (1200, 224, 224, 3)
y_train (1200,)
x_test (300, 224, 224, 3)
y_test (300,)


In [80]:
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255
x_test /= 255

In [81]:
add_model = Sequential()
add_model.add(Flatten(input_shape=base_model.output_shape[1:]))
add_model.add(Dense(256, activation='relu'))
add_model.add(Dense(1, activation='sigmoid'))

model = Model(inputs=base_model.input, outputs=add_model(base_model.output))
model.compile(loss='binary_crossentropy', optimizer=optimizers.SGD(lr=1e-4, momentum=0.9),
              metrics=['accuracy'])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_6 (InputLayer)         (None, 224, 224, 3)       0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 112, 112, 128)     147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 56, 56, 128)       0         
__________

In [84]:
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import ModelCheckpoint

batch_size = 32
epochs = 100

train_datagen = ImageDataGenerator(
        rotation_range=30, 
        width_shift_range=0.1,
        height_shift_range=0.1, 
        horizontal_flip=True)
train_datagen.fit(x_train)


history = model.fit_generator(
    train_datagen.flow(x_train, y_train, batch_size=batch_size),
    steps_per_epoch=x_train.shape[0] // batch_size,
    epochs=epochs,
    validation_data=(x_test, y_test),
    callbacks=[ModelCheckpoint('VGG16-transferlearning.model', monitor='val_acc', save_best_only=True)]
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [105]:
# Cross validation CNN

# prediction_scores = cv_performance_assessment(data,labels,num_training_folds,model)
# plot_roc(labels, prediction_scores)

In [112]:
'''
Sample script for producing a Kaggle submission
'''

produce_submission_cnn = True # Switch this to True when you're ready to create a submission for Kaggle

if produce_submission_cnn:
    # Load the test data and test the classifier
    test_data, ids = load_data_cnn(dir_test_images, dir_test_ids, training=False)
    test_data = test_data.astype('float32')
    test_data /= 255
    # print('test_data', test_data.shape)
    predictions = model.predict(test_data)
    test_scores  = [val for sublist in predictions for val in sublist]

    # Save the predictions to a CSV file for upload to Kaggle
    submission_file = pd.DataFrame({'id':    ids,
                                   'score':  test_scores})
    submission_file.to_csv('submission_cnn.csv',
                           columns=['id','score'],
                           index=False)


In [108]:
'''
Sample script for producing a Kaggle submission
'''

produce_submission = False # Switch this to True when you're ready to create a submission for Kaggle

if produce_submission:
    # Load data, extract features, and train the classifier on the training data
    training_data, training_labels = load_data(dir_train_images, dir_train_labels, training=True)
    training_features              = preprocess_and_extract_features(training_data)
    clf                            = set_classifier_rf(800)
    clf.fit(training_features,training_labels)

    # Load the test data and test the classifier
    test_data, ids = load_data(dir_test_images, dir_test_ids, training=False)
    test_features  = preprocess_and_extract_features(test_data)
    test_scores    = clf.predict_proba(test_features)[:,1]

    # Save the predictions to a CSV file for upload to Kaggle
    submission_file = pd.DataFrame({'id':    ids,
                                   'score':  test_scores})
    submission_file.to_csv('submission_rf.csv',
                           columns=['id','score'],
                           index=False)


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    0.6s
[Parallel(n_jobs=4)]: Done 800 out of 800 | elapsed:    0.6s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 800 out of 800 | elapsed:    0.2s finished


In [109]:
print(test_scores)

[0.38125 0.0625  0.33625 0.75875 0.32375 0.31    0.505   0.5975  0.25
 0.5525  0.03125 0.20625 0.1425  0.35375 0.20875 0.      0.07375 0.18375
 0.1475  0.69875 0.19875 0.33125 0.07625 0.27875 0.49875 0.00625 0.10875
 0.03125 0.1175  0.0625  0.24875 0.4425  0.20625 0.10625 0.14375 0.4725
 0.3     0.0325  0.12875 0.13125 0.4875  0.      0.27875 0.41875 0.615
 0.49125 0.24    0.5525  0.50375 0.7325  0.3875  0.9     0.4525  0.6425
 0.16625 0.17875 0.2325  0.00125 0.27    0.47375 0.115   0.85375 0.24125
 0.34625 0.29    0.86125 0.64375 0.2475  0.48875 0.63    0.135   0.0575
 0.52875 0.245   0.56125 0.525   0.48125 0.39375 0.52625 0.31375 0.085
 0.025   0.69    0.47375 0.73    0.24625 0.35625 0.7575  0.47125 0.33875
 0.0775  0.2125  0.16875 0.57375 0.42125 0.185   0.86375 0.24125 0.36
 0.65125 0.14875 0.28375 0.3275  0.05    0.315   0.25375 0.405   0.0925
 0.265   0.005   0.5325  0.02875 0.46125 0.09875 0.12125 0.41125 0.22
 0.73625 0.26    0.23625 0.52125 0.07375 0.12625 0.38875 0.28875 0.3