In [18]:
from keras.models import Sequential
from keras.layers import Convolution2D, BatchNormalization
from keras.layers import MaxPooling2D
from keras.layers import Flatten
from keras.layers import Dense
from keras.preprocessing.image import ImageDataGenerator 
from keras.optimizers import SGD
import numpy as np
import pandas as pd
import tensorflow as tf

import matplotlib.pyplot as plt
import matplotlib as mpl

from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
import sklearn.metrics as metrics
import PIL

import os

import numpy as np

import matplotlib.pyplot as plt
from keras.applications.inception_v3 import InceptionV3
from keras.preprocessing import image
from keras.models import Model
from keras.layers import Dense, GlobalAveragePooling2D
from keras import backend as K
import keras
from keras.callbacks import CSVLogger

In [2]:
# Parameters
img_width = 101
img_height = 101
batch_size = 64
keras = tf.keras

IMG_SHAPE = (img_height, img_width, 3)

In [3]:
def plot_roc(labels, prediction_scores):
    fpr, tpr, _ = metrics.roc_curve(labels, prediction_scores, pos_label=1)
    auc = metrics.roc_auc_score(labels, prediction_scores)
    legend_string = 'AUC = {:0.3f}'.format(auc)
   
    plt.plot([0,1],[0,1],'--', color='gray', label='Chance')
    plt.plot(fpr, tpr, label=legend_string)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.grid('on')
    plt.axis('square')
    plt.legend()
    plt.tight_layout()
    
def load_data(dir_data, dir_labels, training=True):
    ''' Load each of the image files into memory 

    While this is feasible with a smaller dataset, for larger datasets,
    not all the images would be able to be loaded into memory

    When training=True, the labels are also loaded
    '''
    labels_pd = pd.read_csv(dir_labels)
    ids       = labels_pd.id.values
    data      = []
    for identifier in ids:
        fname     = dir_data + identifier.astype(str) + '.tif'
        image     = mpl.image.imread(fname)
        data.append(image)
    data = np.array(data) # Convert to Numpy array
    if training:
        labels = labels_pd.label.values
        return data, labels
    else:
        return data, ids
    
def cv_performance_assessment(X,y,k,clf):
    '''Cross validated performance assessment
    
    X   = training data
    y   = training labels
    k   = number of folds for cross validation
    clf = classifier to use
    
    Divide the training data into k folds of training and validation data. 
    For each fold the classifier will be trained on the training data and
    tested on the validation data. The classifier prediction scores are 
    aggregated and output
    '''
    # Establish the k folds
    prediction_scores = np.empty(y.shape[0],dtype='object')
    kf = StratifiedKFold(n_splits=k, shuffle=True)
    for train_index, val_index in kf.split(X, y):
        # Extract the training and validation data for this fold
        X_train, X_val   = X[train_index], X[val_index]
        y_train          = y[train_index]
        
        # Train the classifier
        X_train_features = X_train
        clf              = clf.fit(X_train_features,y_train)
        
        # Test the classifier on the validation data for this fold
        X_val_features   = X_val
        cpred            = clf.predict_proba(X_val_features)
        
        # Save the predictions for this fold
        prediction_scores[val_index] = cpred[:,1]
    return prediction_scores

In [4]:

'''
Set directory parameters
'''
# Set the directories for the data and the CSV files that contain ids/labels
dir_train_images  = './data/training/'
dir_test_images   = './data/testing/'
dir_train_labels  = './data/labels_training.csv'
dir_test_ids      = './data/sample_submission.csv'
train_datagen = ImageDataGenerator(rescale = 1./255, vertical_flip = True, 
                                   horizontal_flip=True, channel_shift_range=50.0,
                                   rotation_range = 30, shear_range = 10.0,
                                   validation_split = 0.15)
test_datagen = ImageDataGenerator(rescale = 1./255)

In [5]:
# Generate image label dataframe
# Dont run this
traindf = pd.read_csv("./data/labels_training.csv",dtype=str)
def append_ext(fn):
    return fn+".tif"
traindf["id"]=traindf["id"].apply(append_ext)
traindf = traindf.sample(frac=1)

train_generator = train_datagen.flow_from_dataframe( 
    dataframe=traindf,
    directory=dir_train_images, 
    x_col="id",
    y_col="label",
    seed=12,
    batch_size = batch_size,
    target_size=(img_height, img_width),
    shuffle=True,
    class_mode='binary',
    subset='training')

validation_generator = train_datagen.flow_from_dataframe( 
    dataframe=traindf,
    directory=dir_train_images, 
    x_col="id",
    y_col="label",
    seed=12,
    target_size=(img_height, img_width),
    class_mode='binary',
    subset='validation',
    shuffle = False,
    batch_size = 225)

Found 1275 validated image filenames belonging to 2 classes.
Found 225 validated image filenames belonging to 2 classes.


## CNN Vanilla ##

In [23]:
from keras.optimizers import adam
def CNN_mod2(lr, weight, name, epoch = 100):
    # create the base pre-trained model
    
    model = Sequential()
    # First layer
    model.add(Convolution2D(filters = 8, kernel_size = (3, 3), 
                            input_shape = IMG_SHAPE, activation = 'relu'))
    model.add(BatchNormalization())
    model.add(MaxPooling2D(pool_size = (3, 3)))

    # Second layer
    model.add(Convolution2D(12, kernel_size = (3, 3), activation = 'relu'))
    model.add(BatchNormalization())
    #model.add(MaxPooling2D(pool_size = (3, 3)))
    
    # Third layer
    model.add(Convolution2D(16, kernel_size = (3, 3), activation = 'relu', strides = 2))
    model.add(BatchNormalization())
    
    # Fourth layer
    model.add(Convolution2D(24, kernel_size = (3, 3), activation = 'relu'))
    model.add(BatchNormalization())
    
    # Fifth layer
    model.add(Convolution2D(32, kernel_size = (3, 3), activation = 'relu'))
    model.add(BatchNormalization())
    
    # Flatten
    #model.add(GlobalAveragePooling2D())
    model.add(Flatten())
    
    # FC
    # model.add(Dense(units = 16, activation = 'sigmoid'))
    
    # Output 
    model.add(Dense(units = 1, activation = 'sigmoid'))
    
    # compile the model (should be done *after* setting layers to non-trainable)
    model.compile(optimizer=adam(learning_rate=lr, beta_1=0.9, beta_2=0.999), 
                  loss='binary_crossentropy', metrics=[tf.keras.metrics.AUC()])
    
    filename = 'D:\\MIDS\\RedTeam\\' +name+ '.csv'
    csv_logger = CSVLogger(filename, append=True, separator=';')
    
    model.fit_generator(train_generator, validation_data=validation_generator ,
                        epochs=epoch, class_weight=weight)
    
    score = model.predict(train_generator)
    labels = train_generator.classes
    auc = metrics.roc_auc_score(labels, score.ravel())
    return (model, score, labels, auc)

In [24]:
model, s, l, a = CNN_mod2(0.001, {0:1., 1:1.75}, name = 'Try_to_overfit2', epoch = 50)

W0210 16:21:44.724288 18380 deprecation.py:323] From C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\ops\nn_impl.py:180: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
W0210 16:21:46.211500 18380 deprecation_wrapper.py:119] From C:\ProgramData\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:422: The name tf.global_variables is deprecated. Please use tf.compat.v1.global_variables instead.



Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [53]:
model.fit_generator(train_generator, validation_data=validation_generator ,
                        epochs=5, class_weight={0:1., 1:1.7})

from sklearn.metrics import classification_report

score = model.predict(validation_generator)
labels = validation_generator.classes
#plot_roc(labels, score.ravel())
auc = metrics.roc_auc_score(labels, score.ravel())
print(auc)
print (classification_report(labels, score.ravel()>=0.5))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
0.9370588235294117
              precision    recall  f1-score   support

           0       0.81      0.97      0.89       140
           1       0.93      0.64      0.76        85

    accuracy                           0.84       225
   macro avg       0.87      0.80      0.82       225
weighted avg       0.86      0.84      0.84       225



In [17]:
# let's visualize layer names and layer indices to see how many layers
# we should freeze:
for i, layer in enumerate(model.layers):
   print(i, layer.name)


0 input_2
1 conv2d_95
2 batch_normalization_95
3 activation_95
4 conv2d_96
5 batch_normalization_96
6 activation_96
7 conv2d_97
8 batch_normalization_97
9 activation_97
10 max_pooling2d_5
11 conv2d_98
12 batch_normalization_98
13 activation_98
14 conv2d_99
15 batch_normalization_99
16 activation_99
17 max_pooling2d_6
18 conv2d_103
19 batch_normalization_103
20 activation_103
21 conv2d_101
22 conv2d_104
23 batch_normalization_101
24 batch_normalization_104
25 activation_101
26 activation_104
27 average_pooling2d_10
28 conv2d_100
29 conv2d_102
30 conv2d_105
31 conv2d_106
32 batch_normalization_100
33 batch_normalization_102
34 batch_normalization_105
35 batch_normalization_106
36 activation_100
37 activation_102
38 activation_105
39 activation_106
40 mixed0
41 conv2d_110
42 batch_normalization_110
43 activation_110
44 conv2d_108
45 conv2d_111
46 batch_normalization_108
47 batch_normalization_111
48 activation_108
49 activation_111
50 average_pooling2d_11
51 conv2d_107
52 conv2d_109
53 co

In [51]:
# Load the test data and test the classifier
test_data, ids = load_data(dir_test_images, dir_test_ids, training=False)
test_data = test_data/255
test_scores    = model.predict_proba(test_data)

# Save the predictions to a CSV file for upload to Kaggle
submission_file = pd.DataFrame({'id':    ids,
                                   'score':  test_scores.ravel()})
submission_file.to_csv('CNN_vanilla_09695.csv',
                           columns=['id','score'],
                           index=False)

In [52]:
peer = pd.read_csv('submission_PCA_SVM_3C.csv')
peer = pd.read_csv('Inception_SVM.csv')
peer = pd.read_csv('CNN_vanilla3.csv')
peer = pd.read_csv('CNN_vanilla_0981.csv')
np.corrcoef(test_scores.ravel(),np.array(peer.score))

array([[1.        , 0.90180249],
       [0.90180249, 1.        ]])

In [54]:
model.summary()

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_34 (Conv2D)           (None, 99, 99, 8)         224       
_________________________________________________________________
batch_normalization_30 (Batc (None, 99, 99, 8)         32        
_________________________________________________________________
max_pooling2d_13 (MaxPooling (None, 33, 33, 8)         0         
_________________________________________________________________
conv2d_35 (Conv2D)           (None, 31, 31, 12)        876       
_________________________________________________________________
batch_normalization_31 (Batc (None, 31, 31, 12)        48        
_________________________________________________________________
conv2d_36 (Conv2D)           (None, 15, 15, 16)        1744      
_________________________________________________________________
batch_normalization_32 (Batc (None, 15, 15, 16)       