In [1]:
from keras.models import Sequential
from keras.layers import Lambda
from keras.layers import Convolution2D, BatchNormalization
from keras.layers import MaxPooling2D
from keras.layers import Flatten
from keras.layers import Dense
from keras.layers import Dropout

from keras.preprocessing.image import ImageDataGenerator 
from keras.optimizers import SGD
import numpy as np
import pandas as pd
import tensorflow as tf

import matplotlib.pyplot as plt
import matplotlib as mpl

from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
import sklearn.metrics as metrics
import PIL

import os

import numpy as np

import matplotlib.pyplot as plt
from keras.applications.inception_v3 import InceptionV3
from keras.preprocessing import image
from keras.models import Model
from keras.layers import Dense, GlobalAveragePooling2D
from keras import backend as K
import keras
from keras.callbacks import CSVLogger

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
# Parameters
img_width = 101
img_height = 101
batch_size = 64
keras = tf.keras

IMG_SHAPE = (img_height, img_width, 3)

In [3]:
def plot_roc(labels, prediction_scores):
    fpr, tpr, _ = metrics.roc_curve(labels, prediction_scores, pos_label=1)
    auc = metrics.roc_auc_score(labels, prediction_scores)
    legend_string = 'AUC = {:0.3f}'.format(auc)
   
    plt.plot([0,1],[0,1],'--', color='gray', label='Chance')
    plt.plot(fpr, tpr, label=legend_string)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.grid('on')
    plt.axis('square')
    plt.legend()
    plt.tight_layout()
    
def load_data(dir_data, dir_labels, training=True):
    ''' Load each of the image files into memory 

    While this is feasible with a smaller dataset, for larger datasets,
    not all the images would be able to be loaded into memory

    When training=True, the labels are also loaded
    '''
    labels_pd = pd.read_csv(dir_labels)
    ids       = labels_pd.id.values
    data      = []
    for identifier in ids:
        fname     = dir_data + identifier.astype(str) + '.tif'
        image     = mpl.image.imread(fname)
        data.append(image)
    data = np.array(data) # Convert to Numpy array
    if training:
        labels = labels_pd.label.values
        return data, labels
    else:
        return data, ids
    
def cv_performance_assessment(X,y,k,clf):
    '''Cross validated performance assessment
    
    X   = training data
    y   = training labels
    k   = number of folds for cross validation
    clf = classifier to use
    
    Divide the training data into k folds of training and validation data. 
    For each fold the classifier will be trained on the training data and
    tested on the validation data. The classifier prediction scores are 
    aggregated and output
    '''
    # Establish the k folds
    prediction_scores = np.empty(y.shape[0],dtype='object')
    kf = StratifiedKFold(n_splits=k, shuffle=True)
    for train_index, val_index in kf.split(X, y):
        # Extract the training and validation data for this fold
        X_train, X_val   = X[train_index], X[val_index]
        y_train          = y[train_index]
        
        # Train the classifier
        X_train_features = X_train
        clf              = clf.fit(X_train_features,y_train)
        
        # Test the classifier on the validation data for this fold
        X_val_features   = X_val
        cpred            = clf.predict_proba(X_val_features)
        
        # Save the predictions for this fold
        prediction_scores[val_index] = cpred[:,1]
    return prediction_scores

In [4]:

'''
Set directory parameters
'''
# Set the directories for the data and the CSV files that contain ids/labels
dir_train_images  = './data/training/'
dir_test_images   = './data/testing/'
dir_train_labels  = './data/labels_training.csv'
dir_test_ids      = './data/sample_submission.csv'
train_datagen = ImageDataGenerator(rescale = 1./255, vertical_flip = True, 
                                   horizontal_flip=True, channel_shift_range=50.0,
                                   rotation_range = 30, shear_range = 10.0,
                                   validation_split = 0.15)
test_datagen = ImageDataGenerator(rescale = 1./255)

In [5]:
# Generate image label dataframe
# Dont run this
traindf = pd.read_csv("./data/labels_training.csv",dtype=str)
def append_ext(fn):
    return fn+".tif"
traindf["id"]=traindf["id"].apply(append_ext)
traindf = traindf.sample(frac=1)

train_generator = train_datagen.flow_from_dataframe( 
    dataframe=traindf,
    directory=dir_train_images, 
    x_col="id",
    y_col="label",
    seed=12,
    batch_size = batch_size,
    target_size=(img_height, img_width),
    shuffle=True,
    class_mode='binary',
    subset='training')

validation_generator = train_datagen.flow_from_dataframe( 
    dataframe=traindf,
    directory=dir_train_images, 
    x_col="id",
    y_col="label",
    seed=12,
    target_size=(img_height, img_width),
    class_mode='binary',
    subset='validation',
    shuffle = False,
    batch_size = 225)

Found 1275 validated image filenames belonging to 2 classes.
Found 225 validated image filenames belonging to 2 classes.


## CNN Vanilla ##

In [6]:
from keras.optimizers import adam

def hsv_conversion(x):
    import tensorflow as tf    
    return tf.image.rgb_to_hsv(x)


def CNN_mod2(lr, weight, name, epoch = 100):
    # create the base pre-trained model
    
    model = Sequential()
    # Conversion from RGB to CSV
    model.add(Lambda(hsv_conversion, input_shape=IMG_SHAPE))

    # First layer
    model.add(Convolution2D(filters = 4, kernel_size = (3, 3), 
                            input_shape = IMG_SHAPE, activation = 'relu'))
    model.add(BatchNormalization())

    # Second layer
    model.add(Convolution2D(8, kernel_size = (3, 3), activation = 'relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))
    #model.add(MaxPooling2D(pool_size = (3, 3)))
    
    # Third layer
    model.add(Convolution2D(16, kernel_size = (3, 3), activation = 'relu'))
    model.add(BatchNormalization())
    
    # Fourth layer
    model.add(Convolution2D(32, kernel_size = (3, 3), activation = 'relu'))
    model.add(BatchNormalization())
    
    # Fifth layer
    model.add(Convolution2D(48, kernel_size = (3, 3), activation = 'relu'))
    model.add(BatchNormalization())
    model.add(MaxPooling2D(pool_size = (3, 3)))
 
    # Sixth layer
    #model.add(Convolution2D(64, kernel_size = (3, 3), activation = 'relu'))
    #model.add(BatchNormalization())
    

    # Flatten
    #model.add(GlobalAveragePooling2D())
    model.add(Flatten())
    
    # FC
    # model.add(Dense(units = 16, activation = 'sigmoid'))
    
    # Output 
    model.add(Dense(units = 1, activation = 'sigmoid'))
    
    # compile the model (should be done *after* setting layers to non-trainable)
    model.compile(optimizer=adam(learning_rate=lr, beta_1=0.9, beta_2=0.999), 
                  loss='binary_crossentropy', metrics=[tf.keras.metrics.AUC()])
    
    filename = '/Users/ethan/solar-pv-image/' +name+ '.csv'
    csv_logger = CSVLogger(filename, append=True, separator=';')
    
    model.fit_generator(train_generator, validation_data=validation_generator ,
                        epochs=epoch, class_weight=weight)
    
    score = model.predict(train_generator)
    labels = train_generator.classes
    auc = metrics.roc_auc_score(labels, score.ravel())
    return (model, score, labels, auc)

In [None]:
model, s, l, a = CNN_mod2(0.001, {0:1., 1:1.75}, name = 'Try_to_overfit_Ethan_HSV', epoch = 100)


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100

In [None]:
model.fit_generator(train_generator, validation_data=validation_generator ,
                        epochs=5, class_weight={0:1., 1:1.7})

from sklearn.metrics import classification_report

score = model.predict(validation_generator)
labels = validation_generator.classes
plot_roc(labels, score.ravel())
auc = metrics.roc_auc_score(labels, score.ravel())
print("AUC:", auc)
print (classification_report(labels, score.ravel()>=0.5))

In [None]:
# let's visualize layer names and layer indices to see how many layers
# we should freeze:
for i, layer in enumerate(model.layers):
   print(i, layer.name)


In [None]:
# Load the test data and test the classifier
test_data, ids = load_data(dir_test_images, dir_test_ids, training=False)
test_data = test_data/255
test_scores    = model.predict_proba(test_data)

# Save the predictions to a CSV file for upload to Kaggle
submission_file = pd.DataFrame({'id':    ids,
                                   'score':  test_scores.ravel()})
submission_file.to_csv('CNN_HSV_Ethan_3.csv',
                           columns=['id','score'],
                           index=False)

In [None]:
peer = pd.read_csv('submission_PCA_SVM_3C.csv')
peer = pd.read_csv('Inception_SVM.csv')
peer = pd.read_csv('CNN_vanilla3.csv')
peer = pd.read_csv('CNN_vanilla_0981.csv')
np.corrcoef(test_scores.ravel(),np.array(peer.score))

In [None]:
model.summary()