In [34]:
import pandas as pd
import numpy as np
import os
import random

from tensorflow.keras.preprocessing.image import ImageDataGenerator

#import tensorflow as tf

from keras.layers import Dropout, Dense, Input, Lambda
from keras.models import Sequential, Model
from keras.models import load_model
from keras import backend as K
from keras.callbacks import TensorBoard, ModelCheckpoint, EarlyStopping
from keras.optimizers import Adam

In [4]:
combos = pd.read_csv("oceane/combinations.csv")

In [7]:
combos.head()

Unnamed: 0.1,Unnamed: 0,image1,image2,label
0,0,1498413993_-02400.jpg,1498413993_-02400.jpg,0.0
1,1,1498413993_-02400.jpg,1498414053_-02340.jpg,0.0
2,2,1498413993_-02400.jpg,1498414113_-02280.jpg,0.0
3,3,1498413993_-02400.jpg,1498414173_-02220.jpg,0.0
4,4,1498413993_-02400.jpg,1498414233_-02160.jpg,0.0


In [5]:
len(combos)

231027

In [39]:
combos.sample(10)

Unnamed: 0.1,Unnamed: 0,image1,image2,label
110959,134002,1571849530.jpg,1571856730.jpg,0.0
144648,167691,1572616812.jpg,1572614472.jpg,0.0
208450,231493,1572713675.jpg,1572716615.jpg,0.0
54715,72050,1563221843_-00960.jpg,1563222863_+00060.jpg,1.0
172478,195521,1572636875.jpg,1572642695.jpg,0.0
53014,70274,1563124927_+02340.jpg,1563123487_+00900.jpg,1.0
12068,15835,1512675724_+01740.jpg,1512674824_+00840.jpg,1.0
26268,34472,1530739045_+00000.jpg,1530739225_+00180.jpg,1.0
69966,89809,1564868559_-00660.jpg,1564868919_-00300.jpg,0.0
34642,45359,1532544985_+00060.jpg,1532545705_+00780.jpg,1.0


In [6]:
sorted(combos.image1.apply(lambda s: s[:6]).unique())

['149841',
 '149954',
 '151267',
 '152875',
 '152899',
 '152900',
 '152901',
 '152902',
 '153073',
 '153074',
 '153089',
 '153090',
 '153254',
 '155916',
 '156295',
 '156296',
 '156312',
 '156313',
 '156322',
 '156468',
 '156486',
 '156487',
 '156572',
 '156573',
 '157142',
 '157143',
 '157184',
 '157185',
 '157261',
 '157262',
 '157263',
 '157264',
 '157271',
 '157272']

In [11]:
train_files = []
val_files = []
random.seed(18)
for folder in os.listdir("data/train_data_set/"):
    if random.random() < 0.8:
        for f in os.listdir("data/train_data_set/" + folder):
            train_files.append("{}/{}".format(folder, f))
    else:
        print(folder)
        for f in os.listdir("data/train_data_set/" + folder):
            val_files.append("{}/{}".format(folder, f))

20190529-94Fire-lp-s-mobo-c
20191102-bm-e-mobo-c-2 
20180614-Bridle-hp-n-mobo-c


In [12]:
len(train_files)

1898

In [13]:
len(val_files)

326

In [17]:
train_files_df = pd.DataFrame(train_files, columns=["file"])
train_files_df["smoke"] = train_files_df.file.str[-10] == "+"

val_files_df = pd.DataFrame(val_files, columns=["file"])
val_files_df["smoke"] = val_files_df.file.str[-10] == "+"

The ImageDataGenerator is useful for preprocessing the images into the right format, and also for applying random transformations to increase the variation in the training data.

However - flow, flow_from_directory, and flow_from_dataframe all seem to be oriented at bringing in one image at a time, with it's label (flow MIGHT be more flexible on this, but needs to take in a numpy array that represents the image, not just an image file name).

The one-shot network takes input from a generator that yields (pair, target) - so need to look into if that pair can be fed into the ImageDataGenerator.flow() method.

Alternatively

In [15]:
train_image_generator = ImageDataGenerator(
    rescale=1./255,
    shear_range=0.2,
    zoom_range=0.2,
    brightness_range=[0.5, 1.5],
    rotation_range=40)

validate_image_generator = ImageDataGenerator(rescale=1./255)

In [19]:
train_image_generator.flow_from_dataframe(train_files_df, "data/train_data_set/", x_col="file", y_col="smoke")

Found 0 images belonging to 2 classes.


<keras_preprocessing.image.DataFrameIterator at 0xb312deda0>

In [16]:
train_image_generator.flow_from_directory("data/train_data_set_2")

Found 2214 images belonging to 2 classes.


<keras_preprocessing.image.DirectoryIterator at 0xb336e2518>

In [1]:
import tensorflow as tf
from keras.applications.vgg16 import VGG16
#create a base model
image_width=224
image_height=224
IMAGE_SHAPE = (image_width, image_height, 3)
base_model = VGG16(input_shape=IMAGE_SHAPE, include_top=False,weights='imagenet')


Using TensorFlow backend.


Downloading data from https://github.com/fchollet/deep-learning-models/releases/download/v0.1/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5


In [23]:
# example of loading an image with the Keras API
from keras.preprocessing.image import load_img
# load the image
img = load_img('data/train_data_set/'+train_files[0])
# report details about the image
print(type(img))
print(img.format)
print(img.mode)
print(img.size)
# show the image
img.show()

<class 'PIL.JpegImagePlugin.JpegImageFile'>
JPEG
RGB
(3072, 2048)


In [24]:
# example of converting an image with the Keras API
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from keras.preprocessing.image import array_to_img
# load the image
img = load_img('data/train_data_set/'+train_files[0])
print(type(img))
# convert to numpy array
img_array = img_to_array(img)
print(img_array.dtype)
print(img_array.shape)
# convert back to image
img_pil = array_to_img(img_array)
print(type(img))

<class 'PIL.JpegImagePlugin.JpegImageFile'>
float32
(2048, 3072, 3)
<class 'PIL.JpegImagePlugin.JpegImageFile'>


In [25]:
img2 = img.rotate(90)
img2.show()

In [None]:
#generate batches:
#select a random file
#find the next file in the sqeuence (see if there is a common pattern to do this via the name))
#if there is not a next file, start over
#convert both files to keras images
#apply the same random transformation to both images
#apply a label
#return pairs, labels

#alternatively, draw random samples from a paired and labeled 
#pandas dataframe/numpy array. 
#Oceane's doesn't seem to be sequential, see what Will built, or build own


In [None]:
#adjust
def generate_batches(batch_size, X, y, class_lookup_dict):
  while True:
      pairs, targets = get_batch(batch_size, X, y, class_lookup_dict)
      yield (pairs, targets)

In [None]:
#adjust parameters
train_gen = generate_batches(train_batch_size, Xtrain_abs, ytrain, class_lookup_dict_train)

val_gen = generate_batches(test_batch_size, Xval_abs, yval, class_lookup_dict_val)

In [32]:
#model architecture:

def get_twin_model():
    
    image_width=224
    image_height=224
    IMAGE_SHAPE = (image_width, image_height, 3)
    
    # Define the tensors for the two input documents
    left_input = Input(IMAGE_SHAPE)
    right_input = Input(IMAGE_SHAPE)
    
    # VGG Neural Network
    model = VGG16(input_shape=IMAGE_SHAPE, include_top=False,weights='imagenet')
    
###    model = Sequential()
###    model.add(VGGsomething...)
###    model.add(Dense(2, activation='sigmoid'))
    
    # Generate the encodings (feature vectors) for the two documents
    encoded_l = model(left_input)
    encoded_r = model(right_input)
    
    # Add a customized layer to compute the absolute difference between the encodings
    L1_layer = Lambda(lambda tensors:K.abs(tensors[0] - tensors[1]))
    L1_distance = L1_layer([encoded_l, encoded_r])
    
    # Add a dense layer with a sigmoid unit to generate the similarity score
    prediction = Dense(1,activation='sigmoid')(L1_distance)
    
    # Connect the inputs with the outputs
    twin_net = Model(inputs=[left_input,right_input],outputs=prediction)
    
    # return the model
    return twin_net

In [35]:
model = get_twin_model()
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, 224, 224, 3)  0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            (None, 224, 224, 3)  0                                            
__________________________________________________________________________________________________
vgg16 (Model)                   (None, 7, 7, 512)    14714688    input_5[0][0]                    
                                                                 input_6[0][0]                    
__________________________________________________________________________________________________
lambda_2 (Lambda)               (None, 7, 7, 512)    0           vgg16[1][0]                      
          

In [36]:
#fix filepaths
optimizer = Adam()
model.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=['accuracy'])

stopper = EarlyStopping(monitor='val_acc', patience=5)
checkpointer = ModelCheckpoint("../../dbfs/mnt/databricks-cc/katelyn/ManuscriptMatcher/{}_continued.h5".format(model_name), monitor='val_acc', save_best_only=True, period = 10)

In [None]:
#adjust training parameters
history = model.fit_generator(train_gen, steps_per_epoch = 300, \
                              epochs = 50, verbose = 1, \
                              callbacks=[tensorboard, stopper, checkpointer], \
                              validation_data = val_gen, validation_steps = 100, \
                              use_multiprocessing = True)

In [None]:
#fix filepath
model.save("../../dbfs/mnt/databricks-cc/katelyn/ManuscriptMatcher/{}_final.h5".format(model_name))