In [None]:
import pickle 
import pandas as pd
import numpy as np
import keras
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import itertools
import math
from imblearn.over_sampling import RandomOverSampler

#select GPU to run script
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="2"

In [None]:
#######################
#Read in sequence data#
#######################

df1 = pd.read_pickle('../data/seq_data1.pkl') 
df2 = pd.read_pickle('../data/seq_data2.pkl')
df3 = pd.read_pickle('../data/seq_data3.pkl')
df4 = pd.read_pickle('../data/seq_data4.pkl')

df_seq = pd.concat([df1, df2, df3,df4]) #sequences

Explain the dataset:<br>

<b>Y1</b> = multiclass label for first image of sequence<br>
<b>Y2</b> = multiclass label for second image of sequence<br>

<b>centroid1</b> = [x,y] of centroid for first image of sequence<br>
<b>centroid2</b> = [x,y] of centroid for second image of sequence

<b>idx1</b> = unique identifier for first image of sequence (index of dataframe with singular images rather than sequences)<br>
<b>idx2</b> = unique identifier for second image of sequence (index of dataframe with singular images rather than sequences)

<b>image1</b> = pixel values for first image of sequence. shape=(height,width,num_channels)<br>
<b>image2</b> = pixel values for second image of sequence. shape=(height,width,num_channels)

<b>page1</b> = page from which first image of sequence originates<br>
<b>page2</b> = page from which second image of sequence originates (should be identical to page1)

<b>roi1</b> = [x1,x2,y1,y2] of the first image of the sequence, relative to the full image<br>
<b>roi2</b> = [x1,x2,y1,y2] of the second image of the sequence, relative to the full image

<b>y1</b> = 1dim label for first image of sequence <br>
<b>y2</b> = 2dim label for second image of sequence

<font color='red'>Note that most images have duplicates!! They appear in one sequence but also in another! E.g. Genus-species species-author. Therefore, they have unique identifiers. They should not be in the test-set as well as in the train set! Furthermore, don't count them twice when evaluating the model!!</font>

In [None]:
###################################
#create the combined model VGG FNN#
###################################

from keras.layers import Dense, Dropout, Flatten, Input
from keras.layers import Convolution2D, MaxPooling2D, ZeroPadding2D
from keras.models import Model
from keras.utils.layer_utils import print_summary

def MLP_CNN():
    #Input: [Image matrix (shape = (n, y, x, num_channels), Centroid matrix (shape = (n, 2))]
    #E.g.: [I_train, C_train]
    #Y (Y_train) is of shape (n, n_classes)
    
    input1 = keras.layers.Input(shape=(y_image, x_image, num_channels),name = 'image_input')

    vgg_output =  vgg_conv(input1)

    for layer in vgg_conv.layers[:-4]:
        layer.trainable = False

    # Check the trainable status of the individual layers
    for layer in vgg_conv.layers:
        print(layer, layer.trainable)

    x1 = keras.layers.Flatten()(vgg_output)
    x1 = keras.layers.Dense(1024, activation='relu', name='dense_1')(x1)
    x1 = keras.layers.Dropout(0.5, name='dropout')(x1)

    input2 = keras.layers.Input(shape=(2,), name='coordinate_input')
    x2 = keras.layers.Dense(4, activation='relu', name='dense_2')(input2)

    merged = keras.layers.merge([x1, x2], mode='concat')
    out = keras.layers.Dense(n_classes, activation='sigmoid', name='output')(merged)

    model = keras.models.Model(inputs=[input1, input2], outputs=out)

    model.compile(loss='categorical_crossentropy',
                  optimizer=keras.optimizers.Adam(lr=1e-4),
                  metrics=['acc'])
    model.summary()
    return(model)

In [None]:
def MLP_CNN_BLSTM():
    #Input: [Image matrix (shape = (n, seq_len, y, x, num_channels), Centroid matrix (shape = (n, seq_len, 2))]
    #E.g.: [I_train, C_train]
    #Y (Y_train) is of shape (n, seq_len, n_classes)
    
    input_cnn = keras.layers.Input(shape=(y_image, x_image, num_channels),name = 'image_input')

    vgg_output =  vgg_conv(input_cnn)

    for layer in vgg_conv.layers[:-4]:
        layer.trainable = False

    # Check the trainable status of the individual layers
    for layer in vgg_conv.layers:
        print(layer, layer.trainable)

    x1 = keras.layers.Flatten()(vgg_output)
    x1 = keras.layers.Dense(1024, activation='relu', name='dense_1')(x1)
    x1 = keras.layers.Dropout(0.5, name='dropout')(x1)
    
    input_ffnn = keras.layers.Input(shape=(2,), name='coordinate_input')
    x2 = keras.layers.Dense(4, activation='relu', name='dense_2')(input_ffnn)

    ffnn = keras.models.Model(inputs=input_ffnn, outputs=x2)
    cnn = keras.models.Model(inputs=input_cnn, outputs=x1)

    I_sequence = keras.layers.Input(shape=(None, y_image, x_image, num_channels))
    X_sequence = keras.layers.Input(shape=(None, 2))

    time_x1 = keras.layers.TimeDistributed(ffnn)(X_sequence)
    time_x2 = keras.layers.TimeDistributed(cnn)(I_sequence)

    merged = keras.layers.merge([time_x1, time_x2], mode='concat')
    
    seq_dat = keras.layers.Bidirectional(keras.layers.LSTM(256, return_sequences=True))(merged)

    hidden = keras.layers.Dense(output_dim=1024, activation="relu")(seq_dat)
    outputs = keras.layers.Dense(output_dim=n_classes, activation="softmax")(hidden)
    
    model = keras.models.Model([I_sequence,X_sequence], outputs=outputs)
    
    model.compile(loss='categorical_crossentropy',
                  optimizer=keras.optimizers.Adam(lr=1e-4),
                  metrics=['acc'])
    model.summary()
    return(model)

In [None]:
#order word images per page, shuffle pages. 
df_page = df_seq.set_index(['page1',df_seq.index])
shuffled = np.random.RandomState(seed=42).permutation(np.unique(df_page.index.get_level_values(level=0)))
df_shuff = pd.concat([df_page.loc[i:i] for i in shuffled])

#create a mask for 20% of the data
split = math.ceil(len(df_shuff)*0.2)
df_shuff['idx'] = df_shuff.reset_index().index
mask = df_shuff['idx'].isin(list(np.arange(0,split)))

#create a test and train set
test = df_shuff.loc[mask]
train = df_shuff.loc[~mask] 

#get test and train data
I_test = np.array([[x for x in i] for i in test[['image1','image2']].values], dtype=np.float16)
I_train = np.array([[x for x in i] for i in train[['image1','image2']].values], dtype=np.float16)

C_test = np.array([[x for x in i] for i in test[['centroid1','centroid2']].values], dtype=np.float16)
C_train = np.array([[x for x in i] for i in train[['centroid1','centroid2']].values], dtype=np.float16)

Y_test = np.array([[x for x in i] for i in test[['Y1','Y2']].values], dtype=np.float16)
Y_train = np.array([[x for x in i] for i in train[['Y1','Y2']].values], dtype=np.float16)

y_test = np.array([[x for x in i] for i in test[['y1','y2']].values], dtype=np.float16)
y_train = np.array([[x for x in i] for i in train[['y1','y2']].values], dtype=np.float16)

idx_test = np.array([[x for x in i] for i in test[['idx1','idx2']].values]).reshape(-1)
idx_train = np.array([[x for x in i] for i in train[['idx1','idx2']].values]).reshape(-1)

(n_test, seq_len, y_image, x_image, num_channels), c_dim, n_train, n_classes, batch_size  = I_test.shape, C_train.shape[-1], I_train.shape[0], Y_train.shape[2], 32

In [None]:
###############################
#Train MLP_CNN() model on fold#
###############################
model_path = '../models/MLP_CNN.h5'

model = MLP_CNN()
callbacks_list = [keras.callbacks.EarlyStopping(monitor='val_loss', patience=0), 
                 keras.callbacks.ModelCheckpoint(filepath=model_path, monitor='val_loss', save_best_only=True)]

history = model.fit([I_train.reshape(-1, y_image, x_image, num_channels),
                    C_train.reshape(-1, c_dim)], 
                    Y_train.reshape(-1, n_classes),
                    validation_data=([I_test.reshape(-1, y_image, x_image, num_channels), 
                                       C_test.reshape(-1, c_dim)], 
                                       Y_test.reshape(-1, n_classes)),
                    epochs=10, 
                    batch_size=batch_size, 
                    shuffle=True, 
                    callbacks=callbacks_list,
                    verbose=1)

model = keras.models.load_model(model_path)

scores = model.evaluate([I_test.reshape(-1, y_image, x_image, num_channels), 
                        C_test.reshape(-1, c_dim)],
                        Y_test.reshape(-1, n_classes))

y_pred = model.predict([I_test.reshape(-1, y_image, x_image, num_channels), 
                        C_test.reshape(-1, c_dim)], verbose=1) 
y_pred = np.argmax(y_pred,axis=1)

print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
print('\n')
print(classification_report(y_test.ravel()[np.unique(idx_test,True)[1]], y_pred.ravel()[np.unique(idx_test,True)[1]]))

In [None]:
#order word images per page, shuffle pages. 
df_page = df_seq.set_index(['page1',df_seq.index])
shuffled = np.random.RandomState(seed=42).permutation(np.unique(df_page.index.get_level_values(level=0)))
df_shuff = pd.concat([df_page.loc[i:i] for i in shuffled])


model_path = '../models/MLP_CNN_BLSTM.h5'

model = MLP_CNN_BLSTM()
callbacks_list = [keras.callbacks.EarlyStopping(monitor='val_loss', patience=0), 
                 keras.callbacks.ModelCheckpoint(filepath=model_path, monitor='val_loss', save_best_only=True)]

history = model.fit([I_train,C_train],Y_train,
                    validation_data=([I_test, C_test], Y_test),
                    epochs=10, 
                    batch_size=batch_size, 
                    shuffle=True, 
                    callbacks=callbacks_list, 
                    verbose=1)

model = keras.models.load_model(model_path)

scores = model.evaluate([I_test, C_test], Y_test)

Y_pred = model.predict([I_test, C_test], verbose=1)

y_pred = []
for idx, pred in enumerate(Y_pred):
    y_pred.append(np.argmax(pred,axis=1))
y_pred = np.array(y_pred).reshape(-1,2,1) 

print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
print('\n')
print(classification_report(y_test.ravel()[np.unique(idx_test,True)[1]], y_pred.ravel()[np.unique(idx_test,True)[1]]))
