## Training and Feature Extractor code VGG16

This notebook contains the code required to train the VGG model from the Wiedemann paper on the dataset presented in the paper, with code to extract the finetuned vectors from the model. Please be a aware that the training can take quite some time, it took as approximately 6 hours on a decent GPU.

These fuctions assume that you have the data already downloaded, with a folder with the PNG images in the data folder.

## Training the Model

In [18]:
import os
import argparse
import tensorflow
from PIL import Image
from tqdm import tqdm
from tensorflow.keras.layers import *
from tensorflow.keras.optimizers import Nadam
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.preprocessing.image import ImageDataGenerator

Image.MAX_IMAGE_PIXELS = 1000000000

%run ../utils/metricutils.py

print("Num GPUs Available: ", len(tensorflow.config.list_physical_devices('GPU')))

class ImageModelWiedemann:
    def __init__(self, learning_rate=0.00001):

        # We use the VGG16 model pretrained on the imagenet corpus
        # As the basis of our network.
        model_vgg16 = VGG16(weights='imagenet', include_top=False,
                            input_shape=(300, 300, 3))

        # We don't want to train the first 13 layers of the VGG16 model
        # We will add our own tower to this later. It is common in the literature
        # To only freeze the first 4 of the 5 convolutional layers so that
        # the network can still learn to adjust some of the filters to specifics
        # of the dataset
        for l in model_vgg16.layers[:13]:
            l.trainable = False

        top_model = Flatten()(model_vgg16.output)
        drop1 = Dropout(0.5)(top_model)
        dense1 = Dense(512)(drop1)
        relu1 = LeakyReLU()(dense1)
        drop2 = Dropout(0.5)(relu1)
        dense2 = Dense(256)(drop2)
        relu2 = LeakyReLU()(dense2)

        # After the output of the model, we pass the output through
        # A final linear layer and a sigmoid to obtain values for prediction
        model_output = Dense(1, activation="sigmoid")(relu2)

        model = Model(model_vgg16.input, model_output)
        # Set up the optimzation steps as described in the original
        # wiedemann paper.
        model.compile(loss='binary_crossentropy', optimizer=Nadam(learning_rate=learning_rate),
                      metrics=['AUC'])

        self.intermediate_activation = Model(model_vgg16.input, dense1)
        self.intermediate_activation.compile()

        self.model = model

    def train(self, train_data, num_epochs=20):
        self.model.fit(train_data, epochs=num_epochs)

    def predict(self, test_data):
        y_predict = self.model.predict(test_data, verbose=True)
        return y_predict


def prepare_df_for_model(dataframe):
    dataframe['png'] = dataframe.name + '-' + dataframe.page.astype(str) + '.png'
    dataframe['label'] = dataframe['label'].astype(str)

    return dataframe


def prepare_test_streams(test_subdataframe, png_folder,
                         batch_size):

    subtest_generator = ImageDataGenerator(
        preprocessing_function=preprocess_input).flow_from_dataframe(
        dataframe=test_subdataframe,
        directory=png_folder,
        x_col='png',
        y_col='label',
        target_size=(300, 300),
        class_mode=None,
        batch_size=batch_size,
        shuffle=False,
        seed=42,
        validate_filenames=True,
    )

    return subtest_generator

Num GPUs Available:  0


In [19]:
train_dataframe = prepare_df_for_model(pd.read_csv('../data/dataframes/train.csv'))
test_dataframe = prepare_df_for_model(pd.read_csv('../data/dataframes/test.csv'))

train_gen = ImageDataGenerator(
    preprocessing_function=preprocess_input).flow_from_dataframe(
    dataframe=train_dataframe,
    directory='../data/images/train/',
    x_col='png',
    y_col='label',
    target_size=(300, 300),
    class_mode='binary',
    batch_size=128,
    shuffle=True,
    seed=42,
    validate_filenames=True)

# We either want to train our own model and save it, or use a
# Model we trained ourselves, and only run the prediction step.

# Set the model
model = ImageModelWiedemann(learning_rate=0.00001)
# Train the model
model.train(train_data=train_gen, num_epochs=20)
# Save the model
model.model.save('../trained_VGG16_model')


Found 63815 validated image filenames belonging to 2 classes.



KeyboardInterrupt



## Extracting Feature Vectors from the VGG16 model

Once we have trained the VGG16 model, we can now load it and use to obtain the finetuned vectors for the clustering and classification methods.

In [24]:
def get_vectors_from_model(mode="finetuned"):
    assert mode in ["pretrained", "finetuned"]
    
    if mode == 'pretrained':
        model = VGG16(weights = 'imagenet', include_top=True, input_shape=(224, 224, 3))
    else:
        model = load_model('../data/trained_VGG16_model')
    
    layer_name = 'dense'
    model_top = Model(inputs=model.input, outputs=model.get_layer(layer_name).output)

    model_input = prepare_df_for_model(pd.read_csv('../data/dataframes/test.csv'))

    for doc_id, stream in tqdm(model_input.groupby('name')):
        stream['page'] = stream['page'].astype(int)
        sorted_stream = stream.sort_values(by='page')

        test_data = prepare_test_streams(sorted_stream, '../data/images/test',
                                       256)
        vectors = model_top.predict(test_data)
    if mode == "pretrained":
        np.save('../data/pretrained_vectors.npy', vectors)
    elif mode == "finetuned":
        np.save('../data/finetuned_vectors.npy', vectors)
    return None


In [26]:
get_vectors_from_model()




KeyboardInterrupt

