In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import os
import keras

from PIL import Image

from keras.preprocessing.image import ImageDataGenerator
from keras.applications.vgg19 import VGG19
from keras.layers import Dense, Flatten
from keras.models import Model
from keras.optimizers import Adam

Using TensorFlow backend.


In [3]:
data_folder = os.path.join(os.path.abspath(os.curdir), os.pardir, "data")

In [4]:
training_data = os.path.join(data_folder, "processed", "train_train.csv")

In [5]:
train_df = pd.read_csv(training_data)

In [7]:
validation_data = os.path.join(data_folder, "processed", "train_validation.csv")
validation_df = pd.read_csv(validation_data)
subsampled_validation_df = validation_df.sample(frac=0.001, replace=False, random_state=0)

In [8]:
image_folder = os.path.join(data_folder, "raw", "train_jpg", "data", "competition_files", "train_jpg")

In [10]:
train_df = train_df.assign(image_path = train_df.image + ".jpg")

In [22]:
train_image_df = train_df[['image_path', "deal_probability"]].dropna()
subsampled_train_image_df = train_image_df.sample(frac=0.001, replace=False, random_state=0)

In [26]:
validation_df = validation_df.assign(image_path = validation_df.image + ".jpg")
validation_image_df = validation_df[['image_path', "deal_probability"]].dropna()
subsampled_validation_image_df = validation_image_df.sample(frac=0.001, replace=False, random_state=0)

In [27]:
subsampled_train_image_df.head()

Unnamed: 0,image_path,deal_probability
917111,4a9e999536299d25f6b7413772363e66578168f5ce3283...,0.0
384978,2e8d030d65e53bcf2b96b7143b6197c423e9e6c9297654...,0.80322
28050,b4f477657db6eef86c12857ac2c6bc03f5341d5ac113cf...,0.0
362221,d29f14218e3bddaefdd29e1db73f4819bd6df250fc2bf5...,0.00849
786073,15678006cc16f8f5baf982ff685ac8ebd6489269cb900b...,0.0


In [28]:
subsampled_validation_image_df.head()

Unnamed: 0,image_path,deal_probability
53851,34cd8be979ff325e1d7118753555191fa2833094a570ad...,0.0
218050,dc69f4afe05df186302737152f296aa1a9dbf0320cc474...,0.07437
174130,7624d225530b700cb555b5e2d35aab810e156821afad2b...,0.0
299067,9c1141d191d52e340f0c146c45dcb88d146102f8b140b5...,0.0
67250,3f56ee330151432f446b32b9784e389932be873d316901...,0.05268


In [16]:
image_generator = ImageDataGenerator()

In [29]:
train_image_generator = image_generator.flow_from_dataframe(dataframe=subsampled_train_image_df, directory=image_folder, x_col="image_path", y_col="deal_probability",
                                                           class_mode='other', target_size=(224, 224), color_mode='rgb')

Found 1112 images.


In [30]:
validation_image_generator = image_generator.flow_from_dataframe(dataframe=subsampled_validation_image_df, directory=image_folder, x_col="image_path", y_col="deal_probability",
                                                           class_mode='other', target_size=(224, 224), color_mode='rgb')

Found 278 images.


In [31]:
vgg19_model = VGG19(input_shape=(224, 224, 3), include_top=False)

In [32]:
image_model = vgg19_model.output

In [33]:
image_model = Flatten()(image_model)
image_model = Dense(512, activation='relu')(image_model)
image_model = Dense(256, activation='relu')(image_model)
preds = Dense(1)(image_model)

In [34]:
model = Model(inputs=vgg19_model.input, outputs=preds)

In [36]:
for layer in model.layers[:22]:
    layer.trainable = False
for layer in model.layers[22:]:
    layer.trainable = True

In [37]:
for layer in model.layers:
    print(layer.trainable, layer.name)

False input_1
False block1_conv1
False block1_conv2
False block1_pool
False block2_conv1
False block2_conv2
False block2_pool
False block3_conv1
False block3_conv2
False block3_conv3
False block3_conv4
False block3_pool
False block4_conv1
False block4_conv2
False block4_conv3
False block4_conv4
False block4_pool
False block5_conv1
False block5_conv2
False block5_conv3
False block5_conv4
False block5_pool
True flatten_1
True dense_1
True dense_2
True dense_3


In [40]:
model.compile(optimizer='Adam', loss='mean_squared_error', metrics=['mean_squared_error'])

In [41]:
model.fit_generator(generator=train_image_generator,
                    validation_data=validation_image_generator,
                    epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1758e90f780>

In [50]:
np.mean(np.power(np.abs(subsampled_validation_image_df.deal_probability.values - subsampled_validation_image_df.deal_probability.mean()), 2))

0.0723544687816741