In [68]:
import tensorflow as tf
import pandas as pd
import numpy as np
import os
import keras

from PIL import Image

from keras.preprocessing.image import ImageDataGenerator
from keras.applications.vgg19 import VGG19
from keras.layers import Dense, Flatten, Embedding, Input
from keras.models import Model
from keras.optimizers import Adam

In [30]:
data_folder = os.path.join(os.path.abspath(os.curdir), os.pardir, "data")

In [91]:
training_data = os.path.join(data_folder, "processed", "train_train.csv")
train_df = pd.read_csv(training_data)
train_df = train_df.assign(image_path = train_df.image + ".jpg")
subsampled_train_df = train_df.sample(frac=0.1, replace=False, random_state=0)

In [90]:
validation_data = os.path.join(data_folder, "processed", "train_validation.csv")
validation_df = pd.read_csv(validation_data)
validation_df = validation_df.assign(image_path = validation_df.image + ".jpg")
subsampled_validation_df = validation_df.sample(frac=0.1, replace=False, random_state=0)

In [92]:
city_index_mapping = {city : int(index + 1) for index, city in enumerate(np.unique(subsampled_train_df.city.values))}
category_index_mapping = {category : int(index + 1) for index, category in enumerate(np.unique(subsampled_train_df.category_name.values))}

In [93]:
subsampled_train_df = subsampled_train_df.assign(city_index = subsampled_train_df.city.map(city_index_mapping),
                                                category_index = subsampled_train_df.category_name.map(category_index_mapping))
subsampled_validation_df = subsampled_validation_df.assign(city_index = subsampled_validation_df.city.map(city_index_mapping),
                                                category_index = subsampled_validation_df.category_name.map(category_index_mapping))

In [94]:
subsampled_train_df.head()

Unnamed: 0.1,Unnamed: 0,item_id,user_id,region,city,parent_category_name,category_name,param_1,param_2,param_3,...,price,item_seq_number,activation_date,user_type,image,image_top_1,deal_probability,image_path,city_index,category_index
223617,24527,d2e60e7f02eb,8ab1dcf8ae00,Тюменская область,Тюмень,Транспорт,Автомобили,С пробегом,Peugeot,407,...,315000.0,1689,2017-03-19,Company,bfe0f269734a36022d8f9ed3080c52508ceda3948f7623...,1132.0,0.0,bfe0f269734a36022d8f9ed3080c52508ceda3948f7623...,1141,1
506290,138293,dfb1a5b562d7,d0a2ffcd7b0f,Иркутская область,Братск,Бытовая электроника,Настольные компьютеры,,,,...,18500.0,728,2017-03-25,Shop,14de02f13446f40796af7c9be5dd54afc0b9758af2761a...,2789.0,0.0,14de02f13446f40796af7c9be5dd54afc0b9758af2761a...,180,26
611342,205587,233bd90b2585,5ffa26fd101d,Челябинская область,Челябинск,Личные вещи,Детская одежда и обувь,Для девочек,Трикотаж,110-116 см (4-6 лет),...,50.0,345,2017-03-17,Company,08d733bb2c05103d1d45b874455fe5cf55fb3ae6db26c7...,111.0,0.0,08d733bb2c05103d1d45b874455fe5cf55fb3ae6db26c7...,1210,11
358153,1021800,9dcd4eaff44e,7a763d840a81,Краснодарский край,Ейск,Бытовая электроника,Аудио и видео,Телевизоры и проекторы,,,...,1600.0,9,2017-03-26,Company,c04d6059a66c4b786e933f78dec4caf41023fd88dcb821...,3022.0,0.21651,c04d6059a66c4b786e933f78dec4caf41023fd88dcb821...,339,3
874693,910900,43349e5ce6df,8db0650e5a00,Краснодарский край,Краснодар,Личные вещи,"Одежда, обувь, аксессуары",Женская одежда,Свадебные платья,Без размера,...,14000.0,1434,2017-03-23,Company,380eac54ee5214e07fcbdffd9517e0369050c0ea782b29...,560.0,0.0,380eac54ee5214e07fcbdffd9517e0369050c0ea782b29...,542,30


In [35]:
image_folder = os.path.join(data_folder, "raw", "train_jpg", "data", "competition_files", "train_jpg")

In [36]:
subsampled_train_image_df = subsampled_train_df[['image_path', "deal_probability"]].dropna()

In [37]:
subsampled_validation_image_df = subsampled_validation_df[['image_path', "deal_probability"]].dropna()

In [38]:
subsampled_train_image_df.head()

Unnamed: 0,image_path,deal_probability
223617,bfe0f269734a36022d8f9ed3080c52508ceda3948f7623...,0.0
506290,14de02f13446f40796af7c9be5dd54afc0b9758af2761a...,0.0
611342,08d733bb2c05103d1d45b874455fe5cf55fb3ae6db26c7...,0.0
358153,c04d6059a66c4b786e933f78dec4caf41023fd88dcb821...,0.21651
874693,380eac54ee5214e07fcbdffd9517e0369050c0ea782b29...,0.0


In [39]:
subsampled_validation_image_df.head()

Unnamed: 0,image_path,deal_probability
52457,56fd8d24d67f4a46f257ca63195044cd51dffc844f46b5...,0.12869
295589,80d3e93abe6faf8487ce6a1a30b9cf8863f7872cc0773f...,0.80323
108117,617928b4c8d1df676c9a658ad4ab8117c239551dc06ad7...,0.0
169164,c0b5dae9d134087ad3db8479df4b54e076075ce4ea4518...,0.0
111649,45ce27a7d80f43c426f698c80bfef794615817df0559ca...,0.0


In [40]:
image_generator = ImageDataGenerator()

In [41]:
train_image_generator = image_generator.flow_from_dataframe(dataframe=subsampled_train_image_df, directory=image_folder, x_col="image_path", y_col="deal_probability",
                                                           class_mode='other', target_size=(224, 224), color_mode='rgb')

Found 1108 images.


In [42]:
validation_image_generator = image_generator.flow_from_dataframe(dataframe=subsampled_validation_image_df, directory=image_folder, x_col="image_path", y_col="deal_probability",
                                                           class_mode='other', target_size=(224, 224), color_mode='rgb')

Found 280 images.


In [45]:
city_index_mapping = {city : int(index + 1) for index, city in enumerate(np.unique(subsampled_train_df.city.values))}

In [46]:
subsampled_train_df.city.map(city_index_mapping)

223617     203
506290      32
611342     216
358153      54
874693      88
459541      19
1116380    223
710347     138
261897      55
928488     120
1172099    132
425146      55
641804      79
571644     132
862392      18
258896     192
762716     133
457868      37
428738     188
1190048    209
621192     123
741364      90
492611     219
772777      90
985536     164
144359     211
960261      67
911143     132
1166836    141
528528     133
          ... 
1120436     79
1073818     79
671431      40
912520      93
909208     123
93460      118
792565      55
491925      88
108720      85
927792      40
802167      96
995158     227
924820     148
199605     123
161887      88
447634      72
185612      90
821445      47
43170      120
63050       19
887683      55
170693     110
806928     199
1067583    131
28759       69
889711     148
443599     139
446463      55
922701      19
159567     167
Name: city, Length: 1203, dtype: int64

In [31]:
vgg19_model = VGG19(input_shape=(224, 224, 3), include_top=False)

In [32]:
image_model = vgg19_model.output

In [33]:
image_model = Flatten()(image_model)
image_model = Dense(512, activation='relu')(image_model)
image_model = Dense(256, activation='relu')(image_model)
preds = Dense(1)(image_model)

In [34]:
model = Model(inputs=vgg19_model.input, outputs=preds)

In [36]:
for layer in model.layers[:22]:
    layer.trainable = False
for layer in model.layers[22:]:
    layer.trainable = True

In [37]:
for layer in model.layers:
    print(layer.trainable, layer.name)

False input_1
False block1_conv1
False block1_conv2
False block1_pool
False block2_conv1
False block2_conv2
False block2_pool
False block3_conv1
False block3_conv2
False block3_conv3
False block3_conv4
False block3_pool
False block4_conv1
False block4_conv2
False block4_conv3
False block4_conv4
False block4_pool
False block5_conv1
False block5_conv2
False block5_conv3
False block5_conv4
False block5_pool
True flatten_1
True dense_1
True dense_2
True dense_3


In [40]:
model.compile(optimizer='Adam', loss='mean_squared_error', metrics=['mean_squared_error'])

In [41]:
model.fit_generator(generator=train_image_generator,
                    validation_data=validation_image_generator,
                    epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1758e90f780>

In [50]:
np.mean(np.power(np.abs(subsampled_validation_image_df.deal_probability.values - subsampled_validation_image_df.deal_probability.mean()), 2))

0.0723544687816741

In [112]:
from keras import backend as K

def root_mean_squared_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true), axis=-1)) 

In [101]:
inputs = Input(shape=(1,))
embedding = Embedding(len(category_index_mapping), 1, input_length=1)(inputs)

In [107]:
dense = Dense(64)(Flatten()(embedding))

In [108]:
outputs = Dense(1, activation='sigmoid')(dense)

In [114]:
model = Model(inputs=inputs, outputs=outputs)

In [115]:
model.compile(optimizer='Adam', loss=root_mean_squared_error, metrics=['mean_squared_error'])

In [116]:
model.fit(x=subsampled_train_df.category_index.values, y=subsampled_train_df.deal_probability.values, batch_size=2048,
         validation_data = (subsampled_validation_df.category_index.values, subsampled_validation_df.deal_probability.values), epochs=10)

Train on 120274 samples, validate on 30068 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1f7b36dc3c8>