In [1]:
from datetime import datetime
import json
import os
import numpy as np

import tensorflow as tf
import pandas as pd
import tensorflow_hub as hub
from keras import Sequential
from keras.callbacks import ModelCheckpoint
from keras.layers import Lambda, Dense
import matplotlib.pyplot as plt

from utility.train_data_loader import load_train_data

Using TensorFlow backend.


In [42]:
epochs = 1
batch_size = 64
specialization = "beauty"

In [3]:
categories_file = open("../data/categories.json", "r")
categories = json.load(categories_file)


In [4]:
all_subcategories = {k.lower(): v for k, v in categories['Mobile'].items()}
all_subcategories.update({k.lower(): v for k, v in categories['Fashion'].items()})
all_subcategories.update({k.lower(): v for k, v in categories['Beauty'].items()})


In [5]:
data_root = "../../"+specialization+"_image/"


In [6]:
datagen = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1./255)
valid_datagen = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1./255)
test_datagen = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1./255)


In [7]:
feature_extractor_url = "https://tfhub.dev/google/imagenet/mobilenet_v2_100_224/feature_vector/2"


In [8]:
trainData = load_train_data()
testData = pd.read_csv("../data/new_test.csv")

custom train data used


In [24]:
train_data_specialized = trainData[trainData['image_path'].str.contains(specialization)][::10]
train_data_specialized['image_path'] = train_data_specialized['image_path'].\
    map(lambda x: x.lstrip(specialization+'_image/'))


In [10]:
validation_data_specialized = trainData[trainData['image_path'].str.contains(specialization)][::100]
validation_data_specialized['image_path'] = validation_data_specialized['image_path'].\
    map(lambda x: x.lstrip(specialization+'_image/'))



In [63]:
test_data_specialized = testData[testData['image_path'].str.contains(specialization)]
test_data_specialized['image_path'] = test_data_specialized['image_path'].\
    map(lambda x: x.lstrip(specialization+'_image/'))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [69]:
test_data_specialized

Unnamed: 0,itemid,title,image_path
0,370855998,flormar white cream bb spf ml,1588591395c5a254bab84042005f2a9f.jpg
1,637234604,maybelline clear smooth all in one bb cream sp...,920985ed9587ea20f58686ea74e20f93.jpg
2,690282890,murah innisfree eco natural green tea bb cream...,90b40e5710f54352b243fcfb0f5d1d7f.jpg
3,930913462,loreal white perfect day cream spf pa white...,289c668ef3d70e1d929d602d52d5d78a.jpg
4,1039280071,hada labo cc cream ultimate anti aging spf ...,d5b3e652c5822d2306f4560488ec30c6.jpg
5,1327710392,cathy doll cc speed white powder pact spf o...,1e50828d5594721a7d5d5c1ff78afbd.jpg
6,1328802799,safi white natural brightening cream g,97ec852d5afc5d82ac02b80083cf292f.jpg
7,1330468145,light beige bioaqua bb cushion exquisite de...,8ce1a5fe546f0cc795329bad599a8d5a.jpg
8,1677309730,new produk missha m perfect bb cream share in ...,755fcc85c687e8cb53d2a8d43ebfe251.jpg
9,1683142205,ready laneige bb cushion anti aging spf pa,34b56398c099505c650cf2447dc9f21f.jpg


In [64]:
print(test_data_specialized.shape)

(76545, 3)


In [54]:
categories_specialized = {k.lower(): v for k, v in categories[specialization.capitalize()].items()}


In [55]:
categories_specialized

{'bb & cc cream': 5,
 'blush on': 2,
 'bronzer': 11,
 'concealer': 7,
 'contour': 6,
 'face palette': 0,
 'foundation': 1,
 'highlighter': 8,
 'lip gloss': 14,
 'lip liner': 15,
 'lip tint': 13,
 'lipstick': 12,
 'other face cosmetics': 4,
 'other lip cosmetics': 16,
 'powder': 3,
 'primer': 9,
 'setting spray': 10}

In [14]:
IMAGE_SIZE = hub.get_expected_image_size(hub.Module(feature_extractor_url))


INFO:tensorflow:Using C:\Users\HANSTA~1\AppData\Local\Temp\tfhub_modules to cache modules.


In [15]:
image_generator = datagen.flow_from_dataframe(train_data_specialized,
                                              directory=os.path.join(data_root),
                                              x_col="image_path",
                                              y_col="item_category",
                                              target_size=IMAGE_SIZE,
                                              color_mode="rgb",
                                              class_mode="categorical",
                                              shuffle=True,
                                              batch_size=64,
                                              )

Found 23209 images belonging to 17 classes.


In [16]:
valid_generator = valid_datagen.flow_from_dataframe(validation_data_specialized,
                                                    directory=os.path.join(data_root),
                                                    x_col="image_path",
                                                    y_col="item_category",
                                                    target_size=IMAGE_SIZE,
                                                    color_mode="rgb",
                                                    class_mode="categorical",
                                                    shuffle=True,
                                                    batch_size=64,
                                                    )

Found 2332 images belonging to 17 classes.


In [65]:
test_generator = test_datagen.flow_from_dataframe(test_data_specialized,
                                                   directory=os.path.join(data_root),
                                                   x_col="image_path",
                                                  y_col=None,
                                                  class_mode=None,
                                                  shuffle=False,
                                                   target_size=IMAGE_SIZE,
                                                   color_mode="rgb",
                                                   batch_size=64,
                                                   )

Found 62059 images.


In [35]:
label_names = sorted(image_generator.class_indices.items(), key=lambda pair:pair[1])
label_names = np.array([key.title() for key, value in label_names])


In [36]:
def feature_extractor(x):
    feature_extractor_module = hub.Module(feature_extractor_url)
    return feature_extractor_module(x)

In [37]:
model = Sequential()
model.add(Lambda(feature_extractor, input_shape=IMAGE_SIZE+[3], trainable=True))
model.add(Dense(len(categories_specialized), activation='softmax'))
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()


INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lambda_1 (Lambda)            (None, 1280)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 17)                21777     
Total params: 21,777
Trainable params: 21,777
Non-trainable params: 0
_________________________________________________________________


In [38]:
def gen_filename_h5():
    return 'epoch_'+str(epochs) + '_' + datetime.now().strftime("%m_%d_%Y_%H_%M_%S")


In [67]:
def gen_filename_csv():
    return 'epoch_'+str(epochs) + '_' + str(max_words) + '_' + datetime.now().strftime("%m_%d_%Y_%H_%M_%S")



In [39]:
filepath = "../checkpoints/"+gen_filename_h5()+"v2.hdf5"
checkpointer = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')

In [50]:
steps_per_epoch = image_generator.samples//image_generator.batch_size
valid_steps_per_epoch = valid_generator.samples // valid_generator.batch_size
test_steps_per_epoch = test_generator.samples // test_generator.batch_size

In [43]:
history = model.fit_generator(generator=image_generator,
                    steps_per_epoch=steps_per_epoch,
                    validation_data=valid_generator,
                    validation_steps=valid_steps_per_epoch,
                    epochs=epochs,
                    callbacks=[checkpointer],
                    )

Epoch 1/1

Epoch 00001: val_acc improved from -inf to 0.54277, saving model to ../checkpoints/epoch_25_03_14_2019_00_47_25v2.hdf5


In [45]:
def plot_history(history):
    plt.style.use('ggplot')
    acc = history.history['acc']
    val_acc = history.history['val_acc']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()
    plt.show()


In [46]:
plot_history(history)

In [51]:
prediction_specialized = model.predict_generator(test_generator, verbose=1, steps = test_steps_per_epoch)




In [59]:
predicted_label_specialized = [np.argmax(prediction_specialized[i]) for i in range(len(prediction_specialized))]


In [68]:
df = pd.DataFrame({'itemid': test_data_specialized['itemid'].astype(int), 'Category': predicted_label_specialized})


ValueError: array length 62016 does not match index length 76545