In [1]:
from datetime import datetime
import json
import os
import numpy as np

import tensorflow as tf
import pandas as pd
import tensorflow_hub as hub
from keras import Sequential
from keras.callbacks import ModelCheckpoint
from keras.layers import Dense, Activation, Dropout, Embedding, Conv1D, GlobalMaxPooling1D, Bidirectional, CuDNNLSTM, \
    SpatialDropout1D, MaxPooling1D,Conv2D,MaxPooling2D,Flatten
import matplotlib.pyplot as plt

from utility.train_data_loader import load_train_data


Using TensorFlow backend.


In [2]:
epochs = 10
batch_size = 256
specialization = "fashion"
gen_test = True

categories_file = open("../data/categories.json", "r")
categories = json.load(categories_file)

all_subcategories = {k.lower(): v for k, v in categories['Mobile'].items()}
all_subcategories.update({k.lower(): v for k, v in categories['Fashion'].items()})
all_subcategories.update({k.lower(): v for k, v in categories['Beauty'].items()})

data_root = "../../"+specialization+"_image/"

datagen = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1./255)
valid_datagen = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1./255)
test_datagen = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1./255)

feature_extractor_url = "https://tfhub.dev/google/imagenet/mobilenet_v2_100_224/feature_vector/2"

trainData = load_train_data()
testData = pd.read_csv("../data/test.csv")

validation_data_specialized = trainData[trainData['image_path'].str.contains(specialization)][::100]
validation_data_specialized['image_path'] = validation_data_specialized['image_path']. \
    map(lambda x: x.replace(specialization + '_image/', ''))

test_data_specialized = testData[testData['image_path'].str.contains(specialization)]
test_data_specialized['image_path'] = test_data_specialized['image_path'].\
    map(lambda x: x.replace(specialization+'_image/', ''))

inverted_categories_specialized = {k.lower(): v for k, v in categories[specialization.capitalize()].items()}

custom train data used


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [11]:
train_data_specialized = trainData[trainData['image_path'].str.contains(specialization)][::]
df_train = pd.DataFrame()
df_valid = pd.DataFrame()
num_train=2000
num_valid=int(0.1*num_train)
for k,v in inverted_categories_specialized.items():
    rows = train_data_specialized.loc[train_data_specialized['Category'] == v]
    num_images = rows.shape[0]
    if(num_train+num_valid>num_images):
        nt=int(0.9*num_images)
        nv=int(0.1*num_images)
    else:
        nt=num_train
        nv=num_valid
    # print(nt,nv)
    rows_train = rows[:nt]
    df_train = df_train.append(rows_train)
    rows_valid = rows[nt:(nt+num_valid)]
    df_valid = df_valid.append(rows_valid)

train_data_specialized = df_train
validation_data_specialized = df_valid

train_data_specialized['image_path'] = train_data_specialized['image_path']. \
    map(lambda x: x.replace(specialization + '_image/', ''))

validation_data_specialized['image_path'] = validation_data_specialized['image_path']. \
    map(lambda x: x.replace(specialization + '_image/', ''))



1503 167
2000 200
2000 200
2000 200
2000 200
2000 200
2000 200
2000 200
2000 200
2000 200
2000 200
2000 200
2000 200
1296 144


In [12]:
IMAGE_SIZE = hub.get_expected_image_size(hub.Module(feature_extractor_url))
image_generator = datagen.flow_from_dataframe(train_data_specialized,
                                              directory=os.path.join(data_root),
                                              x_col="image_path",
                                              y_col="item_category",
                                              target_size=IMAGE_SIZE,
                                              color_mode="rgb",
                                              class_mode="categorical",
                                              shuffle=True,
                                              batch_size=64,
                                              )

Found 26799 images belonging to 14 classes.


In [13]:
label_names = sorted(image_generator.class_indices.items(), key=lambda pair:pair[1])
label_names = np.array([key.title() for key, value in label_names])


def feature_extractor(x):
    feature_extractor_module = hub.Module(feature_extractor_url)
    return feature_extractor_module(x)


for image_batch, label_batch in image_generator:
    print("Image batch shape: ", image_batch.shape)
    print("Label batch shape: ", label_batch.shape)
    break

Image batch shape:  (64, 224, 224, 3)
Label batch shape:  (64, 14)


In [14]:
input_shape = IMAGE_SIZE+[3] 

model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=input_shape))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.2))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(len(inverted_categories_specialized), activation='softmax'))
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_2 (Conv2D)            (None, 222, 222, 32)      896       
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 111, 111, 32)      0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 111, 111, 32)      0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 394272)            0         
_________________________________________________________________
dense_3 (Dense)              (None, 128)               50466944  
_________________________________________________________________
dropout_4 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 14)                1806      
Total para

In [15]:
valid_generator = valid_datagen.flow_from_dataframe(validation_data_specialized,
                                                    directory=os.path.join(data_root),
                                                    x_col="image_path",
                                                    y_col="item_category",
                                                    target_size=IMAGE_SIZE,
                                                    color_mode="rgb",
                                                    class_mode="categorical",
                                                    shuffle=True,
                                                    batch_size=64,
                                                    )


test_generator = test_datagen.flow_from_dataframe(test_data_specialized,
                                                  directory=os.path.join(data_root),
                                                  x_col="image_path",
                                                  y_col=None,
                                                  target_size=IMAGE_SIZE,
                                                  color_mode="rgb",
                                                  class_mode=None,
                                                  shuffle=False,
                                                  batch_size=64,
                                                  )


Found 2713 images belonging to 14 classes.
Found 55440 images.


In [None]:
def gen_filename_h5():
    return 'epoch_'+str(epochs) + '_' + datetime.now().strftime("%m_%d_%Y_%H_%M_%S")


def gen_filename_csv():
    return 'epoch_'+str(epochs) + '_' + datetime.now().strftime("%m_%d_%Y_%H_%M_%S")


# Checkpoint auto
filepath = "../checkpoints/"+gen_filename_h5()+"v2.hdf5"
checkpointer = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')


steps_per_epoch = image_generator.samples//image_generator.batch_size
valid_steps_per_epoch = valid_generator.samples // valid_generator.batch_size
test_steps_per_epoch = test_generator.samples // test_generator.batch_size

history = model.fit_generator(generator=image_generator,
                              steps_per_epoch=steps_per_epoch,
                              validation_data=valid_generator,
                              validation_steps=valid_steps_per_epoch,
                              epochs=epochs,
                              callbacks=[checkpointer],
                              )


Epoch 1/10

Epoch 00001: val_acc improved from -inf to 0.15365, saving model to ../checkpoints/epoch_10_03_19_2019_00_19_17v2.hdf5
Epoch 2/10

Epoch 00002: val_acc improved from 0.15365 to 0.18309, saving model to ../checkpoints/epoch_10_03_19_2019_00_19_17v2.hdf5
Epoch 3/10

Epoch 00003: val_acc improved from 0.18309 to 0.19290, saving model to ../checkpoints/epoch_10_03_19_2019_00_19_17v2.hdf5
Epoch 4/10

Epoch 00004: val_acc improved from 0.19290 to 0.19366, saving model to ../checkpoints/epoch_10_03_19_2019_00_19_17v2.hdf5
Epoch 5/10

Epoch 00005: val_acc improved from 0.19366 to 0.20385, saving model to ../checkpoints/epoch_10_03_19_2019_00_19_17v2.hdf5
Epoch 6/10

In [None]:
def perform_test():
    prediction_specialized = model.predict_generator(test_generator, verbose=1, steps=test_steps_per_epoch)
    predicted_label_specialized = [np.argmax(prediction_specialized[i]) for i in range(len(prediction_specialized))]
    df = pd.DataFrame({'itemid': test_data_specialized['itemid'].astype(int), 'Category': predicted_label_specialized})
    df.to_csv(path_or_buf='res' + gen_filename_csv() + '.csv', index=False)

if gen_test:
    perform_test()
