In [36]:
from datetime import datetime
import itertools
import json
import matplotlib.pyplot as plt
import numpy as np
from keras.callbacks import ModelCheckpoint
import tensorflow_hub as hub
from keras_preprocessing.sequence import pad_sequences
from sklearn.utils import shuffle

from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Embedding, Conv1D, GlobalMaxPooling1D, Flatten, LSTM, \
Bidirectional, CuDNNLSTM
from keras.preprocessing import text, sequence
from keras import utils
import pandas as pd

In [3]:
trainData = pd.read_csv("../data/train_with_cname.csv")
testData = pd.read_csv("../data/test.csv")
dictData = pd.read_csv("../data/kata_dasar_kbbi.csv")
categories_file = open("../data/categories.json", "r")

In [4]:
glove_file = open('../data/glove.840B.300d.txt', "r", encoding="Latin-1")
embeddings_index = {}
for line in glove_file:
    values = line.split()
    word = ''.join(values[:-300])
    coefs = np.asarray(values[-300:], dtype='float32')
    # print(coefs)
    embeddings_index[word] = coefs

In [5]:
categories = json.load(categories_file)
inverted_categories_mobile = {v: k.lower() for k, v in categories['Mobile'].items()}
inverted_categories_fashion = {v: k.lower() for k, v in categories['Fashion'].items()}
inverted_categories_beauty = {v: k.lower() for k, v in categories['Beauty'].items()}

In [6]:
all_subcategories = {k.lower(): v for k, v in categories['Mobile'].items()}
all_subcategories.update({k.lower(): v for k, v in categories['Fashion'].items()})
all_subcategories.update({k.lower(): v for k, v in categories['Beauty'].items()})


In [7]:
# Main settings
plot_history_check = True
gen_test = False
max_length = 35  # 32 is max word in train
max_words = 1000
num_classes = len(all_subcategories)
# Training for more epochs will likelval-acc after 10 epochs: 0.71306y lead to overfitting on this dataset
# You can try tweaking these hyperparamaters when using this model with your own data
batch_size = 256
epochs = 10

## Debug

In [7]:
print(all_subcategories)
print("no of categories: " + str(num_classes))

{'others mobile & tablet': 35, 'smartfren': 53, 'infinix': 40, 'brandcode': 39, 'icherry': 52, 'advan': 45, 'iphone': 31, 'realme': 51, 'motorola': 49, 'maxtron': 56, 'nokia': 38, 'xiaomi': 34, 'mito': 46, 'sony': 33, 'spc': 57, 'lenovo': 37, 'alcatel': 55, 'samsung': 32, 'vivo': 42, 'evercoss': 44, 'strawberry': 50, 'blackberry': 36, 'asus': 43, 'honor': 54, 'oppo': 41, 'huawei': 47, 'sharp': 48, 'wedding dress': 23, 'shirt': 27, 'casual dress': 18, 'maxi dress': 20, 'big size dress': 24, 'bodycon dress': 22, 'party dress': 19, 'blouse': 26, 'tshirt': 25, 'crop top': 29, 'tanktop': 28, 'others': 17, 'a line dress': 21, 'big size top': 30, 'foundation': 1, 'face palette': 0, 'concealer': 7, 'lip gloss': 14, 'blush on': 2, 'highlighter': 8, 'bb & cc cream': 5, 'other face cosmetics': 4, 'lip tint': 13, 'bronzer': 11, 'lip liner': 15, 'powder': 3, 'setting spray': 10, 'primer': 9, 'contour': 6, 'other lip cosmetics': 16, 'lipstick': 12}
no of categories: 58


In [8]:
category_mapping = {
    'fashion_image': 'Fashion',
    'beauty_image': 'Beauty',
    'mobile_image': 'Mobile',
}
directory_mapping = {
    'Fashion': 'fashion_image',
    'Beauty': 'beauty_image',
    'Mobile': 'mobile_image',
}

In [9]:
# Shuffle train data
trainData = shuffle(trainData)

In [10]:
max_data_size = int(len(trainData) * 1)
train_data_size = int(max_data_size * .95)
train_data_step = 1
validate_data_step = 1
print(train_data_size, max_data_size)

633284 666615


In [28]:
train_texts = trainData['title'][::train_data_step]
train_tags = trainData['Category'][::train_data_step]
test_texts = testData['title']
print(len(train_texts), len(train_tags))

666615 666615


In [12]:
tokenize = text.Tokenizer(num_words=1000, char_level=False)
tokenize.fit_on_texts(train_texts)  # only fit on train
word_index = tokenize.word_index
x_train = tokenize.texts_to_sequences(train_texts)
x_test = tokenize.texts_to_sequences(test_texts)

In [13]:
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [14]:
# Pad sequences with zeros
x_train = pad_sequences(x_train, padding='post', maxlen=max_length)
x_test = pad_sequences(x_test, padding='post', maxlen=max_length)

In [29]:
y_train = train_tags.values
y_train = utils.to_categorical(y_train)

In [44]:
model = Sequential()
model.add(Embedding(len(word_index) + 1,
                    300,
                    input_length=max_length,
                    weights=[embedding_matrix],
                    trainable=True))
model.add(Bidirectional(CuDNNLSTM(128, return_sequences=True)))
model.add(Bidirectional(CuDNNLSTM(128)))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 35, 300)           24027600  
_________________________________________________________________
bidirectional_4 (Bidirection (None, 35, 256)           440320    
_________________________________________________________________
bidirectional_5 (Bidirection (None, 256)               395264    
_________________________________________________________________
dense_13 (Dense)             (None, 256)               65792     
_________________________________________________________________
dropout_6 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_14 (Dense)             (None, 58)                14906     
Total params: 24,943,882
Trainable params: 24,943,882
Non-trainable params: 0
________________________________________________________________

In [17]:
def gen_filename_h5():
    return 'epoch_'+str(epochs) + '_' + datetime.now().strftime("%m_%d_%Y_%H_%M_%S")

In [18]:
def gen_filename_csv():
    return 'epoch_'+str(epochs) + '_' + datetime.now().strftime("%m_%d_%Y_%H_%M_%S")

In [38]:
# Checkpoint auto
filepath = "../checkpoints/"+gen_filename_h5()+"v2.hdf5"
checkpointer = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')

In [24]:
print(x_train.shape)

(666615, 35)


In [26]:
print(y_train.shape)

(666615,)


In [None]:
history = model.fit([x_train], batch_size=batch_size, y=y_train, verbose=1, validation_split=0.1,
                    shuffle=True, epochs=epochs, callbacks=[checkpointer])

Train on 599953 samples, validate on 66662 samples
Epoch 1/10

Epoch 00001: val_acc did not improve from 0.72440
Epoch 2/10

Epoch 00002: val_acc did not improve from 0.72440
Epoch 3/10

Epoch 00003: val_acc did not improve from 0.72440
Epoch 4/10

Epoch 00004: val_acc did not improve from 0.72440
Epoch 5/10

Epoch 00005: val_acc did not improve from 0.72440
Epoch 6/10

Epoch 00006: val_acc improved from 0.72440 to 0.72809, saving model to ../checkpoints/epoch_10_03_04_2019_00_24_51v2.hdf5
Epoch 7/10

Epoch 00007: val_acc improved from 0.72809 to 0.72820, saving model to ../checkpoints/epoch_10_03_04_2019_00_24_51v2.hdf5
Epoch 8/10

Epoch 00008: val_acc improved from 0.72820 to 0.72887, saving model to ../checkpoints/epoch_10_03_04_2019_00_24_51v2.hdf5
Epoch 9/10

Epoch 00009: val_acc improved from 0.72887 to 0.72914, saving model to ../checkpoints/epoch_10_03_04_2019_00_24_51v2.hdf5
Epoch 10/10

In [21]:
def plot_history(history):
    plt.style.use('ggplot')
    acc = history.history['acc']
    val_acc = history.history['val_acc']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()
    plt.show()


In [None]:
plot_history(history)

In [None]:
def perform_test():
    prediction = model.predict(x_test, batch_size=batch_size, verbose=1)
    predicted_label = [np.argmax(prediction[i]) for i in range(len(x_test))]
    # print(predicted_label)
    df = pd.DataFrame({'itemid': testData['itemid'].astype(int), 'Category': predicted_label})
    df.to_csv(path_or_buf='res_' + gen_filename_csv() + '.csv', index=False)

In [None]:
perform_test()