In [None]:
from datetime import datetime
import itertools
import json

import joblib
import matplotlib.pyplot as plt
import numpy as np
from keras.callbacks import ModelCheckpoint
import tensorflow as tf
import tensorflow_hub as hub
from keras.engine import Layer
from keras_preprocessing.sequence import pad_sequences
from sklearn.utils import shuffle

from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Embedding, Conv1D, GlobalMaxPooling1D, Flatten, LSTM, \
    Bidirectional, CuDNNLSTM, MaxPooling1D, ConvLSTM2D, CuDNNGRU, SpatialDropout1D
from keras.preprocessing import text, sequence
from keras import utils
import pandas as pd
from keras import backend as K
from utility.train_data_loader import load_train_data


In [None]:
def update_embeddings_index():
    embeddings_index = {}
    for line in glove_file:
        values = line.split()
        word = ''.join(values[:-300])
        coefs = np.asarray(values[-300:], dtype='float32')
        # print(coefs)
        embeddings_index[word] = coefs
    return embeddings_index


try:
    print("using glove data from joblib...")
    embeddings_index = joblib.load("../data/glove.840B.300d.joblib")
    print("glove data loaded from joblib!")
except:
    print("using glove data from txt...")
    glove_file = open('../data/glove.840B.300d.txt', "r", encoding="Latin-1")
    embeddings_index = update_embeddings_index()
    print("glove data loaded from txt!")
    joblib.dump(embeddings_index, "../data/glove.840B.300d.joblib")
    print("glove data saved to joblib!")


In [None]:
testData = pd.read_csv("../data/new_test.csv")
dictData = pd.read_csv("../data/kata_dasar_kbbi.csv")
categories_file = open("../data/categories.json", "r")
trainData = load_train_data()

# Shuffle train data
trainData = shuffle(trainData)

categories = json.load(categories_file)
inverted_categories_mobile = {v: k.lower() for k, v in categories['Mobile'].items()}
inverted_categories_fashion = {v: k.lower() for k, v in categories['Fashion'].items()}
inverted_categories_beauty = {v: k.lower() for k, v in categories['Beauty'].items()}

all_subcategories = {k.lower(): v for k, v in categories['Mobile'].items()}
all_subcategories.update({k.lower(): v for k, v in categories['Fashion'].items()})
all_subcategories.update({k.lower(): v for k, v in categories['Beauty'].items()})


In [None]:
# Main settings
plot_history_check = True
gen_test = False
max_length = 35  # 32 is max word in train
max_words = 2500
EMBEDDING_DIM = 300  # Based on the txt file: glove 300d
num_classes = len(all_subcategories)
# Training for more epochs will likelval-acc after 10 epochs: 0.71306y lead to overfitting on this dataset
# You can try tweaking these hyperparamaters when using this model with your own data
batch_size = 256
epochs = 10
print(all_subcategories)
print("no of categories: " + str(num_classes))



In [None]:

max_data_size = int(len(trainData) * 1)
train_data_size = int(max_data_size * .95)
train_data_step = 1
validate_data_step = 1
print(train_data_size, max_data_size)

train_texts = trainData['title'][::train_data_step]
train_tags = trainData['Category'][::train_data_step]
test_texts = testData['title']
print(len(train_texts), len(train_tags))



tokenize = text.Tokenizer(num_words=max_words, char_level=False)
tokenize.fit_on_texts(train_texts)  # only fit on train
x_train = tokenize.texts_to_sequences(train_texts)
x_test = tokenize.texts_to_sequences(test_texts)

# Pad sequences with zeros
x_train = pad_sequences(x_train, padding='post', maxlen=max_length)
x_test = pad_sequences(x_test, padding='post', maxlen=max_length)

y_train = train_tags.values
y_train = utils.to_categorical(y_train)
word_index = tokenize.word_index

print(len(word_index))

In [None]:
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
