In [None]:
import os
import pickle

from tensorflow import keras
from keras.models import Sequential
from keras.models import Model
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.layers import Input
from keras.utils import to_categorical
from keras.preprocessing import sequence
from keras.utils import plot_model
from sklearn import preprocessing
from keras.layers import Activation, Dense, Embedding, Dropout, Conv2D, Reshape, GlobalMaxPooling2D
from sklearn.metrics import classification_report
#load_data & pre_processing
df = pd.read_pickle('data.pk')
df['goods_name'] = df['goods_name'].apply(sp.encode_as_pieces).apply(clean_spm).apply(lambda x:' '.join(x))
# load embedding_layer
embedding_index = {}
f = open(os.path.join('','cate_w2v.txt'), encoding = 'utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:])
    embedding_index[word] = coefs
f.close()

word_index = tokenizer.word_index

num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word_index.items():
    if i > num_words:
        continue
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
print (num_words)
# prepare modeling
X = df[['goods_name']]
y = df['depth_4']

label_encoder = preprocessing.LabelEncoder()
y = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 2020)
X_train, X_test = X_train['goods_name'], X_test['goods_name']

y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

max_len = 50
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

sequences = tokenizer.texts_to_sequences(X_train)
X_train = sequence.pad_sequences(sequences,maxlen = max_len)
sequences = tokenizer.texts_to_sequences(X_test)
X_test = sequence.pad_sequences(sequences, maxlen = max_len)
# modeling
model = Sequential()
embedding_layer = Embedding(num_words, 
                            embedding_dim, 
                            weights = [embedding_matrix],
                            input_length = max_len)
model.add(embedding_layer)
model.add(Reshape((max_len, embedding_dim, 1), input_shape = (max_len, embedding_dim)))
# print (model.output_shape)
model.add(Conv2D(filters = 32, kernel_size = (4, embedding_dim), strides = (2,2), padding = 'valid'))
model.add(GlobalMaxPooling2D())
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(29, activation='softmax'))
# print (model.summary())
model.compile(loss='categorical_crossentropy', optimizer='RMSprop', metrics=['acc'])
history = model.fit(x = X_train, y = y_train, batch_size = 128, epochs = 1, verbose = 1, validation_split = 0.1)
# evaluate
acc = model.evaluate(X_test,y_test)
print('Loss: {:0.3f} | Accuracy: {:0.3f}'.format(acc[0],acc[1])) 
print ('=' * 50)
pred = model.predict(X_test)
pred_bool = np.argmax(pred,1)
y_test_bool = np.argmax(y_test,1)
print(classification_report(y_test_bool, pred_bool))
# save classification model
model.save('cate_food_model.h5')
# save tkn model
with open('cate_food_tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)