In [1]:
import re
import math
import pymongo 
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_pickle('class_29.pk')
print (df.shape)

(2434424, 3)


In [3]:
df = df.rename(columns = {'mall_goods_name':'goods_name'})
# print ('class: ',df['depth_4'].nunique())
# df['depth_4'].value_counts()

In [38]:
import os
import pickle
import collections
import sentencepiece as spm

from gensim.models import Word2Vec

from tensorflow import keras
from keras.models import Sequential
from keras.models import Model
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.layers import Input
from keras.utils import to_categorical
from keras.preprocessing import sequence
from keras.utils import plot_model
from sklearn import preprocessing
from keras.layers import Activation, Dense, Embedding, Flatten, BatchNormalization, Dropout, Conv2D, MaxPooling2D, Reshape, GlobalMaxPooling2D, GlobalAveragePooling2D
from sklearn.metrics import classification_report

In [5]:
x = df[['goods_name','depth_4']]
x.to_csv(r'cate.txt', header=None, index=None, sep='\t')

spm.SentencePieceTrainer.train('--input=cate.txt --model_prefix=cate --vocab_size=20000 --model_type=unigram') # --model_type=unigram (default), bpe, char, or word
sp = spm.SentencePieceProcessor()
sp.load('cate.model')

True

In [6]:
def clean_spm(lst):
    def clean_text(text):
        return re.sub('[-=+,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…》]', '', text)
    def clean_num(text):
        return re.sub('\d+', '', text)
    def _del(text):
        return text.replace('▁','')
    a = [clean_text(x) for x in lst] 
    b = [clean_num(x) for x in a] 
    c = [_del(x) for x in b]
    d = [x for x in c if len(x) != 0]
    e = ['즉석죽' if x=='죽' else x for x in d]
    f = ['껌껌' if x=='껌' else x for x in e]
    g = ['Tea' if x=='티' else x for x in f]
    h = [x for x in g if len(x) != 1]
    return h

In [7]:
df['goods_name'] = df['goods_name'].apply(sp.encode_as_pieces).apply(clean_spm).apply(lambda x:' '.join(x))

## modeling

In [29]:
sentences = df['goods_name'].drop_duplicates().apply(lambda x:x.split(' ')).to_list()

embedding_dim = 200
model = Word2Vec(sentences, size = embedding_dim, window = 3, min_count = 3, workers = 16)

word_vectors = model.wv
vocabs = word_vectors.vocab.keys()
word_vectors_list = [word_vectors[v] for v in vocabs]
print ('Vocab Size:',len(model.wv.vocab))

# print (word_vectors.similarity(w1 = '즉석밥', w2 = '햇반'))
# print (model.wv.most_similar('햇반')[:5])

filename = 'cate_w2v.txt'
model.wv.save_word2vec_format(filename, binary = False)
 
embedding_index = {}
f = open(os.path.join('','cate_w2v.txt'), encoding = 'utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:])
    embedding_index[word] = coefs
f.close()

Vocab Size: 13928


In [30]:
X = df[['goods_name']]
y = df['depth_4']

label_encoder = preprocessing.LabelEncoder()
y = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 2020)
X_train, X_test = X_train['goods_name'], X_test['goods_name']

y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

In [31]:
max_len = 50
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

sequences = tokenizer.texts_to_sequences(X_train)
X_train = sequence.pad_sequences(sequences,maxlen = max_len) #  padding='post'
sequences = tokenizer.texts_to_sequences(X_test)
X_test = sequence.pad_sequences(sequences, maxlen = max_len)

In [32]:
word_index = tokenizer.word_index

num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word_index.items():
    if i > num_words:
        continue
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
print (num_words)

13881


In [33]:
model = Sequential()
embedding_layer = Embedding(num_words, 
                            embedding_dim, 
                            weights = [embedding_matrix],
                            input_length = max_len)
model.add(embedding_layer)
model.add(Reshape((max_len, embedding_dim, 1), input_shape = (max_len, embedding_dim)))
# print (model.output_shape)
model.add(Conv2D(filters = 32, kernel_size = (4, embedding_dim), strides = (2,2), padding = 'valid'))
model.add(GlobalMaxPooling2D())
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(29, activation='softmax'))
# print (model.summary())
model.compile(loss='categorical_crossentropy', optimizer='RMSprop', metrics=['acc'])
history = model.fit(x = X_train, y = y_train, batch_size = 128, epochs = 1, verbose = 1, validation_split = 0.1)
# evaluate
acc = model.evaluate(X_test,y_test)
print('Loss: {:0.3f} | Accuracy: {:0.3f}'.format(acc[0],acc[1])) 
print ('=' * 50)
pred = model.predict(X_test)
pred_bool = np.argmax(pred,1)
y_test_bool = np.argmax(y_test,1)
print(classification_report(y_test_bool, pred_bool))

Train on 1971882 samples, validate on 219099 samples
Epoch 1/1
Loss: 0.091 | Accuracy: 0.978
              precision    recall  f1-score   support

           0       0.98      0.99      0.99     67038
           1       0.97      0.97      0.97      6182
           2       0.97      0.98      0.97      4303
           3       0.97      0.97      0.97     12783
           4       0.99      0.99      0.99      2878
           5       0.98      0.97      0.97     21725
           6       0.97      0.97      0.97      9677
           7       0.96      0.93      0.95      1532
           8       0.98      0.98      0.98      2346
           9       0.98      0.99      0.98      8022
          10       0.95      0.91      0.93      1418
          11       0.97      0.95      0.96       760
          12       0.99      0.97      0.98      1551
          13       0.97      0.97      0.97      9278
          14       0.99      0.96      0.97      1894
          15       0.97      0.95      0.9

In [34]:
text = '쉐프드 쉐푸드 명란오일파스타 285g 6종 즉석식품 냉동식품'
pre = ' '.join(clean_spm(sp.encode_as_pieces(text)))
print ('pre:',pre)
t = sequence.pad_sequences(tokenizer.texts_to_sequences([pre]), maxlen = max_len)
Preds = model.predict(t)

p = [np.argmax(x) for x in Preds]
prob = [np.max(x) for x in Preds]

pred = label_encoder.inverse_transform(p)
print(pred, prob)

pre: 쉐프 쉐푸드 명란 오일 파스타 즉석식품 냉동식품
['면류'] [0.9845915]


## model save

In [36]:
model.save('cate_food_model.h5')

In [39]:
with open('cate_food_tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)