In [35]:
import numpy as np
import pandas as pd

from konlpy.tag import Mecab

import tensorflow as tf

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

def process(config):
    train_df = pd.read_csv('./data/train.csv', encoding='utf-8')
    test_df = pd.read_csv('./data/test.csv', encoding='utf-8')
    sample = pd.read_csv('./data/sample_submission.csv', encoding='utf-8')

    y = train_df[['중식계', '석식계']]

    TRAIN_LENGTH = 1205

    df = pd.concat([train_df, test_df], axis=0)
    df['출근'] = df['본사정원수'] - (df['본사휴가자수'] + df['본사출장자수'] + df['현본사소속재택근무자수'])
    df['휴가비율'] = df['본사휴가자수'] / df['본사정원수']
    df['출장비율'] = df['본사출장자수'] / df['본사정원수']
    df['야근비율'] = df['본사시간외근무명령서승인건수'] / df['출근']
    df['재택비율'] = df['현본사소속재택근무자수'] / df['본사정원수']

    df.drop(columns=['본사정원수', '본사휴가자수', '본사출장자수', '현본사소속재택근무자수'], inplace=True)

    df['공휴일전후'] = 0
    df['공휴일전후'][17] = 1
    df['공휴일전후'][3] = 1
    df['공휴일전후'][62] = 1
    df['공휴일전후'][131] = 1
    df['공휴일전후'][152] = 1
    df['공휴일전후'][226] = 1
    df['공휴일전후'][221] = 1
    df['공휴일전후'][224] = 1
    df['공휴일전후'][245] = 1
    df['공휴일전후'][310] = 2
    df['공휴일전후'][311] = 1
    df['공휴일전후'][309] = 1
    df['공휴일전후'][330] = 1
    df['공휴일전후'][379] = 1
    df['공휴일전후'][467] = 1
    df['공휴일전후'][470] = 1
    df['공휴일전후'][502] = 2
    df['공휴일전후'][565] = 1
    df['공휴일전후'][623] = 1
    df['공휴일전후'][651] = 1
    df['공휴일전후'][705] = 1
    df['공휴일전후'][709] = 1
    df['공휴일전후'][815] = 1
    df['공휴일전후'][864] = 1
    df['공휴일전후'][950] = 1
    df['공휴일전후'][951] = 1
    df['공휴일전후'][953] = 1
    df['공휴일전후'][955] = 1
    df['공휴일전후'][954] = 1
    df['공휴일전후'][971] = 1
    df['공휴일전후'][1038] = 1
    df['공휴일전후'][1099] = 1
    df['공휴일전후'][1129] = 1
    df['공휴일전후'][1187] = 1

    df['공휴일전후'][TRAIN_LENGTH + 10] = 1
    df['공휴일전후'][TRAIN_LENGTH + 20] = 1

    df = pd.get_dummies(df, columns=['공휴일전후'])
    df['공휴일전후_0'][TRAIN_LENGTH + 20] = 1
    df['공휴일전후_1'][TRAIN_LENGTH + 20] = 0

    if config.temp:
        temp = pd.read_csv('./data/temp.csv', encoding='utf-8')
        df = pd.merge(df, temp, on='일자')

    if config.sep_date:
        df['year'] = df.일자.apply(lambda x : int(x[:4]))
        df['month'] = df.일자.apply(lambda x : int(x[-5 :-3]))
        df['day'] = df.일자.apply(lambda x : int(x[-2 :]))
        df['week'] = df.day.apply(lambda x : x // 7)

        df.drop(columns=['일자'], inplace=True)
    else :
        df.일자 = pd.to_datetime(df.일자)
        df.rename(columns={'일자':'ds'}, inplace=True)

    columns = ['조식메뉴', '중식메뉴', '석식메뉴']
    for col in columns :
        df[col] = df[col].str.replace('/', ' ')
        df[col] = df[col].str.replace(r'([<]{1}[ㄱ-힣\:\,\.\/\-A-Za-z 0-9]*[>]{1})', '')
        df[col] = df[col].str.replace(r'([＜]{1}[ㄱ-힣\:\,\.\/\-A-Za-z 0-9]*[＞]{1})', '')
        df[col] = df[col].str.replace(r'([(]{1}[ㄱ-힣\:\,\.\/\-A-Za-z 0-9]*[)]{1})', '')
        df[col] = df[col].str.replace(r'[ ]{2, }', ' ')
        df[col] = df[col].str.replace('\(New\)', '')
        df[col] = df[col].str.replace('\(NeW\)', '')
        df[col] = df[col].str.replace(r'[D]{1}', '')
        df[col] = df[col].str.replace(r'[S]{1}', '')
        df[col] = df[col].str.replace('\(쌀:국내산,돈육:국내', '')
        df[col] = df[col].str.replace('고추가루:중국산\)', '')
        df[col] = df[col].str.replace('*', ' ')
        df[col] = df[col].str.replace('[(]만두 고추 통계란[)]', '')
        df[col] = df[col].str.replace('[(]모둠튀김 양념장[)]', '')
        df[col] = df[col].apply(lambda x : x.strip())

    # Normalize
    scaling_cols = ['본사시간외근무명령서승인건수', '출근', '휴가비율', '야근비율', '재택비율', '출장비율']
    for col in scaling_cols :
        ms = MinMaxScaler()
        ms.fit(df[col][:TRAIN_LENGTH].values.reshape(-1, 1))
        df[col] = ms.transform(df[col].values.reshape(-1, 1))

    le = LabelEncoder()
    le.fit(df.요일.values[:TRAIN_LENGTH])
    df.요일 = le.transform(df.요일.values)

    np.random.seed(0)
    idx = np.random.permutation(TRAIN_LENGTH)
    train_idx = idx[:1000]
    valid_idx = idx[1000:]

    train_df = df.iloc[train_idx, :]
    valid_df = df.iloc[valid_idx, :]
    test_df = df.iloc[TRAIN_LENGTH:, :]
    train_y = y.iloc[train_idx, :]
    valid_y = y.iloc[valid_idx, :]

    train_df.drop(columns=['중식계', '석식계'], inplace=True)
    valid_df.drop(columns=['중식계', '석식계'], inplace=True)
    test_df.drop(columns=['중식계', '석식계'], inplace=True)

    if config.text == 'embedding' :
        train_tmp, valid_tmp, test_tmp = embedding(config, train_df, valid_df, test_df)

    # if config.text == 'subword':
    #     tmp = subword(config, train_df, valid_df, test_df)

    train_df = pd.concat([train_df.reset_index(drop=True), train_tmp.reset_index(drop=True)], axis=1)
    train_df.drop(columns=['조식메뉴', '중식메뉴', '석식메뉴'], inplace=True)

    valid_df = pd.concat([valid_df.reset_index(drop=True), valid_tmp.reset_index(drop=True)], axis=1)
    valid_df.drop(columns=['조식메뉴', '중식메뉴', '석식메뉴'], inplace=True)

    test_df = pd.concat([test_df.reset_index(drop=True), test_tmp.reset_index(drop=True)], axis=1)
    test_df.drop(columns=['조식메뉴', '중식메뉴', '석식메뉴'], inplace=True)

    print('|TRAIN| : {} |VALID| : {} |TEST| : {}'.format(train_df.shape, valid_df.shape, test_df.shape))
    print(train_y.shape, valid_y.shape)
    return train_df, valid_df, test_df, train_y, valid_y, sample


def autoencoding(train_df, test_df, TARGET=1, verbose=2) :
    TRAIN_LENGTH = 1205

    df = pd.concat([train_df, test_df], axis=0)
    text_df = df[['조식메뉴', '중식메뉴', '석식메뉴']]

    mecab = Mecab(dicpath='C:/mecab/mecab-ko-dic')

    for i in range(len(text_df.조식메뉴)) :
        text_df.조식메뉴[i] = ','.join(text_df.조식메뉴[i].split())
        text_df.조식메뉴[i] = ' '.join(mecab.morphs(text_df.조식메뉴[i]))
    for i in range(len(text_df.중식메뉴)) :
        text_df.중식메뉴[i] = ','.join(text_df.중식메뉴[i].split())
        text_df.중식메뉴[i] = ' '.join(mecab.morphs(text_df.중식메뉴[i]))
    for i in range(len(text_df.석식메뉴)) :
        text_df.석식메뉴[i] = ','.join(text_df.석식메뉴[i].split())
        text_df.석식메뉴[i] = ' '.join(mecab.morphs(text_df.석식메뉴[i]))

    vect = CountVectorizer()
    vect.fit(text_df['조식메뉴'].values[:TRAIN_LENGTH])
    breakfast = vect.transform(text_df['조식메뉴'].values).toarray()
    vect.fit(text_df['중식메뉴'].values[:TRAIN_LENGTH])
    lunch = vect.transform(text_df['중식메뉴'].values).toarray()
    vect.fit(text_df['석식메뉴'].values[:TRAIN_LENGTH])
    dinner = vect.transform(text_df['석식메뉴'].values).toarray()

    enc_df = pd.DataFrame()

    i = 1
    for menu in [breakfast, lunch, dinner] :
        print('+' * 10, 'Train {}'.format(i), '+' * 10)
        train_X, valid_X = train_test_split(menu[:TRAIN_LENGTH],
                                            test_size=0.1,
                                            shuffle=True,
                                            random_state=0)
        STEP = (menu.shape[1] - TARGET) // 5
        INPUT = menu.shape[1]

        model = AutoEncoder(INPUT, STEP)
        model.compile(loss='mse', optimizer='adam', metrics='mse')
        model.fit(train_X, train_X,
                  validation_data=(valid_X, valid_X),
                  epochs=1000,
                  callbacks=tf.keras.callbacks.EarlyStopping(patience=10),
                  verbose=verbose)
        result = model.encoder(menu)
        result = np.array(result)

        enc_df = pd.concat([enc_df, pd.DataFrame(result)], axis=1)
        i += 1
        print()

    train_df.drop(columns=['조식메뉴', '중식메뉴', '석식메뉴'], inplace=True)
    test_df.drop(columns=['조식메뉴', '중식메뉴', '석식메뉴'], inplace=True)

    train_df_enc = pd.concat([train_df, enc_df[:TRAIN_LENGTH]], axis=1)
    test_df_enc = pd.concat([test_df, enc_df[TRAIN_LENGTH :]], axis=1)

    column_names = list(train_df_enc.columns[:8]) + [i for i in range(enc_df.shape[1])]
    train_df_enc.columns = column_names
    test_df_enc.columns = column_names

    return train_df_enc, test_df_enc


def get_data(config) :
    train_df, valid_df, test_df, train_y, valid_y, sample = process(config)
    if config.text == 'autoencode' :
        train_df, test_df = autoencoding(train_df, test_df, TARGET=config.dim)

    return train_df, valid_df, test_df, train_y, valid_y, sample

In [36]:
import re
from argparse import Namespace

import numpy as np
import pandas as pd

import fasttext
from konlpy.tag import Mecab

import tensorflow as tf
from tensorflow.keras.layers import *


TRAIN_LENGTH = 1205


def pretext(text) :
    text = text.rstrip()
    text = re.sub('▁', '', text)
    return text

def get_fasttext_model(config, train_df, save=False):
    print('"{}" DOES NOT EXIST. START TRAINING MODEL'.format(config.fasttext_model_fn))
    corpus = np.concatenate([train_df.조식메뉴.values,
                             train_df.중식메뉴.values,
                             train_df.석식메뉴.values], axis=0)
    with open('./data/corpus.train.txt', 'w', -1, encoding='utf-8') as f :
        f.write('\n'.join(corpus))

    if config.dummy_corpus :
        args = {'load_fn' : './data/corpus.train.txt',
                'save_fn' : './data/corpus.train.dummy.txt',
                'iter' : 2,
                'verbose' : 300}
        args = Namespace(**args)
        main(args)

    model = fasttext.train_unsupervised(
        './data/corpus.train.dummy.txt' if config.dummy_corpus else './data/corpus.train.txt',
        dim=config.dim,
        ws=config.window_size,
        epoch=config.fasttext_epoch,
        min_count=config.min_count,
        minn=config.min_ngram,
        maxn=config.max_ngram,
        )
    if save:
        model.save_model('./data/{}'.format(config.fasttext_model_fn))
    return model

def embedding(config, train_df, valid_df, test_df):
    embedding_features = []

    if config.pretrained:
        print('USE PRETRAINED MODEL: cc.ko.300.bin')
        model = fasttext.load_model('cc.ko.300.bin')
        fasttext.util.reduce_model(model, config.dim)

    else:
        if config.fasttext_model_fn is not None:
            try:
                model = fasttext.load_model('./data/{}'.format(config.fasttext_model_fn))
            except:
                model = get_fasttext_model(config, train_df, save=True)
        else:
            model = get_fasttext_model(config, train_df, save=False)


    TRAIN_LENGTH, VALID_LENGTH = train_df.shape[0], valid_df.shape[0]

    df = pd.concat([train_df, valid_df, test_df], axis=0)
    breakfast = df.조식메뉴.values
    lunch = df.중식메뉴.values
    dinner = df.석식메뉴.values

    breakfast_array = np.zeros((1255, config.dim))
    lunch_array = np.zeros((1255, config.dim))
    dinner_array = np.zeros((1255, config.dim))
    for i in range(1255) :

        breakfast_array[i] = model.get_sentence_vector(breakfast[i]) / (len(breakfast[i].split()) + 1)
        lunch_array[i] = model.get_sentence_vector(lunch[i]) / (len(lunch[i].split()) + 1)
        dinner_array[i] = model.get_sentence_vector(dinner[i]) / (len(dinner[i].split()) + 1)

    for i in range(config.dim) :
        embedding_features.append('breakfast_{}'.format(i))
        embedding_features.append('lunch_{}'.format(i))
        embedding_features.append('dinner_{}'.format(i))

    tmp = pd.concat([
        pd.DataFrame(breakfast_array, columns=['breakfast_{}'.format(i) for i in range(config.dim)]),
        pd.DataFrame(lunch_array, columns=['lunch_{}'.format(i) for i in range(config.dim)]),
        pd.DataFrame(dinner_array, columns=['dinner_{}'.format(i) for i in range(config.dim)])], axis=1)
    train_tmp = tmp[:TRAIN_LENGTH]
    valid_tmp = tmp[TRAIN_LENGTH: TRAIN_LENGTH + VALID_LENGTH]
    test_tmp = tmp[TRAIN_LENGTH + VALID_LENGTH:]

    return train_tmp, valid_tmp, test_tmp

# def subword(config):
#     # corpus = '\n'.join(np.concatenate([breakfast[:TRAIN_LENGTH],
#     #                                    lunch[:TRAIN_LENGTH],
#     #                                    dinner[:TRAIN_LENGTH]], axis=0))
#     # with open('./data/corpus.train.txt', 'w', encoding='utf-8') as f:
#     #     f.write(corpus)
#     #
#     # corpus = '\n'.join(np.concatenate([breakfast[TRAIN_LENGTH:],
#     #                                    lunch[TRAIN_LENGTH:],
#     #                                    dinner[TRAIN_LENGTH:]], axis=0))
#     # with open('./data/corpus.test.txt', 'w', encoding='utf-8') as f:
#     #     f.write(corpus)
#
#     # =================== subword segment=========================
#     with open('./data/corpus.train.sub.txt', 'r', encoding='utf-8') as f :
#         data = f.readlines()
#     corpus_sub_train = list(map(pretext, data))
#     with open('./data/corpus.valid.sub.txt', 'r', encoding='utf-8') as f :
#         data = f.readlines()
#     corpus_sub_valid = list(map(pretext, data))
#     with open('./data/corpus.test.sub.txt', 'r', encoding='utf-8') as f :
#         data = f.readlines()
#     corpus_sub_test = list(map(pretext, data))
#
#     tokenizer = Tokenizer(oov_token='<oov>')
#     tokenizer.fit_on_texts(corpus_sub_train)
#
#     vocab_size = len(tokenizer.word_index)
#     print('VOCAB SIZE : {}'.format(vocab_size))
#     cnt = 0
#     for i in tokenizer.word_counts.values():
#         if i == 1:
#             cnt += 1
#     print('Freq 1 word : {}'.format(cnt))
#     tokenizer = Tokenizer(oov_token='<oov>', num_words=vocab_size \
#         if not config.sub_sparse_word \
#         else vocab_size - cnt + 2)
#
#     tokenizer.fit_on_texts(corpus_sub_train)
#
#     corpus_sub = corpus_sub_train + corpus_sub_valid + corpus_sub_test
#     corpus_seq = tokenizer.texts_to_sequences(corpus_sub)
#
#     embeds = nn.Embedding(839, config.dim)
#
#     def get_sentence_vect(seq) :
#         result = torch.tensor(torch.zeros((config.dim)))
#         for idx in seq :
#             result += embeds(torch.tensor(idx, dtype=torch.long))
#         return (result / len(seq)).detach().numpy()
#
#     tmp = pd.DataFrame(np.array(list(map(get_sentence_vect, corpus_seq))))
#     tmp = pd.concat([tmp.iloc[:1255, :],
#                      tmp.iloc[1255 :2510, :].reset_index(drop=True),
#                      tmp.iloc[2510 :, :].reset_index(drop=True)], axis=1)
#     columns = []
#
#     for menu in ['breakfast', 'lunch', 'dinner']:
#         for i in range(config.dim):
#             columns.append('{}_{}'.format(menu, i))
#
#     tmp.columns = columns
#
#     return tmp

class Encoder(tf.keras.models.Model) :
    def __init__(self, step, input_size) :
        super(Encoder, self).__init__()
        self.model = tf.keras.models.Sequential([
            InputLayer(input_shape=(input_size,)),
            Dense(input_size - step * 1, activation='relu'),
            Dense(input_size - step * 2, activation='relu'),
            Dense(input_size - step * 3, activation='relu'),
            Dense(input_size - step * 4, activation='relu'),
            Dense(input_size - step * 5),
        ])

    def call(self, x) :
        z = self.model(x)
        return z

class Decoder(tf.keras.models.Model) :
    def __init__(self, step, input_size, output_size) :
        super(Decoder, self).__init__()
        self.model = tf.keras.models.Sequential([
            InputLayer(input_shape=(input_size,)),
            Dense(output_size - step * 4, activation='relu'),
            #             Dense(output_size - step * 3, activation='relu'),
            Dense(output_size - step * 2, activation='relu'),
            #             Dense(output_size - step * 1, activation='relu'),
            Dense(output_size),
        ])

    def call(self, x) :
        z = self.model(x)
        return z

class AutoEncoder(tf.keras.models.Model) :
    def __init__(self, input_size, step) :
        super(AutoEncoder, self).__init__()
        self.encoder = Encoder(step, input_size)
        self.decoder = Decoder(step, input_size - step * 5, input_size)

    def call(self, x) :
        y = self.encoder(x)
        z = self.decoder(y)
        return z

In [52]:
train_df, valid_df, test_df, train_y, valid_y, sample = get_data(config)

|TRAIN| : (1000, 25) |VALID| : (205, 25) |TEST| : (50, 25)
(1000, 2) (205, 2)




In [56]:
from argparse import Namespace
config = {'temp':True,
          'sep_date':True,
          'text':'embedding',
          'pretrained':False,
          'fasttext_model_fn':'basemodel.bin',
          'dim':3,
         }
config = Namespace(**config)
train_df, valid_df, test_df, train_y, valid_y, sample = get_data(config)
train_df = pd.concat([train_df.reset_index(drop=True), train_y.reset_index(drop=True)], axis=1)
valid_df = pd.concat([valid_df.reset_index(drop=True), valid_y.reset_index(drop=True)], axis=1)

|TRAIN| : (1000, 25) |VALID| : (205, 25) |TEST| : (50, 25)
(1000, 2) (205, 2)




In [None]:
from pycaret.regression import *

FOLD = 10
cat_columns = ['요일', 'month', 'week']
reg = setup(data=train_df.drop(columns=['석식계']), target='중식계', test_data=valid_df.drop(columns=['석식계']),
            categorical_features=cat_columns, fold=FOLD)

print('=' * 10, 'Comparing models...', '=' * 10)
best5 = compare_models(fold=FOLD, sort='mae', n_select=5)

print('=' * 10, 'Tuning models', '=' * 10)
tuned_best5 = [tune_model(i, optimize='mae', 
                          early_stopping=True, 
                          early_stopping_max_iters=100,
                          choose_better=True) for i in best5]
print('=' * 10, 'Blending...', '=' * 10)
blend_best5 = blend_models(estimator_list=tuned_best5, fold=FOLD, optimize='mae',
                           choose_better=True)

print('=' * 10, 'Finalizing...', '=' * 10)
pred = predict_model(blend_best5)
final_model = finalize_model(blend_best5)
pred = predict_model(final_model, data=test_df)
print('=' * 10, 'Completed!', '=' * 10)
sample['중식계'] = pred.Label.values


reg = setup(data=train_df.drop(columns=['중식계']), target='석식계', test_data=valid_df.drop(columns=['중식계']),
            categorical_features=[0, 7, 8, 9, 12, 13, 14, 15], fold=FOLD)

print('=' * 10, 'Comparing models...', '=' * 10)
best5 = compare_models(fold=FOLD, sort='mae', n_select=5)

print('=' * 10, 'Tuning models', '=' * 10)
tuned_best5 = [tune_model(i, optimize='mae', 
                          early_stopping=True, 
                          early_stopping_max_iters=100,
                          choose_better=True) for i in best5]
print('=' * 10, 'Blending...', '=' * 10)
blend_best5 = blend_models(estimator_list=tuned_best5, fold=FOLD, optimize='mae',
                           choose_better=True)

print('=' * 10, 'Finalizing...', '=' * 10)
pred = predict_model(blend_best5)
final_model = finalize_model(blend_best5)
pred = predict_model(final_model, data=test_df)
print('=' * 10, 'Completed!', '=' * 10)
sample['석식계'] = pred.Label.values

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,67.6833,8117.0848,89.9612,0.8078,0.1122,0.0827,2.957
lr,Linear Regression,71.3398,8881.0406,94.017,0.7905,0.1147,0.0857,0.719
br,Bayesian Ridge,71.6421,8992.5124,94.5545,0.7888,0.1156,0.0862,0.013
gbr,Gradient Boosting Regressor,72.8986,9621.2803,97.504,0.7749,0.1196,0.0881,0.136
ridge,Ridge Regression,73.1297,9415.956,96.6698,0.7797,0.1189,0.0883,0.011
et,Extra Trees Regressor,73.4256,10270.2269,100.8695,0.759,0.125,0.0892,0.237
lightgbm,Light Gradient Boosting Machine,73.641,9697.7581,98.2696,0.7725,0.1207,0.0891,0.36
lasso,Lasso Regression,74.6752,9815.214,98.6234,0.7714,0.1215,0.0908,0.011
xgboost,Extreme Gradient Boosting,77.1152,10535.8192,102.3917,0.7489,0.1269,0.0932,0.441
huber,Huber Regressor,77.9077,10949.0148,103.9741,0.7449,0.1293,0.0937,0.025




IntProgress(value=0, description='Processing: ', max=7)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE


Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.9min


In [63]:
sample.to_csv('submission_pycaret.csv', index=False)

# submission_pycaret.csv
- 5dim fasttext / n_select 2
- 10dim fasttext / n_select 3 / normalize / multicollinearity ths 0.9