In [1]:
import random

from konlpy.tag import Mecab

import fasttext
from catboost import CatBoostRegressor, Pool

import tensorflow as tf
from tensorflow.keras.layers import *

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [2]:
def dummy_data(corpus, iter=10, seed=0):
    import random
    random.seed(seed)
    
    for _ in range(iter):
        for i in range(corpus.shape[0]):
            string = np.array(corpus[i].split())
            indices = np.random.permutation(len(string))
            new_string = np.array([' '.join(string[indices])])
            corpus = np.c_.concatenate([corpus, new_string])
    return np.array(list(set(corpus)))

In [3]:
def get_data(text='embedding',
             DIM=2,
             epoch=1000,
             use_pca=0,
             use_tok=False,
             sum_reduction=False,
             sep_date=True,
             use_corpus=True,
             min_count=5,
             window_size=3,
             min_ngram=2,
             max_ngram=4,
             dummy_corpus=True,
             corpus_iter=10,
             ):
    train_df = pd.read_csv('train.csv', encoding='utf-8')
    test_df = pd.read_csv('test.csv', encoding='utf-8')
    y = train_df[['중식계', '석식계']]
    sample = pd.read_csv('sample_submission.csv', encoding='utf-8')

    # crawling
    temp = pd.read_csv('temp.csv', encoding='utf-8')

    TRAIN_LENGTH = 1205

    train_df.drop(columns=['중식계', '석식계'], inplace=True)

    df = pd.concat([train_df, test_df], axis=0)
    df = pd.merge(df, temp, on='일자')

    if sep_date :
        df['month'] = df.일자.apply(lambda x : int(x[-5 :-3]))
        df['day'] = df.일자.apply(lambda x : int(x[-2 :]))
        df['week'] = df.day.apply(lambda x : x // 7)

        df.drop(columns=['일자', 'day'], inplace=True)
    else :
        df.일자 = pd.to_datetime(df.일자)

    columns = ['조식메뉴', '중식메뉴', '석식메뉴']
    for col in columns :
        df[col] = df[col].str.replace('/', ' ')
        df[col] = df[col].str.replace(r'[(]{1}[ㄱ-힣:,.A-Za-z]*[)]{1}', '')
        df[col] = df[col].str.replace(r'[ ]{2, }', ' ')
        df[col] = df[col].str.replace('*', ' ')
        df[col] = df[col].apply(lambda x : x.strip())

    if text == 'embedding' :
        breakfast = df.조식메뉴.values
        launch = df.중식메뉴.values
        dinner = df.석식메뉴.values

        embedding_features = []

        if use_corpus:
            corpus = np.concatenate([breakfast, launch, dinner], axis=0)
            if dummy_corpus:
                corpus = dummy_data(corpus, iter=corpus_iter)
            with open('./data/corpus.txt', 'w', -1, encoding='utf-8') as f:
                f.write('\n'.join(corpus))
            model = fasttext.train_unsupervised('./data/corpus.txt',
                                                dim=DIM,
                                                ws=window_size,
                                                epoch=epoch,
                                                min_count=min_count,
                                                minn=min_ngram,
                                                maxn=max_ngram,
                                                )

            breakfast_array = np.zeros((1255, DIM))
            launch_array = np.zeros((1255, DIM))
            dinner_array = np.zeros((1255, DIM))

            for i in range(1255) :
                breakfast_array[i] = model.get_sentence_vector(breakfast[i]) / len(breakfast[i].split())
                launch_array[i] = model.get_sentence_vector(launch[i]) / len(launch[i].split())
                dinner_array[i] = model.get_sentence_vector(dinner[i]) / len(dinner[i].split())

            for i in range(DIM) :
                embedding_features.append('breakfast_{}'.format(i))
                embedding_features.append('launch_{}'.format(i))
                embedding_features.append('dinner_{}'.format(i))

            tmp = pd.concat([
                pd.DataFrame(breakfast_array, columns=['breakfast_{}'.format(i) for i in range(DIM)]),
                pd.DataFrame(launch_array, columns=['launch_{}'.format(i) for i in range(DIM)]),
                pd.DataFrame(dinner_array, columns=['dinner_{}'.format(i) for i in range(DIM)])], axis=1)

        else:
            menus = [breakfast, launch, dinner]

            models = []
            for i, n in enumerate(['breakfast', 'launch', 'dinner']) :
                with open('./data/{}.txt'.format(n), 'w', -1, encoding='utf-8') as f :
                    f.write('\n'.join(menus[i][:TRAIN_LENGTH]))
                if use_tok :
                    models.append(fasttext.train_unsupervised('./data/{}.tok.txt'.format(n),
                                                              dim=DIM,
                                                              ws=window_size,
                                                              epoch=epoch,
                                                              min_count=min_count,
                                                              minn=min_ngram,
                                                              maxn=max_ngram,
                                                              ))
                else :
                    models.append(fasttext.train_unsupervised('./data/{}.txt'.format(n),
                                                              dim=DIM,
                                                              ws=window_size,
                                                              epoch=epoch,
                                                              min_count=min_count,
                                                              minn=min_ngram,
                                                              maxn=max_ngram,
                                                              ))

            embedding_features = []

            if sum_reduction :
                breakfast_array = np.zeros((1255, 1))
                launch_array = np.zeros((1255, 1))
                dinner_array = np.zeros((1255, 1))

                for i in range(1255) :
                    breakfast_array[i] = [sum(models[0].get_sentence_vector(breakfast[i]))]
                    launch_array[i] = [sum(models[1].get_sentence_vector(launch[i]))]
                    dinner_array[i] = [sum(models[2].get_sentence_vector(dinner[i]))]

                tmp = pd.concat([
                    pd.DataFrame(breakfast_array, columns=['breakfast']),
                    pd.DataFrame(launch_array, columns=['launch']),
                    pd.DataFrame(dinner_array, columns=['dinner'])], axis=1)

                ms = MinMaxScaler()
                ms.fit(tmp.breakfast[:TRAIN_LENGTH].values.reshape(-1, 1))
                tmp.breakfast = ms.transform(tmp.breakfast.values.reshape(-1, 1))
                ms.fit(tmp.launch[:TRAIN_LENGTH].values.reshape(-1, 1))
                tmp.launch = ms.transform(tmp.launch.values.reshape(-1, 1))
                ms.fit(tmp.dinner[:TRAIN_LENGTH].values.reshape(-1, 1))
                tmp.dinner = ms.transform(tmp.dinner.values.reshape(-1, 1))

            else :
                breakfast_array = np.zeros((1255, DIM))
                launch_array = np.zeros((1255, DIM))
                dinner_array = np.zeros((1255, DIM))

                for i in range(1255) :
                    breakfast_array[i] = models[0].get_sentence_vector(breakfast[i]) / len(breakfast[i].split())
                    launch_array[i] = models[1].get_sentence_vector(launch[i]) / len(launch[i].split())
                    dinner_array[i] = models[2].get_sentence_vector(dinner[i]) / len(dinner[i].split())

                for i in range(DIM) :
                    embedding_features.append('breakfast_{}'.format(i))
                    embedding_features.append('launch_{}'.format(i))
                    embedding_features.append('dinner_{}'.format(i))

                tmp = pd.concat([
                    pd.DataFrame(breakfast_array, columns=['breakfast_{}'.format(i) for i in range(DIM)]),
                    pd.DataFrame(launch_array, columns=['launch_{}'.format(i) for i in range(DIM)]),
                    pd.DataFrame(dinner_array, columns=['dinner_{}'.format(i) for i in range(DIM)])], axis=1)

    if text == 'tokenize' :
        from tensorflow.keras.preprocessing.text import Tokenizer
        from tensorflow.keras.preprocessing.sequence import pad_sequences
        from sklearn.decomposition import PCA

        menus = ['조식메뉴', '중식메뉴', '석식메뉴']
        for col in menus :
            tokenizer = Tokenizer(oov_token='<OOV>')
            tokenizer.fit_on_texts(df[col][:TRAIN_LENGTH])
            seq = tokenizer.texts_to_sequences(df[col])
            pad = pad_sequences(seq)

            if use_pca > 0 :
                pca = PCA(n_components=use_pca)
                pca.fit(pad[:TRAIN_LENGTH])
                pad = pca.transform(pad)

            length = len(pad[0])
            pad = pd.DataFrame(pad, columns=['{}_{}'.format(col, i) for i in range(length)])
            df = pd.concat([df.reset_index(drop=True), pd.DataFrame(pad)], axis=1)
        df.drop(columns=menus, inplace=True)

    # Normalize
    scaling_cols = ['본사정원수', '본사휴가자수', '본사출장자수',
                    '본사시간외근무명령서승인건수', '현본사소속재택근무자수']
    for col in scaling_cols :
        ms = MinMaxScaler()
        ms.fit(df[col][:TRAIN_LENGTH].values.reshape(-1, 1))
        df[col] = ms.transform(df[col].values.reshape(-1, 1))

    if text == 'embedding' :
        new_df = pd.concat([df.reset_index(drop=True), tmp.reset_index(drop=True)], axis=1)
        new_df.drop(columns=['조식메뉴', '중식메뉴', '석식메뉴'], inplace=True)
    else :
        new_df = df.reset_index(drop=True)

    le = LabelEncoder()
    le.fit(new_df.요일.values[:TRAIN_LENGTH])
    new_df.요일 = le.transform(new_df.요일.values)

    train_df = new_df[:TRAIN_LENGTH]
    # train_df, valid_df, train_y, valid_y = train_test_split(df, train_y, test_size=0.2, random_state=0)
    test_df = new_df[TRAIN_LENGTH :]
    
    return train_df, test_df, y, sample

In [3]:
train_df = pd.read_csv('../data/train.csv', encoding='utf-8')

In [7]:
train_df.석식메뉴

0       쌀밥/잡곡밥 (쌀,현미흑미:국내산) 육개장  자반고등어구이  두부조림  건파래무침 ...
1       콩나물밥*양념장 (쌀,현미흑미:국내산) 어묵국  유산슬 (쇠고기:호주산) 아삭고추무...
2       쌀밥/잡곡밥 (쌀,현미흑미:국내산) 청국장찌개  황태양념구이 (황태:러시아산) 고기...
3       미니김밥*겨자장 (쌀,현미흑미:국내산) 우동  멕시칸샐러드  군고구마  무피클  포...
4       쌀밥/잡곡밥 (쌀,현미흑미:국내산) 차돌박이찌개 (쇠고기:호주산) 닭갈비 (닭고기:...
                              ...                        
1200       김치볶음밥 미니쫄우동*맛살튀김 브로콜리깨소스무침 계란후라이 고들빼기무침 겉절이김치 
1201              흑미밥 쇠고기무국 삼치양념구이 비엔나채소볶음 숙주나물당근무침 포기김치 
1202          흑미밥 수제비국 수제맛쵸킹탕수육 유부채소겨자냉채 참나물무침 갓김치/겉절이김치 
1203              흑미밥 열무된장국 장어강정*데리야끼소스 깻잎쌈*생강채 오이선 포기김치 
1204           (New)할라피뇨멸치주먹밥 잔치국수 수제고기육전 쑥갓나물 양파초절임 깍두기 
Name: 석식메뉴, Length: 1205, dtype: object

In [323]:
def autoencoding(train_df, test_df, TARGET=1, verbose=2):
    TRAIN_LENGTH = 1205
    
    df = pd.concat([train_df, test_df], axis=0)
    text_df = df[['조식메뉴', '중식메뉴', '석식메뉴']]

    mecab = Mecab(dicpath='C:/mecab/mecab-ko-dic')

    for i in range(len(text_df.조식메뉴)):
        text_df.조식메뉴[i] = ','.join(text_df.조식메뉴[i].split())
        text_df.조식메뉴[i] = ' '.join(mecab.morphs(text_df.조식메뉴[i]))
    for i in range(len(text_df.중식메뉴)):
        text_df.중식메뉴[i] = ','.join(text_df.중식메뉴[i].split())
        text_df.중식메뉴[i] = ' '.join(mecab.morphs(text_df.중식메뉴[i]))
    for i in range(len(text_df.석식메뉴)):
        text_df.석식메뉴[i] = ','.join(text_df.석식메뉴[i].split())
        text_df.석식메뉴[i] = ' '.join(mecab.morphs(text_df.석식메뉴[i]))

    vect = CountVectorizer()
    vect.fit(text_df['조식메뉴'].values[:TRAIN_LENGTH])
    breakfast = vect.transform(text_df['조식메뉴'].values).toarray()
    vect.fit(text_df['중식메뉴'].values[:TRAIN_LENGTH])
    launch = vect.transform(text_df['중식메뉴'].values).toarray()
    vect.fit(text_df['석식메뉴'].values[:TRAIN_LENGTH])
    dinner = vect.transform(text_df['석식메뉴'].values).toarray()

    class Encoder(tf.keras.models.Model):
        def __init__(self, step, input_size):
            super(Encoder, self).__init__()
            self.model = tf.keras.models.Sequential([
                InputLayer(input_shape=(input_size,)),
                Dense(input_size - step * 1, activation='relu'),
                Dense(input_size - step * 2, activation='relu'),
                Dense(input_size - step * 3, activation='relu'),
                Dense(input_size - step * 4, activation='relu'),
                Dense(input_size - step * 5),
            ])

        def call(self, x):
            z = self.model(x)
            return z

    class Decoder(tf.keras.models.Model):
        def __init__(self, step, input_size, output_size):
            super(Decoder, self).__init__()
            self.model = tf.keras.models.Sequential([
                InputLayer(input_shape=(input_size,)),
                Dense(output_size - step * 4, activation='relu'),
    #             Dense(output_size - step * 3, activation='relu'),
                Dense(output_size - step * 2, activation='relu'),
    #             Dense(output_size - step * 1, activation='relu'),
                Dense(output_size),
            ])
        def call(self, x):
            z = self.model(x)
            return z

    class AutoEncoder(tf.keras.models.Model):
        def __init__(self, input_size, step):
            super(AutoEncoder, self).__init__()
            self.encoder = Encoder(step, input_size)
            self.decoder = Decoder(step, input_size - step * 5, input_size)

        def call(self, x):
            y = self.encoder(x)
            z = self.decoder(y)
            return z

    enc_df = pd.DataFrame()

    i = 1
    for menu in [breakfast, launch, dinner]:
        print('+' * 10, 'Train {}'.format(i), '+' * 10)
        train_X, valid_X = train_test_split(menu[:TRAIN_LENGTH], 
                                            test_size=0.1, 
                                            shuffle=True, 
                                            random_state=0)

        STEP = (menu.shape[1] - TARGET) // 5
        INPUT = menu.shape[1]

        model = AutoEncoder(INPUT, STEP)
        model.compile(loss='mse', optimizer='adam', metrics='mse')
        model.fit(train_X, train_X,
                  validation_data=(valid_X, valid_X),
                  epochs=1000, 
                  callbacks=tf.keras.callbacks.EarlyStopping(patience=10),
                  verbose=verbose)
        result = model.encoder(menu)
        result = np.array(result)

        enc_df = pd.concat([enc_df, pd.DataFrame(result)], axis=1)
        i += 1
        print()

    train_df.drop(columns=['조식메뉴', '중식메뉴', '석식메뉴'], inplace=True)
    test_df.drop(columns=['조식메뉴', '중식메뉴', '석식메뉴'], inplace=True)

    train_df_enc = pd.concat([train_df, enc_df[:TRAIN_LENGTH]], axis=1)
    test_df_enc = pd.concat([test_df, enc_df[TRAIN_LENGTH:]], axis=1)
    
    column_names = list(train_df_enc.columns[:8]) + [i for i in range(enc_df.shape[1])]
    train_df_enc.columns = column_names
    test_df_enc.columns = column_names

    return train_df_enc, test_df_enc

In [324]:
def processing(text='embedding', 
               DIM=3, 
               sep_date=False, 
               verbose=2,
               min_count=5,
               window_size=3,
               min_ngram=2,
               max_ngram=4,
               sum_reduction=False,
               use_tok=False,
               dummy_corpus=True,
               corpus_iter=10,
              ): 
    if text == 'autoencode':
        train_df, test_df, y, sample = get_data(text='raw', 
                                                sep_date=sep_date,
                                               )
        train_df_enc, test_df_enc = autoencoding(train_df, test_df, TARGET=DIM)
        
        return train_df_enc, test_df_enc, y, sample
    else:
        train_df, test_df, y, sample = get_data(text=text, 
                                                DIM=DIM, 
                                                sep_date=sep_date,
                                                use_tok=use_tok,
                                                min_count=min_count,
                                                window_size=window_size,
                                                min_ngram=min_ngram,
                                                max_ngram=max_ngram,
                                                sum_reduction=sum_reduction,
                                                dummy_corpus=dummy_corpus,
                                                corpus_iter=corpus_iter
                                               )
        
        return train_df, test_df, y, sample

In [329]:
def train_catboost(train_df, test_df, depth=2, lr=0.3, verbose=100, k=0, test_size=0.2, seed=0):
    if  train_df.columns[8] == 'month':
        cat_features = [0, 8, 9]
    else:
        cat_features = []
        
    print('Categorical Features :', cat_features)
    
    if k > 0:
        skf = KFold(n_splits=k, shuffle=True, random_state=42)
        folds = []
        for train_idx, valid_idx in skf.split(train_df, y.중식계) :
            folds.append((train_idx, valid_idx))

        random.seed(42)
        models_launch = {}
        models_dinner = {}
        scores_launch = []
        scores_dinner = []
        for fold in range(k) :
            print(f'===================================={fold + 1}============================================')
            train_idx, valid_idx = folds[fold]
            train_X, valid_X, train_y, valid_y = train_df.iloc[train_idx], train_df.iloc[valid_idx],\
                                                 y.중식계[train_idx], y.중식계[valid_idx]

            train_pool_launch = Pool(train_X, train_y, cat_features=cat_features)
            valid_pool_launch = Pool(valid_X, valid_y, cat_features=cat_features)
            reg_launch = CatBoostRegressor(
                                       loss_function='MAE', 
                                       has_time=True,
                                       eval_metric='MAE',
                                       iterations=3000,
                                       depth=depth,
                                       rsm=0.9,
                                       random_seed=0,
                                       boost_from_average=True,
                                       reg_lambda=20.0,)
            reg_launch.fit(train_pool_launch, 
                    eval_set=valid_pool_launch, 
                    use_best_model=True, 
                    early_stopping_rounds=100,
                    verbose=verbose)

            models_launch[fold] = reg_launch
            y_pred_launch = reg_launch.predict(valid_X)
            mae_launch = mean_absolute_error(y_pred_launch, valid_y)
            scores_launch.append(mae_launch)

            train_y, valid_y = y.석식계[train_idx], y.석식계[valid_idx]
            train_pool_dinner = Pool(train_X, train_y, cat_features=cat_features)
            valid_pool_dinner = Pool(valid_X, valid_y, cat_features=cat_features)
            reg_dinner = CatBoostRegressor(
                                       loss_function='MAE', 
                                       has_time=True,
                                       eval_metric='MAE',
                                       iterations=3000,
                                       depth=depth,
                                       rsm=0.9,
                                       boost_from_average=True,
                                       random_seed=0,
                                       reg_lambda=20.0,)
            reg_dinner.fit(train_pool_dinner, 
                    eval_set=valid_pool_dinner, 
                    use_best_model=True, 
                    early_stopping_rounds=100, 
                    verbose=verbose)

            models_dinner[fold] = reg_dinner
            y_pred_dinner = reg_dinner.predict(valid_X)
            mae_dinner = mean_absolute_error(y_pred_dinner, valid_y)
            scores_dinner.append(mae_dinner)
            print(f'================================================================================\n\n')

        print('-' * 50, 'Result', '-' * 50, '\n')
        print('Mean launch : {}'.format(np.mean(scores_launch, axis=0)))
        print('Mean dinner : {}'.format(np.mean(scores_dinner, axis=0)))
        print('Total : {}'.format((np.mean(scores_launch, axis=0) + np.mean(scores_dinner, axis=0)) / 2))
        print('_' * 106)
        
        return models_launch, models_dinner 
    
    else:
        train_X, valid_X, train_y, valid_y = train_test_split(train_df, 
                                                              y.중식계, 
                                                              test_size=test_size, 
                                                              random_state=seed)

        train_pool_launch = Pool(train_X, train_y, cat_features=cat_features)
        valid_pool_launch = Pool(valid_X, valid_y, cat_features=cat_features)

        reg_launch = CatBoostRegressor(loss_function='MAE', 
                                       has_time=True,
                                       eval_metric='MAE',
                                       iterations=3000,
                                       depth=depth,
                                       rsm=0.9,
                                       boost_from_average=True,
                                       reg_lambda=20.0,
                                       random_seed=0
                                      )

        reg_launch.fit(train_pool_launch, 
                eval_set=valid_pool_launch, 
                use_best_model=True, 
                early_stopping_rounds=100,
                verbose=verbose)

        y_pred_launch = reg_launch.predict(valid_X)
        mae_launch = mean_absolute_error(y_pred_launch, valid_y)

        # ===========================================================================================
        train_X, valid_X, train_y, valid_y = train_test_split(train_df, 
                                                              y.석식계, 
                                                              test_size=test_size, 
                                                              random_state=seed)

        train_pool_launch = Pool(train_X, train_y, cat_features=cat_features)
        valid_pool_launch = Pool(valid_X, valid_y, cat_features=cat_features)

        reg_dinner = CatBoostRegressor(loss_function='MAE', 
                                       has_time=True,
                                       eval_metric='MAE',
                                       iterations=3000,
                                       depth=depth,
                                       rsm=0.9,
                                       boost_from_average=True,
                                       reg_lambda=20.0,
                                       random_seed=0
                                      )

        reg_dinner.fit(train_pool_launch, 
                eval_set=valid_pool_launch, 
                use_best_model=True, 
                early_stopping_rounds=100,
                verbose=verbose)

        y_pred_dinner = reg_dinner.predict(valid_X)
        mae_dinner = mean_absolute_error(y_pred_dinner, valid_y)

        print('MAE_LAUNCH :', mae_launch)
        print('MAE_DINNER :', mae_dinner)
        print('TOTAL SCORE :', (mae_launch + mae_dinner) / 2)
    
    return reg_launch, reg_dinner

In [326]:
def save_submission(reg_launch, reg_dinner, sample, file_fn='new'):
    if type(reg_launch) == 'list':
        for i in range(10):
            sample.중식계 += reg_launch[i].predict(test_df)
            sample.석식계 += reg_dinner[i].predict(test_df)

        sample.중식계 /= 10
        sample.석식계 /= 10
        sample.to_csv('submission_{}.csv'.format(file_fn), index=False)
    else:
        sample.중식계 = reg_launch.predict(test_df)
        sample.석식계 = reg_dinner.predict(test_df)

        sample.to_csv('submission_{}.csv'.format(file_fn), index=False)
    print(sample.tail())

In [327]:
# text = [embedding, tokenize, raw, autoencoding]
train_df, test_df, y, sample = processing(text='embedding', 
                                          DIM=2,
                                          verbose=0,
                                          min_count=0,
                                          window_size=3,
                                          min_ngram=2,
                                          max_ngram=3,   
                                          sum_reduction=False,
                                          use_tok=False,
                                          sep_date=True,
                                          dummy_corpus=True,
                                          corpus_iter=2
                                         )
# train_df = train_df.iloc[:, :8]
# test_df = test_df.iloc[:, :8]

reg_launch, reg_dinner = train_catboost(train_df, 
                                        test_df,
                                        depth=2, 
                                        verbose=100,
                                        k=0, 
                                        test_size=0.2,
                                        seed=2)

Categorical Features : [0, 8, 9, 10]


CatBoostError: Invalid type for cat_feature[non-default value idx=0,feature_idx=10]=-0.08910762518644333 : cat_features must be integer or string, real number values and NaN values should be converted to string.

In [330]:

reg_launch, reg_dinner = train_catboost(train_df, 
                                        test_df,
                                        depth=2, 
                                        verbose=100,
                                        k=0, 
                                        test_size=0.2,
                                        seed=2)

Categorical Features : [0, 8, 9]
0:	learn: 163.7071879	test: 168.2167625	best: 168.2167625 (0)	total: 20.4ms	remaining: 1m 1s
100:	learn: 86.1392338	test: 87.8508187	best: 87.8508187 (100)	total: 886ms	remaining: 25.4s
200:	learn: 76.0338330	test: 76.3176693	best: 76.3176693 (200)	total: 1.83s	remaining: 25.5s
300:	learn: 71.0614750	test: 72.2836064	best: 72.2836064 (300)	total: 2.9s	remaining: 26s
400:	learn: 68.3371029	test: 71.4206280	best: 71.4206280 (400)	total: 3.7s	remaining: 24s
500:	learn: 66.1450491	test: 70.8753028	best: 70.8569770 (497)	total: 4.45s	remaining: 22.2s
600:	learn: 64.4417721	test: 70.0613253	best: 70.0562338 (599)	total: 5.2s	remaining: 20.8s
700:	learn: 62.8715975	test: 69.4688371	best: 69.4497303 (695)	total: 5.92s	remaining: 19.4s
800:	learn: 61.5184769	test: 68.9559463	best: 68.9526442 (799)	total: 6.66s	remaining: 18.3s
900:	learn: 60.2518474	test: 68.4964217	best: 68.4937468 (899)	total: 7.46s	remaining: 17.4s
1000:	learn: 59.1131799	test: 68.0318036	bes

In [129]:
train_df, test_df, y, sample = processing(text='embedding', 
                                          DIM=2,
                                          verbose=0,
                                          min_count=0,
                                          window_size=3,
                                          min_ngram=2,
                                          max_ngram=3,   
                                          sum_reduction=False,
                                          use_tok=False,
                                          sep_date=True,
                                         )

In [130]:
train_df

Unnamed: 0,요일,본사정원수,본사휴가자수,본사출장자수,본사시간외근무명령서승인건수,현본사소속재택근무자수,temp,rain,month,day,week,breakfast_0,breakfast_1,launch_0,launch_1,dinner_0,dinner_1
0,3,0.000000,0.022481,0.323442,0.227969,0.000000,0.183128,0.000000,2,1,0,-0.328111,0.924677,0.958274,0.211348,-0.333639,0.914947
1,4,0.000000,0.022481,0.391691,0.305556,0.000000,0.197531,0.000000,2,2,0,-0.301556,0.938399,0.951181,0.260686,-0.529107,0.735462
2,2,0.000000,0.027477,0.412463,0.106322,0.000000,0.257202,0.000000,2,3,0,-0.307250,0.935701,0.468310,0.820731,-0.292870,0.923697
3,1,0.000000,0.067444,0.531157,0.340038,0.000000,0.292181,0.000000,2,4,0,-0.268320,0.943259,0.957419,0.215146,-0.970038,0.109891
4,0,0.000000,0.212323,0.415430,0.032567,0.000000,0.261317,0.000000,2,5,0,-0.318779,0.930272,0.933504,0.321258,-0.365625,0.894952
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1200,2,0.542614,0.043297,0.465875,0.003831,0.733583,0.304527,0.000000,1,20,2,0.341600,0.835125,0.490622,0.784972,-0.913245,0.334217
1201,1,0.542614,0.057452,0.563798,0.442529,0.658537,0.399177,0.063668,1,21,3,0.323341,0.859361,0.807818,0.422196,-0.264632,0.919909
1202,0,0.542614,0.193172,0.614243,0.000958,0.568480,0.419753,0.000692,1,22,3,0.212831,0.892905,0.368154,0.852467,-0.455512,0.869566
1203,3,0.542614,0.069942,0.332344,0.590038,0.613508,0.469136,0.000000,1,25,3,0.297104,0.859851,0.764566,0.492922,-0.481827,0.833512


In [148]:
train_df

Unnamed: 0,요일,본사정원수,본사휴가자수,본사출장자수,본사시간외근무명령서승인건수,현본사소속재택근무자수,temp,rain,month,day,week,breakfast_0,breakfast_1,launch_0,launch_1,dinner_0,dinner_1
0,3,0.000000,0.022481,0.323442,0.227969,0.000000,0.183128,0.000000,2,1,0,-0.384382,0.899315,0.963856,0.185830,-0.341007,0.912270
1,4,0.000000,0.022481,0.391691,0.305556,0.000000,0.197531,0.000000,2,2,0,-0.357028,0.915522,0.957702,0.236133,-0.532368,0.733278
2,2,0.000000,0.027477,0.412463,0.106322,0.000000,0.257202,0.000000,2,3,0,-0.368673,0.909323,0.486355,0.809349,-0.299320,0.921431
3,1,0.000000,0.067444,0.531157,0.340038,0.000000,0.292181,0.000000,2,4,0,-0.328957,0.921337,0.962963,0.190820,-0.970942,0.101644
4,0,0.000000,0.212323,0.415430,0.032567,0.000000,0.261317,0.000000,2,5,0,-0.377070,0.904571,0.942352,0.295522,-0.373294,0.891523
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1200,2,0.542614,0.043297,0.465875,0.003831,0.733583,0.304527,0.000000,1,20,2,0.306769,0.845391,0.508735,0.774032,-0.917556,0.323175
1201,1,0.542614,0.057452,0.563798,0.442529,0.658537,0.399177,0.063668,1,21,3,0.282533,0.871354,0.821642,0.394880,-0.271240,0.918021
1202,0,0.542614,0.193172,0.614243,0.000958,0.568480,0.419753,0.000692,1,22,3,0.171729,0.899214,0.387554,0.843478,-0.448166,0.873202
1203,3,0.542614,0.069942,0.332344,0.590038,0.613508,0.469136,0.000000,1,25,3,0.257112,0.868527,0.782051,0.463389,-0.489620,0.827982


In [65]:
save_submission(reg_launch, reg_dinner, sample)

            일자          중식계         석식계
0   2021-01-27  1059.124527  339.564667
1   2021-01-28   975.859173  423.681958
2   2021-01-29   700.566329  231.822373
3   2021-02-01  1266.773316  465.702315
4   2021-02-02  1045.126225  408.930388
5   2021-02-03   977.272879  350.067663
6   2021-02-04   971.669293  386.524374
7   2021-02-05   705.797848  282.220481
8   2021-02-08  1205.932265  523.682720
9   2021-02-09  1096.225787  456.717027
10  2021-02-10   865.119962  265.356322
11  2021-02-15  1313.701883  624.725493
12  2021-02-16  1116.281739  530.168552
13  2021-02-17  1038.450521  334.889811
14  2021-02-18   854.868356  416.699744
15  2021-02-19   695.631073  303.494322
16  2021-02-22  1229.828118  532.592874
17  2021-02-23  1062.149589  534.693198
18  2021-02-24   896.656916  343.654939
19  2021-02-25   875.596015  400.567985
20  2021-02-26   630.739475  211.876763
21  2021-03-02  1133.777522  560.328366
22  2021-03-03  1017.979485  362.290720
23  2021-03-04   890.260112  461.063305
