In [10]:
import random

import fasttext

from catboost import CatBoostRegressor, Pool
from pycaret.regression import *

from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [44]:
def get_data(text='embedding', DIM=30, use_pca=0, total=False):
    train_df = pd.read_csv('train.csv', encoding='utf-8')
    test_df = pd.read_csv('test.csv', encoding='utf-8')
    y = train_df[['중식계', '석식계']]
    sample = pd.read_csv('sample_submission.csv', encoding='utf-8')

    TRAIN_LENGTH = 1205

    train_df.drop(columns=['중식계', '석식계'], inplace=True)

    df = pd.concat([train_df, test_df], axis=0)

    # 날짜
    df['month'] = df.일자.apply(lambda x : int(x[-5:-3]))
    df['day'] = df.일자.apply(lambda x : int(x[-2:]))
    df['week'] = df.day.apply(lambda x : x // 7)
    
    if total:
        df['total'] = ''
        df['total'] = df['본사정원수'] - (df['본사휴가자수'] + df['본사출장자수'] + df['현본사소속재택근무자수'])
        df.drop(columns=['일자', 'day', '본사정원수', '본사휴가자수', '본사출장자수', '현본사소속재택근무자수'], inplace=True)
    else:
        df.drop(columns=['일자', 'day'], inplace=True)

    columns = ['조식메뉴', '중식메뉴', '석식메뉴']
    for col in columns:
        df[col] = df[col].str.replace('/', ' ')
        df[col] = df[col].str.replace(r'[(]{1}[ㄱ-힣:,.A-Za-z]*[)]{1}', '')
        df[col] = df[col].str.replace(r'[ ]{2, }', ' ')
        df[col] = df[col].str.replace('*', ' ')
        df[col] = df[col].apply(lambda x : x.strip())
    
    if text == 'embedding':
        breakfast = df.조식메뉴.values
        launch = df.중식메뉴.values
        dinner = df.석식메뉴.values

        menus = [breakfast, launch, dinner]

        models = []
        for i, n in enumerate(['breakfast', 'launch', 'dinner']):
            with open('./data/{}.txt'.format(n), 'w', -1, encoding='utf-8') as f:
                f.write('\n'.join(menus[i][:TRAIN_LENGTH]))
            models.append(fasttext.train_unsupervised('./data/{}.txt'.format(n), 
                                                      dim=DIM, 
                                                      ws=5, 
                                                      epoch=100,
                                                      min_count=5,
                                                      minn=2,
                                                      maxn=4,
                                                     ))

        breakfast_array = np.zeros((1255, DIM))
        launch_array = np.zeros((1255, DIM))
        dinner_array = np.zeros((1255, DIM))

        embedding_features = []

        for  i in range(1255):
            breakfast_array[i] = models[0].get_sentence_vector(breakfast[i])
            launch_array[i] = models[1].get_sentence_vector(launch[i])
            dinner_array[i] = models[2].get_sentence_vector(dinner[i])

        for i in range(DIM):
            embedding_features.append('breakfast_{}'.format(i))
            embedding_features.append('launch_{}'.format(i))
            embedding_features.append('dinner_{}'.format(i))

        tmp = pd.concat([
            pd.DataFrame(breakfast_array, columns=['breakfast_{}'.format(i) for i in range(DIM)]),
            pd.DataFrame(launch_array, columns=['launch_{}'.format(i) for i in range(DIM)]),
            pd.DataFrame(dinner_array, columns=['dinner_{}'.format(i) for i in range(DIM)])], axis=1)
    
    if text == 'tokenize':
        from tensorflow.keras.preprocessing.text import Tokenizer
        from tensorflow.keras.preprocessing.sequence import pad_sequences
        from sklearn.decomposition import PCA
        
        
        menus = ['조식메뉴', '중식메뉴', '석식메뉴']
        for col in menus:
            tokenizer = Tokenizer(oov_token='<OOV>')
            tokenizer.fit_on_texts(df[col][:TRAIN_LENGTH])
            seq = tokenizer.texts_to_sequences(df[col])
            pad = pad_sequences(seq)
            
            if use_pca > 0:
                pca = PCA(n_components=use_pca)
                pca.fit(pad[:TRAIN_LENGTH])
                pad = pca.transform(pad)
            
            length = len(pad[0])
            pad = pd.DataFrame(pad, columns=['{}_{}'.format(col, i) for i in range(length)])
            df = pd.concat([df.reset_index(drop=True), pd.DataFrame(pad)], axis=1)
        df.drop(columns=menus, inplace=True)
        
        
    
    # Normalize
    if total:
        scaling_cols = ['total', '본사시간외근무명령서승인건수']
    else:
        scaling_cols = ['본사정원수', '본사휴가자수', '본사출장자수', 
                        '본사시간외근무명령서승인건수', '현본사소속재택근무자수']
    for col in scaling_cols:
        ms = MinMaxScaler()
        ms.fit(df[col][:TRAIN_LENGTH].values.reshape(-1, 1))
        df[col] = ms.transform(df[col].values.reshape(-1, 1))
    
    if text == 'embedding':
        new_df = pd.concat([df.reset_index(drop=True), tmp.reset_index(drop=True)], axis=1)
        new_df.drop(columns=['조식메뉴', '중식메뉴', '석식메뉴'], inplace=True)
    else:
        new_df = df.reset_index(drop=True)

    le = LabelEncoder()
    le.fit(new_df.요일.values[:TRAIN_LENGTH])
    new_df.요일 = le.transform(new_df.요일.values)

    train_df = new_df[:TRAIN_LENGTH]
    # train_df, valid_df, train_y, valid_y = train_test_split(df, train_y, test_size=0.2, random_state=0)
    test_df = new_df[TRAIN_LENGTH:]
    
    return train_df, test_df, y, sample

In [45]:
total = False
train_df, test_df, y, sample = get_data(text='tokenize', DIM=3, total=total)

In [51]:
df = pd.concat([train_df, y], axis=1)

In [52]:
from pycaret.regression import *

print('Setup start, input split size')
reg = setup(data=df.drop(columns=['석식계']), target='중식계', train_size=0.8, 
            categorical_features=list(train_df.columns[8:]))
print('=' * 10, 'Comparing models...', '=' * 10)
best3 = compare_models(fold=5, sort='mae', n_select=3, exclude=['dt', 'en', 'knn', 'huber', 'par', 'lar', 'ransac', 'kr'])
# print('=' * 10, 'Tuning models', '=' * 10)
# tuned_best3 = [tune_model(i, optimize='mae', 
#                           early_stopping=True, 
#                           early_stopping_max_iters=10,
#                           choose_better=True) for i in best3]
# print('=' * 10, 'Blending...', '=' * 10)
# blend_best3 = blend_models(estimator_list=tuned_best3, fold=5, optimize='mae',
#                            choose_better=True)
# print('=' * 10, 'Finalizing...', '=' * 10)
# pred = predict_model(blend_best3)
# final_model = finalize_model(blend_best3)
# pred = predict_model(final_model, data=test_df)
# print('=' * 10, 'Completed!', '=' * 10)
# sample['중식계'] = pred.Label.values

Unnamed: 0,Description,Value
0,session_id,6092
1,Target,중식계
2,Original Data,"(1205, 51)"
3,Missing Values,False
4,Numeric Features,6
5,Categorical Features,44
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(964, 2472)"




IntProgress(value=0, description='Processing: ', max=59)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,70.1798,8377.2863,91.2042,0.8104,0.1115,0.0847,5.56
lightgbm,Light Gradient Boosting Machine,72.3363,8944.5161,94.4298,0.7971,0.1155,0.087,0.43
gbr,Gradient Boosting Regressor,73.8438,9300.2731,96.1737,0.7896,0.1172,0.0892,1.012
rf,Random Forest Regressor,75.4972,9909.9903,99.3305,0.7755,0.1209,0.091,1.818
xgboost,Extreme Gradient Boosting,75.9085,9620.0613,97.8157,0.7821,0.1192,0.0908,3.87
et,Extra Trees Regressor,80.7583,11785.5235,108.2429,0.7334,0.1319,0.0977,3.072
lasso,Lasso Regression,84.8115,12043.9294,109.6267,0.7269,0.1311,0.1013,0.746
ada,AdaBoost Regressor,88.8392,12356.7746,111.038,0.7194,0.1365,0.1088,2.856
br,Bayesian Ridge,100.9891,17322.2264,131.5316,0.6063,0.1607,0.1235,5.346
llar,Lasso Least Angle Regression,101.1843,16322.4968,127.6643,0.6296,0.1517,0.1218,0.096


KeyboardInterrupt: 

In [7]:
print('Setup start, input split size')
reg = setup(data=df.drop(columns=['중식계']), target='석식계', train_size=0.8, 
            categorical_features=['요일', 'month', 'day', 'week'], 
            normalize=True, 
            feature_selection=True,
            feature_interaction=True,
            remove_multicollinearity=True)
print('=' * 10, 'Comparing models...', '=' * 10)
best3 = compare_models(fold=5, sort='mae', n_select=3, exclude=['dt', 'en', 'knn', 'huber', 'par', 'lar', 'ransac', 'kr'])
# print('=' * 10, 'Tuning models, (It takes a long time)', '=' * 10)
# tuned_best3 = [tune_model(i, optimize='mae', fold=2, 
#                           n_iter=100,
#                           early_stopping=True, 
#                           early_stopping_max_iters=10,
#                           choose_better=True) for i in best3]
# print('=' * 10, 'Blending...', '=' * 10)
# blend_best3 = blend_models(estimator_list=tuned_best3, fold=5, optimize='mae',
#                            choose_better=True)
# print('=' * 10, 'Finalizing...', '=' * 10)
# pred = predict_model(blend_best3)
# final_model = finalize_model(blend_best3)
# pred = predict_model(final_model, data=test_df)
# print('=' * 10, 'Completed!', '=' * 10)
# sample['석식계'] = pred.Label.values

Unnamed: 0,Description,Value
0,session_id,3490
1,Target,석식계
2,Original Data,"(1205, 41)"
3,Missing Values,False
4,Numeric Features,36
5,Categorical Features,4
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(964, 296)"




IntProgress(value=0, description='Processing: ', max=59)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,53.5454,6045.4226,77.428,0.6966,0.6544,0.1259,1.134
catboost,CatBoost Regressor,53.8638,5790.1579,75.8438,0.7084,0.8853,0.1222,33.96
gbr,Gradient Boosting Regressor,55.508,6027.0118,77.4001,0.6955,0.785,0.1294,1.202
rf,Random Forest Regressor,57.9114,6527.5775,80.4458,0.6715,0.7846,0.1344,2.13
lasso,Lasso Regression,58.1283,6575.3729,80.8469,0.6654,0.8938,0.1333,0.724
lightgbm,Light Gradient Boosting Machine,58.3595,6492.2741,80.2696,0.6729,0.7809,0.1358,1.09
xgboost,Extreme Gradient Boosting,59.7243,7241.1544,84.6817,0.6341,0.754,0.136,1.588
br,Bayesian Ridge,59.8089,6805.2502,82.2854,0.6563,0.8579,0.1395,0.08
omp,Orthogonal Matching Pursuit,62.1533,7704.1406,87.5304,0.6025,0.9266,0.1418,0.042
ridge,Ridge Regression,65.6047,8136.1979,90.0777,0.5837,0.9182,0.15,0.036


KeyboardInterrupt: 

In [63]:
sample.to_csv('submission_pycaret.csv', index=False)

# submission_pycaret.csv
- 5dim fasttext / n_select 2
- 10dim fasttext / n_select 3 / normalize / multicollinearity ths 0.9