In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))
display(HTML("<style>.input_area pre {font-family: Consolas; font-size: 11pt; line-height: 140%;}</style>"))
display(HTML("<style>.output_area pre {font-family: Consolas; font-size: 11pt; line-height: 140%;}</style>"))

In [2]:
# pip install --upgrade gensim

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from lightgbm import LGBMRegressor

from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA



In [4]:
# HYPER PARAMETERS

class CFG:
    emb_dim = 200

args = CFG

In [6]:
df_train = pd.read_csv('../data/train.csv')
df_test = pd.read_csv('../data/test.csv')

# Word Embedding

In [7]:
df_all = pd.concat([df_train, df_test])

In [8]:
df_all.head(2)

Unnamed: 0,일자,요일,본사정원수,본사휴가자수,본사출장자수,본사시간외근무명령서승인건수,현본사소속재택근무자수,조식메뉴,중식메뉴,석식메뉴,중식계,석식계
0,2016-02-01,월,2601,50,150,238,0.0,모닝롤/찐빵 우유/두유/주스 계란후라이 호두죽/쌀밥 (쌀:국내산) 된장찌개 쥐...,"쌀밥/잡곡밥 (쌀,현미흑미:국내산) 오징어찌개 쇠불고기 (쇠고기:호주산) 계란찜 ...","쌀밥/잡곡밥 (쌀,현미흑미:국내산) 육개장 자반고등어구이 두부조림 건파래무침 ...",1039.0,331.0
1,2016-02-02,화,2601,50,173,319,0.0,모닝롤/단호박샌드 우유/두유/주스 계란후라이 팥죽/쌀밥 (쌀:국내산) 호박젓국찌...,"쌀밥/잡곡밥 (쌀,현미흑미:국내산) 김치찌개 가자미튀김 모둠소세지구이 마늘쫑무...","콩나물밥*양념장 (쌀,현미흑미:국내산) 어묵국 유산슬 (쇠고기:호주산) 아삭고추무...",867.0,560.0


In [9]:
# Basic text preprocessing

def split_process(x, q):
    x_ = []
    x = x.split(' ')
    for i in x:
        if '(' in i and ':' in i and ')' in i:
            continue
        if '/' in i:
            x_.extend(i.split('/'))
        else:
            x_.append(i)
    x_ = list(set(x_))
    x_.remove('')
    return x_

In [10]:
# Get all combinations for training w2v (train + test)

food_combinations = []
for i in ['조식메뉴', '중식메뉴', '석식메뉴']:
    food_combinations += df_all[i].apply(lambda x: split_process(x, i)).to_list()

In [11]:
len(food_combinations)

3765

In [12]:
food_combinations

[['우유',
  '호두죽',
  '된장찌개',
  '쥐어채무침',
  '주스',
  '포기김치',
  '쌀밥',
  '두유',
  '계란후라이',
  '찐빵',
  '모닝롤'],
 ['우유',
  '주스',
  '팥죽',
  '단호박샌드',
  '호박젓국찌개',
  '시래기조림',
  '포기김치',
  '쌀밥',
  '두유',
  '계란후라이',
  '모닝롤'],
 ['우유',
  '표고버섯죽',
  '콩나물국',
  '주스',
  '포기김치',
  '쌀밥',
  '베이글',
  '두유',
  '계란후라이',
  '모닝롤',
  '느타리호박볶음'],
 ['우유',
  '닭죽',
  '근대국',
  '주스',
  '포기김치',
  '쌀밥',
  '두유',
  '계란후라이',
  '모닝롤',
  '토마토샌드',
  '멸치볶음'],
 ['우유', '주스', '방풍나물', '포기김치', '쇠고기죽', '재첩국', '쌀밥', '두유', '계란후라이', '모닝롤', '와플'],
 ['우유',
  '주스',
  '팬케익',
  '견과류죽',
  '감자찌개',
  '포기김치',
  '명엽채무침',
  '쌀밥',
  '두유',
  '계란후라이',
  '찐빵'],
 ['우유',
  '주스',
  '숙주나물',
  '봄동된장국',
  '포기김치',
  '쌀밥',
  '두유',
  '계란후라이',
  '고구마죽',
  '모닝롤',
  '야채샌드'],
 ['우유',
  '주스',
  '콩조림',
  '잣죽',
  '포기김치',
  '쌀밥',
  '두유',
  '계란후라이',
  '민물새우찌개',
  '모닝롤',
  '치즈프레즐'],
 ['우유', '단호박죽', '어묵국', '주스', '포기김치', '마늘빵', '쌀밥', '두유', '계란후라이', '김구이', '모닝롤'],
 ['우유',
  '참치샌드',
  '주스',
  '포기김치',
  '무생채',
  '쌀밥',
  '두유',
  '계란후라이',
  '모닝롤',
  '북어계란국',
  '흑임자죽'],
 ['우유', '인절미토스트

In [13]:
# Train or load w2v model

TRAIN_W2V = True
try:
    model = Word2Vec.load('food_embedding.model')
    print("Model loaded")
except:
    if TRAIN_W2V:
        print("Training w2v")
        model = Word2Vec(sentences=food_combinations, vector_size=args.emb_dim, window=7, min_count=0, workers=4, sg=0, epochs =5000)
        model.save('food_embedding.model')
    else:
        print("Model loading failed. Do not train.")

Model loaded


In [14]:
# w2v demo
model.wv.most_similar('된장찌개')

[('차돌박이찌개', 0.5862919688224792),
 ('감자국', 0.5642290115356445),
 ('조랭이떡미역국', 0.5440123677253723),
 ('오징어국', 0.5439069271087646),
 ('민물새우찌개', 0.5349516868591309),
 ('열무된장국', 0.5214429497718811),
 ('대구매운탕', 0.5209029912948608),
 ('얼갈이국', 0.5151638984680176),
 ('어묵국', 0.5150087475776672),
 ('무채국', 0.5106183886528015)]

# Preprocess

In [15]:
def process_date(df):
    df['일자'] = pd.to_datetime(df['일자'], format="%Y-%m-%d")
    df['year'] = df['일자'].dt.year
    df['month'] = df['일자'].dt.month
    df['day'] = df['일자'].dt.day
    df = df.drop('일자', axis=1)
    return df

def get_food_embedding(x):
    x_ = []
    x = x.split(' ')
    for i in x:
        if '(' in i and ':' in i and ')' in i:
            continue
        if '/' in i:
            x_.extend(i.split('/'))
        else:
            x_.append(i)
    x_ = list(set(x_))
    x_.remove('')
    vec_ = np.zeros(args.emb_dim)
    for i in x_:
        vec = model.wv.get_vector(i)
        vec_ += vec
    vec_ /= len(x_)
    return vec_

In [16]:
# General preprocessing
df_train = process_date(df_train)
day_encoder = LabelEncoder()
df_train['요일'] = day_encoder.fit_transform(df_train['요일'])

In [17]:
df_train.head(2)

Unnamed: 0,요일,본사정원수,본사휴가자수,본사출장자수,본사시간외근무명령서승인건수,현본사소속재택근무자수,조식메뉴,중식메뉴,석식메뉴,중식계,석식계,year,month,day
0,3,2601,50,150,238,0.0,모닝롤/찐빵 우유/두유/주스 계란후라이 호두죽/쌀밥 (쌀:국내산) 된장찌개 쥐...,"쌀밥/잡곡밥 (쌀,현미흑미:국내산) 오징어찌개 쇠불고기 (쇠고기:호주산) 계란찜 ...","쌀밥/잡곡밥 (쌀,현미흑미:국내산) 육개장 자반고등어구이 두부조림 건파래무침 ...",1039.0,331.0,2016,2,1
1,4,2601,50,173,319,0.0,모닝롤/단호박샌드 우유/두유/주스 계란후라이 팥죽/쌀밥 (쌀:국내산) 호박젓국찌...,"쌀밥/잡곡밥 (쌀,현미흑미:국내산) 김치찌개 가자미튀김 모둠소세지구이 마늘쫑무...","콩나물밥*양념장 (쌀,현미흑미:국내산) 어묵국 유산슬 (쇠고기:호주산) 아삭고추무...",867.0,560.0,2016,2,2


In [18]:
# Get embedding
df_train['조식메뉴_embedding'] = df_train['조식메뉴'].apply(lambda x: get_food_embedding(x))
df_train['중식메뉴_embedding'] = df_train['중식메뉴'].apply(lambda x: get_food_embedding(x))
df_train['석식메뉴_embedding'] = df_train['석식메뉴'].apply(lambda x: get_food_embedding(x))

In [19]:
y_lunch = df_train['중식계']
y_dinner = df_train['석식계']
df_train.drop(['조식메뉴', '중식메뉴', '석식메뉴', '중식계', '석식계'], axis=1, inplace=True)

In [20]:
X_common = df_train.iloc[:, :9]
X_common.head(2)

Unnamed: 0,요일,본사정원수,본사휴가자수,본사출장자수,본사시간외근무명령서승인건수,현본사소속재택근무자수,year,month,day
0,3,2601,50,150,238,0.0,2016,2,1
1,4,2601,50,173,319,0.0,2016,2,2


In [21]:
emb_arr_lunch = np.array(df_train.iloc[:, 10].to_numpy().tolist()) # 점심메뉴단어임베딩벡터 추가 
emb_arr_dinner = np.array(df_train.iloc[:, 11].to_numpy().tolist()) # 저녁메뉴단어임베딩벡터 추가

X_train_lunch = np.concatenate((X_common.to_numpy(), emb_arr_lunch), axis=1)
X_train_dinner = np.concatenate((X_common.to_numpy(), emb_arr_dinner), axis=1)

In [22]:
X_train_lunch.shape

(1205, 209)

In [23]:
X_train_dinner.shape

(1205, 209)

In [24]:
X_train_lunch, X_test_lunch, y_train_lunch, y_test_lunch = train_test_split(X_train_lunch, y_lunch, test_size=0.1, random_state=42)
X_train_dinner, X_test_dinner, y_train_dinner, y_test_dinner = train_test_split(X_train_dinner, y_dinner, test_size=0.1, random_state=42)

# Modeling

In [25]:
# Simple LGBM Regressor w/o tuning
model_lunch = LGBMRegressor()
model_lunch.fit(X_train_lunch, y_train_lunch)

model_dinner = LGBMRegressor()
model_dinner.fit(X_train_dinner, y_train_dinner)

# Validate
pred_lunch = model_lunch.predict(X_test_lunch)
pred_dinner = model_dinner.predict(X_test_dinner)

print("lunch mae: ", mean_absolute_error(y_test_lunch, pred_lunch))
print("dinner mae: ", mean_absolute_error(y_test_dinner, pred_dinner))

lunch mae:  81.13258088998049
dinner mae:  47.23218586192287


# Inference

In [26]:
df_test = pd.read_csv('../data/test.csv')

# Apply general preprocessing
df_test= process_date(df_test)
df_test['요일'] = day_encoder.transform(df_test['요일'])
df_test['조식메뉴_embedding'] = df_test['조식메뉴'].apply(lambda x: get_food_embedding(x))
df_test['중식메뉴_embedding'] = df_test['중식메뉴'].apply(lambda x: get_food_embedding(x))
df_test['석식메뉴_embedding'] = df_test['석식메뉴'].apply(lambda x: get_food_embedding(x))
df_test.drop(['조식메뉴', '중식메뉴', '석식메뉴'], axis=1, inplace=True)
X_test_common = df_test.iloc[:, :9]

In [27]:
emb_arr_lunch = np.array(df_train.iloc[:, 10].to_numpy().tolist()) # Ver 2
emb_arr_dinner = np.array(df_train.iloc[:, 11].to_numpy().tolist()) # Ver 2

In [28]:
# Get embedding
test_emb_arr_lunch = np.array(df_test.iloc[:, 10].to_numpy().tolist()) # Ver 2
test_emb_arr_dinner = np.array(df_test.iloc[:, 11].to_numpy().tolist()) # Ver 2
# Concat
test_lunch = np.concatenate((X_test_common.to_numpy(), test_emb_arr_lunch), axis=1)
test_dinner = np.concatenate((X_test_common.to_numpy(), test_emb_arr_dinner), axis=1)

In [30]:
# Inference

test_pred_lunch = model_lunch.predict(test_lunch)
test_pred_dinner = model_dinner.predict(test_dinner)

submission_df = pd.read_csv('../data/sample_submission.csv')
submission_df['중식계'] = test_pred_lunch
submission_df['석식계'] = test_pred_dinner

In [31]:
# Save

submission_df.to_csv('../sub_2nd_0618.csv', index=False)