In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_excel("data/train.xlsx")
df.head()

Unnamed: 0,일자,식사명,식사내용,수량
0,20030301,아침,"과일샐러드,닭죽,돈육마늘장조림,떡만두국,부추김무침,쌀밥,딸기잼(중),비엔나구이,스크...",37.472924
1,20030301,저녁,"감자으깸샐러드,비프까스,스위트피클,쌀밥,옥수수스프",19.566787
2,20030301,점심(일반),"골뱅이야채무침,새우맛살튀김,쌀밥(사무직),열무겉절이,칼국수",31.191336
3,20030302,아침,"계란죽,곤약멸치조림,김치국,마카로니샐러드,쌀밥,오징어회무침,딸기잼(중),삶은계란,야...",36.101083
4,20030302,저녁,"계란탕,단무지잔파무침,자장소스,잡채밥,탕수만두",21.949458


## Normalize Date
* Year : min-max scaling [0,1]
* Month : (sin,cos) transform [-1,1]
* Day : (sin,cos) transform [-1,1]

In [3]:
df['year'] = (df['일자'] / 10000).astype(int)
df['month'] = (df['일자'] % 10000 / 100).astype(int)
df['day'] = (df['일자'] % 100).astype(int)
df.drop(['일자'], axis=1, inplace=True)
df.head()

Unnamed: 0,식사명,식사내용,수량,year,month,day
0,아침,"과일샐러드,닭죽,돈육마늘장조림,떡만두국,부추김무침,쌀밥,딸기잼(중),비엔나구이,스크...",37.472924,2003,3,1
1,저녁,"감자으깸샐러드,비프까스,스위트피클,쌀밥,옥수수스프",19.566787,2003,3,1
2,점심(일반),"골뱅이야채무침,새우맛살튀김,쌀밥(사무직),열무겉절이,칼국수",31.191336,2003,3,1
3,아침,"계란죽,곤약멸치조림,김치국,마카로니샐러드,쌀밥,오징어회무침,딸기잼(중),삶은계란,야...",36.101083,2003,3,2
4,저녁,"계란탕,단무지잔파무침,자장소스,잡채밥,탕수만두",21.949458,2003,3,2


In [4]:
df['year'] = (df['year']-min(df['year'])) / (max(df['year'])-min(df['year']))
df['month_sin'] = [np.sin(x*2*np.pi/12) for x in df['month']]
df['month_cos'] = [np.cos(x*2*np.pi/12) for x in df['month']]
df['day_sin'] = [np.sin(x*2*np.pi/31) for x in df['day']]
df['day_cos'] = [np.cos(x*2*np.pi/31) for x in df['day']]
df.drop(['month', 'day'], axis=1, inplace=True)
df.head()

Unnamed: 0,식사명,식사내용,수량,year,month_sin,month_cos,day_sin,day_cos
0,아침,"과일샐러드,닭죽,돈육마늘장조림,떡만두국,부추김무침,쌀밥,딸기잼(중),비엔나구이,스크...",37.472924,0.0,1.0,6.123234000000001e-17,0.201299,0.97953
1,저녁,"감자으깸샐러드,비프까스,스위트피클,쌀밥,옥수수스프",19.566787,0.0,1.0,6.123234000000001e-17,0.201299,0.97953
2,점심(일반),"골뱅이야채무침,새우맛살튀김,쌀밥(사무직),열무겉절이,칼국수",31.191336,0.0,1.0,6.123234000000001e-17,0.201299,0.97953
3,아침,"계란죽,곤약멸치조림,김치국,마카로니샐러드,쌀밥,오징어회무침,딸기잼(중),삶은계란,야...",36.101083,0.0,1.0,6.123234000000001e-17,0.394356,0.918958
4,저녁,"계란탕,단무지잔파무침,자장소스,잡채밥,탕수만두",21.949458,0.0,1.0,6.123234000000001e-17,0.394356,0.918958


## Convert 식사명 to One-hot Vector

In [5]:
df = df.join(pd.get_dummies(df['식사명'], prefix='식사명'))
df.drop(['식사명'], axis=1, inplace=True)
df.head()

Unnamed: 0,식사내용,수량,year,month_sin,month_cos,day_sin,day_cos,식사명_아침,식사명_저녁,식사명_점심(양식),식사명_점심(일반)
0,"과일샐러드,닭죽,돈육마늘장조림,떡만두국,부추김무침,쌀밥,딸기잼(중),비엔나구이,스크...",37.472924,0.0,1.0,6.123234000000001e-17,0.201299,0.97953,1,0,0,0
1,"감자으깸샐러드,비프까스,스위트피클,쌀밥,옥수수스프",19.566787,0.0,1.0,6.123234000000001e-17,0.201299,0.97953,0,1,0,0
2,"골뱅이야채무침,새우맛살튀김,쌀밥(사무직),열무겉절이,칼국수",31.191336,0.0,1.0,6.123234000000001e-17,0.201299,0.97953,0,0,0,1
3,"계란죽,곤약멸치조림,김치국,마카로니샐러드,쌀밥,오징어회무침,딸기잼(중),삶은계란,야...",36.101083,0.0,1.0,6.123234000000001e-17,0.394356,0.918958,1,0,0,0
4,"계란탕,단무지잔파무침,자장소스,잡채밥,탕수만두",21.949458,0.0,1.0,6.123234000000001e-17,0.394356,0.918958,0,1,0,0


In [6]:
df.head()

Unnamed: 0,식사내용,수량,year,month_sin,month_cos,day_sin,day_cos,식사명_아침,식사명_저녁,식사명_점심(양식),식사명_점심(일반)
0,"과일샐러드,닭죽,돈육마늘장조림,떡만두국,부추김무침,쌀밥,딸기잼(중),비엔나구이,스크...",37.472924,0.0,1.0,6.123234000000001e-17,0.201299,0.97953,1,0,0,0
1,"감자으깸샐러드,비프까스,스위트피클,쌀밥,옥수수스프",19.566787,0.0,1.0,6.123234000000001e-17,0.201299,0.97953,0,1,0,0
2,"골뱅이야채무침,새우맛살튀김,쌀밥(사무직),열무겉절이,칼국수",31.191336,0.0,1.0,6.123234000000001e-17,0.201299,0.97953,0,0,0,1
3,"계란죽,곤약멸치조림,김치국,마카로니샐러드,쌀밥,오징어회무침,딸기잼(중),삶은계란,야...",36.101083,0.0,1.0,6.123234000000001e-17,0.394356,0.918958,1,0,0,0
4,"계란탕,단무지잔파무침,자장소스,잡채밥,탕수만두",21.949458,0.0,1.0,6.123234000000001e-17,0.394356,0.918958,0,1,0,0


## Convert 식사내용 to Vector based on Bag-of-Word

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
def tokenize(text):
    return text.split(',')

In [9]:
vectorizer = CountVectorizer(tokenizer=tokenize)

In [10]:
bow = vectorizer.fit_transform(df['식사내용']).toarray()
print(bow.shape)
bow

(20010, 1874)


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [11]:
vectorizer.get_feature_names()

[' 알감자 쇠고기 조림',
 '1회용케찹+1회용허니',
 'la갈비구이',
 '가리비젓갈',
 '가오리찜',
 '가자미구이',
 '가자미무조림',
 '가자미무조림(가공)',
 '가자미미역국',
 '가자미빵가루튀김',
 '가자미양념장구이',
 '가자미찜',
 '가자미찜2',
 '가자미카레튀김',
 '가자미튀김',
 '가자미튀김양념',
 '가지굴소스볶음',
 '가지깐풍',
 '가지나물',
 '가지냉국',
 '가지볶음',
 '가지볶음2',
 '가지쇠고기볶음',
 '가지오이냉국',
 '가지전',
 '가지토마토소스볶음',
 '간장파닭',
 '갈릭산적고추장조림',
 '갈비만두(찐)',
 '갈비산적간장조림',
 '갈비살구이',
 '갈비탕',
 '갈치감자조림',
 '갈치구이',
 '갈치그릴구이',
 '갈치단호박조림',
 '갈치무조림',
 '갈치무조림(원양)',
 '갈치조림',
 '갈치찜',
 '갈치카레구이',
 '갈치호박조림',
 '감식초',
 '감자간장조림',
 '감자고르케',
 '감자고추장찌개',
 '감자고추장찌개(느타리)',
 '감자그라탕',
 '감자냉국',
 '감자냉이국',
 '감자다시마국',
 '감자당면찌개',
 '감자맛탕',
 '감자버터구이',
 '감자베이컨볶음',
 '감자베이컨조림',
 '감자비엔나조림',
 '감자샐러드',
 '감자소세지볶음',
 '감자수제비국',
 '감자수제비국(큰그릇)',
 '감자스모크햄볶음',
 '감자양파국',
 '감자오이샐러드',
 '감자육계조림',
 '감자으깸샐러드',
 '감자전',
 '감자조림',
 '감자채베이컨볶음(i',
 '감자채볶음',
 '감자채브로콜리볶음',
 '감자채카레볶음',
 '감자채피망볶음 ',
 '감자탕',
 '감자탕1',
 '감자튀김',
 '감자튀김(40)',
 '감자튀김(경양식)',
 '감자튀김(맛)',
 '감자튀김(반달)',
 '감자풋고추볶음',
 '건문어초무침',
 '건미역맛살초무침',
 '건미역무침',
 '건미역오이초무침',
 '건빵튀김',
 '건새우마늘쫑볶음',
 '건새우무채국',
 '건새우볶음',
 '건새

In [12]:
df = df.join(pd.DataFrame(bow, columns=vectorizer.get_feature_names()))
df.drop(['식사내용'], axis=1, inplace=True)
df.head()

Unnamed: 0,수량,year,month_sin,month_cos,day_sin,day_cos,식사명_아침,식사명_저녁,식사명_점심(양식),식사명_점심(일반),...,휘),흑미밥,흑미밥(현장),흑임자밥,흑임자죽,흑콩견과류조림,흑콩밥,흑콩조림,흰죽,흰콩곤약조림
0,37.472924,0.0,1.0,6.123234000000001e-17,0.201299,0.97953,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,19.566787,0.0,1.0,6.123234000000001e-17,0.201299,0.97953,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,31.191336,0.0,1.0,6.123234000000001e-17,0.201299,0.97953,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,36.101083,0.0,1.0,6.123234000000001e-17,0.394356,0.918958,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,21.949458,0.0,1.0,6.123234000000001e-17,0.394356,0.918958,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
