In [1]:
import pandas as pd
import numpy as np

import os

def load_hotel_reserve():
  customer_tb = pd.read_csv('./data/customer.csv')
  hotel_tb = pd.read_csv('./data/hotel.csv')
  reserve_tb = pd.read_csv('./data/reserve.csv')
  return customer_tb, hotel_tb, reserve_tb


def load_holiday_mst():
  holiday_tb = pd.read_csv('./data/holiday_mst.csv',
                           index_col=False)
  return holiday_tb


def load_production():
  production_tb = pd.read_csv('./data/production.csv')
  return production_tb


def load_production_missing_num():
  production_tb = pd.read_csv('./data/production_missing_num.csv')
  return production_tb


def load_production_missing_category():
  production_tb = pd.read_csv('./data/production_missing_category.csv')
  return production_tb


def load_monthly_index():
  monthly_index_tb = \
    pd.read_csv('./data/monthly_index.csv')
  return monthly_index_tb


def load_meros_txt():
  with open('./data/txt/meros.txt', 'r') as f:
    meros = f.read()
    f.close()
  return meros


In [2]:
customer_tb, hotel_tb, reserve_tb = load_hotel_reserve()

#### 3. 일시 데이터 전처리 기법
- 문자열 시간 데이터를 날짜형으로 변환하기

    -> 일시형에는 Timestamp와 DateTime 같은 자료형이 존재

    -> 데이터 읽을 때 연월일시간의 문자열이나 UNIXTIME 형태는 일시형으로 변환

In [None]:
# 호텔 예약 레코드를 사용하여 일시형과 날짜형으로 변환
pd.to_datetime(reserve_tb['reserve_datetime'], format='%Y-%m-%d %H:%M:%S')
pd.to_datetime(reserve_tb['checkin_date'] + ' ' + reserve_tb['checkin_time'],
                format='%Y-%m-%d %H:%M:%S')

pd.to_datetime(reserve_tb['reserve_datetime'],
               format='%Y-%m-%d %H:%M:%S').dt.date
pd.to_datetime(reserve_tb['checkin_date'], format='%Y-%m-%d').dt.date

reserve_tb

Unnamed: 0,reserve_id,hotel_id,customer_id,reserve_datetime,checkin_date,checkin_time,checkout_date,people_num,total_price
0,r1,h_75,c_1,2016-03-06 13:09:42,2016-03-26,10:00:00,2016-03-29,4,97200
1,r2,h_219,c_1,2016-07-16 23:39:55,2016-07-20,11:30:00,2016-07-21,2,20600
2,r3,h_179,c_1,2016-09-24 10:03:17,2016-10-19,09:00:00,2016-10-22,2,33600
3,r4,h_214,c_1,2017-03-08 03:20:10,2017-03-29,11:00:00,2017-03-30,4,194400
4,r5,h_16,c_1,2017-09-05 19:50:37,2017-09-22,10:30:00,2017-09-23,3,68100
...,...,...,...,...,...,...,...,...,...
4025,r4026,h_129,c_999,2017-06-27 23:00:02,2017-07-10,09:30:00,2017-07-11,2,16000
4026,r4027,h_97,c_999,2017-09-29 05:24:57,2017-10-09,10:30:00,2017-10-10,2,41800
4027,r4028,h_27,c_999,2018-03-14 05:01:45,2018-04-02,11:30:00,2018-04-04,2,74800
4028,r4029,h_48,c_1000,2016-04-16 15:20:17,2016-05-10,09:30:00,2016-05-13,4,540000


- 연, 월, 일, 시 등 시간 구성 요소 분리

In [54]:
# 기존 예약 테이블의 reserve_datetime 상위 2개
reserve_tb['reserve_datetime'].head(2)

0   2016-03-06 13:09:42
1   2016-07-16 23:39:55
Name: reserve_datetime, dtype: datetime64[ns]

In [50]:
# 예약 테이블의 reserve_datetime에서 연, 월, 일, 시, 분, 초를 추리기
reserve_tb['reserve_datetime'] = \
    pd.to_datetime(reserve_tb['reserve_datetime'], format='%Y-%m-%d %H:%M:%S') #datetime64형으로 변경

print("--년도--")
print(reserve_tb['reserve_datetime'].dt.year.head(2)) #년도 정보 추출
print("--월--")
print(reserve_tb['reserve_datetime'].dt.month.head(2)) #월 정보 추출
print("--일--")
print(reserve_tb['reserve_datetime'].dt.day.head(2)) #일 정보 추출
print("--요일--")
print(reserve_tb['reserve_datetime'].dt.dayofweek.head(2)) #요일 정보 추출
print("--시--")
print(reserve_tb['reserve_datetime'].dt.hour.head(2)) #시 정보 추출
print("--분--")
print(reserve_tb['reserve_datetime'].dt.minute.head(2)) #분 정보 추출
print("--초--")
print(reserve_tb['reserve_datetime'].dt.second.head(2)) #초 정보 추출

--년도--
0    2016
1    2016
Name: reserve_datetime, dtype: int32
--월--
0    3
1    7
Name: reserve_datetime, dtype: int32
--일--
0     6
1    16
Name: reserve_datetime, dtype: int32
--요일--
0    6
1    5
Name: reserve_datetime, dtype: int32
--시--
0    13
1    23
Name: reserve_datetime, dtype: int32
--분--
0     9
1    39
Name: reserve_datetime, dtype: int32
--초--
0    42
1    55
Name: reserve_datetime, dtype: int32


- 시간 간격 계산

    -> 일시형 데이터가 여러 개 있을 때 데이터 사이의 차이(연수, 월수, 주수, 일수, 시간차)

In [52]:
reserve_tb['reserve_datetime'] = \
    pd.to_datetime(reserve_tb['reserve_datetime'], format='%Y-%m-%d %H:%M:%S')

reserve_tb['checkin_datetime'] = \
    pd.to_datetime(reserve_tb['checkin_date'] + ' ' + reserve_tb['checkin_time'],
                   format='%Y-%m-%d %H:%M:%S')

#연도 차이 계산
print("--연도--")
print((reserve_tb['reserve_datetime'].dt.year - \
reserve_tb['checkin_datetime'].dt.year).head(3))

#월 차이 계산
print("--월--")
print(((reserve_tb['reserve_datetime'].dt.year * 12 + 
 reserve_tb['reserve_datetime'].dt.month) \
 - (reserve_tb['checkin_datetime'].dt.year * 12 + 
    reserve_tb['checkin_datetime'].dt.month)).head(3))

#일 차이 계산
print("--일--")
print((((reserve_tb['reserve_datetime'] - reserve_tb['checkin_datetime'])\
    .dt.days).astype('timedelta64[D]')).head(3))


--연도--
0    0
1    0
2    0
dtype: int32
--월--
0    0
1    0
2   -1
dtype: int32
--일--
0   -20 days
1    -4 days
2   -25 days
dtype: timedelta64[s]


In [53]:
#시 차이 계산
print("--시--")
print((((reserve_tb['reserve_datetime'] - reserve_tb['checkin_datetime'])\
    .dt.total_seconds() / 3600).astype('timedelta64[h]')).head(3))

#분 차이 계산
print("--분--")
print((((reserve_tb['reserve_datetime'] - reserve_tb['checkin_datetime'])\
    .dt.total_seconds() / 60).astype('timedelta64[m]')).head(3))

#초 차이 계산
print("--초--")
print((((reserve_tb['reserve_datetime'] - reserve_tb['checkin_datetime'])\
    .dt.total_seconds()).astype('timedelta64[s]')).head(3))

--시--
0   -20 days +04:00:00
1    -4 days +13:00:00
2   -25 days +02:00:00
dtype: timedelta64[s]
--분--
0   -20 days +03:10:00
1    -4 days +12:10:00
2   -25 days +01:04:00
dtype: timedelta64[s]
--초--
0   -20 days +03:09:42
1    -4 days +12:09:55
2   -25 days +01:03:17
dtype: timedelta64[s]


- 시간 정보의 증분 처리

In [62]:
# 예약 테이블의 예약 일시에 1일, 1시간, 1분, 1초 추가
import datetime

reserve_tb['reserve_datetime'] = \
    pd.to_datetime(reserve_tb['reserve_datetime'], format='%Y-%m-%d %H:%M:%S')

reserve_tb['reserve_date'] = reserve_tb['reserve_datetime'].dt.date #date 추출

#기존 시간
print("--기존 시간--")
print(reserve_tb['reserve_datetime'].head(2))

#1일 증가
print("--1일 증가--")
print((reserve_tb['reserve_datetime'] + datetime.timedelta(days=1)).head(2))

#1일 증가
print("--1일 증가--")
print((reserve_tb['reserve_date'] + datetime.timedelta(days=1)).head(2))

#1시간 증가
print("--1시간 증가--")
print((reserve_tb['reserve_datetime'] + datetime.timedelta(hours=1)).head(2))

#1분 증가
print("--1분 증가--")
print((reserve_tb['reserve_datetime'] + datetime.timedelta(minutes=1)).head(2))  

#1초 증가
print("--1초 증가--")
print((reserve_tb['reserve_datetime'] + datetime.timedelta(seconds=1)).head(2))


--기존 시간--
0   2016-03-06 13:09:42
1   2016-07-16 23:39:55
Name: reserve_datetime, dtype: datetime64[ns]
--1일 증가--
0   2016-03-07 13:09:42
1   2016-07-17 23:39:55
Name: reserve_datetime, dtype: datetime64[ns]
--1일 증가--
0    2016-03-07
1    2016-07-17
Name: reserve_date, dtype: object
--1시간 증가--
0   2016-03-06 14:09:42
1   2016-07-17 00:39:55
Name: reserve_datetime, dtype: datetime64[ns]
--1분 증가--
0   2016-03-06 13:10:42
1   2016-07-16 23:40:55
Name: reserve_datetime, dtype: datetime64[ns]
--1초 증가--
0   2016-03-06 13:09:43
1   2016-07-16 23:39:56
Name: reserve_datetime, dtype: datetime64[ns]


- 계절 항목으로 변환

In [63]:
# 예약 테이블의 reserve_datetime의 월부터 예약일의 계절 데이터 생성
reserve_tb['reserve_datetime'] = pd.to_datetime(
    reserve_tb['reserve_datetime'], format='%Y-%m-%d %H:%M:%S'
)

def to_season(month_num):
    season = 'winter'
    if 3 <= month_num <= 5:
        season = 'spring'
    elif 6 <= month_num <= 8:
        season = 'summer'
    elif 9 <= month_num <= 11:
        season = 'autumn'

    return season

reserve_tb['reserve_season'] = pd.Categorical(
    reserve_tb['reserve_datetime'].dt.month.apply(to_season),
    categories=['spring', 'summer', 'autumn', 'winter']
)

reserve_tb

Unnamed: 0,reserve_id,hotel_id,customer_id,reserve_datetime,checkin_date,checkin_time,checkout_date,people_num,total_price,checkin_datetime,reserve_date,reserve_season
0,r1,h_75,c_1,2016-03-06 13:09:42,2016-03-26,10:00:00,2016-03-29,4,97200,2016-03-26 10:00:00,2016-03-06,spring
1,r2,h_219,c_1,2016-07-16 23:39:55,2016-07-20,11:30:00,2016-07-21,2,20600,2016-07-20 11:30:00,2016-07-16,summer
2,r3,h_179,c_1,2016-09-24 10:03:17,2016-10-19,09:00:00,2016-10-22,2,33600,2016-10-19 09:00:00,2016-09-24,autumn
3,r4,h_214,c_1,2017-03-08 03:20:10,2017-03-29,11:00:00,2017-03-30,4,194400,2017-03-29 11:00:00,2017-03-08,spring
4,r5,h_16,c_1,2017-09-05 19:50:37,2017-09-22,10:30:00,2017-09-23,3,68100,2017-09-22 10:30:00,2017-09-05,autumn
...,...,...,...,...,...,...,...,...,...,...,...,...
4025,r4026,h_129,c_999,2017-06-27 23:00:02,2017-07-10,09:30:00,2017-07-11,2,16000,2017-07-10 09:30:00,2017-06-27,summer
4026,r4027,h_97,c_999,2017-09-29 05:24:57,2017-10-09,10:30:00,2017-10-10,2,41800,2017-10-09 10:30:00,2017-09-29,autumn
4027,r4028,h_27,c_999,2018-03-14 05:01:45,2018-04-02,11:30:00,2018-04-04,2,74800,2018-04-02 11:30:00,2018-03-14,spring
4028,r4029,h_48,c_1000,2016-04-16 15:20:17,2016-05-10,09:30:00,2016-05-13,4,540000,2016-05-10 09:30:00,2016-04-16,spring


- 평일/주말 및 공휴일 구분

In [65]:
# 예약 테이블의 checkin_date에 휴일마스터(휴일플래그, 휴일 전날 플래그)를 부여
holiday_mst = load_holiday_mst()

pd.merge(reserve_tb, holiday_mst,
         left_on='checkin_date', right_on='target_day')

Unnamed: 0,reserve_id,hotel_id,customer_id,reserve_datetime,checkin_date,checkin_time,checkout_date,people_num,total_price,checkin_datetime,reserve_date,reserve_season,target_day,holidayday_flg,nextday_is_holiday_flg
0,r1,h_75,c_1,2016-03-06 13:09:42,2016-03-26,10:00:00,2016-03-29,4,97200,2016-03-26 10:00:00,2016-03-06,spring,2016-03-26,True,True
1,r2,h_219,c_1,2016-07-16 23:39:55,2016-07-20,11:30:00,2016-07-21,2,20600,2016-07-20 11:30:00,2016-07-16,summer,2016-07-20,False,False
2,r3,h_179,c_1,2016-09-24 10:03:17,2016-10-19,09:00:00,2016-10-22,2,33600,2016-10-19 09:00:00,2016-09-24,autumn,2016-10-19,False,False
3,r4,h_214,c_1,2017-03-08 03:20:10,2017-03-29,11:00:00,2017-03-30,4,194400,2017-03-29 11:00:00,2017-03-08,spring,2017-03-29,False,False
4,r5,h_16,c_1,2017-09-05 19:50:37,2017-09-22,10:30:00,2017-09-23,3,68100,2017-09-22 10:30:00,2017-09-05,autumn,2017-09-22,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4025,r4026,h_129,c_999,2017-06-27 23:00:02,2017-07-10,09:30:00,2017-07-11,2,16000,2017-07-10 09:30:00,2017-06-27,summer,2017-07-10,False,False
4026,r4027,h_97,c_999,2017-09-29 05:24:57,2017-10-09,10:30:00,2017-10-10,2,41800,2017-10-09 10:30:00,2017-09-29,autumn,2017-10-09,True,False
4027,r4028,h_27,c_999,2018-03-14 05:01:45,2018-04-02,11:30:00,2018-04-04,2,74800,2018-04-02 11:30:00,2018-03-14,spring,2018-04-02,False,False
4028,r4029,h_48,c_1000,2016-04-16 15:20:17,2016-05-10,09:30:00,2016-05-13,4,540000,2016-05-10 09:30:00,2016-04-16,spring,2016-05-10,False,False
