### 데이터 불러오기

In [1]:
import pandas as pd
import numpy as np
import os

def load_hotel_reserve():
  customer_tb = pd.read_csv('./data/customer.csv')
  hotel_tb = pd.read_csv('./data/hotel.csv')
  reserve_tb = pd.read_csv('./data/reserve.csv')
  return customer_tb, hotel_tb, reserve_tb


def load_holiday_mst():
  holiday_tb = pd.read_csv('./data/holiday_mst.csv',
                           index_col=False)
  return holiday_tb


def load_production():
  production_tb = pd.read_csv('./data/production.csv')
  return production_tb


def load_production_missing_num():
  production_tb = pd.read_csv('./data/production_missing_num.csv')
  return production_tb


def load_production_missing_category():
  production_tb = pd.read_csv('./data/production_missing_category.csv')
  return production_tb


def load_monthly_index():
  monthly_index_tb = pd.read_csv('./data/monthly_index.csv')
  return monthly_index_tb


def load_meros_txt():
  with open('./data/txt/meros.txt', 'r') as f:
    meros = f.read()
    f.close()
  return meros


In [27]:
customer_tb, hotel_tb, reserve_tb = load_hotel_reserve()

### 3. 일자 데이터 전처리

#### 문자열 데이터 -> 날짜형 데이터

날짜형 데이터: Timestamp, DateTime 로 변환

In [28]:
reserve_tb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4030 entries, 0 to 4029
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   reserve_id        4030 non-null   object
 1   hotel_id          4030 non-null   object
 2   customer_id       4030 non-null   object
 3   reserve_datetime  4030 non-null   object
 4   checkin_date      4030 non-null   object
 5   checkin_time      4030 non-null   object
 6   checkout_date     4030 non-null   object
 7   people_num        4030 non-null   int64 
 8   total_price       4030 non-null   int64 
dtypes: int64(2), object(7)
memory usage: 283.5+ KB


In [30]:
reserve_tb[['checkin_date', 'checkin_time', 'reserve_datetime']].head()

Unnamed: 0,checkin_date,checkin_time,reserve_datetime
0,2016-03-26,10:00:00,2016-03-06 13:09:42
1,2016-07-20,11:30:00,2016-07-16 23:39:55
2,2016-10-19,09:00:00,2016-09-24 10:03:17
3,2017-03-29,11:00:00,2017-03-08 03:20:10
4,2017-09-22,10:30:00,2017-09-05 19:50:37


In [None]:
# 문자열 2016-03-06 13:09:42 -> datetime형으로 변환
# 문자열 자체에 공백이 있으므로 이에 맞춰서 공백 써줌
pd.to_datetime(reserve_tb['reserve_datetime'], format='%Y-%m-%d %H:%M:%S')

0      2016-03-06 13:09:42
1      2016-07-16 23:39:55
2      2016-09-24 10:03:17
3      2017-03-08 03:20:10
4      2017-09-05 19:50:37
               ...        
4025   2017-06-27 23:00:02
4026   2017-09-29 05:24:57
4027   2018-03-14 05:01:45
4028   2016-04-16 15:20:17
4029   2016-06-06 08:16:51
Name: reserve_datetime, Length: 4030, dtype: datetime64[ns]

In [None]:
reserve_tb['checkin_date'] + reserve_tb['checkin_time'] #이렇게 더하면 공백 없음

0       2016-03-2610:00:00
1       2016-07-2011:30:00
2       2016-10-1909:00:00
3       2017-03-2911:00:00
4       2017-09-2210:30:00
               ...        
4025    2017-07-1009:30:00
4026    2017-10-0910:30:00
4027    2018-04-0211:30:00
4028    2016-05-1009:30:00
4029    2016-07-0609:00:00
Length: 4030, dtype: object

In [32]:
# 위에서 본 것처럼 그냥 합치면 공백 없으므로 포맷에 공백 없애줌 
pd.to_datetime(reserve_tb['checkin_date'] + reserve_tb['checkin_time'], format='%Y-%m-%d%H:%M:%S')

0      2016-03-26 10:00:00
1      2016-07-20 11:30:00
2      2016-10-19 09:00:00
3      2017-03-29 11:00:00
4      2017-09-22 10:30:00
               ...        
4025   2017-07-10 09:30:00
4026   2017-10-09 10:30:00
4027   2018-04-02 11:30:00
4028   2016-05-10 09:30:00
4029   2016-07-06 09:00:00
Length: 4030, dtype: datetime64[ns]

In [33]:
# 아니면 그냥 +연산자 사이에 공백 추가해주기
pd.to_datetime(reserve_tb['checkin_date'] + ' ' + reserve_tb['checkin_time'], format='%Y-%m-%d %H:%M:%S')

0      2016-03-26 10:00:00
1      2016-07-20 11:30:00
2      2016-10-19 09:00:00
3      2017-03-29 11:00:00
4      2017-09-22 10:30:00
               ...        
4025   2017-07-10 09:30:00
4026   2017-10-09 10:30:00
4027   2018-04-02 11:30:00
4028   2016-05-10 09:30:00
4029   2016-07-06 09:00:00
Length: 4030, dtype: datetime64[ns]

#### 연/월/일/시 분리하기

In [34]:
# datetime64형으로 변경, 적용
reserve_tb['reserve_datetime'] = pd.to_datetime(reserve_tb['reserve_datetime'], format='%Y-%m-%d %H:%M:%S')
reserve_tb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4030 entries, 0 to 4029
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   reserve_id        4030 non-null   object        
 1   hotel_id          4030 non-null   object        
 2   customer_id       4030 non-null   object        
 3   reserve_datetime  4030 non-null   datetime64[ns]
 4   checkin_date      4030 non-null   object        
 5   checkin_time      4030 non-null   object        
 6   checkout_date     4030 non-null   object        
 7   people_num        4030 non-null   int64         
 8   total_price       4030 non-null   int64         
dtypes: datetime64[ns](1), int64(2), object(6)
memory usage: 283.5+ KB


In [44]:
# 추출
print('연도:\n' ,reserve_tb['reserve_datetime'].dt.year.head(1))    #연도
print('월:\n' , reserve_tb['reserve_datetime'].dt.month.head(1))    #월
print('날짜:\n' , reserve_tb['reserve_datetime'].dt.day.head(1))     #일
print('요일:\n' , reserve_tb['reserve_datetime'].dt.dayofweek.head(1))   #요일
print('요일 이름:\n' , reserve_tb['reserve_datetime'].dt.day_name().head(1))   #요일
print('시간:\n' , reserve_tb['reserve_datetime'].dt.hour.head(1))        #시간
print('분:\n' , reserve_tb['reserve_datetime'].dt.minute.head(1))        #분
print('초:\n' , reserve_tb['reserve_datetime'].dt.second.head(1))        #시간
print('지정 문자열로 변환:\n' , reserve_tb['reserve_datetime'].dt.strftime('%Y-%m-%d %H:%M:%S'))

연도:
 0    2016
Name: reserve_datetime, dtype: int32
월:
 0    3
Name: reserve_datetime, dtype: int32
날짜:
 0    6
Name: reserve_datetime, dtype: int32
요일:
 0    6
Name: reserve_datetime, dtype: int32
요일 이름:
 0    Sunday
Name: reserve_datetime, dtype: object
시간:
 0    13
Name: reserve_datetime, dtype: int32
분:
 0    9
Name: reserve_datetime, dtype: int32
초:
 0    42
Name: reserve_datetime, dtype: int32
지정 문자열로 변환:
 0       2016-03-06 13:09:42
1       2016-07-16 23:39:55
2       2016-09-24 10:03:17
3       2017-03-08 03:20:10
4       2017-09-05 19:50:37
               ...         
4025    2017-06-27 23:00:02
4026    2017-09-29 05:24:57
4027    2018-03-14 05:01:45
4028    2016-04-16 15:20:17
4029    2016-06-06 08:16:51
Name: reserve_datetime, Length: 4030, dtype: object


#### 시간 간격 계산

In [48]:
# datetime64형으로 변경, 적용
reserve_tb['reserve_datetime'] = pd.to_datetime(reserve_tb['reserve_datetime'], format='%Y-%m-%d %H:%M:%S')
# datetime64형으로 변경, 적용
reserve_tb['checkin_datetime'] = pd.to_datetime(reserve_tb['checkin_date'] + ' ' + reserve_tb['checkin_time'], format='%Y-%m-%d %H:%M:%S')

In [50]:
reserve_tb.head()

Unnamed: 0,reserve_id,hotel_id,customer_id,reserve_datetime,checkin_date,checkin_time,checkout_date,people_num,total_price,checkin_datetime
0,r1,h_75,c_1,2016-03-06 13:09:42,2016-03-26,10:00:00,2016-03-29,4,97200,2016-03-26 10:00:00
1,r2,h_219,c_1,2016-07-16 23:39:55,2016-07-20,11:30:00,2016-07-21,2,20600,2016-07-20 11:30:00
2,r3,h_179,c_1,2016-09-24 10:03:17,2016-10-19,09:00:00,2016-10-22,2,33600,2016-10-19 09:00:00
3,r4,h_214,c_1,2017-03-08 03:20:10,2017-03-29,11:00:00,2017-03-30,4,194400,2017-03-29 11:00:00
4,r5,h_16,c_1,2017-09-05 19:50:37,2017-09-22,10:30:00,2017-09-23,3,68100,2017-09-22 10:30:00


In [49]:
# 연도 차이 게산
reserve_tb['reserve_datetime'].dt.year - reserve_tb['checkin_datetime'].dt.year

0       0
1       0
2       0
3       0
4       0
       ..
4025    0
4026    0
4027    0
4028    0
4029    0
Length: 4030, dtype: int32

In [52]:
# 월 차이 계산
abs( \
(reserve_tb['reserve_datetime'].dt.year * 12 + reserve_tb['reserve_datetime'].dt.month) -  \
(reserve_tb['checkin_datetime'].dt.year * 12 + reserve_tb['checkin_datetime'].dt.month) )

0       0
1       0
2       1
3       0
4       0
       ..
4025    1
4026    1
4027    1
4028    1
4029    1
Length: 4030, dtype: int32

In [54]:
# 일 차이 계산
abs((reserve_tb['reserve_datetime'] - reserve_tb['checkin_datetime']).dt.days)

0       20
1        4
2       25
3       22
4       17
        ..
4025    13
4026    11
4027    20
4028    24
4029    31
Length: 4030, dtype: int64

Timedelta : 두 시점의 차이로 연산이 된 데이터로, 이렇게 연산이 되는 즉시 data type이 timedelta로 변경

In [55]:
# 일 차이 계산 -> timedelta로 변환
reserve_tb['date_diff'] = abs((reserve_tb['reserve_datetime'] - reserve_tb['checkin_datetime']).dt.days).astype('timedelta64[D]')

In [56]:
reserve_tb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4030 entries, 0 to 4029
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   reserve_id        4030 non-null   object        
 1   hotel_id          4030 non-null   object        
 2   customer_id       4030 non-null   object        
 3   reserve_datetime  4030 non-null   datetime64[ns]
 4   checkin_date      4030 non-null   object        
 5   checkin_time      4030 non-null   object        
 6   checkout_date     4030 non-null   object        
 7   people_num        4030 non-null   int64         
 8   total_price       4030 non-null   int64         
 9   checkin_datetime  4030 non-null   datetime64[ns]
 10  date_diff         4030 non-null   timedelta64[s]
dtypes: datetime64[ns](2), int64(2), object(6), timedelta64[s](1)
memory usage: 346.5+ KB


#### 시간 증분 처리

In [57]:
# datetime64형으로 변경, 적용
# reserve_tb['reserve_datetime'] = pd.to_datetime(reserve_tb['reserve_datetime'], format='%Y-%m-%d %H:%M:%S')

In [None]:
# 예약 일자만 추출
reserve_tb['reserve_date'] = reserve_tb['reserve_datetime'].dt.date
reserve_tb['reserve_date']

0       2016-03-06
1       2016-07-16
2       2016-09-24
3       2017-03-08
4       2017-09-05
           ...    
4025    2017-06-27
4026    2017-09-29
4027    2018-03-14
4028    2016-04-16
4029    2016-06-06
Name: reserve_date, Length: 4030, dtype: object

datetime.timedelta : 시간 간격을 계산할 때 사용함

In [60]:
# datetime 패키지 불러오기
import datetime

# 1일 증가
reserve_tb['reserve_datetime'] + datetime.timedelta(days=1)

0      2016-03-07 13:09:42
1      2016-07-17 23:39:55
2      2016-09-25 10:03:17
3      2017-03-09 03:20:10
4      2017-09-06 19:50:37
               ...        
4025   2017-06-28 23:00:02
4026   2017-09-30 05:24:57
4027   2018-03-15 05:01:45
4028   2016-04-17 15:20:17
4029   2016-06-07 08:16:51
Name: reserve_datetime, Length: 4030, dtype: datetime64[ns]

In [None]:
# 1시간 증가
reserve_tb['reserve_datetime'] + datetime.timedelta(hours=1)

0      2016-03-06 14:09:42
1      2016-07-17 00:39:55
2      2016-09-24 11:03:17
3      2017-03-08 04:20:10
4      2017-09-05 20:50:37
               ...        
4025   2017-06-28 00:00:02
4026   2017-09-29 06:24:57
4027   2018-03-14 06:01:45
4028   2016-04-16 16:20:17
4029   2016-06-06 09:16:51
Name: reserve_datetime, Length: 4030, dtype: datetime64[ns]

In [62]:
# 1분 증가
reserve_tb['reserve_datetime'] + datetime.timedelta(minutes=1)

0      2016-03-06 13:10:42
1      2016-07-16 23:40:55
2      2016-09-24 10:04:17
3      2017-03-08 03:21:10
4      2017-09-05 19:51:37
               ...        
4025   2017-06-27 23:01:02
4026   2017-09-29 05:25:57
4027   2018-03-14 05:02:45
4028   2016-04-16 15:21:17
4029   2016-06-06 08:17:51
Name: reserve_datetime, Length: 4030, dtype: datetime64[ns]

In [63]:
# 1초 증가
reserve_tb['reserve_datetime'] + datetime.timedelta(seconds=1)

0      2016-03-06 13:09:43
1      2016-07-16 23:39:56
2      2016-09-24 10:03:18
3      2017-03-08 03:20:11
4      2017-09-05 19:50:38
               ...        
4025   2017-06-27 23:00:03
4026   2017-09-29 05:24:58
4027   2018-03-14 05:01:46
4028   2016-04-16 15:20:18
4029   2016-06-06 08:16:52
Name: reserve_datetime, Length: 4030, dtype: datetime64[ns]

#### 계절 항목으로 변환

In [None]:
# datetime64형으로 변경, 적용
# reserve_tb['reserve_datetime'] = pd.to_datetime(reserve_tb['reserve_datetime'], format='%Y-%m-%d %H:%M:%S')

In [64]:
# 함수로 지정
def to_season(month_num):
    season = 'winter'
    if 3 <= month_num <= 5:
        season = 'spring'
    elif 6 <= month_num <= 8:
        season = 'summer'
    elif 9 <= month_num <= 11:
        season = 'autumn'
    return season   

In [65]:
# 함수 적용 후, 카테고리 데이터로 변환해 새로운 열에 저장
reserve_tb['reserve_season'] = pd.Categorical(
    reserve_tb['reserve_datetime'].dt.month.apply(to_season), 
    categories=['spring', 'summer','autumn', 'winter']
)
reserve_tb['reserve_season']

0       spring
1       summer
2       autumn
3       spring
4       autumn
         ...  
4025    summer
4026    autumn
4027    spring
4028    spring
4029    summer
Name: reserve_season, Length: 4030, dtype: category
Categories (4, object): ['spring', 'summer', 'autumn', 'winter']

#### 평일,주말,공휴일 구분

In [67]:
holiday_mst = load_holiday_mst()
new_reserve_df = pd.merge(reserve_tb, holiday_mst, left_on='checkin_date', right_on='target_day')
new_reserve_df.head(10)

Unnamed: 0,reserve_id,hotel_id,customer_id,reserve_datetime,checkin_date,checkin_time,checkout_date,people_num,total_price,checkin_datetime,date_diff,reserve_date,reserve_season,target_day,holidayday_flg,nextday_is_holiday_flg
0,r1,h_75,c_1,2016-03-06 13:09:42,2016-03-26,10:00:00,2016-03-29,4,97200,2016-03-26 10:00:00,20 days,2016-03-06,spring,2016-03-26,True,True
1,r2,h_219,c_1,2016-07-16 23:39:55,2016-07-20,11:30:00,2016-07-21,2,20600,2016-07-20 11:30:00,4 days,2016-07-16,summer,2016-07-20,False,False
2,r3,h_179,c_1,2016-09-24 10:03:17,2016-10-19,09:00:00,2016-10-22,2,33600,2016-10-19 09:00:00,25 days,2016-09-24,autumn,2016-10-19,False,False
3,r4,h_214,c_1,2017-03-08 03:20:10,2017-03-29,11:00:00,2017-03-30,4,194400,2017-03-29 11:00:00,22 days,2017-03-08,spring,2017-03-29,False,False
4,r5,h_16,c_1,2017-09-05 19:50:37,2017-09-22,10:30:00,2017-09-23,3,68100,2017-09-22 10:30:00,17 days,2017-09-05,autumn,2017-09-22,False,True
5,r6,h_241,c_1,2017-11-27 18:47:05,2017-12-04,12:00:00,2017-12-06,3,36000,2017-12-04 12:00:00,7 days,2017-11-27,autumn,2017-12-04,False,False
6,r7,h_256,c_1,2017-12-29 10:38:36,2018-01-25,10:30:00,2018-01-28,1,103500,2018-01-25 10:30:00,27 days,2017-12-29,winter,2018-01-25,False,False
7,r8,h_241,c_1,2018-05-26 08:42:51,2018-06-08,10:00:00,2018-06-09,1,6000,2018-06-08 10:00:00,14 days,2018-05-26,spring,2018-06-08,False,True
8,r9,h_217,c_2,2016-03-05 13:31:06,2016-03-25,09:30:00,2016-03-27,3,68400,2016-03-25 09:30:00,20 days,2016-03-05,spring,2016-03-25,False,True
9,r10,h_240,c_2,2016-06-25 09:12:22,2016-07-14,11:00:00,2016-07-17,4,320400,2016-07-14 11:00:00,20 days,2016-06-25,summer,2016-07-14,False,False
