## 빅데이터 전처리

### 1회차 – 전처리개요

In [1]:
import pandas as pd
import os

def load_hotel_reserve():
  customer_tb = pd.read_csv('./data/customer.csv')
  hotel_tb = pd.read_csv('./data/hotel.csv')
  reserve_tb = pd.read_csv('./data/reserve.csv')
  return customer_tb, hotel_tb, reserve_tb


def load_holiday_mst():
  holiday_tb = pd.read_csv('./data/holiday_mst.csv',
                           index_col=False)
  return holiday_tb


def load_production():
  production_tb = pd.read_csv('./data/production.csv')
  return production_tb


def load_production_missing_num():
  production_tb = pd.read_csv('./data/production_missing_num.csv')
  return production_tb


def load_production_missing_category():
  production_tb = pd.read_csv('./data/production_missing_category.csv')
  return production_tb


def load_monthly_index():
  monthly_index_tb = \
    pd.read_csv('./data/monthly_index.csv')
  return monthly_index_tb


def load_meros_txt():
  with open('./data/txt/meros.txt', 'r') as f:
    meros = f.read()
    f.close()
  return meros


In [2]:
customer_tb, hotel_tb, reserve_tb = load_hotel_reserve()

print(reserve_tb)

     reserve_id hotel_id customer_id     reserve_datetime checkin_date  \
0            r1     h_75         c_1  2016-03-06 13:09:42   2016-03-26   
1            r2    h_219         c_1  2016-07-16 23:39:55   2016-07-20   
2            r3    h_179         c_1  2016-09-24 10:03:17   2016-10-19   
3            r4    h_214         c_1  2017-03-08 03:20:10   2017-03-29   
4            r5     h_16         c_1  2017-09-05 19:50:37   2017-09-22   
...         ...      ...         ...                  ...          ...   
4025      r4026    h_129       c_999  2017-06-27 23:00:02   2017-07-10   
4026      r4027     h_97       c_999  2017-09-29 05:24:57   2017-10-09   
4027      r4028     h_27       c_999  2018-03-14 05:01:45   2018-04-02   
4028      r4029     h_48      c_1000  2016-04-16 15:20:17   2016-05-10   
4029      r4030    h_117      c_1000  2016-06-06 08:16:51   2016-07-06   

     checkin_time checkout_date  people_num  total_price  
0        10:00:00    2016-03-29           4        9

### 2회차 – 데이터 구성 전처리


#### 1. 필요한 데이터만 추출하기
- 지정된 데이터 열 추출  

    -> 예약 테이블에서 reserve_id, hotel_id, customer_id, reserve_datetime을 선택하여 추출

In [3]:
# 방법1 : Reserve_tb의 배열에 추출할 열 이름을 문자열로 지정
reserve_tb[['reserve_id', 'hotel_id', 'customer_id', 'reserve_datetime', 
            'checkin_date', 'checkin_time','checkout_date']]

Unnamed: 0,reserve_id,hotel_id,customer_id,reserve_datetime,checkin_date,checkin_time,checkout_date
0,r1,h_75,c_1,2016-03-06 13:09:42,2016-03-26,10:00:00,2016-03-29
1,r2,h_219,c_1,2016-07-16 23:39:55,2016-07-20,11:30:00,2016-07-21
2,r3,h_179,c_1,2016-09-24 10:03:17,2016-10-19,09:00:00,2016-10-22
3,r4,h_214,c_1,2017-03-08 03:20:10,2017-03-29,11:00:00,2017-03-30
4,r5,h_16,c_1,2017-09-05 19:50:37,2017-09-22,10:30:00,2017-09-23
...,...,...,...,...,...,...,...
4025,r4026,h_129,c_999,2017-06-27 23:00:02,2017-07-10,09:30:00,2017-07-11
4026,r4027,h_97,c_999,2017-09-29 05:24:57,2017-10-09,10:30:00,2017-10-10
4027,r4028,h_27,c_999,2018-03-14 05:01:45,2018-04-02,11:30:00,2018-04-04
4028,r4029,h_48,c_1000,2016-04-16 15:20:17,2016-05-10,09:30:00,2016-05-13


In [4]:
# 방법2 : Loc함수의 2차원 배열의 2차원 항목에 추출할 열 이름을 배열로 지정하여 열을 추출
reserve_tb.loc[:, ['reserve_id', 'hotel_id', 'customer_id', 'reserve_datetime', 
                   'checkin_date', 'checkin_time','checkout_date']]

Unnamed: 0,reserve_id,hotel_id,customer_id,reserve_datetime,checkin_date,checkin_time,checkout_date
0,r1,h_75,c_1,2016-03-06 13:09:42,2016-03-26,10:00:00,2016-03-29
1,r2,h_219,c_1,2016-07-16 23:39:55,2016-07-20,11:30:00,2016-07-21
2,r3,h_179,c_1,2016-09-24 10:03:17,2016-10-19,09:00:00,2016-10-22
3,r4,h_214,c_1,2017-03-08 03:20:10,2017-03-29,11:00:00,2017-03-30
4,r5,h_16,c_1,2017-09-05 19:50:37,2017-09-22,10:30:00,2017-09-23
...,...,...,...,...,...,...,...
4025,r4026,h_129,c_999,2017-06-27 23:00:02,2017-07-10,09:30:00,2017-07-11
4026,r4027,h_97,c_999,2017-09-29 05:24:57,2017-10-09,10:30:00,2017-10-10
4027,r4028,h_27,c_999,2018-03-14 05:01:45,2018-04-02,11:30:00,2018-04-04
4028,r4029,h_48,c_1000,2016-04-16 15:20:17,2016-05-10,09:30:00,2016-05-13


In [5]:
# 방법3 : Drop 함수로 불필요한 열을 제거, axis=1은 열을 의미하고, inplace는 reserve_tb를 갱신
reserve_tb.drop(['people_num', 'total_price'], axis=1, inplace=True)
print(reserve_tb)

     reserve_id hotel_id customer_id     reserve_datetime checkin_date  \
0            r1     h_75         c_1  2016-03-06 13:09:42   2016-03-26   
1            r2    h_219         c_1  2016-07-16 23:39:55   2016-07-20   
2            r3    h_179         c_1  2016-09-24 10:03:17   2016-10-19   
3            r4    h_214         c_1  2017-03-08 03:20:10   2017-03-29   
4            r5     h_16         c_1  2017-09-05 19:50:37   2017-09-22   
...         ...      ...         ...                  ...          ...   
4025      r4026    h_129       c_999  2017-06-27 23:00:02   2017-07-10   
4026      r4027     h_97       c_999  2017-09-29 05:24:57   2017-10-09   
4027      r4028     h_27       c_999  2018-03-14 05:01:45   2018-04-02   
4028      r4029     h_48      c_1000  2016-04-16 15:20:17   2016-05-10   
4029      r4030    h_117      c_1000  2016-06-06 08:16:51   2016-07-06   

     checkin_time checkout_date  
0        10:00:00    2016-03-29  
1        11:30:00    2016-07-21  
2        

- 조건이 부여되어 있는 데이터 행 추출

In [6]:
reserve_tb['checkout_date'] = pd.to_datetime(reserve_tb['checkout_date'])

In [7]:
# 방법1 : DataFrame 배열에 지정한 조건의 결과값인 True/False를 가지는 행의 배열을 지정하여 추출
reserve_tb[(reserve_tb['checkout_date'] >= '2016-10-13') &
           (reserve_tb['checkout_date'] <= '2016-10-14')]

Unnamed: 0,reserve_id,hotel_id,customer_id,reserve_datetime,checkin_date,checkin_time,checkout_date
284,r285,h_121,c_67,2016-09-27 06:13:19,2016-10-12,12:00:00,2016-10-14
513,r514,h_74,c_120,2016-10-06 03:12:04,2016-10-11,12:30:00,2016-10-14
1065,r1066,h_205,c_261,2016-09-14 02:57:59,2016-10-11,10:00:00,2016-10-14
1480,r1481,h_116,c_364,2016-09-17 17:45:39,2016-10-11,11:30:00,2016-10-13
1546,r1547,h_149,c_377,2016-09-27 08:19:24,2016-10-10,11:00:00,2016-10-13
1709,r1710,h_59,c_422,2016-09-19 04:17:25,2016-10-10,12:00:00,2016-10-13
1932,r1933,h_113,c_477,2016-09-24 09:04:26,2016-10-12,11:30:00,2016-10-13
2058,r2059,h_9,c_517,2016-09-19 15:32:35,2016-10-11,12:30:00,2016-10-13
2115,r2116,h_77,c_527,2016-10-05 00:44:09,2016-10-11,09:00:00,2016-10-13
2170,r2171,h_177,c_540,2016-09-28 01:21:26,2016-10-11,10:00:00,2016-10-13


In [8]:
# 방법2 : loc 함수의 2차원 배열의 1차원 항목에 지정한 조건의 결과값인 True/False를 지정하여 추출
reserve_tb.loc[(reserve_tb['checkout_date'] >= '2016-10-13') &
           (reserve_tb['checkout_date'] <= '2016-10-14'), :]

Unnamed: 0,reserve_id,hotel_id,customer_id,reserve_datetime,checkin_date,checkin_time,checkout_date
284,r285,h_121,c_67,2016-09-27 06:13:19,2016-10-12,12:00:00,2016-10-14
513,r514,h_74,c_120,2016-10-06 03:12:04,2016-10-11,12:30:00,2016-10-14
1065,r1066,h_205,c_261,2016-09-14 02:57:59,2016-10-11,10:00:00,2016-10-14
1480,r1481,h_116,c_364,2016-09-17 17:45:39,2016-10-11,11:30:00,2016-10-13
1546,r1547,h_149,c_377,2016-09-27 08:19:24,2016-10-10,11:00:00,2016-10-13
1709,r1710,h_59,c_422,2016-09-19 04:17:25,2016-10-10,12:00:00,2016-10-13
1932,r1933,h_113,c_477,2016-09-24 09:04:26,2016-10-12,11:30:00,2016-10-13
2058,r2059,h_9,c_517,2016-09-19 15:32:35,2016-10-11,12:30:00,2016-10-13
2115,r2116,h_77,c_527,2016-10-05 00:44:09,2016-10-11,09:00:00,2016-10-13
2170,r2171,h_177,c_540,2016-09-28 01:21:26,2016-10-11,10:00:00,2016-10-13


In [9]:
# 방법3 : Query함수와 조건식을 이용하여 행을 추출
reserve_tb.query('"2016-10-13" <= checkout_date <= "2016-10-14"')

Unnamed: 0,reserve_id,hotel_id,customer_id,reserve_datetime,checkin_date,checkin_time,checkout_date
284,r285,h_121,c_67,2016-09-27 06:13:19,2016-10-12,12:00:00,2016-10-14
513,r514,h_74,c_120,2016-10-06 03:12:04,2016-10-11,12:30:00,2016-10-14
1065,r1066,h_205,c_261,2016-09-14 02:57:59,2016-10-11,10:00:00,2016-10-14
1480,r1481,h_116,c_364,2016-09-17 17:45:39,2016-10-11,11:30:00,2016-10-13
1546,r1547,h_149,c_377,2016-09-27 08:19:24,2016-10-10,11:00:00,2016-10-13
1709,r1710,h_59,c_422,2016-09-19 04:17:25,2016-10-10,12:00:00,2016-10-13
1932,r1933,h_113,c_477,2016-09-24 09:04:26,2016-10-12,11:30:00,2016-10-13
2058,r2059,h_9,c_517,2016-09-19 15:32:35,2016-10-11,12:30:00,2016-10-13
2115,r2116,h_77,c_527,2016-10-05 00:44:09,2016-10-11,09:00:00,2016-10-13
2170,r2171,h_177,c_540,2016-09-28 01:21:26,2016-10-11,10:00:00,2016-10-13


- 중복성을 고려하지 않은 랜덤 샘플링

In [10]:
# 호텔 예약 레코드를 활용하여 약 50%의 랜덤 샘플링 수행
reserve_tb.sample(frac=0.5)

Unnamed: 0,reserve_id,hotel_id,customer_id,reserve_datetime,checkin_date,checkin_time,checkout_date
3316,r3317,h_134,c_831,2016-08-24 05:02:38,2016-09-01,10:30:00,2016-09-04
3276,r3277,h_151,c_824,2017-01-19 03:41:43,2017-02-11,12:30:00,2017-02-14
1623,r1624,h_162,c_398,2016-02-14 01:57:08,2016-03-02,10:00:00,2016-03-03
2276,r2277,h_36,c_570,2016-12-15 16:58:32,2017-01-09,09:00:00,2017-01-11
1866,r1867,h_206,c_462,2016-07-08 03:20:55,2016-07-14,09:00:00,2016-07-16
...,...,...,...,...,...,...,...
2408,r2409,h_63,c_603,2016-11-20 07:02:30,2016-11-27,10:30:00,2016-11-29
297,r298,h_201,c_69,2018-07-08 19:43:45,2018-07-20,10:30:00,2018-07-21
588,r589,h_163,c_143,2016-12-30 09:47:00,2017-01-15,12:30:00,2017-01-18
492,r493,h_293,c_114,2017-08-08 06:17:16,2017-08-14,09:00:00,2017-08-17


고객 ID에 기반한 샘플링 – 중복성 제거  

-> 어느 한 쪽으로 편향된 샘플링을 하게되면 이후 분석에서 잘못된 결과를 도출할 수 있음

-> 공평하게 샘플링하려면 분석 대상의 단위와 샘플링 단위를 서로 맞춰야 함

-> 해결 방법: 예약 테이블의 고객 ID를 대상으로 랜덤 샘플링을 실행하고 샘플링한 고객 ID의 예약 레코드만 추출

- 집약 ID에 기반한 샘플링

In [11]:
#  unique 함수로 중복값을 제거하여 pandas.Series를 얻고 sample로 50%를 추출
target = pd.Series(reserve_tb['customer_id'].unique()).sample(frac=0.5)

In [12]:
# isin 함수로 매개변수로 전달된 리스트 값 중 일치하는 열의 값만을 추출
reserve_tb[reserve_tb['customer_id'].isin(target)]

Unnamed: 0,reserve_id,hotel_id,customer_id,reserve_datetime,checkin_date,checkin_time,checkout_date
16,r17,h_115,c_3,2016-05-10 12:20:32,2016-05-17,10:00:00,2016-05-19
17,r18,h_132,c_3,2016-10-22 02:18:48,2016-11-12,12:00:00,2016-11-13
18,r19,h_23,c_3,2017-01-11 22:54:09,2017-02-08,10:00:00,2017-02-10
19,r20,h_292,c_3,2017-02-23 07:10:30,2017-03-03,11:00:00,2017-03-04
20,r21,h_153,c_3,2017-04-06 18:12:10,2017-04-16,09:00:00,2017-04-19
...,...,...,...,...,...,...,...
4001,r4002,h_263,c_993,2017-02-20 16:13:02,2017-02-27,10:30:00,2017-02-28
4002,r4003,h_262,c_993,2017-08-01 23:47:40,2017-08-03,10:00:00,2017-08-05
4003,r4004,h_201,c_993,2017-08-31 01:33:58,2017-09-13,09:30:00,2017-09-15
4020,r4021,h_159,c_997,2016-01-08 20:30:10,2016-01-09,12:00:00,2016-01-11
