In [1]:
import pandas as pd
import os

def load_hotel_reserve():
  customer_tb = pd.read_csv('./data/customer.csv')
  hotel_tb = pd.read_csv('./data/hotel.csv')
  reserve_tb = pd.read_csv('./data/reserve.csv')
  return customer_tb, hotel_tb, reserve_tb


def load_holiday_mst():
  holiday_tb = pd.read_csv('./data/holiday_mst.csv',
                           index_col=False)
  return holiday_tb


def load_production():
  production_tb = pd.read_csv('./data/production.csv')
  return production_tb


def load_production_missing_num():
  production_tb = pd.read_csv('./data/production_missing_num.csv')
  return production_tb


def load_production_missing_category():
  production_tb = pd.read_csv('./awesomebook-master/data/production_missing_category.csv')
  return production_tb


def load_monthly_index():
  monthly_index_tb = pd.read_csv('./data/monthly_index.csv')
  return monthly_index_tb


def load_meros_txt():
  with open('./data/txt/meros.txt', 'r') as f:
    meros = f.read()
    f.close()
  return meros


In [2]:
customer_tb, hotel_tb, reserve_tb = load_hotel_reserve()

print(reserve_tb)

     reserve_id hotel_id customer_id     reserve_datetime checkin_date  \
0            r1     h_75         c_1  2016-03-06 13:09:42   2016-03-26   
1            r2    h_219         c_1  2016-07-16 23:39:55   2016-07-20   
2            r3    h_179         c_1  2016-09-24 10:03:17   2016-10-19   
3            r4    h_214         c_1  2017-03-08 03:20:10   2017-03-29   
4            r5     h_16         c_1  2017-09-05 19:50:37   2017-09-22   
...         ...      ...         ...                  ...          ...   
4025      r4026    h_129       c_999  2017-06-27 23:00:02   2017-07-10   
4026      r4027     h_97       c_999  2017-09-29 05:24:57   2017-10-09   
4027      r4028     h_27       c_999  2018-03-14 05:01:45   2018-04-02   
4028      r4029     h_48      c_1000  2016-04-16 15:20:17   2016-05-10   
4029      r4030    h_117      c_1000  2016-06-06 08:16:51   2016-07-06   

     checkin_time checkout_date  people_num  total_price  
0        10:00:00    2016-03-29           4        9

### 1. 필요한 데이터만 추출하기

#### 지정된 데이터 열 추출

예약테이블에서reserve_id, hotel_id, customer_id, reserve_datetime을 선택하여 추출한다

In [3]:
# 방법1. Reserve_tb의 배열에 추출할 열 이름을 문자열로 지정한다
reserve_tb[['reserve_id', 'hotel_id', 'customer_id', 'reserve_datetime', 'checkin_time', 'checkout_date']]

Unnamed: 0,reserve_id,hotel_id,customer_id,reserve_datetime,checkin_time,checkout_date
0,r1,h_75,c_1,2016-03-06 13:09:42,10:00:00,2016-03-29
1,r2,h_219,c_1,2016-07-16 23:39:55,11:30:00,2016-07-21
2,r3,h_179,c_1,2016-09-24 10:03:17,09:00:00,2016-10-22
3,r4,h_214,c_1,2017-03-08 03:20:10,11:00:00,2017-03-30
4,r5,h_16,c_1,2017-09-05 19:50:37,10:30:00,2017-09-23
...,...,...,...,...,...,...
4025,r4026,h_129,c_999,2017-06-27 23:00:02,09:30:00,2017-07-11
4026,r4027,h_97,c_999,2017-09-29 05:24:57,10:30:00,2017-10-10
4027,r4028,h_27,c_999,2018-03-14 05:01:45,11:30:00,2018-04-04
4028,r4029,h_48,c_1000,2016-04-16 15:20:17,09:30:00,2016-05-13


In [4]:
# 방법2. Loc함수의 2차원 배열의 2차원 항목에 추출할 열 이름을 배열로 지정하여 열을 추출한다
reserve_tb.loc[ : , ['reserve_id', 'hotel_id', 'customer_id', 'reserve_datetime', 'checkin_time', 'checkout_date']]

Unnamed: 0,reserve_id,hotel_id,customer_id,reserve_datetime,checkin_time,checkout_date
0,r1,h_75,c_1,2016-03-06 13:09:42,10:00:00,2016-03-29
1,r2,h_219,c_1,2016-07-16 23:39:55,11:30:00,2016-07-21
2,r3,h_179,c_1,2016-09-24 10:03:17,09:00:00,2016-10-22
3,r4,h_214,c_1,2017-03-08 03:20:10,11:00:00,2017-03-30
4,r5,h_16,c_1,2017-09-05 19:50:37,10:30:00,2017-09-23
...,...,...,...,...,...,...
4025,r4026,h_129,c_999,2017-06-27 23:00:02,09:30:00,2017-07-11
4026,r4027,h_97,c_999,2017-09-29 05:24:57,10:30:00,2017-10-10
4027,r4028,h_27,c_999,2018-03-14 05:01:45,11:30:00,2018-04-04
4028,r4029,h_48,c_1000,2016-04-16 15:20:17,09:30:00,2016-05-13


In [5]:
#  방법3. Drop 함수로 불필요한 열을 제거한다. axis=1은 열을 의미하고, inplace는 reserve_tb를 갱신한다
reserve_tb.drop(['people_num', 'total_price'], axis=1)

Unnamed: 0,reserve_id,hotel_id,customer_id,reserve_datetime,checkin_date,checkin_time,checkout_date
0,r1,h_75,c_1,2016-03-06 13:09:42,2016-03-26,10:00:00,2016-03-29
1,r2,h_219,c_1,2016-07-16 23:39:55,2016-07-20,11:30:00,2016-07-21
2,r3,h_179,c_1,2016-09-24 10:03:17,2016-10-19,09:00:00,2016-10-22
3,r4,h_214,c_1,2017-03-08 03:20:10,2017-03-29,11:00:00,2017-03-30
4,r5,h_16,c_1,2017-09-05 19:50:37,2017-09-22,10:30:00,2017-09-23
...,...,...,...,...,...,...,...
4025,r4026,h_129,c_999,2017-06-27 23:00:02,2017-07-10,09:30:00,2017-07-11
4026,r4027,h_97,c_999,2017-09-29 05:24:57,2017-10-09,10:30:00,2017-10-10
4027,r4028,h_27,c_999,2018-03-14 05:01:45,2018-04-02,11:30:00,2018-04-04
4028,r4029,h_48,c_1000,2016-04-16 15:20:17,2016-05-10,09:30:00,2016-05-13


In [6]:
# 최종 결과
# 방법3 적용하기
reserve_tb.drop(['people_num', 'total_price'], axis=1, inplace=True)
reserve_tb.head(1)

Unnamed: 0,reserve_id,hotel_id,customer_id,reserve_datetime,checkin_date,checkin_time,checkout_date
0,r1,h_75,c_1,2016-03-06 13:09:42,2016-03-26,10:00:00,2016-03-29


#### 조건이 부여되어 있는 데이터 행 추출

In [7]:
#  방법1 : DataFrame 배열에 지정한 조건의 결과값인 True/False를 가지는 행의 배열을 지정하여 추출한다
#  Checkout_date가 2016-10-13일과 2016-10-14일 사이인 행을 추출
reserve_tb[ (reserve_tb['checkin_date'] >= '2016-10-13') &
            (reserve_tb['checkin_date'] <= '2016-10-14') ].head()

Unnamed: 0,reserve_id,hotel_id,customer_id,reserve_datetime,checkin_date,checkin_time,checkout_date
47,r48,h_299,c_8,2016-10-05 08:53:39,2016-10-14,09:30:00,2016-10-16
327,r328,h_48,c_76,2016-10-11 15:51:25,2016-10-14,11:00:00,2016-10-16
959,r960,h_215,c_237,2016-09-19 05:51:20,2016-10-14,11:00:00,2016-10-16
1225,r1226,h_176,c_300,2016-09-19 22:11:19,2016-10-14,09:30:00,2016-10-16
1960,r1961,h_244,c_486,2016-09-19 03:05:50,2016-10-14,12:00:00,2016-10-15


In [8]:
#  방법2 : loc 함수의 2차원 배열의 1차원 항목에 지정한 조건의 결과값인 True/False를 지정하여 추출한다
reserve_tb.loc[ (reserve_tb['checkin_date'] >= '2016-10-13') &
                (reserve_tb['checkin_date'] <= '2016-10-14') , : ].head()


Unnamed: 0,reserve_id,hotel_id,customer_id,reserve_datetime,checkin_date,checkin_time,checkout_date
47,r48,h_299,c_8,2016-10-05 08:53:39,2016-10-14,09:30:00,2016-10-16
327,r328,h_48,c_76,2016-10-11 15:51:25,2016-10-14,11:00:00,2016-10-16
959,r960,h_215,c_237,2016-09-19 05:51:20,2016-10-14,11:00:00,2016-10-16
1225,r1226,h_176,c_300,2016-09-19 22:11:19,2016-10-14,09:30:00,2016-10-16
1960,r1961,h_244,c_486,2016-09-19 03:05:50,2016-10-14,12:00:00,2016-10-15


In [9]:
# 방법3 : Query함수와 조건식을 이용하여 행을 추출한다
reserve_tb.query( ' "2016-10-13" <= checkout_date <= "2016-10-14" ').head()

Unnamed: 0,reserve_id,hotel_id,customer_id,reserve_datetime,checkin_date,checkin_time,checkout_date
284,r285,h_121,c_67,2016-09-27 06:13:19,2016-10-12,12:00:00,2016-10-14
513,r514,h_74,c_120,2016-10-06 03:12:04,2016-10-11,12:30:00,2016-10-14
1065,r1066,h_205,c_261,2016-09-14 02:57:59,2016-10-11,10:00:00,2016-10-14
1480,r1481,h_116,c_364,2016-09-17 17:45:39,2016-10-11,11:30:00,2016-10-13
1546,r1547,h_149,c_377,2016-09-27 08:19:24,2016-10-10,11:00:00,2016-10-13


#### 중복성을 고려하지 않은 랜덤 샘플링

In [10]:
# 호텔 예약 레코드를 활용하여 약 50%의 랜덤 샘플링을 수행한다
reserve_tb.sample(frac=0.5)

Unnamed: 0,reserve_id,hotel_id,customer_id,reserve_datetime,checkin_date,checkin_time,checkout_date
2553,r2554,h_151,c_639,2016-04-06 18:51:35,2016-05-01,12:30:00,2016-05-03
3163,r3164,h_199,c_796,2018-04-21 23:44:16,2018-05-02,09:00:00,2018-05-03
577,r578,h_282,c_137,2016-05-24 12:41:15,2016-05-29,12:30:00,2016-05-31
2711,r2712,h_39,c_682,2018-04-18 11:31:15,2018-04-21,11:00:00,2018-04-24
3915,r3916,h_278,c_975,2016-06-12 06:11:15,2016-06-20,12:00:00,2016-06-23
...,...,...,...,...,...,...,...
3178,r3179,h_211,c_799,2017-10-11 17:42:03,2017-10-29,11:30:00,2017-11-01
1517,r1518,h_189,c_372,2016-03-06 05:22:09,2016-03-15,11:30:00,2016-03-18
1580,r1581,h_296,c_386,2017-02-19 07:37:10,2017-03-17,11:00:00,2017-03-18
1967,r1968,h_58,c_492,2016-04-04 15:47:30,2016-04-06,12:00:00,2016-04-07


#### 중복성을 제거한 랜덤 샘플링

어느 한 쪽으로 편향된 샘플링을 하게되면 이후 분석에서 잘못된 결과를 도출할 수 있다.

➢ 공평하게 샘플링하려면 *분석 대상의 단위*와 *샘플링 단위*를 서로 맞춰야 한다.  
• 연간 예약 횟수별 고객 수의 비율 계산시 샘플링 전후 분석 결과의 차이가 발생함  
• 이유 → 분석 대상 단위가 고객 한 명인데, 샘플링 단위는 예약한 건수가 됨.   
• 분석 대상의 단위와 샘플링 단위가 서로 달라서 발생한 현상.         

➢ 해결 방법 : 고객들 중 랜덤으로 50%만 선택하고 (예: 100명의 고객이 있으면 50명만 선택), 그 고객들의 데이터를 모두 반환(예: 총 데이터가 1000건이라면 그중 위에서 고른 고객들의 데이터들은 560건)

In [11]:
# 고객 ID를 대상으로 50%로 샘플링한다 
reserve_tb.sample(frac=0.5).head()

Unnamed: 0,reserve_id,hotel_id,customer_id,reserve_datetime,checkin_date,checkin_time,checkout_date
423,r424,h_191,c_98,2016-09-11 20:33:31,2016-09-28,09:00:00,2016-09-29
14,r15,h_92,c_2,2018-04-19 11:25:00,2018-05-04,12:30:00,2018-05-05
1504,r1505,h_40,c_369,2018-04-06 03:28:11,2018-05-02,09:00:00,2018-05-05
1459,r1460,h_37,c_357,2016-04-17 10:36:28,2016-05-08,12:30:00,2016-05-09
1297,r1298,h_78,c_316,2016-05-29 07:51:40,2016-06-04,10:00:00,2016-06-05


In [13]:
#  isin 함수로 매개변수로 전달된 리스트 값 중 일치하는 열의 값만을 추출한다.
target = pd.Series(reserve_tb['customer_id'].unique()).sample(frac=0.5)
reserve_tb_sample = reserve_tb[ reserve_tb['customer_id'].isin(target) ]
reserve_tb_sample.head(3)

Unnamed: 0,reserve_id,hotel_id,customer_id,reserve_datetime,checkin_date,checkin_time,checkout_date
16,r17,h_115,c_3,2016-05-10 12:20:32,2016-05-17,10:00:00,2016-05-19
17,r18,h_132,c_3,2016-10-22 02:18:48,2016-11-12,12:00:00,2016-11-13
18,r19,h_23,c_3,2017-01-11 22:54:09,2017-02-08,10:00:00,2017-02-10


In [15]:
reserve_tb_sample['reserve_id'].unique()

array(['r17', 'r18', 'r19', ..., 'r4011', 'r4012', 'r4013'],
      shape=(2079,), dtype=object)