#### 데이터 로드하기

In [3]:
import pandas as pd
import os

def load_hotel_reserve():
  customer_tb = pd.read_csv('./data/customer.csv')
  hotel_tb = pd.read_csv('./data/hotel.csv')
  reserve_tb = pd.read_csv('./data/reserve.csv')
  return customer_tb, hotel_tb, reserve_tb


def load_holiday_mst():
  holiday_tb = pd.read_csv('./data/holiday_mst.csv',
                           index_col=False)
  return holiday_tb


def load_production():
  production_tb = pd.read_csv('./data/production.csv')
  return production_tb


def load_production_missing_num():
  production_tb = pd.read_csv('./data/production_missing_num.csv')
  return production_tb


def load_production_missing_category():
  production_tb = pd.read_csv('./awesomebook-master/data/production_missing_category.csv')
  return production_tb


def load_monthly_index():
  monthly_index_tb = pd.read_csv('./data/monthly_index.csv')
  return monthly_index_tb


def load_meros_txt():
  with open('./data/txt/meros.txt', 'r') as f:
    meros = f.read()
    f.close()
  return meros


In [4]:
customer_tb, hotel_tb, reserve_tb = load_hotel_reserve()

reserve_tb.head()

Unnamed: 0,reserve_id,hotel_id,customer_id,reserve_datetime,checkin_date,checkin_time,checkout_date,people_num,total_price
0,r1,h_75,c_1,2016-03-06 13:09:42,2016-03-26,10:00:00,2016-03-29,4,97200
1,r2,h_219,c_1,2016-07-16 23:39:55,2016-07-20,11:30:00,2016-07-21,2,20600
2,r3,h_179,c_1,2016-09-24 10:03:17,2016-10-19,09:00:00,2016-10-22,2,33600
3,r4,h_214,c_1,2017-03-08 03:20:10,2017-03-29,11:00:00,2017-03-30,4,194400
4,r5,h_16,c_1,2017-09-05 19:50:37,2017-09-22,10:30:00,2017-09-23,3,68100


### 2. 분석 단위를 손실 없이 변경하기

데이터를 가치 손실 없이 압축하여 단위를 변환    
예) 과목별로 평균값을 계산  
방법: Group_by로 묶어 count, sum 지정

.agg()와 일반 집계 함수(.sum(), .count() 등)의 차이
* 일반 집계 함수는 헌 열에 하나의 집계만 가능하다
```df.groupby('customer_id')['price'].sum()```
```df.groupby('customer_id')['price'].mean()```
* agg()는 열 별로 각각 다른 집계 함수를 한 번에 적용할 수 있다!
- 여러 개의 열에 일괄 적용 
```df.groupby('customer_id')['price'].agg(['sum', 'mean', 'max'])```
- 여러 열에 각각 다른 집계 적용

```df.groupby('customer_id').agg({```   
   ``` 'price': ['sum', 'mean'],      # 가격은 합계와 평균```   
    ```'reserve_id': 'count',          # 예약ID는 개수```   
    ```'visit_date': ['first', 'last'] # 방문일은 첫/마지막```  
```})```

#### 1. agg  산출

* count : 중복 허용
* nunique : 중복 제외

In [14]:
# agg() 함수를 사용한 개수 산출
result = reserve_tb \
         .groupby('hotel_id') \
         .agg( {'reserve_id': 'count', 'customer_id': 'nunique'} )
result.head(2)

Unnamed: 0_level_0,reserve_id,customer_id
hotel_id,Unnamed: 1_level_1,Unnamed: 2_level_1
h_1,10,10
h_10,3,3


In [15]:
result.reset_index(inplace=True)
result.head(5)

Unnamed: 0,hotel_id,reserve_id,customer_id
0,h_1,10,10
1,h_10,3,3
2,h_100,20,19
3,h_101,17,17
4,h_102,13,13


#### 2. sum()을 이용한 매출 합계 산출

In [16]:
result = reserve_tb \
         .groupby( ['hotel_id', 'people_num'])['total_price'] \
         .sum().reset_index()
result.head(2)

Unnamed: 0,hotel_id,people_num,total_price
0,h_1,1,156600
1,h_1,2,156600


In [None]:
result.rename(columns={'total_price':'price_sum'}, inplace=True)
result.head()

Unnamed: 0,hotel_id,people_num,price_sum
0,h_1,1,156600
1,h_1,2,156600
2,h_1,3,391500
3,h_1,4,417600
4,h_10,1,11200


#### 3. max, min, mean, median, percentile(백분위수) 산출

In [19]:
result = reserve_tb \
         .groupby('hotel_id') \
         .agg({ 'total_price' : ['max', 'min','mean','median',
                                 lambda x: np.percentile(x, q=20)] }) \
         .reset_index()
result.head(2)

Unnamed: 0_level_0,hotel_id,total_price,total_price,total_price,total_price,total_price
Unnamed: 0_level_1,Unnamed: 1_level_1,max,min,mean,median,<lambda_0>
0,h_1,208800,26100,112230.0,104400.0,73080.0
1,h_10,67200,11200,42933.333333,50400.0,26880.0


In [20]:
result.columns = ['hotel_id', 'price_max', 'price_min', 'price_mean', 'price_median', 'price_20per']
result.head(5)

Unnamed: 0,hotel_id,price_max,price_min,price_mean,price_median,price_20per
0,h_1,208800,26100,112230.0,104400.0,73080.0
1,h_10,67200,11200,42933.333333,50400.0,26880.0
2,h_100,57600,4800,27600.0,28800.0,9600.0
3,h_101,168000,14000,75764.705882,56000.0,30800.0
4,h_102,72000,12000,32769.230769,24000.0,18000.0


#### 4. Variance(분산)과 Standard deviation(표준편차) 산출

In [37]:
result = reserve_tb \
         .groupby('hotel_id') \
         .agg({ 'total_price' : ['var', 'std'] }) \
         .reset_index()
result_columns = ['hotel_id', 'price_Var', 'price_std']
result.head()

Unnamed: 0_level_0,hotel_id,total_price,total_price
Unnamed: 0_level_1,Unnamed: 1_level_1,var,std
0,h_1,3186549000.0,56449.526127
1,h_10,825813300.0,28736.968061
2,h_100,319831600.0,17883.835689
3,h_101,2402441000.0,49014.703676
4,h_102,357692300.0,18912.755159


In [None]:
result.fillna(0, inplace=True)

#### 5. mode(최빈값) 계산

In [38]:
reserve_tb['total_price'].round(-3).mode()

0    10000
1    20000
2    40000
Name: total_price, dtype: int64

#### 6. rank()를 이용한 순위 계산

In [41]:
# 그룹별로 순서를 정렬하고 순위를 매겨서 새로 추가된 열에 기록
reserve_tb['reserve_datetime'] = pd.to_datetime(
    reserve_tb['reserve_datetime'], format='%Y-%m-%d %H:%M:%S'
)
reserve_tb.head(1)

Unnamed: 0,reserve_id,hotel_id,customer_id,reserve_datetime,checkin_date,checkin_time,checkout_date,people_num,total_price
0,r1,h_75,c_1,2016-03-06 13:09:42,2016-03-26,10:00:00,2016-03-29,4,97200


In [42]:
reserve_tb['log_no'] = reserve_tb.groupby('customer_id')['reserve_datetime'] \
                                 .rank(ascending=True, method='first')
reserve_tb.head(1)

Unnamed: 0,reserve_id,hotel_id,customer_id,reserve_datetime,checkin_date,checkin_time,checkout_date,people_num,total_price,log_no
0,r1,h_75,c_1,2016-03-06 13:09:42,2016-03-26,10:00:00,2016-03-29,4,97200,1.0


In [46]:
rsv_cnt_tb = reserve_tb.groupby('hotel_id').size().reset_index()
rsv_cnt_tb.columns = ['hotel_id', 'rsv_cnt']
rsv_cnt_tb.head(3)

Unnamed: 0,hotel_id,rsv_cnt
0,h_1,10
1,h_10,3
2,h_100,20


In [47]:
rsv_cnt_tb['rsv_cnt_rank'] = rsv_cnt_tb['rsv_cnt'].rank(ascending=False, method='min')
rsv_cnt_tb.head(3)

Unnamed: 0,hotel_id,rsv_cnt,rsv_cnt_rank
0,h_1,10,235.0
1,h_10,3,300.0
2,h_100,20,12.0


In [48]:
rsv_cnt_tb.drop('rsv_cnt', axis=1, inplace=True)
rsv_cnt_tb.head(3)

Unnamed: 0,hotel_id,rsv_cnt_rank
0,h_1,235.0
1,h_10,300.0
2,h_100,12.0
