In [1]:
import pandas as pd
import numpy as np

data = pd.read_csv('data/hotel_bookings.csv')
data.shape

(119390, 32)

In [2]:
data.columns

Index(['hotel', 'is_canceled', 'lead_time', 'arrival_date_year',
       'arrival_date_month', 'arrival_date_week_number',
       'arrival_date_day_of_month', 'stays_in_weekend_nights',
       'stays_in_week_nights', 'adults', 'children', 'babies', 'meal',
       'country', 'market_segment', 'distribution_channel',
       'is_repeated_guest', 'previous_cancellations',
       'previous_bookings_not_canceled', 'reserved_room_type',
       'assigned_room_type', 'booking_changes', 'deposit_type', 'agent',
       'company', 'days_in_waiting_list', 'customer_type', 'adr',
       'required_car_parking_spaces', 'total_of_special_requests',
       'reservation_status', 'reservation_status_date'],
      dtype='object')

## feature 컬럼 전처리

In [3]:
# Direct/non direct
data.loc[data["distribution_channel"]=='Direct', 'Direct(bool)'] = 1
data.loc[data["distribution_channel"]!='Direct', 'Direct(bool)'] = 0

#previous_cancel = 0 or <0
data['precancel(bool)'] = 1
data.loc[data['previous_cancellations'] == 0, 'precancel(bool)'] = 0

#previous_book_not_cancel = 0 or <0
data['prebook(bool)'] = 1
data.loc[data['previous_bookings_not_canceled'] == 0, 'prebook(bool)'] = 0

#change_room (방이 바뀌었으면 1)
data['change_room(bool)'] = 0
data.loc[data['assigned_room_type'] != data['reserved_room_type'], 'change_room(bool)'] = 1

#예약 사항 변경(booking changes =0 or <0)
data['reserve_change(bool)'] = 0
data.loc[data['previous_cancellations'] > 0, 'reserve_change(bool)'] = 1 

#주차 공간 요구
data.loc[data['required_car_parking_spaces'] == 0, 'parkinglot(bool)'] = 0
data.loc[data['required_car_parking_spaces'] > 0,'parkinglot(bool)'] = 1

#특별 요청 사항 요구
data.loc[data['total_of_special_requests'] == 0, 'special_request(bool)'] = 0
data.loc[data['total_of_special_requests'] > 0,'special_request(bool)'] = 1

#customer type 원 핫인코딩
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(data["customer_type"])
label_encoded = le.transform(data["customer_type"])
data["customer_type(label)"] = label_encoded.reshape((-1, 1))
ohe = pd.get_dummies(data["customer_type"]).reset_index()
data_resetindex = data.reset_index()
data = data_resetindex.merge(ohe,on='index')

#투숙객이 0명인 데이터 삭제
data['group_total'] = data['adults'] + data['children'] + data['babies']

num1 = data[data['group_total'] == 0].index
data = data.drop(num1)

In [4]:
data.head()

Unnamed: 0,index,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,...,change_room(bool),reserve_change(bool),parkinglot(bool),special_request(bool),customer_type(label),Contract,Group,Transient,Transient-Party,group_total
0,0,Resort Hotel,0,342,2015,July,27,1,0,0,...,0,0,0.0,0.0,2,0,0,1,0,2.0
1,1,Resort Hotel,0,737,2015,July,27,1,0,0,...,0,0,0.0,0.0,2,0,0,1,0,2.0
2,2,Resort Hotel,0,7,2015,July,27,1,0,1,...,1,0,0.0,0.0,2,0,0,1,0,1.0
3,3,Resort Hotel,0,13,2015,July,27,1,0,1,...,0,0,0.0,0.0,2,0,0,1,0,1.0
4,4,Resort Hotel,0,14,2015,July,27,1,0,2,...,0,0,0.0,1.0,2,0,0,1,0,2.0


In [5]:
label=['is_canceled']
feature = ['hotel','is_canceled','lead_time','Direct(bool)','precancel(bool)','prebook(bool)','change_room(bool)','reserve_change(bool)', 'parkinglot(bool)','special_request(bool)','customer_type(label)','Contract','Group','Transient','Transient-Party']

In [6]:
ml_data = data[feature]
ml_data

Unnamed: 0,hotel,is_canceled,lead_time,Direct(bool),precancel(bool),prebook(bool),change_room(bool),reserve_change(bool),parkinglot(bool),special_request(bool),customer_type(label),Contract,Group,Transient,Transient-Party
0,Resort Hotel,0,342,1.0,0,0,0,0,0.0,0.0,2,0,0,1,0
1,Resort Hotel,0,737,1.0,0,0,0,0,0.0,0.0,2,0,0,1,0
2,Resort Hotel,0,7,1.0,0,0,1,0,0.0,0.0,2,0,0,1,0
3,Resort Hotel,0,13,0.0,0,0,0,0,0.0,0.0,2,0,0,1,0
4,Resort Hotel,0,14,0.0,0,0,0,0,0.0,1.0,2,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119385,City Hotel,0,23,0.0,0,0,0,0,0.0,0.0,2,0,0,1,0
119386,City Hotel,0,102,0.0,0,0,0,0,0.0,1.0,2,0,0,1,0
119387,City Hotel,0,34,0.0,0,0,0,0,0.0,1.0,2,0,0,1,0
119388,City Hotel,0,109,0.0,0,0,0,0,0.0,0.0,2,0,0,1,0


In [7]:
ml_resort_data = ml_data[ml_data["hotel"]=="Resort Hotel"]
ml_city_data = ml_data[ml_data["hotel"]=="City Hotel"]

In [8]:
ml_resort_data = ml_resort_data.drop(['hotel'], axis=1)
ml_city_data = ml_city_data.drop(['hotel'], axis=1)
ml_city_data = ml_city_data.reset_index(drop=True)

In [9]:
ml_city_data.to_csv("data/ml_city_data.csv")

In [10]:
ml_resort_data.to_csv("data/ml_resort_data.csv")