# Hotel Booking Demand - Processing

In [75]:
import pandas as pd
import numpy as np

In [208]:
# H1 is a resort hotel
h1 = pd.read_csv('H1.csv')
h1.head()

Unnamed: 0,IsCanceled,LeadTime,ArrivalDateYear,ArrivalDateMonth,ArrivalDateWeekNumber,ArrivalDateDayOfMonth,StaysInWeekendNights,StaysInWeekNights,Adults,Children,...,DepositType,Agent,Company,DaysInWaitingList,CustomerType,ADR,RequiredCarParkingSpaces,TotalOfSpecialRequests,ReservationStatus,ReservationStatusDate
0,0,342,2015,July,27,1,0,0,2,0,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,0,737,2015,July,27,1,0,0,2,0,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,0,7,2015,July,27,1,0,1,1,0,...,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,0,13,2015,July,27,1,0,1,1,0,...,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,0,14,2015,July,27,1,0,2,2,0,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


In [209]:
h1.columns

Index(['IsCanceled', 'LeadTime', 'ArrivalDateYear', 'ArrivalDateMonth',
       'ArrivalDateWeekNumber', 'ArrivalDateDayOfMonth',
       'StaysInWeekendNights', 'StaysInWeekNights', 'Adults', 'Children',
       'Babies', 'Meal', 'Country', 'MarketSegment', 'DistributionChannel',
       'IsRepeatedGuest', 'PreviousCancellations',
       'PreviousBookingsNotCanceled', 'ReservedRoomType', 'AssignedRoomType',
       'BookingChanges', 'DepositType', 'Agent', 'Company',
       'DaysInWaitingList', 'CustomerType', 'ADR', 'RequiredCarParkingSpaces',
       'TotalOfSpecialRequests', 'ReservationStatus', 'ReservationStatusDate'],
      dtype='object')

#### Verify data types for each column

In [210]:
types = []
for column in h1.columns:
    types.append(type(h1[column][0]))
    
col_type = dict(zip(h1.columns, types))

In [211]:
col_type

{'IsCanceled': numpy.int64,
 'LeadTime': numpy.int64,
 'ArrivalDateYear': numpy.int64,
 'ArrivalDateMonth': str,
 'ArrivalDateWeekNumber': numpy.int64,
 'ArrivalDateDayOfMonth': numpy.int64,
 'StaysInWeekendNights': numpy.int64,
 'StaysInWeekNights': numpy.int64,
 'Adults': numpy.int64,
 'Children': numpy.int64,
 'Babies': numpy.int64,
 'Meal': str,
 'Country': str,
 'MarketSegment': str,
 'DistributionChannel': str,
 'IsRepeatedGuest': numpy.int64,
 'PreviousCancellations': numpy.int64,
 'PreviousBookingsNotCanceled': numpy.int64,
 'ReservedRoomType': str,
 'AssignedRoomType': str,
 'BookingChanges': numpy.int64,
 'DepositType': str,
 'Agent': str,
 'Company': str,
 'DaysInWaitingList': numpy.int64,
 'CustomerType': str,
 'ADR': numpy.float64,
 'RequiredCarParkingSpaces': numpy.int64,
 'TotalOfSpecialRequests': numpy.int64,
 'ReservationStatus': str,
 'ReservationStatusDate': str}

In [212]:
h1['ReservedRoomType'].unique()

array(['C               ', 'A               ', 'D               ',
       'E               ', 'G               ', 'F               ',
       'H               ', 'L               ', 'P               ',
       'B               '], dtype=object)

In [213]:
h1['ReservedRoomType'] = h1['ReservedRoomType'].str.strip()
h1['ReservedRoomType'].unique()

array(['C', 'A', 'D', 'E', 'G', 'F', 'H', 'L', 'P', 'B'], dtype=object)

In [214]:
h1['AssignedRoomType'].unique()

array(['C               ', 'A               ', 'D               ',
       'E               ', 'G               ', 'F               ',
       'I               ', 'B               ', 'H               ',
       'P               ', 'L               '], dtype=object)

In [215]:
h1['AssignedRoomType'] = h1['AssignedRoomType'].str.strip()
h1['AssignedRoomType'].unique()

array(['C', 'A', 'D', 'E', 'G', 'F', 'I', 'B', 'H', 'P', 'L'],
      dtype=object)

In [216]:
h1['Agent'].unique()[:20]

array(['       NULL', '        304', '        240', '        303',
       '         15', '        241', '          8', '        250',
       '        115', '          5', '        175', '        134',
       '        156', '        243', '        242', '          3',
       '        105', '         40', '        147', '        306'],
      dtype=object)

In [217]:
h1['Agent'] = h1['Agent'].str.strip()
h1['Agent'].unique()[:20]

array(['NULL', '304', '240', '303', '15', '241', '8', '250', '115', '5',
       '175', '134', '156', '243', '242', '3', '105', '40', '147', '306'],
      dtype=object)

In [218]:
h1['Company'].unique()[:20]

array(['       NULL', '        110', '        113', '        270',
       '        178', '        240', '        154', '        144',
       '        307', '        268', '         59', '        204',
       '        312', '        318', '         94', '        174',
       '        274', '        195', '        223', '        317'],
      dtype=object)

In [219]:
h1['Company'] = h1['Company'].str.strip()
h1['Company'].unique()[:20]

array(['NULL', '110', '113', '270', '178', '240', '154', '144', '307',
       '268', '59', '204', '312', '318', '94', '174', '274', '195', '223',
       '317'], dtype=object)

In [220]:
h1['CustomerType'] = h1['CustomerType'].str.strip()
h1['CustomerType'].unique()

array(['Transient', 'Contract', 'Transient-Party', 'Group'], dtype=object)

In [221]:
h1['DepositType'] = h1['DepositType'].str.strip()
h1['DepositType'].unique()

array(['No Deposit', 'Refundable', 'Non Refund'], dtype=object)

----
#### Converting `ArrivalDateMonth` column from string to integer

In [10]:
# create month dict
month_dict = {'January': 1, 'February': 2, 'March': 3, 'April': 4, 'May': 5, 'June': 6, 'July': 7, 'August': 8, 'September': 9, 'October': 10, 'November': 11, 'December': 12}

# define month function to apply to column
def month(val):
    return month_dict[val]

In [11]:
# apply month() to column, reassign to dataframe
h1['ArrivalDateMonth'] = h1['ArrivalDateMonth'].apply(month)

Create new column `ArrivalDate`, which is `ArrivalDateYear`, `ArrivalDateMonth`, `ArrivalDateDayOfMonth` in YYYY-MM-DD format (as in `ReservationStatusDate`).

In [13]:
def create_arrival_date(s):
    return "{}-{}-{}".format(s['ArrivalDateYear'], s['ArrivalDateMonth'], s['ArrivalDateDayOfMonth'])

arrival_dates = h1.apply(create_arrival_date, axis=1)

In [14]:
arrival_dates

0         2015-7-1
1         2015-7-1
2         2015-7-1
3         2015-7-1
4         2015-7-1
           ...    
40055    2017-8-31
40056    2017-8-30
40057    2017-8-29
40058    2017-8-31
40059    2017-8-31
Length: 40060, dtype: object

----

Check relationship between `IsCanceled` and `ReservationStatus`

In [20]:
h1['ReservationStatus'].value_counts()

Check-Out    28938
Canceled     10831
No-Show        291
Name: ReservationStatus, dtype: int64

In [21]:
h1[['IsCanceled', 'ReservationStatus']].value_counts()

IsCanceled  ReservationStatus
0           Check-Out            28938
1           Canceled             10831
            No-Show                291
dtype: int64

In [27]:
h1[(h1['IsCanceled']==1) & (h1['ReservationStatus'] =='No-Show')]

Unnamed: 0,IsCanceled,LeadTime,ArrivalDateYear,ArrivalDateMonth,ArrivalDateWeekNumber,ArrivalDateDayOfMonth,StaysInWeekendNights,StaysInWeekNights,Adults,Children,...,DepositType,Agent,Company,DaysInWaitingList,CustomerType,ADR,RequiredCarParkingSpaces,TotalOfSpecialRequests,ReservationStatus,ReservationStatusDate
0,0,342,2015,7,27,1,0,0,2,0,...,No Deposit,,,0,Transient,0.00,0,0,Check-Out,2015-07-01
1,0,737,2015,7,27,1,0,0,2,0,...,No Deposit,,,0,Transient,0.00,0,0,Check-Out,2015-07-01
2,0,7,2015,7,27,1,0,1,1,0,...,No Deposit,,,0,Transient,75.00,0,0,Check-Out,2015-07-02
3,0,13,2015,7,27,1,0,1,1,0,...,No Deposit,304,,0,Transient,75.00,0,0,Check-Out,2015-07-02
4,0,14,2015,7,27,1,0,2,2,0,...,No Deposit,240,,0,Transient,98.00,0,1,Check-Out,2015-07-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40055,0,212,2017,8,35,31,2,8,2,1,...,No Deposit,143,,0,Transient,89.75,0,0,Check-Out,2017-09-10
40056,0,169,2017,8,35,30,2,9,2,0,...,No Deposit,250,,0,Transient-Party,202.27,0,1,Check-Out,2017-09-10
40057,0,204,2017,8,35,29,4,10,2,0,...,No Deposit,250,,0,Transient,153.57,0,3,Check-Out,2017-09-12
40058,0,211,2017,8,35,31,4,10,2,0,...,No Deposit,40,,0,Contract,112.80,0,1,Check-Out,2017-09-14


In [26]:
291 / h1.shape[0]

0.007264103844233649

### Removing No-Shows (tentative)

Because less than 1% of customers fail to show up in H1, I will remove them from the dataset.

In [222]:
h1 = h1[~((h1['IsCanceled']==1) & (h1['ReservationStatus'] =='No-Show'))]

#### Remove `Country` column

In [223]:
h1 = h1.drop('Country', axis=1)

#### Examine columns related to travel agencies, tour operators, booking channels 

A **hotel distribution channel** can be any method or platform by which your hotel sells its rooms. Examples include online travel agents, booking engines (website + social media), phone/email/walk-ins, metasearch, global distribution system.

In [48]:
h1['DistributionChannel'].value_counts()

TA/TO        28925
Direct        7865
Corporate     3269
Name: DistributionChannel, dtype: int64

In [47]:
h1 = h1[h1['DistributionChannel'] != 'Undefined']

In [52]:
misc = ['MarketSegment', 'DepositType', 'Agent', 'Company', 'CustomerType']

In [55]:
h1ta = h1[misc]
h1ta.head()

Unnamed: 0,MarketSegment,DepositType,Agent,Company,CustomerType
0,Direct,No Deposit,,,Transient
1,Direct,No Deposit,,,Transient
2,Direct,No Deposit,,,Transient
3,Corporate,No Deposit,304.0,,Transient
4,Online TA,No Deposit,240.0,,Transient


In [69]:
print(len(h1ta['Company'].unique()))

h1ta['Company'].value_counts()

236


       NULL    36951
        223      784
        281      138
        154      133
        405      100
               ...  
         10        1
        246        1
        413        1
        408        1
        410        1
Name: Company, Length: 236, dtype: int64

In [70]:
print(len(h1ta['Agent'].unique()))

h1ta['Agent'].value_counts()

186


        240    13905
       NULL     8208
        250     2869
        241     1721
         40     1002
               ...  
         64        1
        406        1
        333        1
        431        1
        187        1
Name: Agent, Length: 186, dtype: int64

In [73]:
# since company and agent are categorical variables and some agencies/companies appear a few times throughout the dataset, will convert
# these two variables to binary (1 if it was booked through an agency/company, 0 if not). 

# Also in each dataset, agent/company is "incomplete" (i.e. we see company #410 but only 236 unique companies are present in H1) since
# each dataset is compiled from a database -- companies that don't appear in H1 can appear in H2, vice versa, and 
# the same applies for `agent`

In [254]:
with_agent = np.where(h1['Agent']=='NULL', 0, 1)
with_company = np.where(h1['Company']=='NULL', 0, 1)

In [255]:
h1['agent'] = with_agent
h1['company'] = with_company

In [63]:
# calculating % of bookings that had a deposit
dt_vc = h1ta['DepositType'].value_counts()
(dt_vc[1] + dt_vc[2]) / h1.shape[0]

0.04645647669687211

In [120]:
h1[['IsCanceled', 'DepositType']].value_counts(normalize=True)

IsCanceled  DepositType    
0           No Deposit         0.717641
1           No Deposit         0.235902
            Non Refund         0.041189
0           Refundable         0.002996
            Non Refund         0.001722
1           Refundable         0.000549
dtype: float64

In [126]:
h1[['DepositType', 'agent']].value_counts(normalize=True)

DepositType      agent
No Deposit       1        0.763823
                 0        0.189720
Non Refund       1        0.030505
                 0        0.012407
Refundable       0        0.002771
                 1        0.000774
dtype: float64

In [131]:
h1[['DepositType', 'company']].value_counts(normalize=True)

DepositType      company
No Deposit       0          0.881550
                 1          0.071994
Non Refund       0          0.039617
                 1          0.003295
Refundable       1          0.002297
                 0          0.001248
dtype: float64

In [133]:
deposit_type = {
    'No Deposit': 0,
    'Non Refund': 1, 
    'Refundable': 2
}

def encode_deposit_type(entry):
    return deposit_type[entry]

h1['deposit_type'] = h1['DepositType'].str.strip().apply(encode_deposit_type)

-----

In [94]:
h1['CustomerType'].value_counts()

Transient          30208
Transient-Party     7791
Contract            1776
Group                284
Name: CustomerType, dtype: int64

In [107]:
customer_type = {
    'Transient': 1,
    'Transient-Party': 2, 
    'Contract': 3,
    'Group': 4
}

def encode_customer_type(s):
    return customer_type[s]

In [108]:
h1['CustomerType'] = h1['CustomerType'].apply(encode_customer_type)

In [109]:
h1['CustomerType']

0        1
1        1
2        1
3        1
4        1
        ..
40055    1
40056    2
40057    1
40058    3
40059    1
Name: CustomerType, Length: 40059, dtype: int64

----

In [122]:
h1[['MarketSegment', 'IsCanceled']].value_counts()

MarketSegment  IsCanceled
Online TA      0             11481
Offline TA/TO  0              6334
Online TA      1              6248
Direct         0              5634
Groups         0              3362
               1              2474
Corporate      0              1958
Offline TA/TO  1              1138
Direct         1               878
Corporate      1               351
Complementary  0               168
               1                33
dtype: int64

----
#### Encode `Meal` column

- Undefined/SC – no meal package;
- BB – Bed & Breakfast;
- HB – Half board (breakfast and one other meal – usually dinner);
- FB – Full board (breakfast, lunch and dinner)

Convert to binary variable: 1 if there is a meal package (BB, HB, FB), 0 otherwise

In [96]:
h1['Meal'].value_counts()

BB           30005
HB            8045
Undefined     1169
FB             754
SC              86
Name: Meal, dtype: int64

In [239]:
meals = {
    'BB': 1,
    'HB': 1,
    'FB': 1,
    'SC': 0,
    'Undefined': 0
}

def encode_meals(s):
    return meals[s]

In [246]:
h1['Meal'] = h1['Meal'].str.strip().apply(encode_meals)

----
#### Creating new variable based on room assignment

`AssignedRoomType` and `ReservedRoomType` are encoded for anonymity. It may be helpful to look at the types of rooms, if requested but not assigned (and also the other way around), that will cause people to cancel their bookings. With this in mind, we can combine these two columns to create binary variable - whether the assigned room is different from the reserved room.

Note: According to the metadata, customers may request to change their room type, but we cannot tell if the room type differences is due to overbooking or by request because the room types are coded.

In [226]:
h1[['AssignedRoomType', 'ReservedRoomType']]

Unnamed: 0,AssignedRoomType,ReservedRoomType
0,C,C
1,C,C
2,C,A
3,A,A
4,A,A
...,...,...
40055,A,A
40056,E,E
40057,E,E
40058,D,D


In [227]:
# Factor in `BookingChanges`?
h1['room_type_difference'] = np.where(h1['AssignedRoomType'] == h1['ReservedRoomType'], 0, 1)

-----

In [228]:
h1.head()

Unnamed: 0,IsCanceled,LeadTime,ArrivalDateYear,ArrivalDateMonth,ArrivalDateWeekNumber,ArrivalDateDayOfMonth,StaysInWeekendNights,StaysInWeekNights,Adults,Children,...,DaysInWaitingList,CustomerType,ADR,RequiredCarParkingSpaces,TotalOfSpecialRequests,ReservationStatus,ReservationStatusDate,agent,company,room_type_difference
0,0,342,2015,July,27,1,0,0,2,0,...,0,Transient,0.0,0,0,Check-Out,2015-07-01,0,0,0
1,0,737,2015,July,27,1,0,0,2,0,...,0,Transient,0.0,0,0,Check-Out,2015-07-01,0,0,0
2,0,7,2015,July,27,1,0,1,1,0,...,0,Transient,75.0,0,0,Check-Out,2015-07-02,0,0,1
3,0,13,2015,July,27,1,0,1,1,0,...,0,Transient,75.0,0,0,Check-Out,2015-07-02,1,0,0
4,0,14,2015,July,27,1,0,2,2,0,...,0,Transient,98.0,0,1,Check-Out,2015-07-03,1,0,0


In [229]:
h1.columns

Index(['IsCanceled', 'LeadTime', 'ArrivalDateYear', 'ArrivalDateMonth',
       'ArrivalDateWeekNumber', 'ArrivalDateDayOfMonth',
       'StaysInWeekendNights', 'StaysInWeekNights', 'Adults', 'Children',
       'Babies', 'Meal', 'MarketSegment', 'DistributionChannel',
       'IsRepeatedGuest', 'PreviousCancellations',
       'PreviousBookingsNotCanceled', 'ReservedRoomType', 'AssignedRoomType',
       'BookingChanges', 'DepositType', 'Agent', 'Company',
       'DaysInWaitingList', 'CustomerType', 'ADR', 'RequiredCarParkingSpaces',
       'TotalOfSpecialRequests', 'ReservationStatus', 'ReservationStatusDate',
       'agent', 'company', 'room_type_difference'],
      dtype='object')

In [230]:
h1 = h1.join(pd.get_dummies(h1['DepositType']))

In [231]:
h1 = h1.join(pd.get_dummies(h1['CustomerType']))

In [241]:
h1 = h1.drop(columns=['ReservationStatus'])

In [256]:
h1.to_csv('h1_proc2.csv', index=False)

------

# Processing for H2

In [236]:
h2 = pd.read_csv('h2.csv')
h2.head()

Unnamed: 0,IsCanceled,LeadTime,ArrivalDateYear,ArrivalDateMonth,ArrivalDateWeekNumber,ArrivalDateDayOfMonth,StaysInWeekendNights,StaysInWeekNights,Adults,Children,...,DepositType,Agent,Company,DaysInWaitingList,CustomerType,ADR,RequiredCarParkingSpaces,TotalOfSpecialRequests,ReservationStatus,ReservationStatusDate
0,0,6,2015,July,27,1,0,2,1,0.0,...,No Deposit,6,,0,Transient,0.0,0,0,Check-Out,2015-07-03
1,1,88,2015,July,27,1,0,4,2,0.0,...,No Deposit,9,,0,Transient,76.5,0,1,Canceled,2015-07-01
2,1,65,2015,July,27,1,0,4,1,0.0,...,No Deposit,9,,0,Transient,68.0,0,1,Canceled,2015-04-30
3,1,92,2015,July,27,1,2,4,2,0.0,...,No Deposit,9,,0,Transient,76.5,0,2,Canceled,2015-06-23
4,1,100,2015,July,27,2,0,2,2,0.0,...,No Deposit,9,,0,Transient,76.5,0,1,Canceled,2015-04-02


In [210]:
types = []
for column in h2.columns:
    types.append(type(h2[column][0]))
    
col_type = dict(zip(h2.columns, types))

In [211]:
col_type

{'IsCanceled': numpy.int64,
 'LeadTime': numpy.int64,
 'ArrivalDateYear': numpy.int64,
 'ArrivalDateMonth': str,
 'ArrivalDateWeekNumber': numpy.int64,
 'ArrivalDateDayOfMonth': numpy.int64,
 'StaysInWeekendNights': numpy.int64,
 'StaysInWeekNights': numpy.int64,
 'Adults': numpy.int64,
 'Children': numpy.int64,
 'Babies': numpy.int64,
 'Meal': str,
 'Country': str,
 'MarketSegment': str,
 'DistributionChannel': str,
 'IsRepeatedGuest': numpy.int64,
 'PreviousCancellations': numpy.int64,
 'PreviousBookingsNotCanceled': numpy.int64,
 'ReservedRoomType': str,
 'AssignedRoomType': str,
 'BookingChanges': numpy.int64,
 'DepositType': str,
 'Agent': str,
 'Company': str,
 'DaysInWaitingList': numpy.int64,
 'CustomerType': str,
 'ADR': numpy.float64,
 'RequiredCarParkingSpaces': numpy.int64,
 'TotalOfSpecialRequests': numpy.int64,
 'ReservationStatus': str,
 'ReservationStatusDate': str}

In [212]:
h2['ReservedRoomType'].unique()

array(['C               ', 'A               ', 'D               ',
       'E               ', 'G               ', 'F               ',
       'H               ', 'L               ', 'P               ',
       'B               '], dtype=object)

In [213]:
h2['ReservedRoomType'] = h2['ReservedRoomType'].str.strip()
h2['ReservedRoomType'].unique()

array(['C', 'A', 'D', 'E', 'G', 'F', 'H', 'L', 'P', 'B'], dtype=object)

In [214]:
h2['AssignedRoomType'].unique()

array(['C               ', 'A               ', 'D               ',
       'E               ', 'G               ', 'F               ',
       'I               ', 'B               ', 'H               ',
       'P               ', 'L               '], dtype=object)

In [215]:
h2['AssignedRoomType'] = h2['AssignedRoomType'].str.strip()
h2['AssignedRoomType'].unique()

array(['C', 'A', 'D', 'E', 'G', 'F', 'I', 'B', 'H', 'P', 'L'],
      dtype=object)

In [216]:
h2['Agent'].unique()[:20]

array(['       NULL', '        304', '        240', '        303',
       '         15', '        241', '          8', '        250',
       '        115', '          5', '        175', '        134',
       '        156', '        243', '        242', '          3',
       '        105', '         40', '        147', '        306'],
      dtype=object)

In [217]:
h2['Agent'] = h2['Agent'].str.strip()
h2['Agent'].unique()[:20]

array(['NULL', '304', '240', '303', '15', '241', '8', '250', '115', '5',
       '175', '134', '156', '243', '242', '3', '105', '40', '147', '306'],
      dtype=object)

In [218]:
h2['Company'].unique()[:20]

array(['       NULL', '        110', '        113', '        270',
       '        178', '        240', '        154', '        144',
       '        307', '        268', '         59', '        204',
       '        312', '        318', '         94', '        174',
       '        274', '        195', '        223', '        317'],
      dtype=object)

In [219]:
h2['Company'] = h2['Company'].str.strip()
h2['Company'].unique()[:20]

array(['NULL', '110', '113', '270', '178', '240', '154', '144', '307',
       '268', '59', '204', '312', '318', '94', '174', '274', '195', '223',
       '317'], dtype=object)

In [220]:
h2['CustomerType'] = h2['CustomerType'].str.strip()
h2['CustomerType'].unique()

array(['Transient', 'Contract', 'Transient-Party', 'Group'], dtype=object)

In [221]:
h2['DepositType'] = h2['DepositType'].str.strip()
h2['DepositType'].unique()

array(['No Deposit', 'Refundable', 'Non Refund'], dtype=object)

In [237]:
h2[['IsCanceled', 'ReservationStatus']].value_counts()

IsCanceled  ReservationStatus
0           Check-Out            46228
1           Canceled             32186
            No-Show                916
dtype: int64

In [238]:
916 / h2.shape[0]

0.011546703643010211

In [243]:
h2 = h2[~((h2['IsCanceled']==1) & (h2['ReservationStatus'] =='No-Show'))]

In [244]:
h2 = h2.drop('Country', axis=1)

In [245]:
h2['Agent'] = np.where(h2['Agent']=='NULL', 0, 1)
h2['Company'] = np.where(h2['Company']=='NULL', 0, 1)

In [248]:
h2['Meal'] = h2['Meal'].str.strip().apply(encode_meals)

In [249]:
h2['room_type_difference'] = np.where(h2['AssignedRoomType'] == h2['ReservedRoomType'], 0, 1)


In [250]:
h2 = h2.join(pd.get_dummies(h2['DepositType']))
h2 = h2.join(pd.get_dummies(h2['CustomerType']))
h2 = h2.drop(columns=['ReservationStatus'])

In [267]:
for i in h2.columns:
    if any(h2[i][h2[i].isna()]):
           print(i)

Children


In [269]:
h2['Children'][h2['Children'].isna()]

540    NaN
607    NaN
619    NaN
1100   NaN
Name: Children, dtype: float64

In [271]:
h2.shape

(78414, 37)

In [272]:
h2 = h2[~(h2['Children'].isna())]

In [273]:
h2.to_csv('h2_proc.csv', index=False)