In [None]:
import numpy as np
import pandas as pd
import hvplot.pandas
from collections import Counter
from pathlib import Path
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Set the random seed for reproducibility
from numpy.random import seed
seed(1)
from tensorflow import random
random.set_seed(2)

In [None]:
# Load the fear and greed sentiment data for Bitcoin
csv_path = Path('../Resources/hotel_bookings.csv')

df = pd.read_csv(csv_path)
df.head()

In [None]:
# Month Name to Number
def month_num(df):
    import calendar
    name_to_num = {name: num for num, name in enumerate(calendar.month_name) if num}
    df["arrival_date_month"] = df["arrival_date_month"].apply(lambda x: name_to_num[x])
    return df

In [None]:
# combine year, months, day
def combine_date(df):
    date = df[['arrival_date_year', 'arrival_date_month', 'arrival_date_day_of_month']]
    date_df = date['arrival_date_year'].map(str) + '-' + date['arrival_date_month'].map(str) + '-' + date['arrival_date_day_of_month'].map(str)
    date = pd.to_datetime(date_df)
    df['date'] = date
    return df

In [None]:
# Filter by Hotel Type
def hotel_df(df, hotel):
    rslt_df = df.loc[df['hotel'].isin(hotel)]
    drop_columns = ['hotel']
    rslt_df = rslt_df.drop(drop_columns, axis=1)
    return rslt_df

In [None]:
# Filter out segments like Undefined
def market_segment(df, list):
    index_names = df[
        (df['market_segment']==(list[0]))
        & (df['market_segment']==(list[1]))
        & (df['market_segment']==(list[2]))
        & (df['market_segment']==(list[3]))
        & (df['meal']==(list[0]))
        ].index
    df.drop(index_names, inplace=True)
    return df

In [None]:
# Filter out Min Prices
def adr_min(df, num):
    index_names = df[df['adr']<=num].index
    df.drop(index_names, inplace=True)
    return df

In [None]:
# Drop unnecessary columns
def drop_columns(df, columns):
    df = df.drop(columns, axis=1)
    df = df.dropna()
    return df

In [None]:
# filter by country
def country_df(df, country):
    df = df.loc[df['country'].isin(country)]
    drop_columns = ['country']
    df = df.drop(drop_columns, axis=1)
    return df

In [None]:
# get dummies
def get_dummies(df, columns):
    df = pd.get_dummies(df, columns = columns)
    return df

In [None]:
# Month Name to Number
df = month_num(df)
df

In [None]:
df = combine_date(df)
df

In [None]:
# filter by 'Resort Hotel' or 'City Hotel'
df = hotel_df(df, ['Resort Hotel'])
df

In [None]:
df = adr_min(df, 20.0)
df

In [None]:
df = market_segment(df, ['Undefined', 'Aviation', 'Complementary', 'Corporate'])
df

In [None]:
# Drop unnecessary columns
df = drop_columns(df, [
    'is_repeated_guest',
    'arrival_date_day_of_month',
    'deposit_type', 
    'is_canceled', 
    'booking_changes', 
    'days_in_waiting_list',
    'agent',
    'company',
    'reservation_status',
    'reservation_status_date',
    'customer_type',
    'assigned_room_type',
    'distribution_channel',
    'market_segment',
    'distribution_channel',
    'previous_cancellations',
    'previous_bookings_not_canceled',
    'assigned_room_type',
    'total_of_special_requests',
    'adults',
    'children',
    'babies',
    'required_car_parking_spaces'])

df

In [None]:
df = df.dropna()
df

In [None]:
df = country_df(df, ['BRA'])
df

In [None]:
df = get_dummies(df, ['meal', 'reserved_room_type'])
df

In [None]:
y = df['adr']
y = y.round(-1)
y.head()

In [None]:
X = df.drop('adr', axis=1)
X

In [None]:
# drop full date
X = X.drop('date', axis=1)

In [None]:
# Normal train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Fit a Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=1000, random_state=1)
rf.fit(X_train, y_train)

In [None]:
# Make a prediction of "y" values from the X_test dataset
predictions = rf.predict(X_test)

# Assemble actual y data (Y_test) with predicted y data (from just above) into two columns in a dataframe:
Results = y_test.to_frame()
Results["Predicted Value"] = predictions

Results['Date'] = df['date']

Results.sort_values(["Date"], axis=0, ascending=True, inplace=True)

Results['Diff'] = Results['Predicted Value'] - Results.adr

Results

In [None]:
Results.hvplot(kind='scatter', x='Date', y='Diff')

In [None]:
# feature importance


In [None]:
# from sklearn.metrics import confusion_matrix
# confusion_matrix(adr, Predicted Value)