# 0.0. IMPORTS

In [3]:
import math
import optuna
import numpy  as np
import pandas as pd
import warnings
import seaborn as sns
import xgboost as xgb

from sklearn.decomposition   import PCA
from IPython.display         import HTML
from sklearn.manifold        import TSNE
from scipy                 import stats  as ss
from matplotlib            import pyplot as plt
from IPython.display       import Image
from IPython.core.display  import HTML

from sklearn.decomposition   import TruncatedSVD

from sklearn        import metrics
from sklearn.ensemble      import RandomForestClassifier
from sklearn.linear_model  import LinearRegression, Lasso
from sklearn.preprocessing import RobustScaler, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
warnings.filterwarnings( 'ignore' )

## 0.2. Loading data

In [6]:
df = pd.read_csv('../data/train.csv')

## 0.3. Funções

In [16]:
def embedding_svd(x, X_test, y):
    ###### Feature Engineering

    # Tree Based embedding

    # Model definition
    rf_model_emb = RandomForestClassifier( n_estimators = 100, random_state = 42 )

    # Model training
    rf_model_emb.fit( x, y )

    # Leaf
    df_leaf_emb = pd.DataFrame( rf_model_emb.apply( x ) )

    reducer_emb = TruncatedSVD( n_components = 3, random_state = 42 )

    embedding = reducer_emb.fit_transform( df_leaf_emb )

    # embedding
    df_leaf_emb['embedding_x'] = embedding[: , 0]
    df_leaf_emb['embedding_y'] = embedding[: , 1]
    df_leaf_emb['embedding_z'] = embedding[: , 2]

    df_leaf_emb['Reserva Cancelada'] = y

    x['embedding_x'] = embedding[: , 0]
    x['embedding_y'] = embedding[: , 1]
    x['embedding_z'] = embedding[: , 2]
    
    x_leaf = pd.DataFrame( rf_model_emb.apply( x ) )   
    
    embedding_prep = reducer_emb.transform(x_leaf)
    
    x['embedding_x'] = embedding_prep[: , 0]
    x['embedding_y'] = embedding_prep[: , 1]
    x['embedding_z'] = embedding_prep[: , 2]
    
    return x, y

## 1.3. Data Types

In [3]:
df.dtypes

id                                        int64
Classificação do hotel                   object
Meses da reserva até o check-in           int64
Número de pernoites reservadas            int64
Número de hospedes                      float64
Regime de alimentação                    object
Nacionalidade                            object
Forma de Reserva                         object
Já se hospedou anterioremente            object
Tipo do quarto reservado                 object
Reserva feita por agência de turismo     object
Reserva feita por empresa                object
Reserva com Estacionamento               object
Reserva com Observações                  object
Reserva Cancelada                         int64
dtype: object

## 1.4. Check NA

In [7]:
df.isna().sum()

id                                         0
Classificação do hotel                     0
Meses da reserva até o check-in            0
Número de pernoites reservadas             0
Número de hospedes                         3
Regime de alimentação                      0
Nacionalidade                           1093
Forma de Reserva                           0
Já se hospedou anterioremente              0
Tipo do quarto reservado                   0
Reserva feita por agência de turismo       0
Reserva feita por empresa                  0
Reserva com Estacionamento                 0
Reserva com Observações                    0
Reserva Cancelada                          0
dtype: int64

## 1.5. Fillout NA

In [8]:
df.dropna(inplace=True)

In [9]:
df.isna().sum()

id                                      0
Classificação do hotel                  0
Meses da reserva até o check-in         0
Número de pernoites reservadas          0
Número de hospedes                      0
Regime de alimentação                   0
Nacionalidade                           0
Forma de Reserva                        0
Já se hospedou anterioremente           0
Tipo do quarto reservado                0
Reserva feita por agência de turismo    0
Reserva feita por empresa               0
Reserva com Estacionamento              0
Reserva com Observações                 0
Reserva Cancelada                       0
dtype: int64

# 3.0. PASSO 03 - FILTRAGEM DE VARIÁVEIS

In [10]:
df3 = df.copy()
df3.head()

Unnamed: 0,id,Classificação do hotel,Meses da reserva até o check-in,Número de pernoites reservadas,Número de hospedes,Regime de alimentação,Nacionalidade,Forma de Reserva,Já se hospedou anterioremente,Tipo do quarto reservado,Reserva feita por agência de turismo,Reserva feita por empresa,Reserva com Estacionamento,Reserva com Observações,Reserva Cancelada
0,33571,5 estrelas,5,5,2.0,Café da manha e jantar,France,Agência,Não,Amethyst,Sim,Não,Sim,Nenhuma,0
1,82458,4 estrelas,167,3,2.0,Café da manha,Spain,Agência,Não,Amethyst,Sim,Não,Não,Nenhuma,1
2,94061,4 estrelas,4,3,2.0,Café da manha,Belgium,Agência,Não,Amethyst,Sim,Não,Não,1 a 3,0
3,75196,4 estrelas,13,2,2.0,Café da manha,Spain,Agência,Não,Amethyst,Sim,Não,Não,Nenhuma,1
4,82940,4 estrelas,4,2,2.0,Café da manha,Spain,Agência,Não,Amethyst,Sim,Não,Não,Nenhuma,1


In [11]:
df3.columns

Index(['id', 'Classificação do hotel', 'Meses da reserva até o check-in',
       'Número de pernoites reservadas', 'Número de hospedes',
       'Regime de alimentação', 'Nacionalidade', 'Forma de Reserva',
       'Já se hospedou anterioremente', 'Tipo do quarto reservado',
       'Reserva feita por agência de turismo', 'Reserva feita por empresa',
       'Reserva com Estacionamento', 'Reserva com Observações',
       'Reserva Cancelada'],
      dtype='object')

## 3.1. Filtragem das Linhas


In [12]:
df3 = df3[(df3['Número de hospedes'] != 0)]
df3 = df3[(df3['Número de pernoites reservadas'] != 0)]

# 4.0. Transformação

In [13]:
df4 = df3.copy()

In [15]:
def preproc(df4):    
    #Regime de alimentação
    dic_alim = {"Café da manha":1,
               "Sem refeicao":0,
                "Café da manha e jantar":2,
                "Café da manha, almoco e jantar":3
               }

    df4['Regime de alimentação'] = df4['Regime de alimentação'].map(dic_alim)


    #Forma de Reserva
    df4 = pd.get_dummies( df4, prefix=['reserva'], columns=['Forma de Reserva'] )

    #Já se hospedou anterioremente
    df4['Já se hospedou anterioremente'] = df4['Já se hospedou anterioremente'].apply(lambda x: 1 if x=='Sim' else 0)

    #Tipo do quarto reservado
    #dic_quarto = {"Topaz": 0,
    #"Amethyst": 1,
    #"Peridot": 2,
    #"Tanzanite": 3,
    #"Moonstone": 4,
    #"Red Ruby": 5,
    #"Garnet": 6,
    #"Pink Sapphire": 7,
    #"Green Emerald": 8,
    #"Blue Sapphire": 9
    #}
    #df4['Tipo do quarto reservado'] = df4['Tipo do quarto reservado'].map(dic_quarto)
    
    freq = df4['Tipo do quarto reservado'].value_counts(normalize=True)
    dic = dict(freq)
    df4['Tipo do quarto reservado'] = df4['Tipo do quarto reservado'].map(dic)
    

    #Reserva com Estacionamento
    df4['Reserva com Estacionamento'] = df4['Reserva com Estacionamento'].apply(lambda x: 1 if x=='Sim' else 0)

    #Reserva com Observações
    dic_obs = {"1 a 3":1,
               "Nenhuma":0,
                "Mais de 3":2
               }

    def al(a):
        return dic_obs[a]

    df4['Reserva com Observações'] = df4['Reserva com Observações'].map(dic_obs)

    #freq = df4['Nacionalidade'].value_counts(normalize=True)
    #
    #dic = dict(freq)
    #
    #df4['Nacionalidade'] = df['Nacionalidade'].map(dic)

    df4['Nacionalidade'] = df4['Nacionalidade'].apply(lambda x: 1 if x=="Spain" else 0)
    
    df4['Classificação do hotel'] = df4['Classificação do hotel'].apply(lambda x: 5 if x=='5 estrelas' else 4)
    df4 = df4.drop(columns=['Reserva feita por agência de turismo','Reserva feita por empresa', 'Já se hospedou anterioremente', 'reserva_Agência', 'reserva_Balcão', 'reserva_B2B'])
    return df4

In [16]:
df4 = preproc(df4)

# 5.0. Embedding

In [17]:
df5 = df4.copy()

In [18]:
y_emb = df5['Reserva Cancelada']
X_emb = df5.drop('Reserva Cancelada', axis = 1)
X_emb_1 = df5.drop('Reserva Cancelada', axis = 1)

In [19]:
# Tree Based embedding

# Model definition
# Define Weights
weights_emb = df5['Reserva Cancelada'].value_counts(normalize = True).values
weights_emb = {0: weights_emb[1], 1: weights_emb[0]}

rf_model_emb = RandomForestClassifier( n_estimators = 1000, random_state = 42, class_weight = weights_emb )

# Model training
rf_model_emb.fit( X_emb, y_emb )

# Leaf
df_leaf_emb = pd.DataFrame( rf_model_emb.apply( X_emb ) )

In [114]:
# SVD
reducer_emb = TruncatedSVD(n_components = 10, random_state = 42)

embedding = reducer_emb.fit_transform( df_leaf_emb )

df5['embedding_a'] = embedding[: , 0]
df5['embedding_b'] = embedding[: , 1]
df5['embedding_c'] = embedding[: , 2]
df5['embedding_d'] = embedding[: , 3]
df5['embedding_e'] = embedding[: , 4]
df5['embedding_f'] = embedding[: , 5]
df5['embedding_g'] = embedding[: , 6]
df5['embedding_h'] = embedding[: , 7]
df5['embedding_i'] = embedding[: , 8]
df5['embedding_j'] = embedding[: , 9]

# Plot leaves
#plt.figure(figsize=(6, 6))
#sns.scatterplot( x = 'embedding_x', y = 'embedding_y',
#                 data = df_leaf_emb,
#                 hue = 'Reserva Cancelada');

# 6.0. Data Preparation

In [115]:
df6 = df5.copy()

## 6.1. Rescaling

In [116]:
rs_embedding_a = MinMaxScaler()
df6['embedding_a'] = rs_embedding_x.fit_transform(df6[['embedding_a']]) 

rs_embedding_b = MinMaxScaler()
df6['embedding_b'] = rs_embedding_y.fit_transform(df6[['embedding_b']]) 

rs_embedding_c = MinMaxScaler()
df6['embedding_c'] = rs_embedding_z.fit_transform(df6[['embedding_c']]) 

rs_embedding_d = MinMaxScaler()
df6['embedding_d'] = rs_embedding_y.fit_transform(df6[['embedding_d']]) 

rs_embedding_e = MinMaxScaler()
df6['embedding_e'] = rs_embedding_z.fit_transform(df6[['embedding_e']]) 

rs_embedding_f = MinMaxScaler()
df6['embedding_f'] = rs_embedding_x.fit_transform(df6[['embedding_f']]) 

rs_embedding_g = MinMaxScaler()
df6['embedding_g'] = rs_embedding_y.fit_transform(df6[['embedding_g']]) 

rs_embedding_h = MinMaxScaler()
df6['embedding_h'] = rs_embedding_z.fit_transform(df6[['embedding_h']]) 

rs_embedding_i = MinMaxScaler()
df6['embedding_i'] = rs_embedding_y.fit_transform(df6[['embedding_i']]) 

rs_embedding_j = MinMaxScaler()
df6['embedding_j'] = rs_embedding_z.fit_transform(df6[['embedding_j']]) 

# Separação das Variáveis

In [117]:
df7 = df6.copy()

In [118]:
x = df7.drop(columns=['id','Reserva Cancelada'])
y = df7['Reserva Cancelada']

In [119]:
x_treino, x_teste, y_treino, y_teste = train_test_split(x,y,test_size=0.3)

# 7.0. PASSO 07 - MACHINE LEARNING MODELLING

## 7.4. Random Forest

In [120]:
modelo_random_forest = RandomForestClassifier(n_estimators=2000)
modelo_random_forest.fit(x_treino,y_treino)
Y = pd.DataFrame()
y_previsao = modelo_random_forest.predict(x_teste)
print(metrics.f1_score(y_teste,y_previsao))

0.9577054577687729


## Optuna

In [122]:
def objective(trial):
    # Define hyperparameters to search over
    
    max_depth = trial.suggest_int("max_depth", 6, 10)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 10)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 2, 10)
    max_features = trial.suggest_uniform("max_features", 0.4, 1.0)
    
    # Initialize random forest classifier with hyperparameters
    clf = RandomForestClassifier(
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        random_state=42,
    )

    # Train and predict on validation set
    clf.fit(x_treino, y_treino)
    y_pred = clf.predict(x_teste)

    # Calculate accuracy score
    f1 = metrics.f1_score(y_teste, y_pred)

    return f1

# Create Optuna study and optimize hyperparameters
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=200,timeout=600)

# Print best hyperparameters and corresponding accuracy score
best_trial = study.best_trial
print(f"Best trial: {best_trial.number}")
print(f"Accuracy: {best_trial.value}")
for key, value in best_trial.params.items():
    print(f"{key}: {value}")

[32m[I 2023-03-26 08:09:52,866][0m A new study created in memory with name: no-name-ed7e24b2-e78b-4695-9d39-425aa5ab1c61[0m
[32m[I 2023-03-26 08:11:41,184][0m Trial 0 finished with value: 0.8824819805703542 and parameters: {'n_estimators': 500, 'max_depth': 5, 'min_samples_split': 10, 'min_samples_leaf': 5, 'max_features': 0.478216930883984}. Best is trial 0 with value: 0.8824819805703542.[0m
[32m[I 2023-03-26 08:15:53,412][0m Trial 1 finished with value: 0.9244372990353699 and parameters: {'n_estimators': 400, 'max_depth': 8, 'min_samples_split': 4, 'min_samples_leaf': 8, 'max_features': 0.9665148601346435}. Best is trial 1 with value: 0.9244372990353699.[0m
[32m[I 2023-03-26 08:18:06,154][0m Trial 2 finished with value: 0.8406072106261859 and parameters: {'n_estimators': 700, 'max_depth': 4, 'min_samples_split': 9, 'min_samples_leaf': 9, 'max_features': 0.5050927843041215}. Best is trial 1 with value: 0.9244372990353699.[0m
[32m[I 2023-03-26 08:19:48,450][0m Trial 3 fin

KeyboardInterrupt: 

In [52]:
modelo_random_forest = RandomForestClassifier(n_estimators = 2000,
    max_depth = 7,
    min_samples_split = 4,
    min_samples_leaf = 2,
    max_features = 0.5)
modelo_random_forest.fit(x_treino,y_treino)
Y = pd.DataFrame()
y_previsao = modelo_random_forest.predict(x_teste)
print(metrics.f1_score(y_teste,y_previsao))

0.7466440582167585


## 7.5. XGBoost 

In [43]:
modelo_xgb = xgb.XGBClassifier(n_estimators = 500, max_depth = 6, random_state = 42)
modelo_xgb.fit(x_treino, y_treino)
y_xgb = modelo_xgb.predict(x_teste)
print(metrics.f1_score(y_teste,y_xgb))

0.9468882779305173


# Optuna

In [45]:
def objective(trial):
    """Define the objective function"""

    params = {
                'max_depth': trial.suggest_int('max_depth', 1, 10),
                'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
                'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
                'subsample': trial.suggest_loguniform('subsample', 0.5, 1.0)
            }

    # Fit the model
    optuna_model = xgb.XGBClassifier(**params)
    optuna_model.fit(x_treino, y_treino)

    # Make predictions
    y_pred = optuna_model.predict(x_teste)
    xgb_predictions = optuna_model.predict(x_teste)

    # Evaluate predictions
    f1_score = metrics.f1_score(y_teste, xgb_predictions)
    
    return f1_score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials = 100, timeout = 600)

print('Number of finished trials: {}'.format(len(study.trials)))
print('Best trial:')
trial = study.best_trial

print('  Value: {}'.format(trial.value))
print('  Params: ')

for key, value in trial.params.items():
    print('    {}: {}'.format(key, value))

[32m[I 2023-03-26 06:17:07,278][0m A new study created in memory with name: no-name-faa1c4ca-7a1d-4d83-aaae-dd386d88d50c[0m
[32m[I 2023-03-26 06:17:09,574][0m Trial 0 finished with value: 0.9414191934979681 and parameters: {'max_depth': 7, 'learning_rate': 0.6470280915279841, 'min_child_weight': 7, 'subsample': 0.735556348732361}. Best is trial 0 with value: 0.9414191934979681.[0m
[32m[I 2023-03-26 06:17:10,359][0m Trial 1 finished with value: 0.8570896911053218 and parameters: {'max_depth': 2, 'learning_rate': 0.1849224895408612, 'min_child_weight': 7, 'subsample': 0.8447441322974777}. Best is trial 0 with value: 0.9414191934979681.[0m
[32m[I 2023-03-26 06:17:11,611][0m Trial 2 finished with value: 0.9324815232368783 and parameters: {'max_depth': 5, 'learning_rate': 0.6453389027836505, 'min_child_weight': 4, 'subsample': 0.5931230733382671}. Best is trial 0 with value: 0.9414191934979681.[0m
[32m[I 2023-03-26 06:17:12,601][0m Trial 3 finished with value: 0.84285624882658

[32m[I 2023-03-26 06:18:02,083][0m Trial 31 finished with value: 0.9446946946946947 and parameters: {'max_depth': 9, 'learning_rate': 0.23476429304239543, 'min_child_weight': 1, 'subsample': 0.6664100008580992}. Best is trial 12 with value: 0.94657251765294.[0m
[32m[I 2023-03-26 06:18:04,073][0m Trial 32 finished with value: 0.9439929991248907 and parameters: {'max_depth': 9, 'learning_rate': 0.18790796345080818, 'min_child_weight': 1, 'subsample': 0.6233329497148732}. Best is trial 12 with value: 0.94657251765294.[0m
[32m[I 2023-03-26 06:18:05,854][0m Trial 33 finished with value: 0.9440690690690691 and parameters: {'max_depth': 8, 'learning_rate': 0.32569947906583646, 'min_child_weight': 2, 'subsample': 0.5776717832817283}. Best is trial 12 with value: 0.94657251765294.[0m
[32m[I 2023-03-26 06:18:07,750][0m Trial 34 finished with value: 0.9403824521934759 and parameters: {'max_depth': 9, 'learning_rate': 0.1630037985284386, 'min_child_weight': 3, 'subsample': 0.59858147328

[32m[I 2023-03-26 06:19:00,490][0m Trial 62 finished with value: 0.9468265427836021 and parameters: {'max_depth': 10, 'learning_rate': 0.6628886542650841, 'min_child_weight': 6, 'subsample': 0.7943049239964611}. Best is trial 50 with value: 0.948774217819299.[0m
[32m[I 2023-03-26 06:19:02,440][0m Trial 63 finished with value: 0.9452037617554859 and parameters: {'max_depth': 9, 'learning_rate': 0.6634984568332163, 'min_child_weight': 5, 'subsample': 0.7876589222148689}. Best is trial 50 with value: 0.948774217819299.[0m
[32m[I 2023-03-26 06:19:04,553][0m Trial 64 finished with value: 0.9469403078463271 and parameters: {'max_depth': 10, 'learning_rate': 0.9198525178173353, 'min_child_weight': 6, 'subsample': 0.8614913952014299}. Best is trial 50 with value: 0.948774217819299.[0m
[32m[I 2023-03-26 06:19:06,534][0m Trial 65 finished with value: 0.9447990987607962 and parameters: {'max_depth': 9, 'learning_rate': 0.39158352967754534, 'min_child_weight': 5, 'subsample': 0.86863161

[32m[I 2023-03-26 06:20:02,583][0m Trial 93 finished with value: 0.9482466595571168 and parameters: {'max_depth': 9, 'learning_rate': 0.5014341561007198, 'min_child_weight': 1, 'subsample': 0.7292028492493547}. Best is trial 84 with value: 0.9489693313222725.[0m
[32m[I 2023-03-26 06:20:04,604][0m Trial 94 finished with value: 0.9491312801856614 and parameters: {'max_depth': 9, 'learning_rate': 0.496460462572989, 'min_child_weight': 1, 'subsample': 0.7314719104836033}. Best is trial 94 with value: 0.9491312801856614.[0m
[32m[I 2023-03-26 06:20:06,452][0m Trial 95 finished with value: 0.946840521564694 and parameters: {'max_depth': 8, 'learning_rate': 0.5875638298693988, 'min_child_weight': 1, 'subsample': 0.7262555854181587}. Best is trial 94 with value: 0.9491312801856614.[0m
[32m[I 2023-03-26 06:20:08,582][0m Trial 96 finished with value: 0.9482066716829697 and parameters: {'max_depth': 9, 'learning_rate': 0.5271072140430856, 'min_child_weight': 1, 'subsample': 0.7099019496

Number of finished trials: 100
Best trial:
  Value: 0.9491312801856614
  Params: 
    max_depth: 9
    learning_rate: 0.496460462572989
    min_child_weight: 1
    subsample: 0.7314719104836033


In [121]:
modelo_xgb = xgb.XGBClassifier(n_estimators = 1500, max_depth = 9, random_state = 42, 
                               learning_rate=0.496460462572989, min_child_weight=1, subsample=0.7314719104836033)
modelo_xgb.fit(x_treino, y_treino)
y_xgb = modelo_xgb.predict(x_teste)
print(metrics.f1_score(y_teste,y_xgb))

0.9572931187428716


In [None]:
df7.columns

# Final

In [54]:
dados = pd.read_csv('test.csv')
dados2 = preproc(dados)
dados2.isna().sum()

id                                 0
Classificação do hotel             0
Meses da reserva até o check-in    0
Número de pernoites reservadas     0
Número de hospedes                 1
Regime de alimentação              0
Nacionalidade                      0
Tipo do quarto reservado           0
Reserva com Estacionamento         0
Reserva com Observações            0
dtype: int64

In [62]:
dados2['Número de hospedes'] = dados2['Número de hospedes'].fillna(int(dados2['Número de hospedes'].mode()))

In [63]:
dados2.isna().sum()

id                                 0
Classificação do hotel             0
Meses da reserva até o check-in    0
Número de pernoites reservadas     0
Número de hospedes                 0
Regime de alimentação              0
Nacionalidade                      0
Tipo do quarto reservado           0
Reserva com Estacionamento         0
Reserva com Observações            0
dtype: int64

In [64]:
x_test = dados2.drop(columns='id')

In [65]:
y_test = dados2['id']
y_test['Reserva Cancelada'] = modelo_random_forest.predict(x_test)

ValueError: X has 9 features, but RandomForestClassifier is expecting 11 features as input.

In [None]:
y_test.to_csv('submission_19.csv', index = False)