In [57]:
# Importação de bibliotecas
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import numpy as np

# Leitura de dados

In [26]:
# Ler arquivo .csv para um DataFrame pandas
tb_hotel = pd.read_csv("tb_hotel_traintest.csv")
# Validar número de colunas
# Validar e tratar tipo das colunas (principalmente datas através da função pd.to_datetime)
tb_hotel_cleaned = tb_hotel[['is_cancelled',
                              'hotel',
                              'stays_in_weekend_nights',
                              'stays_in_week_nights',
                              'adults',
                              'children',
                              'babies',
                              #'meal',
                              #'country',
                              #'market_segment',
                              'is_repeated_guest',
                              'previous_cancellations',
                              'previous_bookings_not_canceled',
                              #'reserved_room_type',
                              'booking_changes',
                              #'deposit_type',
                              'days_in_waiting_list',
                              #'customer_type',
                              'adr',
                              'required_car_parking_spaces',
                              'total_of_special_requests'
                              #'reservation_status_date',
                              #'arrival_date'
                              ]]
tb_hotel_cleaned = tb_hotel_cleaned.dropna()

In [27]:
#transformação de variável input 'hotel' em numérico
tb_hotel_cleaned = pd.concat([pd.get_dummies(tb_hotel_cleaned['hotel'],dtype=int,drop_first = True),tb_hotel_cleaned],axis=1)
tb_hotel_cleaned = tb_hotel_cleaned.drop('hotel', axis=1)

In [28]:
tb_hotel_cleaned.head()

Unnamed: 0,Resort Hotel,is_cancelled,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,days_in_waiting_list,adr,required_car_parking_spaces,total_of_special_requests
0,1,0,0,0,2,0.0,0,0,0,0,3,0,0.0,0,0
1,1,0,0,0,2,0.0,0,0,0,0,4,0,0.0,0,0
2,1,0,0,1,1,0.0,0,0,0,0,0,0,75.0,0,0
3,1,0,0,1,1,0.0,0,0,0,0,0,0,75.0,0,0
4,1,0,0,2,2,0.0,0,0,0,0,0,0,98.0,0,1


In [29]:
# separando em variáveis X e Y
tb_hotel_cleaned_Y = tb_hotel_cleaned['is_cancelled']
tb_hotel_cleaned_X = tb_hotel_cleaned.drop(['is_cancelled'], axis=1)

# EDA (Análise Exploratória de Dados)

#### Avaliação de problemas nos dados (missing value e informações incorretas)

In [None]:
tb_hotel.describe()

In [None]:
tb_hotel.info()

In [None]:
tb_hotel.head()

In [None]:
sns.histplot(data=tb_hotel[['hotel','is_cancelled']],x='is_cancelled', hue='hotel', multiple="dodge")

In [None]:
# dúvida
# Como faço pra plotar sem esses números do meio (precisa transformar em boolean)

In [None]:
tb_city_resort = tb_hotel[['is_cancelled','hotel','id_booking']].groupby(by=['hotel', 'is_cancelled']).count()
print(tb_city_resort)

In [None]:
tb_city_resort = tb_city_resort.reset_index()

In [None]:
tb_city_resort['sum_bookings'] = tb_city_resort.groupby(by=['hotel']).transform(sum)['id_booking']
tb_city_resort['%cancelled'] = tb_city_resort['id_booking'] / tb_city_resort['sum_bookings']
print(tb_city_resort)

In [None]:
tb_hotel.corr()

# Modelo Baseline

In [None]:
# Variáveis:
# hotel -- criar boolean

In [None]:
# dúvida
# todas as variáveis deram uma correlação meio baixa, é isso mesmo?

#### Divisão do dataset em train e test para evitar leakage

In [59]:
X_train, X_test, y_train, y_test = train_test_split(tb_hotel_cleaned_X, tb_hotel_cleaned_Y, test_size = 0.2)

In [60]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 90724 entries, 82200 to 64574
Data columns (total 14 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Resort Hotel                    90724 non-null  int32  
 1   stays_in_weekend_nights         90724 non-null  int64  
 2   stays_in_week_nights            90724 non-null  int64  
 3   adults                          90724 non-null  int64  
 4   children                        90724 non-null  float64
 5   babies                          90724 non-null  int64  
 6   is_repeated_guest               90724 non-null  int64  
 7   previous_cancellations          90724 non-null  int64  
 8   previous_bookings_not_canceled  90724 non-null  int64  
 9   booking_changes                 90724 non-null  int64  
 10  days_in_waiting_list            90724 non-null  int64  
 11  adr                             90724 non-null  float64
 12  required_car_parking_spaces 

#### PCA evitará não convergência de regressão logística

In [61]:
norm = StandardScaler().fit(X_train)
X_train_norm = norm.transform(X_train)
pca_t = PCA()
pca_t.fit(X_train_norm)
X_train_pca = pca_t.transform(X_train_norm)

In [62]:
X_test_norm = norm.transform(X_test)
X_test_pca = pca_t.transform(X_test_norm)

#### Regressão logística

In [63]:
# falta evitar leakage
logistic = LogisticRegression()
logistic.fit(X=X_train_pca, y=y_train)

LogisticRegression()

In [66]:
np.mean(y_train)

0.37026586129359373

In [68]:
np.mean(logistic.predict_proba(X_test_pca)[:,1])

0.37195493311232714

In [71]:

y_test['pred_prob'] = logistic.predict_proba(X_test_pca)[:,1]

In [72]:
type(y_test)

pandas.core.series.Series

In [73]:
y_test

68766                                                        1
3687                                                         1
13147                                                        0
6183                                                         1
34339                                                        0
                                   ...                        
98925                                                        0
100817                                                       0
112301                                                       0
64338                                                        1
pred_prob    [0.4731090814008845, 0.30566869009788905, 0.17...
Name: is_cancelled, Length: 22683, dtype: object

#### 1-NN (kNN com n-neighbors = 1)

In [16]:
knn_fit = KNeighborsClassifier(n_neighbors=1)
knn_fit.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=1)

In [17]:
y_test['pred_prob'] = knn_fit.predict(X_test)

In [43]:
y_test

69684                                                        1
27028                                                        0
98961                                                        0
99134                                                        0
73664                                                        1
                                   ...                        
81992                                                        0
80708                                                        0
30547                                                        0
72923                                                        1
pred_prob    [[0.0, 1.0], [1.4925838343060605e-11, 0.999999...
Name: is_cancelled, Length: 22683, dtype: object

#### Teste do sample

In [18]:
sample = pd.read_csv("tb_hotel_feat_valid.csv")
logistic_output = pd.DataFrame()
KNN_output = pd.DataFrame()

In [19]:
sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5981 entries, 0 to 5980
Data columns (total 28 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   hotel                           5981 non-null   object 
 1   lead_time                       5981 non-null   int64  
 2   stays_in_weekend_nights         5981 non-null   int64  
 3   stays_in_week_nights            5981 non-null   int64  
 4   adults                          5981 non-null   int64  
 5   children                        5980 non-null   float64
 6   babies                          5981 non-null   int64  
 7   meal                            5981 non-null   object 
 8   country                         5951 non-null   object 
 9   market_segment                  5981 non-null   object 
 10  distribution_channel            5981 non-null   object 
 11  is_repeated_guest               5981 non-null   int64  
 12  previous_cancellations          59

In [20]:
sample = sample[['hotel',
                              'stays_in_weekend_nights',
                              'stays_in_week_nights',
                              'adults',
                              'children',
                              'babies',
                              #'meal',
                              #'country',
                              #'market_segment',
                              'is_repeated_guest',
                              'previous_cancellations',
                              'previous_bookings_not_canceled',
                              #'reserved_room_type',
                              'booking_changes',
                              #'deposit_type',
                              'days_in_waiting_list',
                              #'customer_type',
                              'adr',
                              'required_car_parking_spaces',
                              'total_of_special_requests',
                 'id_booking'
                              #'reservation_status_date',
                              #'arrival_date'
                              ]]
sample = sample.dropna()

In [None]:
logistic_output['id_booking'] = sample['id_booking'].copy()
KNN_output['id_booking'] = sample['id_booking'].copy()

In [None]:
sample_X = sample.drop(['id_booking'], axis=1)
sample_X = pd.concat([pd.get_dummies(sample_X['hotel'],dtype=int,drop_first = True),sample_X],axis=1)
sample_X = sample.drop('hotel', axis=1)

In [21]:
sample_X_norm = norm.transform(sample_X)
sample_X_pca = pca_t.transform(sample_X_norm)



In [None]:
logistic_output['is_cancelled'] = logistic.predict_proba(sample_X_pca)
KNN_output['is_cancelled'] = knn_fit.predict(sample_X_pca)

In [24]:
X_train.info()

AttributeError: 'numpy.ndarray' object has no attribute 'info'