In [42]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import FunctionTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

pd.options.mode.copy_on_write = True
# two solutions
# random ffill, bfill
# regrission sol for [p-c, p-not-c]

In [43]:
X = pd.read_csv("./first_inten_project.csv")
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

In [44]:
X_train.drop(["Booking_ID", "date of reservation", "repeated"], inplace=True, axis=1)

X_train_1 = X_train.loc[X_train["P-not-C"] != X_train["P-C"]]
X_test_1 = X_train.loc[X_train["P-not-C"] == X_train["P-C"]]

In [45]:
pnc = X_train_1["P-not-C"]
pc = X_train_1["P-C"]
X_train_1.drop(columns=["P-C", "P-not-C"], inplace=True)

In [46]:
cats = ['number of adults', 'number of children', 'number of weekend nights',
       'number of week nights', 'type of meal', 'car parking space',
       'room type', 'market segment type',
       'special requests']

joint_rfreq_dis = {}
for cat in cats:
    joint_rfreq_dis[cat] = pd.crosstab(X_train["booking status"], X_train[cat], margins=True, normalize=True)
    print(pd.crosstab(X_train["booking status"], X_train[cat], margins=True, normalize=True).T, "\n\n")

booking status    Canceled  Not_Canceled       All
number of adults                                  
0                 0.001275      0.002756  0.004031
1                 0.050744      0.160810  0.211554
2                 0.251757      0.469857  0.721614
3                 0.022943      0.039445  0.062388
4                 0.000103      0.000310  0.000413
All               0.326822      0.673178  1.000000 


booking status      Canceled  Not_Canceled       All
number of children                                  
0                   0.298539      0.626051  0.924590
1                   0.015020      0.029730  0.044750
2                   0.013091      0.016984  0.030074
3                   0.000138      0.000344  0.000482
9                   0.000034      0.000034  0.000069
10                  0.000000      0.000034  0.000034
All                 0.326822      0.673178  1.000000 


booking status            Canceled  Not_Canceled       All
number of weekend nights                          

In [47]:
for cat in cats:
    X_train_1[cat + " canceled"] = X_train_1[cat].map(joint_rfreq_dis[cat].loc["Canceled"])
    X_train_1[cat + " not canceled"] = X_train_1[cat].map(joint_rfreq_dis[cat].loc["Not_Canceled"])

In [48]:
X_train_1.drop(columns=cats, inplace=True)

In [49]:
X_train_1["booking status"] = X_train_1["booking status"].map({"Canceled": 0, "Not_Canceled": 1})

In [50]:
X_train_1.isna().sum()

lead time                                0
average price                            0
booking status                           0
number of adults canceled                0
number of adults not canceled            0
number of children canceled              0
number of children not canceled          0
number of weekend nights canceled        0
number of weekend nights not canceled    0
number of week nights canceled           0
number of week nights not canceled       0
type of meal canceled                    0
type of meal not canceled                0
car parking space canceled               0
car parking space not canceled           0
room type canceled                       0
room type not canceled                   0
market segment type canceled             0
market segment type not canceled         0
special requests canceled                0
special requests not canceled            0
dtype: int64

In [51]:
from sklearn.ensemble import RandomForestRegressor

forest_reg_pc_bs = RandomForestRegressor()
forest_reg_pnc_bs = RandomForestRegressor()
forest_reg_pc_bs.fit(X_train_1, pc)
forest_reg_pnc_bs.fit(X_train_1, pnc)

In [52]:
for cat in cats:
    X_test_1[cat + " canceled"] = X_test_1[cat].map(joint_rfreq_dis[cat].loc["Canceled"])
    X_test_1[cat + " not canceled"] = X_test_1[cat].map(joint_rfreq_dis[cat].loc["Not_Canceled"])

In [53]:
X_test_1.isna().sum()

number of adults                         0
number of children                       0
number of weekend nights                 0
number of week nights                    0
type of meal                             0
car parking space                        0
room type                                0
lead time                                0
market segment type                      0
P-C                                      0
P-not-C                                  0
average price                            0
special requests                         0
booking status                           0
number of adults canceled                0
number of adults not canceled            0
number of children canceled              0
number of children not canceled          0
number of weekend nights canceled        0
number of weekend nights not canceled    0
number of week nights canceled           0
number of week nights not canceled       0
type of meal canceled                    0
type of mea

In [54]:
X_test_1.drop(columns=cats, inplace=True)
X_test_1["booking status"] = X_test_1["booking status"].map({"Canceled": 0, "Not_Canceled": 1})

In [55]:
X_test_1.isna().sum()

lead time                                0
P-C                                      0
P-not-C                                  0
average price                            0
booking status                           0
number of adults canceled                0
number of adults not canceled            0
number of children canceled              0
number of children not canceled          0
number of weekend nights canceled        0
number of weekend nights not canceled    0
number of week nights canceled           0
number of week nights not canceled       0
type of meal canceled                    0
type of meal not canceled                0
car parking space canceled               0
car parking space not canceled           0
room type canceled                       0
room type not canceled                   0
market segment type canceled             0
market segment type not canceled         0
special requests canceled                0
special requests not canceled            0
dtype: int6

In [56]:
X_test_1.drop(columns=["P-C", "P-not-C"], inplace=True)

In [57]:
X_train_1.columns == X_test_1.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True])

In [58]:
PC = forest_reg_pc_bs.predict(X_test_1)
PnotC = forest_reg_pnc_bs.predict(X_test_1)

In [59]:
accuracy_score(X_test_1["booking status"], (PC <= PnotC))

0.7144572356804859

In [60]:
X_test_1_copy = X_test_1.copy()
X_test_1_copy["P-not-C"] = PnotC
X_test_1_copy["P-C"] = PC
X_train_2 = X_test_1_copy[X_test_1["booking status"]  == (PC <= PnotC)]
X_test_2 = X_test_1_copy[X_test_1["booking status"]  != (PC <= PnotC)]

In [62]:
from sklearn.neighbors import KNeighborsRegressor

In [63]:
forest_reg_pc_bs_1 = KNeighborsRegressor()
forest_reg_pnc_bs_1 = KNeighborsRegressor()
forest_reg_pc_bs_1.fit(pd.concat([X_train_2.drop(columns=["P-not-C", "P-C"]), X_train_1]), pd.concat([X_train_2["P-C"], pc]))
forest_reg_pnc_bs_1.fit(pd.concat([X_train_2.drop(columns=["P-not-C", "P-C"]), X_train_1]), pd.concat([X_train_2["P-not-C"], pnc]))

In [64]:
X_test_2.drop(columns=["P-not-C", "P-C"], inplace=True)

In [65]:
PNC = forest_reg_pnc_bs_1.predict(X_test_2)
PC = forest_reg_pc_bs_1.predict(X_test_2)

In [70]:
accuracy_score(X_test_2["booking status"], PNC>=PC)

0.5484788523373733

In [71]:
X_test_2["P-not-C"]=  PNC
X_test_2["P-C"]=  PC

In [74]:
X_test_1 = pd.concat([X_test_2, X_train_2])

In [83]:
X_train_1["P-C"] = pc
X_train_1["P-not-C"] = pnc

In [87]:
X_train = pd.concat([X_train_1, X_test_1])

In [89]:
accuracy_score(X_train["booking status"], (X_train["P-C"] <= X_train["P-not-C"]))

0.8708143861099628

In [91]:
from sklearn.preprocessing import StandardScaler

In [93]:
scaller = StandardScaler()
scaller.fit(X_train.drop(columns=["P-C", "P-not-C", "booking status"]))

In [94]:
forest_reg_pc = RandomForestRegressor(n_jobs=-1)
forest_reg_pnc = RandomForestRegressor(n_jobs=-1)

forest_reg_pc.fit(scaller.transform(X_train.drop(columns=["P-C", "P-not-C", "booking status"])), X_train["P-C"])
forest_reg_pnc.fit(scaller.transform(X_train.drop(columns=["P-C", "P-not-C", "booking status"])), X_train["P-not-C"])

In [96]:
for cat in cats:
    X_test[cat + " canceled"] = X_test[cat].map(joint_rfreq_dis[cat].loc["Canceled"])
    X_test[cat + " not canceled"] = X_test[cat].map(joint_rfreq_dis[cat].loc["Not_Canceled"])

In [97]:
X_test.drop(columns=cats, inplace=True)
X_test["booking status"] = X_test["booking status"].map({"Canceled": 0, "Not_Canceled": 1})

In [98]:
y_test = X_test["booking status"].copy()

In [102]:
X_test.drop(columns=["Booking_ID", "date of reservation", "repeated"], inplace=True)

In [104]:
X_test_ready = X_test[X_test["P-C"]!=X_test["P-not-C"]]
X_test_dirty = X_test[X_test["P-C"]==X_test["P-not-C"]]

In [105]:
X_test.drop(columns=["P-not-C", "P-C", "booking status"], inplace=True)

In [113]:
PC = forest_reg_pc.predict(scaller.transform(X_test_dirty.drop(columns=["P-not-C", "P-C", "booking status"])))
PNC = forest_reg_pnc.predict(scaller.transform(X_test_dirty.drop(columns=["P-not-C", "P-C", "booking status"])))

In [114]:
X_test_dirty["P-C"] = PC
X_test_dirty["P-not-C"] = PNC

In [116]:
X_test = pd.concat([X_test_dirty, X_test_ready])

In [119]:
accuracy_score(X_test["booking status"], X_test["P-C"] <= X_test["P-not-C"])

0.7977125533967204

In [122]:
y_train = X_train["booking status"].copy()
y_test = X_test["booking status"].copy()

X_train.drop(columns=["booking status"], inplace=True)
X_test.drop(columns=["booking status"], inplace=True)

In [124]:
from sklearn.preprocessing import StandardScaler

scaller = StandardScaler()
X_train = scaller.fit_transform(X_train)

In [125]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_jobs=-1)
model.fit(X_train, y_train)

In [126]:
cross_val_score(
    model,
    X_train,
    y_train,
    cv=10,
    scoring="accuracy",
    n_jobs=-1,
    verbose=3
).mean()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:    1.2s remaining:    2.8s
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    1.3s remaining:    0.6s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    1.4s finished


0.8929338171282684

In [127]:
from sklearn.svm import SVC

model = SVC()
model.fit(X_train, y_train)

cross_val_score(
    model,
    X_train,
    y_train,
    cv=10,
    scoring="accuracy",
    n_jobs=-1,
    verbose=3
).mean()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:   22.7s remaining:   53.0s
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:   24.7s remaining:   10.6s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   33.1s finished


0.892452590098458

In [143]:
model = RandomForestClassifier(n_jobs=-1)

cross_val_score(
    model,
    X_train,
    y_train,
    cv=10,
    scoring="accuracy",
    n_jobs=-1,
    verbose=3
).mean()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:    3.3s remaining:    7.6s
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    3.4s remaining:    1.5s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    4.0s finished


0.9533919852392533

In [144]:
model.fit(X_train, y_train)

In [151]:
scaller = StandardScaler()
accuracy_score(y_test, model.predict(scallert.fit_transform(X_test)))

0.9388357034587295


In [152]:
import joblib
joblib.dump(model, "model.pkl")
joblib.dump(forest_reg_pc, "forest_reg_pc.pkl")
joblib.dump(forest_reg_pnc, "forest_reg_pnc.pkl")
joblib.dump(joint_rfreq_dis, "joint_rfreq_dis.pkl")

['joint_rfreq_dis.pkl']

In [148]:
X_test.columns

Index(['lead time', 'P-C', 'P-not-C', 'average price ',
       'number of adults canceled', 'number of adults not canceled',
       'number of children canceled', 'number of children not canceled',
       'number of weekend nights canceled',
       'number of weekend nights not canceled',
       'number of week nights canceled', 'number of week nights not canceled',
       'type of meal canceled', 'type of meal not canceled',
       'car parking space canceled', 'car parking space not canceled',
       'room type canceled', 'room type not canceled',
       'market segment type canceled', 'market segment type not canceled',
       'special requests canceled', 'special requests not canceled'],
      dtype='object')

AttributeError: 'numpy.ndarray' object has no attribute 'columns'

In [23]:
X_train.columns == X_test.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True])

In [24]:
X_train.isna().sum()

lead time                                0
average price                            0
booking status                           0
number of adults canceled                0
number of adults not canceled            0
number of children canceled              0
number of children not canceled          0
number of weekend nights canceled        0
number of weekend nights not canceled    0
number of week nights canceled           0
number of week nights not canceled       0
type of meal canceled                    0
type of meal not canceled                0
car parking space canceled               0
car parking space not canceled           0
room type canceled                       0
room type not canceled                   0
market segment type canceled             0
market segment type not canceled         0
special requests canceled                0
special requests not canceled            0
P-C                                      0
P-not-C                                  0
dtype: int6

In [28]:
########################################################################################################################

In [29]:
X_new = pd.concat([X_test, X_train])

In [30]:
X_new.sample(20)

Unnamed: 0,lead time,average price,booking status,number of adults canceled,number of adults not canceled,number of children canceled,number of children not canceled,number of weekend nights canceled,number of weekend nights not canceled,number of week nights canceled,...,car parking space canceled,car parking space not canceled,room type canceled,room type not canceled,market segment type canceled,market segment type not canceled,special requests canceled,special requests not canceled,P-C,P-not-C
30234,50,150.3,1,0.0,0.003268,0.017429,0.96841,0.008715,0.240741,0.001089,...,0.017429,0.831155,0.0,0.069717,0.007625,0.09695,0.0,0.06427,2.42,10.42
29671,36,81.62,1,0.007625,0.213508,0.017429,0.96841,0.008715,0.240741,0.0,...,0.017429,0.831155,0.0,0.007625,0.007625,0.09695,0.0,0.336601,1.67,5.03
5160,4,65.0,1,0.009804,0.764706,0.017429,0.96841,0.007625,0.643791,0.001089,...,0.017429,0.831155,0.01634,0.854031,0.003268,0.643791,0.017429,0.552288,0.451119,3.216137
10549,22,95.0,1,0.009804,0.764706,0.017429,0.96841,0.008715,0.240741,0.002179,...,0.017429,0.831155,0.01634,0.854031,0.007625,0.09695,0.0,0.06427,0.64,1.29
23317,4,140.0,0,0.007625,0.213508,0.017429,0.96841,0.008715,0.240741,0.002179,...,0.017429,0.831155,0.01634,0.854031,0.007625,0.09695,0.017429,0.552288,0.9,2.93
24643,54,96.0,1,0.007625,0.213508,0.017429,0.96841,0.008715,0.240741,0.001089,...,0.017429,0.831155,0.01634,0.854031,0.007625,0.09695,0.0,0.336601,0.41,5.39
26951,56,207.9,1,0.007625,0.213508,0.0,0.013072,0.008715,0.240741,0.010893,...,0.017429,0.831155,0.0,0.010893,0.007625,0.09695,0.0,0.336601,0.36,3.78
23708,3,0.0,1,0.009804,0.764706,0.017429,0.96841,0.008715,0.240741,0.002179,...,0.017429,0.831155,0.01634,0.854031,0.0,0.133987,0.0,0.336601,0.12,2.893714
7671,78,138.6,1,0.007625,0.213508,0.017429,0.96841,0.007625,0.643791,0.001089,...,0.017429,0.831155,0.0,0.069717,0.007625,0.09695,0.0,0.336601,0.32,2.24
3604,180,100.0,1,0.007625,0.213508,0.017429,0.96841,0.008715,0.240741,0.010893,...,0.017429,0.831155,0.01634,0.854031,0.006536,0.091503,0.0,0.336601,10.39,2.89


In [39]:
X_train, X_test = train_test_split(X_new, test_size=0.2, random_state=42)

In [40]:
y_train = X_train["booking status"]
y_test = X_test["booking status"]
X_train.drop("booking status", axis=1, inplace=True)
X_test.drop("booking status", axis=1, inplace=True)

In [41]:
cols = X_train.columns

In [42]:
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFECV

scaller = StandardScaler()
X_train = scaller.fit_transform(X_train)
X_train = pd.DataFrame(X_train, columns=cols)

In [45]:
rfecv = RFECV(RandomForestClassifier(n_jobs=-1, random_state=42), step=1, cv=5, scoring='accuracy')
rfecv.fit(X_train, y_train)

In [52]:
X_train_best_selected = X_train.loc[:, rfecv.support_]

# KNN

In [63]:
from sklearn.neighbors import KNeighborsClassifier

cross_val_score(
    KNeighborsClassifier(n_jobs=-1),
    X_train_best_selected.to_numpy(),
    y_train,
    n_jobs=-1,
    cv=10,
    scoring="accuracy",
    verbose=3
).mean()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:    0.3s remaining:    0.8s
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    0.3s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.5s finished


0.8548636921277593

# SVC

In [64]:
from sklearn.svm import SVC

cross_val_score(
    SVC(),
    X_train_best_selected.to_numpy(),
    y_train,
    n_jobs=-1,
    cv=10,
    scoring="accuracy",
    verbose=3
).mean()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:   30.4s remaining:  1.2min
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:   32.3s remaining:   13.8s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   43.7s finished


0.8463840847458324

# ExtraTree

In [66]:
from sklearn.tree import ExtraTreeClassifier

cross_val_score(
    ExtraTreeClassifier(random_state=42),
    X_train_best_selected.to_numpy(),
    y_train,
    n_jobs=-1,
    cv=10,
    scoring="accuracy",
    verbose=3
).mean()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.1s finished


0.8795049303104369

# MLP

In [67]:
from sklearn.neural_network import MLPClassifier

cross_val_score(
    MLPClassifier(random_state=42),
    X_train_best_selected.to_numpy(),
    y_train,
    n_jobs=-1,
    cv=10,
    scoring="accuracy",
    verbose=3
).mean()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:   13.0s remaining:   30.3s
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:   13.3s remaining:    5.7s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   21.0s finished


0.8854827641429244

# RandomForest

In [68]:
cross_val_score(
    RandomForestClassifier(n_jobs=-1, random_state=42),
    X_train_best_selected,
    y_train,
    n_jobs=-1,
    cv=10,
    scoring="accuracy",
    verbose=3
).mean()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:    2.5s remaining:    5.9s
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    2.6s remaining:    1.1s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    3.2s finished


0.9312550618924673

In [73]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'n_estimators': [50, 100 , 150, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

grid_search = GridSearchCV(RandomForestClassifier(n_jobs=-1), param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=3)

In [74]:
grid_search.fit(X_train_best_selected, y_train)

Fitting 5 folds for each of 144 candidates, totalling 720 fits
[CV 3/5] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=0.936 total time=   1.0s
[CV 4/5] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.925 total time=   1.8s
[CV 2/5] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=0.928 total time=   4.5s
[CV 3/5] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100;, score=0.933 total time=   1.9s
[CV 4/5] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=150;, score=0.923 total time=   3.4s
[CV 2/5] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=50;, score=0.918 total time=   1.2s
[CV 1/5] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=100;, score=0.918 total time=   2.0s
[CV 3/5] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=150;, score=0

[CV 1/5] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=0.925 total time=   1.1s
[CV 1/5] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=150;, score=0.926 total time=   2.7s
[CV 4/5] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=0.925 total time=   4.3s
[CV 5/5] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100;, score=0.920 total time=   1.9s
[CV 3/5] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200;, score=0.936 total time=   3.7s
[CV 5/5] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=50;, score=0.917 total time=   1.2s
[CV 1/5] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=150;, score=0.918 total time=   3.3s
[CV 4/5] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=200;, score=0.914 total time=   4.0s
[CV 2/5] END max_depth=None, min_sample

[CV 2/5] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.926 total time=   2.2s
[CV 5/5] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=150;, score=0.926 total time=   3.6s
[CV 3/5] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=50;, score=0.933 total time=   1.3s
[CV 2/5] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100;, score=0.927 total time=   2.1s
[CV 5/5] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=150;, score=0.921 total time=   3.8s
[CV 3/5] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=50;, score=0.927 total time=   1.1s
[CV 3/5] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=100;, score=0.926 total time=   2.3s
[CV 2/5] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=200;, score=0.921 total time=   4.1s
[CV 2/5] END max_depth=None, min_sample

[CV 1/5] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.925 total time=   2.4s
[CV 1/5] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=0.926 total time=   4.4s
[CV 5/5] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=50;, score=0.923 total time=   1.3s
[CV 3/5] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=150;, score=0.933 total time=   3.7s
[CV 1/5] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=50;, score=0.917 total time=   1.3s
[CV 4/5] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=50;, score=0.911 total time=   1.1s
[CV 4/5] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=100;, score=0.915 total time=   2.3s
[CV 3/5] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=200;, score=0.927 total time=   4.4s
[CV 4/5] END max_depth=None, min_sample

[CV 3/5] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.936 total time=   2.0s
[CV 4/5] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=150;, score=0.925 total time=   3.6s
[CV 2/5] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=50;, score=0.925 total time=   1.3s
[CV 1/5] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100;, score=0.925 total time=   2.2s
[CV 1/5] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200;, score=0.924 total time=   4.8s
[CV 2/5] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=100;, score=0.920 total time=   2.3s
[CV 1/5] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=200;, score=0.918 total time=   4.6s
[CV 3/5] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100;, score=0.929 total time=   2.5s
[CV 2/5] END max_depth=None, min_sample

In [76]:
grid_search.best_score_

0.929239062192825

In [None]:
final1_test = X_test

In [99]:
X_test_best = X_test.loc[:, rfecv.support_]

In [100]:
X_test_best["booking status"] = y_test.to_numpy()

In [101]:
X_test_best.to_csv("final_t.csv", index=None)

In [77]:
final1 = X_train_best_selected.copy()
final1["booking status"] = y_train
final1.to_csv("final1.csv")


[CV 4/5] END max_depth=30, min_samples_leaf=1, min_samples_split=5, n_estimators=150;, score=0.924 total time=   3.5s
[CV 3/5] END max_depth=30, min_samples_leaf=1, min_samples_split=10, n_estimators=50;, score=0.929 total time=   1.4s
[CV 2/5] END max_depth=30, min_samples_leaf=1, min_samples_split=10, n_estimators=100;, score=0.917 total time=   2.3s
[CV 5/5] END max_depth=30, min_samples_leaf=1, min_samples_split=10, n_estimators=150;, score=0.917 total time=   3.5s
[CV 2/5] END max_depth=30, min_samples_leaf=2, min_samples_split=2, n_estimators=50;, score=0.919 total time=   1.1s
[CV 2/5] END max_depth=30, min_samples_leaf=2, min_samples_split=2, n_estimators=100;, score=0.921 total time=   2.4s
[CV 4/5] END max_depth=30, min_samples_leaf=2, min_samples_split=2, n_estimators=150;, score=0.916 total time=   3.4s
[CV 2/5] END max_depth=30, min_samples_leaf=2, min_samples_split=5, n_estimators=50;, score=0.920 total time=   1.2s
[CV 5/5] END max_depth=30, min_samples_leaf=2, min_samp

[CV 2/5] END max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=50;, score=0.925 total time=   1.4s
[CV 2/5] END max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.928 total time=   2.4s
[CV 3/5] END max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=150;, score=0.935 total time=   3.6s
[CV 2/5] END max_depth=30, min_samples_leaf=1, min_samples_split=5, n_estimators=50;, score=0.924 total time=   1.3s
[CV 5/5] END max_depth=30, min_samples_leaf=1, min_samples_split=5, n_estimators=50;, score=0.920 total time=   1.3s
[CV 5/5] END max_depth=30, min_samples_leaf=1, min_samples_split=5, n_estimators=100;, score=0.922 total time=   2.4s
[CV 3/5] END max_depth=30, min_samples_leaf=1, min_samples_split=5, n_estimators=200;, score=0.934 total time=   4.6s
[CV 5/5] END max_depth=30, min_samples_leaf=1, min_samples_split=10, n_estimators=100;, score=0.915 total time=   1.8s
[CV 2/5] END max_depth=30, min_samples_leaf=1, min_samples

In [84]:
y_train.isna().sum()

0

In [85]:
final1 = X_train.copy()

In [92]:
final1["booking status"] = y_train.to_numpy()

In [90]:
X_train.shape

(28773, 22)

In [91]:
y_train.shape

(28773,)

In [93]:
final1.isna().sum()

lead time                                0
average price                            0
number of adults canceled                0
number of adults not canceled            0
number of children canceled              0
number of children not canceled          0
number of weekend nights canceled        0
number of weekend nights not canceled    0
number of week nights canceled           0
number of week nights not canceled       0
type of meal canceled                    0
type of meal not canceled                0
car parking space canceled               0
car parking space not canceled           0
room type canceled                       0
room type not canceled                   0
market segment type canceled             0
market segment type not canceled         0
special requests canceled                0
special requests not canceled            0
P-C                                      0
P-not-C                                  0
booking status                           0
dtype: int6

In [94]:
final1.to_csv("final1.csv", index=None)

In [96]:
pd.read_csv("final1.csv").isna().sum()

lead time                                0
average price                            0
number of adults canceled                0
number of adults not canceled            0
number of children canceled              0
number of children not canceled          0
number of weekend nights canceled        0
number of weekend nights not canceled    0
number of week nights canceled           0
number of week nights not canceled       0
type of meal canceled                    0
type of meal not canceled                0
car parking space canceled               0
car parking space not canceled           0
room type canceled                       0
room type not canceled                   0
market segment type canceled             0
market segment type not canceled         0
special requests canceled                0
special requests not canceled            0
P-C                                      0
P-not-C                                  0
booking status                           0
dtype: int6

In [104]:
X_test_best.columns

Index(['lead time', 'average price ', 'number of adults canceled',
       'number of adults not canceled', 'number of children not canceled',
       'number of weekend nights canceled',
       'number of weekend nights not canceled',
       'number of week nights canceled', 'number of week nights not canceled',
       'type of meal canceled', 'type of meal not canceled',
       'car parking space not canceled', 'room type canceled',
       'room type not canceled', 'market segment type canceled',
       'market segment type not canceled', 'special requests canceled',
       'special requests not canceled', 'P-C', 'P-not-C', 'booking status'],
      dtype='object')

In [107]:
X_train_best_selected.

(20,)

In [111]:
def enc(X):
    X_copy = X.cop()
    for cat in cats:
        X_copy[cat + " canceled"] = X_copy[cat].map(joint_rfreq_dis[cat].loc["Canceled"])
        X_copy[cat + " not canceled"] = X_copy[cat].map(joint_rfreq_dis[cat].loc["Not_Canceled"])
    return X_copy.drop(columns=cats)

encoder = FunctionTransformer(enc)

In [113]:
from sklearn.impute import KNNImputer

imp = KNNImputer()
imp.fit(X_train_best_selected)

In [115]:
import joblib 
joblib.dump(imp, "imputer.pkl")

['imputer.pkl']

In [134]:
def imputer(X):
    X_copy = X.copy()
    X_copy.loc[:, ["P-not-C", "P-C"]] = np.nan
    
    return imp.transform(X_copy)

In [124]:
y_test = X_test_best["booking status"].to_numpy()
X_test_best.drop(columns="booking status", inplace=True)

In [145]:
v = X_test_best.sample(10)

In [128]:
X_test_best["booking status"] = y_test

In [136]:
imputer(v.drop("booking status", axis=1))

array([[ 9.60000000e+01,  1.32300000e+02,  7.62527233e-03,
         2.13507625e-01,  9.68409586e-01,  8.71459695e-03,
         2.40740741e-01,  0.00000000e+00,  9.80392157e-03,
         1.63398693e-02,  9.41176471e-01,  8.31154684e-01,
         0.00000000e+00,  6.97167756e-02,  7.62527233e-03,
         9.69498911e-02,  0.00000000e+00,  6.42701525e-02,
         6.88263749e-01, -3.27103715e-01]])

In [None]:
scaller = StandardScaler()

In [139]:
forest_clf = RandomForestClassifier(n_jobs=-1)
forest_clf.fit(X_train_best_selected, y_train)

In [148]:
imputer(v.drop("booking status", axis=1))

array([[ 2.00000000e+00,  1.30000000e+02,  0.00000000e+00,
         3.26797386e-03,  9.68409586e-01,  1.08932462e-03,
         9.47712418e-02,  1.08932462e-03,  1.90631808e-01,
         1.63398693e-02,  9.41176471e-01,  8.31154684e-01,
         0.00000000e+00,  6.97167756e-02,  7.62527233e-03,
         9.69498911e-02,  0.00000000e+00,  6.42701525e-02,
         1.06691512e-01, -1.80506250e-01],
       [ 5.00000000e+00,  9.50000000e+01,  9.80392157e-03,
         7.64705882e-01,  9.68409586e-01,  7.62527233e-03,
         6.43790850e-01,  1.08932462e-03,  1.08932462e-01,
         1.63398693e-02,  9.41176471e-01,  8.31154684e-01,
         1.63398693e-02,  8.54030501e-01,  0.00000000e+00,
         1.63398693e-02,  1.74291939e-02,  5.52287582e-01,
         1.06691512e-01, -1.80506250e-01],
       [ 1.47000000e+02,  5.59600000e+01,  7.62527233e-03,
         2.13507625e-01,  9.68409586e-01,  8.71459695e-03,
         2.40740741e-01,  1.08932462e-03,  1.08932462e-01,
         1.63398693e-02,  9.4

In [147]:
v

Unnamed: 0,lead time,average price,number of adults canceled,number of adults not canceled,number of children not canceled,number of weekend nights canceled,number of weekend nights not canceled,number of week nights canceled,number of week nights not canceled,type of meal canceled,...,car parking space not canceled,room type canceled,room type not canceled,market segment type canceled,market segment type not canceled,special requests canceled,special requests not canceled,P-C,P-not-C,booking status
24122,2,130.0,0.0,0.003268,0.96841,0.001089,0.094771,0.001089,0.190632,0.01634,...,0.831155,0.0,0.069717,0.007625,0.09695,0.0,0.06427,3.78,11.48,1
25581,5,95.0,0.009804,0.764706,0.96841,0.007625,0.643791,0.001089,0.108932,0.01634,...,0.831155,0.01634,0.854031,0.0,0.01634,0.017429,0.552288,0.125,1.25,1
20236,147,55.96,0.007625,0.213508,0.96841,0.008715,0.240741,0.001089,0.108932,0.01634,...,0.831155,0.01634,0.854031,0.007625,0.09695,0.017429,0.552288,1.0,4.154667,1
18023,74,78.3,0.007625,0.213508,0.96841,0.008715,0.240741,0.001089,0.190632,0.01634,...,0.831155,0.01634,0.854031,0.007625,0.09695,0.0,0.336601,0.62,5.47,1
21486,27,95.0,0.007625,0.213508,0.96841,0.008715,0.240741,0.001089,0.190632,0.01634,...,0.831155,0.01634,0.854031,0.006536,0.091503,0.017429,0.552288,0.759286,0.15,1
3985,39,81.9,0.007625,0.213508,0.96841,0.008715,0.240741,0.001089,0.190632,0.01634,...,0.831155,0.01634,0.854031,0.007625,0.09695,0.017429,0.552288,1.96,1.51,1
19365,113,76.5,0.007625,0.213508,0.96841,0.007625,0.643791,0.010893,0.497821,0.01634,...,0.831155,0.01634,0.854031,0.007625,0.09695,0.0,0.06427,0.91,4.1,0
15690,0,95.0,0.009804,0.764706,0.96841,0.007625,0.643791,0.001089,0.190632,0.01634,...,0.831155,0.01634,0.854031,0.0,0.01634,0.017429,0.552288,0.282889,1.650667,1
24660,3,110.0,0.007625,0.213508,0.96841,0.007625,0.643791,0.010893,0.497821,0.01634,...,0.151416,0.01634,0.854031,0.007625,0.09695,0.017429,0.552288,2.39,2.327,1
8981,52,63.75,0.007625,0.213508,0.96841,0.007625,0.643791,0.0,0.009804,0.0,...,0.831155,0.01634,0.854031,0.007625,0.09695,0.017429,0.552288,1.04,3.46,0


In [149]:
X_train_best_selected.corr()

Unnamed: 0,lead time,average price,number of adults canceled,number of adults not canceled,number of children not canceled,number of weekend nights canceled,number of weekend nights not canceled,number of week nights canceled,number of week nights not canceled,type of meal canceled,type of meal not canceled,car parking space not canceled,room type canceled,room type not canceled,market segment type canceled,market segment type not canceled,special requests canceled,special requests not canceled,P-C,P-not-C
lead time,1.0,-0.057738,-0.047887,-0.126408,0.049747,-0.027705,-0.05287,-0.191744,-0.185104,-0.044479,-0.057454,0.062035,0.097022,0.096846,0.10009,-0.183392,0.099975,0.101787,0.326352,-0.139664
average price,-0.057738,1.0,-0.332221,-0.243555,-0.323204,0.00531,-9.1e-05,-0.021715,-0.020447,0.013968,0.006771,-0.064278,-0.401989,-0.405256,0.369809,-0.146948,-0.157961,-0.191944,-0.121965,-0.100132
number of adults canceled,-0.047887,-0.332221,1.0,0.695774,-0.010723,0.066567,0.093973,0.065196,0.082033,-0.056315,-0.055458,0.019169,0.374703,0.367873,-0.230699,0.184251,0.151913,0.181581,-0.060111,-0.269194
number of adults not canceled,-0.126408,-0.243555,0.695774,1.0,0.085914,0.090628,0.136149,0.110101,0.126791,0.063437,0.063081,0.004661,0.228964,0.226627,-0.360778,0.328967,0.193673,0.192202,-0.010538,-0.097067
number of children not canceled,0.049747,-0.323204,-0.010723,0.085914,1.0,0.036061,0.030164,-0.009744,0.002899,-0.081122,-0.079746,0.038045,0.171338,0.184468,-0.098255,0.058603,0.105379,0.147353,0.11813,-0.010078
number of weekend nights canceled,-0.027705,0.00531,0.066567,0.090628,0.036061,1.0,0.614201,-0.024454,0.034538,-0.025139,-0.024544,-0.026649,0.054053,0.053506,-0.100815,0.084692,0.04797,0.050874,0.024736,-0.109722
number of weekend nights not canceled,-0.05287,-9.1e-05,0.093973,0.136149,0.030164,0.614201,1.0,0.048172,0.091264,-0.053077,-0.055177,-0.029258,0.06232,0.061482,-0.1392,0.11155,0.069764,0.067709,0.009655,-0.132409
number of week nights canceled,-0.191744,-0.021715,0.065196,0.110101,-0.009744,-0.024454,0.048172,1.0,0.94707,-0.062242,-0.059933,-0.055729,0.044486,0.043226,-0.117466,0.121486,0.013549,0.019443,-0.015101,0.052294
number of week nights not canceled,-0.185104,-0.020447,0.082033,0.126791,0.002899,0.034538,0.091264,0.94707,1.0,-0.077208,-0.076187,-0.054531,0.070961,0.069439,-0.137386,0.128725,0.031884,0.0377,0.002845,0.042911
type of meal canceled,-0.044479,0.013968,-0.056315,0.063437,-0.081122,-0.025139,-0.053077,-0.062242,-0.077208,1.0,0.998573,-0.025897,-0.224764,-0.224061,-0.13393,0.127309,0.005382,-0.022422,-0.016982,0.021995


In [152]:
y_tmp = X_train_best_selected[["P-C", "P-not-C"]]
X_tmp = X_train_best_selected.drop(columns=["P-C", "P-not-C"])

In [None]:
from sklearn.ensemble import RandomForestRegressor


forest_