In [1]:
import datetime

import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline, make_pipeline
from imblearn.under_sampling import RandomUnderSampler
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, precision_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import (
    MinMaxScaler,
    OneHotEncoder,
    OrdinalEncoder,
    PowerTransformer,
)
from sklearn.tree import DecisionTreeClassifier

%matplotlib inline

In [2]:
drop_cols = [
    #     "srch_id",
    "srch_date_time",
    "srch_ci",
    "srch_co",
    "srch_visitor_id",
    "srch_local_date",
    #     "prop_key",
    "srch_currency",
    "prop_submarket_id",
    "srch_mobile_app",
    #     "prop_booking_bool",
    "srch_visitor_wr_member",
    "prop_market_id",
    "srch_visitor_loc_city",
]

price_cols = [
    "prop_price_without_discount_local",  # giving some nan after price imputing (not sure why)
    "prop_price_without_discount_usd",
    "prop_price_with_discount_local",
    "prop_price_with_discount_usd",
]

category_cols = [
    #     "srch_visitor_loc_country",
    "srch_visitor_loc_region",
    #     "srch_visitor_loc_city",
    "srch_posa_continent",
    #     "srch_posa_country",
    "srch_hcom_destination_id",
    "srch_rm_cnt",
    "prop_starrating",
    "prop_super_region",
    "prop_continent",
    "prop_country",
    "srch_device",
    "month",
    "week",
]

country_cols = [
    "srch_visitor_loc_country",
    "srch_posa_country",
    "prop_country",
]

positive_cols = [
    "srch_visitor_visit_nbr",
    "srch_los",
    "srch_bw",
    "srch_adults_cnt",
    "srch_children_cnt",
    "srch_rm_cnt",
    "prop_room_capacity",
    "prop_review_score",
    "prop_review_count",
    "discount_perc_local",
    "discount_perc_usd",
]

ranged_cols = [
    "srch_ci_day",
    "srch_co_day",
    "prop_starrating",
    #     "prop_review_score",
]

bool_cols = [
    "srch_mobile_bool",
    "prop_travelad_bool",
    "prop_dotd_bool",
    "prop_imp_drr",
    "prop_brand_bool",
    "prop_hostel_bool",
    "is_rated_less_than_2",
    "is_weekend_ci",
    "is_weekend_co",
]

geography_cols = [
    "srch_visitor_loc_country",
    "srch_visitor_loc_region",
    "srch_posa_continent",
    "srch_posa_country",
    "prop_super_region",
    "prop_continent",
]

In [3]:
df = pd.read_csv("../data/train.csv")
# test_df = pd.read_csv("../data/test.csv")


def clean_df(df_raw):
    df = df_raw.copy(deep=True)
    df["srch_visitor_loc_region"] = df.srch_visitor_loc_region.fillna("Unknown")
    df.loc[
        df.srch_posa_country.isin(["US", "CANADA"]), "srch_posa_continent"
    ] = "NORTHAMERICA"

    split_col_names = ["signin_status", "wr_membership", "fc_membership"]
    df[split_col_names] = df.srch_visitor_wr_member.str.split("|", expand=True)
    df["wr_membership"] = df.wr_membership.fillna("Unknown")
    df["fc_membership"] = df.fc_membership.fillna("Not FC Member")
    df["signin_status"] = df.signin_status.fillna("Not Signed In")

    # Convert date objects into datetime objects
    date_cols = ["srch_date_time", "srch_ci", "srch_co", "srch_local_date"]
    for col in date_cols:
        df[col] = pd.to_datetime(df[col])

    # df["year"] = df["srch_date_time"].dt.year # We only have a little over a year of data
    df["month"] = df.srch_date_time.dt.month
    df["week"] = df.srch_date_time.dt.isocalendar().week
    df["is_weekend_ci"] = df.srch_ci_day.isin([5, 6])
    df["is_weekend_co"] = df.srch_co_day.isin([5, 6])

    df["prop_room_capacity"] = df.prop_room_capacity.replace(-9998, np.nan)
    df["is_rated_less_than_2"] = df.prop_review_score.lt(2).astype(int)
    df["discount_perc_local"] = (
        1 - df.prop_price_with_discount_local / df.prop_price_without_discount_local
    )
    df["discount_perc_usd"] = (
        1 - df.prop_price_with_discount_usd / df.prop_price_without_discount_usd
    )
    return df.loc[:, ~df.columns.isin(drop_cols)]


df = clean_df(df)#.sample(frac=0.2)

In [4]:
column_transform1 = ColumnTransformer(
    [
        (
            "positive_cols_imputer",
            make_pipeline(SimpleImputer(strategy="median"), PowerTransformer()),
            positive_cols + price_cols,
        ),
        (
            "min_max_scaling",
            MinMaxScaler(),
            ranged_cols,
        ),
        (
            "one_hot_encoder",
            make_pipeline(
                OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
                OneHotEncoder(handle_unknown="ignore", sparse=False),
            ),
            category_cols,
        ),
    ],
    remainder="drop",
    n_jobs=-1,
)

column_transform2 = ColumnTransformer(
    [
        (
            "positive_cols_imputer",
            make_pipeline(SimpleImputer(strategy="median"), PowerTransformer()),
            positive_cols + price_cols,
        ),
        (
            "category_encoder",
            OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
            category_cols,
        ),
    ],
    remainder="drop",
    n_jobs=-1,
)


steps = [
    ("columns", column_transform1),
    ("over", SMOTE(n_jobs=-1)),
    ("under", RandomUnderSampler()),
    ("PCA", PCA(n_components=23)),
    ("model", RandomForestClassifier(max_depth=2, n_jobs=-1)),
]
pipeline = Pipeline(steps=steps, verbose=True)

X = df.loc[:, df.columns != "prop_booking_bool"]
y = df.loc[:, df.columns == "prop_booking_bool"]

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

# pipeline.fit(X_train, y_train)
# y_pred = pipeline.predict(X_test)

param_grid = [
    {
        "model": [RandomForestClassifier(n_estimators=100)],
        "columns": [column_transform2],
        "PCA__n_components": [3, 5, 10],
        "model__criterion": ["gini", "entropy"],
        "model__max_depth": [2, 3, 5, 10],
        "model__max_features": range(2, 11, 2),
    },
    {
        "model": [LogisticRegression()],
        "columns": [column_transform1],
        "PCA__n_components": [3, 5, 10],
        "model__C": [0.001, 0.01, 0.1, 1, 10, 100],
    },
    {
        "model": [LogisticRegression(penalty="elasticnet")],
        "columns": [column_transform1],
        "PCA__n_components": [3, 5, 10, 15],
        "model__l1_ratio": [0.001, 0.01, 0.1, 0.5, 0.75, 1],
    },
]

In [None]:
gs = GridSearchCV(pipeline, param_grid, scoring="precision", n_jobs=-1, cv=3)
gs.fit(X, y)

In [None]:
joblib.dump(gs, "../data/gridsearch.pkl")

In [None]:
test = pd.read_csv("../data/test.csv")

test[["prop_booking_bool"]] = gs.predict(clean_df(test))

test[["srch_id", "prop_key", "prop_booking_bool"]].to_csv(
    "../data/submission.csv", index=False
)

In [None]:
%load_ext watermark
%watermark -n -u -v -iv -w