# Exp 5. Target Encoding 

In [5]:
# Import Modules
import os
import sys
import warnings
from pathlib import Path

import numpy as np
import pandas as pd
# Enable plotly offline plotting
import plotly
import plotly.graph_objects as go
import seaborn as sns
from IPython.core.display import HTML, display
from ipywidgets import interact


# Ignore sklearn warnings
def warn(*args, **kwargs):
    pass


warnings.warn = warn


plotly.offline.init_notebook_mode(connected=True)

# Init project path
PROJECT_DIR = os.getcwd() + "/../../"
sys.path.insert(0, PROJECT_DIR)
%load_ext autoreload
%autoreload 2

display(HTML("<style>.container { width:100% !important; }</style>"))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [41]:
from src.exp.common.setup import *
from src.exp.common.load_data import load_data
from src.exp.common.correct_invalid_data import correct_invalid_data
from src.exp.common.preprocessing import preprocessing
from src.exp.common.fill_missing_data import fill_missing_data
from src.exp.common.one_hot_encoding import one_hot_encoding
from src.exp.common.create_cv_df import create_cv_df
from src.exp.common.target_encoding import target_encoding
from src.exp.common.drop_unused_columns import drop_unused_columns
from src.exp.common.rename_non_ascii_cols import rename_non_ascii_cols

# Step 1. Load train.csv, test.csv
with timer("load data"):
    train_df, test_df = load_data(DATA_DIR=DATA_DIR)
    original_train_df, original_test_df = load_data(DATA_DIR=DATA_DIR)
    train_df = correct_invalid_data(train_df)
    original_train_df = correct_invalid_data(original_train_df)

# Step 2. Preprocessing
with timer("preprocessing"):
    train_df = preprocessing(df=train_df)
    test_df = preprocessing(df=test_df)

with timer("fill missing data"):
    for col_name in ["direction", "material", "layout"]:
        train_df = fill_missing_data(
            df=train_df, col_name=col_name, method="most_frequent"
        )
        test_df = fill_missing_data(
            df=test_df, col_name=col_name, method="most_frequent"
        )

# Step 3. Feature Engineering

# Step 4. One-Hot Encoding
with timer("one hot encoding"):
    for col_name in ["direction", "layout", "material"]:
        train_df = one_hot_encoding(df=train_df, col_name=col_name)
        test_df = one_hot_encoding(df=test_df, col_name=col_name)
    
# Step 5. Create dataframe for CV
with timer("create dataframe for CV"):
    cv_df = create_cv_df(n_splits=N_FOLDS, random_state=SEED, train_df=train_df)

# Step 6. Target Encoding
with timer("target encoding"):
    col_names = ["address_1", "address_1_2"]
    methods = {"mean": np.mean, "median": np.median}

    cv_df_tmp = cv_df.copy()
    cv_df = pd.DataFrame()
    for n_fold in cv_df_tmp["n_fold"].unique():
        tr_df = cv_df_tmp.query("n_fold == {} and data_type == '{}'".format(n_fold, "train"))
        te_df = cv_df_tmp.query("n_fold == {} and data_type == '{}'".format(n_fold, "val"))
        for col_name in col_names:
            tr_df, te_df = target_encoding(
                tr_df=tr_df, te_df=te_df, col_name=col_name, methods=methods
            )
        cv_df = pd.concat([cv_df, tr_df, te_df])

    for col_name in col_names:
        train_df, test_df = target_encoding(
            tr_df=train_df, te_df=test_df, col_name=col_name, methods=methods
        )

# Step 7. Drop unused columns
with timer("drop unused columns"):
    train_df = drop_unused_columns(df=train_df)
    test_df = drop_unused_columns(df=test_df)
    cv_df = drop_unused_columns(df=cv_df)

# Step 8. Rename non-ascii columns as lightGBM doesn't support them.
with timer("rename non-ascii cols"):
    train_df = rename_non_ascii_cols(df=train_df)
    cv_df = rename_non_ascii_cols(df=cv_df)

2019-11-02 04:26:48,561 - INFO - [load data] done in 1.67 s
2019-11-02 04:27:04,673 - INFO - [preprocessing] done in 16.11 s
2019-11-02 04:27:04,732 - INFO - [fill missing data] done in 0.06 s
2019-11-02 04:27:05,050 - INFO - [one hot encoding] done in 0.32 s
2019-11-02 04:27:07,885 - INFO - [create dataframe for CV] done in 2.83 s
2019-11-02 04:27:11,303 - INFO - [target encoding] done in 3.4 s
2019-11-02 04:27:11,584 - INFO - [drop unused columns] done in 0.28 s
2019-11-02 04:27:11,773 - INFO - [rename non-ascii cols] done in 0.18 s


In [42]:
cv_df["target_address_1_2_mean"] = cv_df["target_address_1_2_mean"].fillna(cv_df["target_address_1_mean"])
cv_df["target_address_1_2_median"] = cv_df["target_address_1_2_median"].fillna(cv_df["target_address_1_median"])

In [43]:
# LightGBM parameters
lgb_params = {
    "task": "train",
    "boosting_type": "gbdt",
    "objective": "regression",
    "metric": {"l2"},
    "num_leaves": 1000,
    "learning_rate": 0.1,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 20,
    "verbose": -1,
}

# xgb parameters
xgb_params = {"objective": "reg:squarederror", "eval_metric": "rmse"}

In [44]:
# Step 5. Cross Validation
from src.exp.common.cv import cv

with timer("cv with linear regression"):
    lr_result, scores = cv(
        cv_df=cv_df, model_name="linearRegression", params=None
    )
with timer("cv with xgboost"):
    xgb_result, scores = cv(
        cv_df=cv_df, model_name="xgboost", params=xgb_params
    )
with timer("cv with lightgbm"):
    lgb_result, scores = cv(
        cv_df=cv_df, model_name="lightgbm", params=lgb_params
    )

---------------- linearRegression ----------------
n_fold: 0 Score: 23273.53680897503
n_fold: 1 Score: 24062.868121736898
n_fold: 2 Score: 25379.067101171648
n_fold: 3 Score: 30143.79641843235


2019-11-02 04:27:13,996 - INFO - [cv with linear regression] done in 1.93 s


n_fold: 4 Score: 23211.032628115423
---------------- linearRegression  END ----------------

---------------- xgboost ----------------
n_fold: 0 Score: 18863.39807657058
n_fold: 1 Score: 15022.565533978819
n_fold: 2 Score: 16850.679528791912
n_fold: 3 Score: 23211.523725406394


2019-11-02 04:27:33,995 - INFO - [cv with xgboost] done in 19.99 s


n_fold: 4 Score: 18590.426580973
---------------- xgboost  END ----------------

---------------- lightgbm ----------------
n_fold: 0 Score: 19162.298338085253
n_fold: 1 Score: 16981.23522445959
n_fold: 2 Score: 18451.199662966595
n_fold: 3 Score: 24710.5588001783


2019-11-02 04:27:48,518 - INFO - [cv with lightgbm] done in 14.52 s


n_fold: 4 Score: 17948.662646566958
---------------- lightgbm  END ----------------



In [45]:
from src.exp.common.plot_result import plot_result


@interact(n_fold=list(range(N_FOLDS)))
def plot_results(n_fold):
    plot_result(original_train_df, lr_result, n_fold, "lr")
    plot_result(original_train_df, xgb_result, n_fold, "xgb")
    plot_result(original_train_df, lgb_result, n_fold, "lgb")

interactive(children=(Dropdown(description='n_fold', options=(0, 1, 2, 3, 4), value=0), Output()), _dom_classe…