# Exp 5. Target Encoding 

In [1]:
# Import Modules
import os
import sys
import warnings
from pathlib import Path

import numpy as np
import pandas as pd
# Enable plotly offline plotting
import plotly
import plotly.graph_objects as go
import seaborn as sns
from IPython.core.display import HTML, display
from ipywidgets import interact


# Ignore sklearn warnings
def warn(*args, **kwargs):
    pass


warnings.warn = warn


plotly.offline.init_notebook_mode(connected=True)

# Init project path
PROJECT_DIR = os.getcwd() + "/../../"
sys.path.insert(0, PROJECT_DIR)
%load_ext autoreload
%autoreload 2

display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
use_cols = [
    "target",
    "target_address_1_mean",
    "target_address_1_2_mean",
    "target_station_1_mean",
    "duration_1",
    "ohe_bathroom_専用バス",
    "ohe_bathroom_専用トイレ",
    "ohe_bathroom_バス・トイレ別",
    "ohe_bathroom_シャワー",
    "ohe_bathroom_追焚機能",
    "ohe_bathroom_温水洗浄便座",
    "ohe_bathroom_洗面台独立",
    "ohe_bathroom_脱衣所",
    "ohe_bathroom_浴室乾燥機",
    "ohe_bathroom_バスなし",
    "ohe_bathroom_共同バス",
    "ohe_bathroom_共同トイレ",
    "ohe_equipment_エアコン付",
    "ohe_equipment_シューズボックス",
    "ohe_equipment_バルコニー",
    "ohe_equipment_フローリング",
    "ohe_equipment_エレベーター",
    "ohe_equipment_公営水道",
    "ohe_equipment_下水",
    "ohe_equipment_都市ガス",
    "ohe_equipment_室内洗濯機置場",
    "ohe_equipment_2面採光",
    "ohe_equipment_タイル張り",
    "ohe_equipment_室外洗濯機置場",
    "ohe_equipment_冷房",
    "ohe_equipment_敷地内ごみ置き場",
    "ohe_equipment_ロフト付き",
    "ohe_equipment_3面採光",
    "ohe_equipment_24時間換気システム",
    "ohe_equipment_水道その他",
    "ohe_equipment_プロパンガス",
    "ohe_equipment_ウォークインクローゼット",
    "ohe_equipment_トランクルーム",
    "ohe_equipment_出窓",
    "ohe_equipment_ペアガラス",
    "ohe_equipment_専用庭",
    "ohe_equipment_バリアフリー",
    "ohe_equipment_床暖房",
    "ohe_equipment_床下収納",
    "ohe_equipment_クッションフロア",
    "ohe_equipment_オール電化",
    "ohe_equipment_二重サッシ",
    "ohe_equipment_ルーフバルコニー",
    "ohe_equipment_ガス暖房",
    "ohe_equipment_ガスその他",
    "ohe_equipment_洗濯機置場なし",
    "ohe_equipment_防音室",
    "ohe_equipment_排水その他",
    "ohe_equipment_地下室",
    "ohe_equipment_浄化槽",
    "ohe_equipment_汲み取り",
    "ohe_equipment_石油暖房",
    "ohe_equipment_井戸",
    "ohe_equipment_二世帯住宅",
    "ohe_internet_インターネット対応",
    "ohe_internet_CSアンテナ",
    "ohe_internet_BSアンテナ",
    "ohe_internet_インターネット使用料無料",
    "ohe_internet_光ファイバー",
    "ohe_internet_CATV",
    "ohe_internet_高速インターネット",
    "ohe_internet_有線放送",
    "ohe_kitchen_ガスコンロ",
    "ohe_kitchen_コンロ3口",
    "ohe_kitchen_システムキッチン",
    "ohe_kitchen_給湯",
    "ohe_kitchen_L字キッチン",
    "ohe_kitchen_コンロ2口",
    "ohe_kitchen_コンロ設置可（口数不明）",
    "ohe_kitchen_コンロ1口",
    "ohe_kitchen_カウンターキッチン",
    "ohe_kitchen_独立キッチン",
    "ohe_kitchen_IHコンロ",
    "ohe_kitchen_冷蔵庫あり",
    "ohe_kitchen_コンロ設置可（コンロ1口）",
    "ohe_kitchen_コンロ設置可（コンロ2口）",
    "ohe_kitchen_電気コンロ",
    "ohe_kitchen_コンロ設置可（コンロ3口）",
    "ohe_kitchen_コンロ4口以上",
    "ohe_kitchen_コンロ設置可（コンロ4口以上）",
    "ohe_parking_駐車場",
    "ohe_parking_駐輪場",
    "ohe_direction_北",
    "ohe_direction_北東",
    "ohe_direction_北西",
    "ohe_direction_南",
    "ohe_direction_南東",
    "ohe_direction_南西",
    "ohe_direction_東",
    "ohe_direction_西",
    "ohe_layout_1DK",
    "ohe_layout_1DK+S(納戸)",
    "ohe_layout_1K",
    "ohe_layout_1K+S(納戸)",
    "ohe_layout_1LDK",
    "ohe_layout_1LDK+S(納戸)",
    "ohe_layout_1R",
    "ohe_layout_2DK",
    "ohe_layout_2DK+S(納戸)",
    "ohe_layout_2K",
    "ohe_layout_2K+S(納戸)",
    "ohe_layout_2LDK",
    "ohe_layout_2LDK+S(納戸)",
    "ohe_layout_3DK",
    "ohe_layout_3DK+S(納戸)",
    "ohe_layout_3K",
    "ohe_layout_3K+S(納戸)",
    "ohe_layout_3LDK",
    "ohe_layout_3LDK+S(納戸)",
    "ohe_layout_4DK",
    "ohe_layout_4K",
    "ohe_layout_4LDK",
    "ohe_layout_4LDK+S(納戸)",
    "ohe_layout_5DK",
    "ohe_layout_5DK+S(納戸)",
    "ohe_layout_5K",
    "ohe_layout_5LDK",
    "ohe_layout_5LDK+S(納戸)",
    "ohe_layout_6LDK",
    "ohe_material_ALC（軽量気泡コンクリート）",
    "ohe_material_HPC（プレキャスト・コンクリート（重量鉄骨））",
    "ohe_material_PC（プレキャスト・コンクリート（鉄筋コンクリート））",
    "ohe_material_RC（鉄筋コンクリート）",
    "ohe_material_SRC（鉄骨鉄筋コンクリート）",
    "ohe_material_その他",
    "ohe_material_ブロック",
    "ohe_material_木造",
    "ohe_material_軽量鉄骨",
    "ohe_material_鉄骨造",
    "area",
    "contract",
    "age_year",
    "is_bus_1",
]

In [43]:
from src.exp.common.setup import *
from src.exp.common.load_data import load_data
from src.exp.common.correct_invalid_data import correct_invalid_data
from src.exp.common.preprocessing import preprocessing
from src.exp.common.fill_missing_data import fill_missing_data
from src.exp.common.one_hot_encoding import one_hot_encoding
from src.exp.common.create_cv_df import create_cv_df
from src.exp.common.target_encoding import target_encoding
from src.exp.common.drop_unused_columns import drop_unused_columns
from src.exp.common.rename_non_ascii_cols import rename_non_ascii_cols

# Step 1. Load train.csv, test.csv
with timer("load data"):
    train_df, test_df = load_data(DATA_DIR=DATA_DIR)
    original_train_df, original_test_df = load_data(DATA_DIR=DATA_DIR)
    train_df = correct_invalid_data(train_df)
    original_train_df = correct_invalid_data(original_train_df)
    
# Step 2. Preprocessing
with timer("preprocessing"):
    train_df = preprocessing(df=train_df)
    test_df = preprocessing(df=test_df)
    
    # fill missing data
    for col_name in ["direction", "material", "layout"]:
        train_df = fill_missing_data(
            df=train_df, col_name=col_name, method="most_frequent"
        )
        test_df = fill_missing_data(
            df=test_df, col_name=col_name, method="most_frequent"
        )

# Step 3. Feature Engineering

# Step 4. One-Hot Encoding
with timer("one hot encoding"):
    for col_name in ["direction", "layout", "material"]:
        train_df = one_hot_encoding(df=train_df, col_name=col_name)
        test_df = one_hot_encoding(df=test_df, col_name=col_name)
    
# Step 5. Create dataframe for CV
with timer("create dataframe for CV"):
    cv_df = create_cv_df(n_splits=N_FOLDS, random_state=SEED, train_df=train_df)

# Step 6. Target Encoding
with timer("target encoding"):
    col_names = ["address_1", "address_1_2", "station_1"]
    methods = { "mean": np.mean }

    cv_df_tmp = cv_df.copy()
    cv_df = pd.DataFrame()
    for n_fold in cv_df_tmp["n_fold"].unique():
        tr_df = cv_df_tmp.query("n_fold == {} and data_type == '{}'".format(n_fold, "train"))
        te_df = cv_df_tmp.query("n_fold == {} and data_type == '{}'".format(n_fold, "val"))
        for col_name in col_names:
            tr_df, te_df = target_encoding(
                tr_df=tr_df, te_df=te_df, col_name=col_name, methods=methods
            )
        cv_df = pd.concat([cv_df, tr_df, te_df])

    for col_name in col_names:
        train_df, test_df = target_encoding(
            tr_df=train_df, te_df=test_df, col_name=col_name, methods=methods
        )

    def fill_target_nan(df=pd.DataFrame()):
        df["target_address_1_2_mean"] = df["target_address_1_2_mean"].fillna(df["target_address_1_mean"])
        df["target_station_1_mean"] = df["target_station_1_mean"].fillna(df["target_address_1_mean"])
#         df["target_building_mean"] = df["target_building_mean"].fillna(df["target_address_1_2_mean"])
#         df["target_building_mean"] = df["target_building_mean"].fillna(df["target_address_1_mean"])
        return df
    cv_df = fill_target_nan(cv_df)
    train_df = fill_target_nan(train_df)
    test_df = fill_target_nan(test_df)
    
train_df = train_df[use_cols]
test_df = test_df[use_cols[1:]]
    
# Step 7. Drop unused columns
with timer("drop unused columns"):
#     train_df = drop_unused_columns(df=train_df)
#     test_df = drop_unused_columns(df=test_df)
    cv_df = drop_unused_columns(df=cv_df)

# Step 8. Rename non-ascii columns as lightGBM doesn't support them.
with timer("rename non-ascii cols"):
    train_df = rename_non_ascii_cols(df=train_df)
    test_df = rename_non_ascii_cols(df=test_df)
    cv_df, rename_cols= rename_non_ascii_cols(df=cv_df)

2019-11-05 13:58:28,720 - INFO - [load data] done in 4.11 s
2019-11-05 13:58:46,704 - INFO - [preprocessing] done in 17.98 s
2019-11-05 13:58:47,030 - INFO - [one hot encoding] done in 0.32 s
2019-11-05 13:58:51,094 - INFO - [create dataframe for CV] done in 4.06 s
2019-11-05 13:58:54,113 - INFO - [target encoding] done in 3.02 s
2019-11-05 13:58:54,439 - INFO - [drop unused columns] done in 0.25 s
2019-11-05 13:58:54,580 - INFO - [rename non-ascii cols] done in 0.14 s


In [4]:
# LightGBM parameters
lgb_params = {
    "task": "train",
    "boosting_type": "gbdt",
    "objective": "regression",
    "metric": {"l2"},
    "num_leaves": 5000,
    "learning_rate": 0.1,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 20,
    "verbose": -1,
}

# xgb parameters
xgb_params = {
    "eta": 0.1,
    "max_depth": 10,
    "gamma": 0.1,
    "objective": "reg:squarederror", 
    "eval_metric": "rmse"
}

In [37]:
# Step 5. Cross Validation
from src.exp.common.cv import cv

with timer("cv with linear regression"):
    lr_result, scores, lr_models = cv(
        cv_df=cv_df, model_name="linearRegression", params=None
    )

    # xgb parameters
    xgb_params = {
        "eta": 0.1,
        "max_depth": 10,
        "gamma": 0.1,
        "objective": "reg:squarederror", 
        "eval_metric": "rmse"
    }
    xgb_result, scores, xgb_models = cv(
        cv_df=cv_df, model_name="xgboost", params=xgb_params
    )
    
    lgb_result, scores, lgb_models = cv(
        cv_df=cv_df, model_name="lightgbm", params=lgb_params
    )
    

---------------- linearRegression ----------------
n_fold: 0 Score: 24204.406814096343
n_fold: 1 Score: 25483.477744869924
n_fold: 2 Score: 32998.60334824085
n_fold: 3 Score: 69320.05774867587
n_fold: 4 Score: 24142.687022008864
---------------- linearRegression  END ----------------

---------------- xgboost ----------------
n_fold: 0 Score: 15681.055929085938
n_fold: 1 Score: 14568.094672630716
n_fold: 2 Score: 14738.773062612672
n_fold: 3 Score: 22189.63507711768
n_fold: 4 Score: 16224.962027074054
---------------- xgboost  END ----------------

---------------- lightgbm ----------------
n_fold: 0 Score: 17728.63342248937
n_fold: 1 Score: 16781.34795666189
n_fold: 2 Score: 16188.905809549706
n_fold: 3 Score: 22799.498901698706
n_fold: 4 Score: 16415.262065999
---------------- lightgbm  END ----------------



2019-11-05 13:55:47,413 - INFO - [cv with linear regression] done in 60.05 s


In [11]:
from src.exp.common.plot_result import plot_result


@interact(n_fold=list(range(N_FOLDS)))
def plot_results(n_fold):
    plot_result(original_train_df, lr_result, n_fold, "lr")
    plot_result(original_train_df, xgb_result, n_fold, "xgb")
    plot_result(original_train_df, lgb_result, n_fold, "lgb")

interactive(children=(Dropdown(description='n_fold', options=(0, 1, 2, 3, 4), value=0), Output()), _dom_classe…

In [12]:
original_train_df.loc[[1771]]

Unnamed: 0,id,target,address,access,layout,age,direction,area,floor_stories,bathroom,kitchen,internet,equipment,parking,neighbor,material,contract
1771,1772,1450000,東京都大田区田園調布３丁目,東急東横線\t田園調布駅\t徒歩5分\t\t東急目黒線\t田園調布駅\t徒歩5分,4LDK+S(納戸),9年1ヶ月,南西,232.01m2,2階建,専用バス／\t専用トイレ／\tシャワー／\t温水洗浄便座／\t洗面台独立,ガスコンロ／\tシステムキッチン\t／\t給湯,光ファイバー／\tCATV,エアコン付\tウォークインクローゼット\tフローリング／\t室内洗濯機置場,駐車場\t空有,,木造,4年間\t※この物件は\t定期借家\tです。


In [8]:
train_df.loc[[1771]]

Unnamed: 0,target,target_address_1_mean,target_address_1_2_mean,target_station_1_mean,duration_1,ohe_bathroom_1,ohe_bathroom_2,ohe_bathroom_3,ohe_bathroom_4,ohe_bathroom_5,...,ohe_material_124,ohe_material_125,ohe_material_126,ohe_material_127,ohe_material_128,ohe_material_129,area,contract,age_year,is_bus_1
1771,1450000,106981.675,162842.857143,152416.666667,5,1,1,0,1,0,...,0,0,0,1,0,0,232.01,4.0,9,0


In [9]:
# train and predict
from src.exp.common.train_and_predict import train_and_predict

y_pred, model = train_and_predict( 
    train_x=train_df.drop(["target"], axis=1),
    train_y=train_df["target"].values,
    val_x=test_df,
    model_name="xgboost",
    params=xgb_params,
)

result_df = pd.DataFrame({
    "id": test_df.reset_index()["index"]+31471,
    "predicted": y_pred
})
result_df.to_csv(
    "submission_{}_{}.csv".format(pd.to_datetime("today").strftime("%Y-%m-%d"), 1),
    header=False,
    index=False,
)