# Exp3: xgboost With Almost no PP/FE 

In [1]:
# import packages
import gc
import os
import re
import sys
import time
from contextlib import contextmanager
from pathlib import Path

import lightgbm as lgb
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly
import plotly.graph_objects as go
import seaborn as sns
import xgboost as xgb
from IPython.core.display import HTML, display
from ipywidgets import interact
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from tqdm import tqdm_notebook as tqdm

PROJECT_DIR = "/app"
INPUT_FILE_PATH = Path("../..")
sys.path.insert(0, PROJECT_DIR)

# enable plotly offline plotting
plotly.offline.init_notebook_mode(connected=True)

# change figsize of seaborn
sns.set(rc={"figure.figsize": (8, 6)})

display(HTML("<style>.container { width:88% !important; }</style>"))

In [2]:
# import modules
from src.logger.main import LOGGER, setup_logger
from src.preprocess.main import pp
from src.preprocess.read_file import read_file

In [3]:
#####
# settings
#####
EXP_ID = "exp3"
DATA_DIR = "../../input"
SEED = 42
ID_COLUMNS = "id"
TARGET_COLUMNS = ["target"]
N_CLASSES = len(TARGET_COLUMNS)
N_FOLDS = 5 # n_folds for Cross-Validation

# logger set up
LOGGER_PATH = "{}_log.txt".format(EXP_ID)
if os.path.isfile(LOGGER_PATH):
    os.remove(LOGGER_PATH)
setup_logger(out_file=LOGGER_PATH)
LOGGER.info("seed={}".format(SEED))

2019-10-28 11:23:20,541 - INFO - logger set up
2019-10-28 11:23:20,544 - INFO - seed=42


In [4]:
# timer set up 
@contextmanager
def timer(name):
    t0 = time.time()
    yield
    LOGGER.info('[{}] done in {} s'.format(name, round(time.time() - t0, 2)))

In [5]:
with timer('load data'):
    df, _ = read_file(DATA_DIR=DATA_DIR)
    y = df[TARGET_COLUMNS].values
    ids = df[[ID_COLUMNS]] 
    gc.collect()

2019-10-28 11:23:21,754 - INFO - [load data] done in 1.19 s


In [6]:
with timer("preprocessing"):
    df = pp(df)

    # fill NaN with most frequent values
    df.loc[:, "direction"] = df["direction"].fillna(
        df["direction"].value_counts().index[0]
    )
    df.loc[:, "material"] = df["material"].fillna(
        df["material"].value_counts().index[0]
    )
    df.loc[:, "layout"] = df["layout"].fillna(df["layout"].value_counts().index[0])
    df.loc[:, "address_1"] = df["address_1"].fillna(
        df["address_1"].value_counts().index[0]
    )

    # One-Hot Encoding
    df = pd.concat(
        [df, pd.get_dummies(df["direction"], prefix="ohe_direction")], axis=1
    )
    df = pd.concat([df, pd.get_dummies(df["layout"], prefix="ohe_layout")], axis=1)
    df = pd.concat([df, pd.get_dummies(df["material"], prefix="ohe_material")], axis=1)
    df = pd.concat(
        [df, pd.get_dummies(df["address_1"], prefix="ohe_adddress1")], axis=1
    )

    # Drop unused columns
    df = df.drop(
        [
            "id",
            "layout",
            "material",
            "direction",
            "neighbor",
            "address_1",
            "line_1",
            "station_1",
            "duration_1",
            "line_2",
            "station_2",
            "duration_2",
            "is_bus_2",
            "line_3",
            "station_3",
            "duration_3",
            "is_bus_3",
            "address_2",
            "address_3",
        ],
        axis=1,
    )

    df.loc[:, "target_log"] = np.log(df["target"])

2019-10-28 11:24:16,539 - INFO - [preprocessing] done in 54.76 s


In [7]:
# show correlation of one variable 
target_col_name = "target"
corr = df.corr()
index_sort_by_corr = (
    np.abs(corr.drop(["target", "target_log"])[target_col_name])
    .sort_values(ascending=False)
    .index
)

@interact(col_name=index_sort_by_corr)
def show_ohe_boxplot(col_name):
    sns.set(rc={"figure.figsize": (7, 5)})
    sns.scatterplot(x=col_name, y=target_col_name, data=df).set_title(
        "corr_coef: {}".format(corr[col_name][target_col_name])
    )

interactive(children=(Dropdown(description='col_name', options=('area', 'ohe_kitchen_コンロ3口', 'stories', 'floor…

In [8]:
# xgb parameters
params = {
     # 回帰問題
    'objective': 'reg:squarederror',
    # 学習用の指標 (RMSE)
    'eval_metric': 'rmse',
}

In [9]:
with timer("Cross Validation"):
    result = pd.DataFrame()
    scores = []
    model = LinearRegression()
    folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=0).split(df, y)
    for n_fold, (train_index, val_index) in enumerate(folds):
        train_df = df.loc[train_index]
        val_df = df.loc[val_index]
        y_train = y[train_index]
        y_val = y[val_index]

        dtrain = xgb.DMatrix(train_df.drop(["target", "target_log"], axis=1), label=y_train.flatten())
        dtest = xgb.DMatrix(val_df.drop(["target", "target_log"], axis=1), label=y_val.flatten())

        gbm = xgb.train(
            params,
            dtrain,
            num_boost_round=100,  # 学習ラウンド数は適当
        )
        
        y_pred = gbm.predict(dtest)
        
        score = np.sqrt(
            ((y_pred.flatten() - y_val.flatten()) ** 2).sum() / len(y_val.flatten())
        )
        scores.append(score)
        print("No.{} Score: {}".format(n_fold, score))

        result = pd.concat(
            [
                result,
                pd.DataFrame(
                    {
                        "index": val_index,
                        "predicted": y_pred.flatten(),
                        "real": y_val.flatten(),
                        "difference": y_pred.flatten() - y_val.flatten(),
                        "n_fold": n_fold,
                    }
                ),
            ]
        )


The least populated class in y has only 1 members, which is too few. The minimum number of members in any class cannot be less than n_splits=5.



No.0 Score: 24757.262606995544
No.1 Score: 21314.254189367835
No.2 Score: 28340.92489733778
No.3 Score: 18449.231681568883


2019-10-28 11:25:07,020 - INFO - [Cross Validation] done in 46.91 s


No.4 Score: 21694.853633174596


In [10]:
def create_text_from_result(df, r):
    def decode_ohe(row):
        hot_columns = []
        for i in range(len(df.columns)):
            if row[i] == 1 and "ohe_" in df.columns[i]:
                hot_columns.append("_".join(df.columns[i].split("_")[2:]))
        return ",".join(hot_columns)

    text = (
        "pred: ¥"
        + np.ceil(r["predicted"]).map(lambda x: "{:,}".format(x)).astype(str).reset_index()["predicted"]
        + "<br>"
    )
    text += "real: ¥" + r["real"].map(lambda x: "{:,}".format(x)).astype(str).reset_index()["real"] + "<br>"
    text += (
        "difference: ¥"
        + np.ceil(r["difference"]).map(lambda x: "{:,}".format(x)).astype(str).reset_index()["difference"]
        + "<br>"
    )
    text += (
        "age: "
        + np.ceil(df.loc[r["index"]]["age"]).astype(str).reset_index()["age"]
        + "<br>"
    )
    text += (
        "area: "
        + np.ceil(df.loc[r["index"]]["area"]).astype(str).reset_index()["area"]
        + "<br>"
    )
    text += (
        "ohe: "
        + df.loc[r["index"]]
        .apply(decode_ohe, axis=1)
        .reset_index()
        .iloc[:, 1]
        .map(lambda x: re.sub("(.{32})", "\\1<br>", x, 0, re.DOTALL))
        + "<br>"
    )
    return text


def plot_result(df, result, n_fold):
    r = result[result.n_fold==n_fold].reset_index()
    print("n_fold:", n_fold)
    print("RMSE:", np.sqrt((r["difference"]**2).sum() / r["difference"].size))
    text = create_text_from_result(df, r)
    fig = go.Figure()
    fig.add_trace(
        go.Scatter(
            x=r["predicted"],
            y=r["real"],
            text=text,
            hoverinfo="text",
            mode="markers",
            name="markers",
        )
    )
    fig.show()

In [None]:
for n_fold in range(N_FOLDS):
    plot_result(df, result, n_fold)