In [1]:
import sys

import pandas as pd
import catboost as cb
import xgboost as xgb
import imblearn
import matplotlib.pyplot as plt
import numpy as np

sys.path.append("..")
import global_vars as gv
from utils import model_utils as mu
from utils import data_utils as du
from utils import data_process_utils as dpu
from utils import visualize as viz

%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import importlib
importlib.reload(gv)
importlib.reload(du)
importlib.reload(mu)

<module 'utils.model_utils' from '../utils/model_utils.py'>

<IPython.core.display.Javascript object>

In [3]:
#### read in data
wids_path = ".."
test_df = pd.read_csv("../data/test.csv")
print(f"Test dimension: {test_df.shape}")
train_df = pd.read_csv("../data/train.csv")
print(f"Train dimension: {train_df.shape}")
sample_solution_df = pd.read_csv("../data/sample_solution.csv")
print(f"Sample solution dimension: {sample_solution_df.shape}")
train_df.columns = train_df.columns.str.lower()
test_df.columns = test_df.columns.str.lower()

Test dimension: (9705, 63)
Train dimension: (75757, 64)
Sample solution dimension: (9705, 2)


<IPython.core.display.Javascript object>

### Add backfilled, processed columns in data

In [4]:
train_w_parsed_facility_type_df = dpu.parse_facility_type(
        input_df=train_df.copy(),
        facility_type_colname="facility_type")
test_w_parsed_facility_type_df = dpu.parse_facility_type(
        input_df=test_df.copy(),
        facility_type_colname="facility_type")

<IPython.core.display.Javascript object>

In [5]:
groupby_list = ["state_factor", "building_class", "facility_type"]
col = "energy_star_rating"
train_backfill_energy_star_rating_df = dpu.backfill_energy_star_rating(
                input_df=train_df,
                mapping_df=train_df,
                groupby_list=groupby_list,
                energy_star_rating_colname=col,
                agg_approach_func=np.nanmedian,
            )
test_backfill_energy_star_rating_df = dpu.backfill_energy_star_rating(
                input_df=test_df,
                mapping_df=train_df,
                groupby_list=groupby_list,
                energy_star_rating_colname=col,
                agg_approach_func=np.nanmedian,
            )

<IPython.core.display.Javascript object>

In [6]:
groupby_list = ["state_factor", "building_class", "facility_type_parsed"]
col = "energy_star_rating"
train_backfill_energy_star_rating_df_v1 = dpu.backfill_energy_star_rating(
                input_df=train_w_parsed_facility_type_df,
                mapping_df=train_w_parsed_facility_type_df,
                groupby_list=groupby_list,
                energy_star_rating_colname=col,
                agg_approach_func=np.nanmedian,
            ).rename(columns = {"backfilled_energy_star_rating": "backfilled_energy_star_rating_v1"})
test_backfill_energy_star_rating_df_v1 = dpu.backfill_energy_star_rating(
                input_df=test_w_parsed_facility_type_df,
                mapping_df=train_w_parsed_facility_type_df,
                groupby_list=groupby_list,
                energy_star_rating_colname=col,
                agg_approach_func=np.nanmedian,
            ).rename(columns = {"backfilled_energy_star_rating": "backfilled_energy_star_rating_v1"})

<IPython.core.display.Javascript object>

In [7]:
# add back to train
train_filled_df = train_df.merge(
    train_backfill_energy_star_rating_df[["id", "backfilled_energy_star_rating"]],
    on = "id",
    how = "left"
).merge(
    train_w_parsed_facility_type_df[["id", "facility_type_parsed"]],
    on = "id",
    how = "left"
).merge(
    train_backfill_energy_star_rating_df_v1[["id", "backfilled_energy_star_rating_v1"]],
    on = "id",
    how = "left"
)
print(train_df.shape)
print(train_filled_df.shape)
display(train_filled_df[["id", "energy_star_rating","backfilled_energy_star_rating","backfilled_energy_star_rating_v1",  "facility_type","facility_type_parsed"]].notnull().sum())
assert train_filled_df.shape[0] == train_df.shape[0]

(75757, 64)
(75757, 67)


id                                  75757
energy_star_rating                  49048
backfilled_energy_star_rating       73491
backfilled_energy_star_rating_v1    75239
facility_type                       75757
facility_type_parsed                75757
dtype: int64

<IPython.core.display.Javascript object>

In [8]:
# add back to test
test_filled_df = test_df.merge(
    test_backfill_energy_star_rating_df[["id", "backfilled_energy_star_rating"]],
    on = "id",
    how = "left"
).merge(
    test_w_parsed_facility_type_df[["id", "facility_type_parsed"]],
    on = "id",
    how = "left"
).merge(
    test_backfill_energy_star_rating_df_v1[["id", "backfilled_energy_star_rating_v1"]],
    on = "id",
    how = "left"
)
print(test_df.shape)
print(test_filled_df.shape)
display(test_filled_df[["id", "energy_star_rating","backfilled_energy_star_rating", "backfilled_energy_star_rating_v1","facility_type","facility_type_parsed"]].notnull().sum())
assert test_filled_df.shape[0] == test_df.shape[0]

(9705, 63)
(9705, 66)


id                                  9705
energy_star_rating                  7451
backfilled_energy_star_rating       9153
backfilled_energy_star_rating_v1    9546
facility_type                       9705
facility_type_parsed                9705
dtype: int64

<IPython.core.display.Javascript object>

### Model configs

In [9]:
model_type_dict = {
    "catboost": "catboost",
    "xgboost": "sklearn"
}

feature_dict = {
     "xgb_tune": {
        "cols_to_reduce_dict": {"temp": viz.temp_col_list},
        "reduce_number_dict": {"temp": 9},
        "log10_transform_cols": ["floor_area"],
        "if_one_hot": True,
        "if_scale": True,
        "replace_original_feature_col_dict": {"energy_star_rating": "backfilled_energy_star_rating", "facility_type": "facility_type_parsed"},
        "drop_data": {}
    }
}

<IPython.core.display.Javascript object>

In [10]:
# tuning
# https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/
# max_depth, min_child_weight and gamma.
# subsample and colsample_bytree.
seed = 0
eta_list = [0.1, 0.2, 0.3]
max_depth_list = [3,6,9]
min_child_weight_list = [1,5,10]
gamma_list = [0,2,6]
subsample_list = [0.5,1]
colsample_bytree_list = [0.5,1]

<IPython.core.display.Javascript object>

In [11]:
config_name = "xgb_tune"
model_name = "xgboost"

# get model config
cols_to_reduce_dict = feature_dict[config_name]["cols_to_reduce_dict"]
reduce_number_dict = feature_dict[config_name]["reduce_number_dict"]
log10_transform_cols = feature_dict[config_name]["log10_transform_cols"]
if_one_hot = feature_dict[config_name]["if_one_hot"]
if_scale = feature_dict[config_name]["if_scale"]
replace_original_feature_col_dict = feature_dict[config_name]["replace_original_feature_col_dict"]
drop_data_dict = feature_dict[config_name]["drop_data"]

# check if need to drop data
if len(drop_data_dict) > 0:
    for one_col, drop_level_list in drop_data_dict.items():
        train_filled_df = train_filled_df.query(f"{one_col} not in {drop_level_list}")
# process data
train_filter_df, test_filter_df, pca_cols = du.process_data_v1(
    train_filled_df.drop_duplicates(),
    test_filled_df.drop_duplicates(),
    reduce_col_dict = cols_to_reduce_dict,
    cols_to_log_transform = log10_transform_cols,
    reduce_number_dict = reduce_number_dict,
)
# Set feature columns after data transformations
all_cols_to_reduce = []
for _, one_set_col_to_reduce in cols_to_reduce_dict.items():
    all_cols_to_reduce = all_cols_to_reduce + one_set_col_to_reduce

all_cols_to_drop = []
all_cols_to_replace_from_drop = []
for col_to_drop, col_to_replace in replace_original_feature_col_dict.items():
    all_cols_to_drop.append(col_to_drop)
    all_cols_to_replace_from_drop.append(col_to_replace)

features_columns = (
    list(set(gv.all_feature_columns) - set(all_cols_to_reduce) - set(log10_transform_cols) - set(all_cols_to_drop))
    + pca_cols
    + all_cols_to_replace_from_drop
    + [f"log10_{col}" for col in log10_transform_cols]
)
print(config_name, features_columns, if_one_hot)


## predict on test data
train_filter_x_df, train_filter_y_df = mu.split_model_feature_response(
    train_filter_df, features_columns
)
test_filter_x_df = mu.split_model_feature_response(
    test_filter_df, features_columns, if_with_response=False
)
processed_train_x_df, processed_test_x_df = mu.process_train_test_data(
    train_filter_x_df, test_filter_x_df, if_scale, if_one_hot, full_data_df = train_filter_x_df
)

Fitting PCA with 9 components
xgb_tune ['max_wind_speed', 'days_above_80f', 'days_below_20f', 'heating_degree_days', 'snowdepth_inches', 'days_below_0f', 'days_above_110f', 'days_above_100f', 'days_with_fog', 'building_class', 'days_above_90f', 'direction_peak_wind_speed', 'precipitation_inches', 'elevation', 'days_below_30f', 'direction_max_wind_speed', 'days_below_10f', 'year_built', 'cooling_degree_days', 'state_factor', 'snowfall_inches', 'temp_pca1', 'temp_pca2', 'temp_pca3', 'temp_pca4', 'temp_pca5', 'temp_pca6', 'temp_pca7', 'temp_pca8', 'temp_pca9', 'backfilled_energy_star_rating', 'facility_type_parsed', 'log10_floor_area'] True


<IPython.core.display.Javascript object>

In [None]:
for eta in eta_list:
    for max_depth in max_depth_list:
        for min_child_weight in min_child_weight_list:
            for gamma in gamma_list:
                for subsample in subsample_list:
                    for colsample_bytree in colsample_bytree_list:
                        updated_config_name = f"{config_name}_eta{eta}_max_depth{max_depth}_min_child_weight{min_child_weight}_gamma{gamma}_subsample{subsample}_colsample_bytree{colsample_bytree}"
                        print(updated_config_name)
                        model = xgb.XGBRegressor(eval_metric= 'rmse', max_depth=max_depth,
                                            random_state=seed, learning_rate=eta, min_child_weight = min_child_weight, gamma = gamma,
                                            subsample = subsample, colsample_bytree =  colsample_bytree)

                        # run model
                        ## Run LOY model
                        model_rmse = mu.run_leave_year_out(
                            model_df=train_filter_df,
                            ml_model=model,
                            features_columns=features_columns,
                            if_scale_data=if_scale,
                            if_one_hot=if_one_hot,
                            model_type=model_type_dict[model_name],
                        )
                        print(f"Average RMSE:\n{model_rmse.mean()}")
                        # predict on test
                        run_model_dict = {"xgboost": mu.run_sklearn_model, "catboost": mu.run_catboost_model}
                        train_predict, test_predict, fitted_model = run_model_dict[model_name](
                                model, processed_train_x_df, train_filter_y_df, processed_test_x_df
                            )
                        training_rmse = mu.calculate_rmse(train_filter_y_df, train_predict)
                        print(f"Whole data train RMSE: {training_rmse}")

                        ## output save result
                        model_rmse.loc[model_rmse.shape[0], :] = [0, training_rmse, np.nan]
                        model_rmse["method"] = model_rmse["left_out_year"].apply(
                            lambda x: "loyo" if x > 0 else "whole train"
                        )
                        display(model_rmse)
                        model_rmse.to_csv(
                            f"{wids_path}/validation_result/hannah/{updated_config_name}.csv", index=False
                        )

                        test_prediction_result = test_df[["id"]]
                        test_prediction_result["site_eui"] = test_predict
                        test_prediction_result.to_csv(
                            f"{wids_path}/prediction_result/hannah/{updated_config_name}.csv", index=False
                        )
