In [7]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
%autoreload

import time
import warnings
import os.path
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import geopy
import xgboost as xgb
import os
import shutil
import geopandas as gpd

from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from xgboost.sklearn import XGBRegressor
from sklearn.metrics import make_scorer, mean_squared_log_error
from sklearn.model_selection import RandomizedSearchCV, KFold, GridSearchCV
from shapely import wkt
from retail_revenue_xgb import generate_features, create_buffer
from sklearn.model_selection import train_test_split
from xgboost import plot_importance, to_graphviz
from utils import squared_log, rmsle_xgb, add_city_centre_dist, group_ages

pd.options.mode.chained_assignment = None  # default='warn'

spatial = pd.read_csv('data/grunnkrets_norway_stripped.csv')
age = pd.read_csv('data/grunnkrets_age_distribution.csv')
income = pd.read_csv('data/grunnkrets_income_households.csv')
households = pd.read_csv('data/grunnkrets_households_num_persons.csv')
submission = pd.read_csv('data/sample_submission.csv')
plaace = pd.read_csv('data/plaace_hierarchy.csv')
busstops = pd.read_csv('data/busstops_norway.csv')

train = pd.read_csv('data/stores_train.csv')
test = pd.read_csv('data/stores_test.csv')

submission = pd.read_csv('data/sample_submission.csv')
model_to_load = "modeling/0002.model"


In [9]:
def generate_features(df: pd.DataFrame):
    features = ['store_id', 'year', 'store_name', 'mall_name', 'chain_name', 'address', 'lat', 'lon',
                'plaace_hierarchy_id', 'grunnkrets_id']
    _X = df[features]

    _X['store_name'] = _X['store_name'].astype('category')
    _X['address'] = _X['address'].astype('category')
    _X['mall_name'] = _X['mall_name'].astype('category')
    _X['chain_name'] = _X['chain_name'].astype('category')
    _X['plaace_hierarchy_id'] = _X['plaace_hierarchy_id'].astype('category')

    # Merge spatial data
    _X = _X.merge(spatial.drop(columns=['year']).drop_duplicates(subset=['grunnkrets_id']), on='grunnkrets_id', how='left')
    _X['grunnkrets_name'] = _X['grunnkrets_name'].astype('category')
    _X['district_name'] = _X['district_name'].astype('category')
    _X['municipality_name'] = _X['municipality_name'].astype('category')
    _X = _X.drop(columns=['geometry'])

    # Merge age data
    age_ranges = [
        (0, 19),
        (20, 39),
        (40, 59),
        (60, 79),
        (80, 90),
    ]
    grouped_ages = group_ages(age, age_ranges)
    _X = _X.merge(grouped_ages, on='grunnkrets_id', how='left')

    # Merge income data
    _X = _X.merge(income.drop(columns=['year']).drop_duplicates(subset='grunnkrets_id'), how='left')

    # Merge household data
    _X = _X.merge(households.drop(columns=['year']).drop_duplicates(subset='grunnkrets_id'), how='left')

    # Merge plaace data
    _X = _X.merge(plaace.drop_duplicates(subset='plaace_hierarchy_id'), how='left')
    _X['plaace_hierarchy_id'] = _X['plaace_hierarchy_id'].astype('category')
    _X['sales_channel_name'] = _X['sales_channel_name'].astype('category')
    _X = _X.drop(columns=['lv1', 'lv2', 'lv3', 'lv4'])
    _X['lv1_desc'] = _X['lv1_desc'].astype('category')
    _X['lv2_desc'] = _X['lv2_desc'].astype('category')
    _X['lv3_desc'] = _X['lv3_desc'].astype('category')
    _X['lv4_desc'] = _X['lv4_desc'].astype('category')

    _X = add_city_centre_dist(_X)

    # Merge bus data
    bus_data_train = gpd.read_parquet('derived_data/stores_bus_stops_lt_1km_train')
    _X = _X.merge(bus_data_train.drop(columns=['geometry']), on='store_id', how='left')

    _X = _X.drop(columns=['grunnkrets_id', 'plaace_hierarchy_id', 'year', 'store_id'])

    return _X


label_name = 'revenue'
X = train.drop(columns=[label_name])
y = train[label_name]

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=.8)

X = generate_features(X)
X_train, X_val = generate_features(X_train), generate_features(X_val)


In [10]:
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder

enc = OrdinalEncoder()
X_cat = X_train.select_dtypes(include=['category'])


X_train[X_cat.columns] = enc.fit_transform(X_cat)

X_train

Unnamed: 0,store_name,mall_name,chain_name,address,lat,lon,grunnkrets_name,district_name,municipality_name,area_km2,...,lat_center,lon_center,dist_to_center,bus_stops_count,Mangler viktighetsnivå,Standard holdeplass,Lokalt knutepunkt,Nasjonalt knutepunkt,Regionalt knutepunkt,Annen viktig holdeplass
0,51.0,,,6676.0,59.272181,10.205974,2887.0,826.0,305.0,3.602712,...,59.234474,10.305348,7056.012122,0,0,0,0,0,0,0
1,6624.0,,,3595.0,60.804312,11.044654,1750.0,303.0,107.0,0.060065,...,60.797233,11.076513,1905.513060,15,12,0,3,0,0,0
2,7329.0,,,2948.0,60.547957,5.293707,1201.0,464.0,173.0,0.516122,...,60.599523,5.271073,5878.115610,2,0,2,0,0,0,0
3,6605.0,,,4447.0,59.131373,10.221403,2132.0,800.0,269.0,0.146314,...,59.132313,10.223197,146.654168,15,6,0,8,0,1,0
4,8615.0,,,6123.0,59.819962,10.780731,1528.0,466.0,231.0,0.344246,...,59.802242,10.804080,2369.364617,7,7,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10282,7030.0,,,5798.0,59.916263,10.744044,2142.0,802.0,235.0,0.090191,...,59.920051,10.754027,699.967814,19,14,0,5,0,0,0
10283,4482.0,,157.0,,60.897642,4.836826,276.0,595.0,101.0,1.170937,...,60.967501,5.165386,19441.581464,0,0,0,0,0,0,0
10284,7206.0,,,5769.0,59.986149,11.238964,1262.0,964.0,330.0,0.573606,...,60.014351,11.191379,4113.733474,3,2,0,0,0,1,0
10285,529.0,,,646.0,65.837390,13.190532,1604.0,604.0,361.0,0.195105,...,65.839514,13.194638,302.096316,6,4,0,1,0,1,0


In [11]:
def rmsle(X, y):
    y[y < -1] = -1 + 1e-6
    elements = np.power(np.log1p(y) - np.log1p(y), 2)
    return 'RMSLE', float(np.sqrt(np.sum(elements) / len(y)))


def rmsle_vanilla(y_pred, y_true):
    elements = np.power(np.log1p(y_pred) - np.log1p(y_true), 2)
    return float(np.sqrt(np.sum(elements) / len(y_true)))


rmsle_scorer = make_scorer(rmsle_vanilla, greater_is_better=False)


In [34]:
from scipy.stats import uniform, randint

# Parameter grid for XGBoost
params = {"colsample_bytree": uniform(0.3, 0.7),
          "gamma": uniform(0, 0.5),
          "learning_rate": uniform(0.003, 0.3),  # default 0.1
          "max_depth": randint(2, 6),  # default 3
          "n_estimators": randint(100, 400),  # default 100
          "subsample": uniform(0.6, 0.4),
          'objective': ['reg:squaredlogerror'],
          'eval_metric': [rmsle],
          'min_child_weight': randint(3, 8),
          'max_depth': randint(5, 10)}

folds = 5
n_iter_search = 20
kfold = KFold(n_splits=folds, shuffle=True)

xgb_model = XGBRegressor()
xgb_clf = RandomizedSearchCV(xgb_model, param_distributions=params, n_iter=n_iter_search, 
                             scoring=rmsle_scorer, verbose=10, n_jobs=-1,
                             cv=kfold.split(X_train, y_train))

start = time.time()
model = xgb_clf.fit(X_train, y_train)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time.time() - start), n_iter_search))
print(model.best_estimator_)


Fitting 5 folds for each of 20 candidates, totalling 100 fits
RandomizedSearchCV took 318.95 seconds for 20 candidates parameter settings.
XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1,
             colsample_bytree=0.49103709007158713, early_stopping_rounds=None,
             enable_categorical=False,
             eval_metric=<function rmsle at 0x000002665B8C9700>,
             gamma=0.24146702971443568, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.1751174828099928, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=5, max_leaves=0, min_child_weight=3,
             missing=nan, monotone_constraints='()', n_estimators=289, n_jobs=0,
             num_parallel_tree=1, objective='reg:squaredlogerror',
             predictor='auto', random_state=0, reg_alpha=0, ...)


In [37]:
print(model.best_index_)

16


In [36]:
print(model.best_index_, model.best_score_, model.best_params_)


16 -0.7696916305157515 {'colsample_bytree': 0.49103709007158713, 'eval_metric': <function rmsle at 0x000002665B8C9700>, 'gamma': 0.24146702971443568, 'learning_rate': 0.1751174828099928, 'max_depth': 5, 'min_child_weight': 3, 'n_estimators': 289, 'objective': 'reg:squaredlogerror', 'subsample': 0.9756657761007418}


In [24]:
print(model.best_index_, model.best_score_, model.best_params_)


6 -0.7781992901782682 {'subsample': 1.0, 'objective': 'reg:squaredlogerror', 'n_estimators': 500, 'min_child_weight': 5, 'max_depth': 4, 'gamma': 0.5, 'eval_metric': <function rmsle_xgb at 0x000002665900FD30>, 'eta': 0.3, 'colsample_bytree': 0.8, 'booster': 'gbtree'}


In [None]:
# # Various hyper-parameters to tune
xgb1 = XGBRegressor(eval_metric=mean_squared_log_error)
parameters = {'nthread': [4],  # when use hyperthread, xgboost may become slower
              'objective': ['reg:squaredlogerror'],
              'learning_rate': [.03, 0.05, .07],  # so called `eta` value
              'max_depth': [5, 10, 15],
              'min_child_weight': [4],
              'silent': [1],
              'subsample': [0.7],
              'colsample_bytree': [0.7],
              'n_estimators': [500]}

xgb_grid = GridSearchCV(xgb1,
                        parameters,
                        cv=kfold,
                        n_jobs=-1,
                        scoring=rmsle_scorer,
                        verbose=3)

xgb_grid.fit(X_train, y_train)

print(xgb_grid.best_score_)
print(xgb_grid.best_params_)

model = xgb_grid.best_estimator_


Fitting 5 folds for each of 9 candidates, totalling 45 fits


ValueError: 
All the 45 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
45 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\oskar\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\oskar\AppData\Local\Programs\Python\Python39\lib\site-packages\xgboost\core.py", line 575, in inner_f
    return f(**kwargs)
  File "c:\Users\oskar\AppData\Local\Programs\Python\Python39\lib\site-packages\xgboost\sklearn.py", line 931, in fit
    train_dmatrix, evals = _wrap_evaluation_matrices(
  File "c:\Users\oskar\AppData\Local\Programs\Python\Python39\lib\site-packages\xgboost\sklearn.py", line 401, in _wrap_evaluation_matrices
    train_dmatrix = create_dmatrix(
  File "c:\Users\oskar\AppData\Local\Programs\Python\Python39\lib\site-packages\xgboost\sklearn.py", line 945, in <lambda>
    create_dmatrix=lambda **kwargs: DMatrix(nthread=self.n_jobs, **kwargs),
  File "c:\Users\oskar\AppData\Local\Programs\Python\Python39\lib\site-packages\xgboost\core.py", line 575, in inner_f
    return f(**kwargs)
  File "c:\Users\oskar\AppData\Local\Programs\Python\Python39\lib\site-packages\xgboost\core.py", line 686, in __init__
    handle, feature_names, feature_types = dispatch_data_backend(
  File "c:\Users\oskar\AppData\Local\Programs\Python\Python39\lib\site-packages\xgboost\data.py", line 889, in dispatch_data_backend
    return _from_pandas_df(data, enable_categorical, missing, threads,
  File "c:\Users\oskar\AppData\Local\Programs\Python\Python39\lib\site-packages\xgboost\data.py", line 344, in _from_pandas_df
    data, feature_names, feature_types = _transform_pandas_df(
  File "c:\Users\oskar\AppData\Local\Programs\Python\Python39\lib\site-packages\xgboost\data.py", line 282, in _transform_pandas_df
    _invalid_dataframe_dtype(data)
  File "c:\Users\oskar\AppData\Local\Programs\Python\Python39\lib\site-packages\xgboost\data.py", line 246, in _invalid_dataframe_dtype
    raise ValueError(msg)
ValueError: DataFrame.dtypes for data must be int, float, bool or category.  When
categorical type is supplied, DMatrix parameter `enable_categorical` must
be set to `True`. Invalid columns:store_name, mall_name, chain_name, address, grunnkrets_name, district_name, municipality_name, sales_channel_name, lv1_desc, lv2_desc, lv3_desc, lv4_desc


In [None]:
for c in X_val.columns:
    if X_val[c].dtype == 'object':
        lbl.fit(list(X_val[c].values))
        X_val[c] = lbl.transform(list(X_val[c].values))
        # x_test.drop(c,axis=1,inplace=True)


y_pred_val = xgb_grid.best_estimator_.predict(X_val)
print(len(y_val), len(y_pred_val), len(X_val))
print(mean_squared_log_error(y_val, y_pred_val, squared=False))

# y_pred_train = xgb_grid.best_estimator_.predict(X_train)
# print(mean_squared_log_error(y_train, y_pred_train, squared=False))


2572 2572 2572
1.085321068283053
