In [88]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [89]:
%autoreload

import warnings
import os.path
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import geopy
import xgboost as xgb
import os
import shutil

from shapely import wkt
from retail_revenue_xgb import generate_features, create_buffer
from sklearn.model_selection import train_test_split
from utils import squared_log, rmsle
from xgboost import plot_importance, to_graphviz

pd.options.mode.chained_assignment = None  # default='warn'


In [90]:
spatial = pd.read_csv('data/grunnkrets_norway_stripped.csv')
age = pd.read_csv('data/grunnkrets_age_distribution.csv')
income = pd.read_csv('data/grunnkrets_income_households.csv')
households = pd.read_csv('data/grunnkrets_households_num_persons.csv')
submission = pd.read_csv('data/sample_submission.csv')
plaace = pd.read_csv('data/plaace_hierarchy.csv')
busstops = pd.read_csv('data/busstops_norway.csv')


def generate_features(df: pd.DataFrame):
    features = ['year', 'store_name', 'mall_name', 'chain_name', 'address', 'lat', 'lon',
                'plaace_hierarchy_id', 'grunnkrets_id']
    df = df[features]
    df['store_name'] = df['store_name'].astype('category')
    # df['store_id'] = df['store_id'].astype('category')
    df['address'] = df['address'].astype('category')
    df['chain_name+mall_name'] = (df['chain_name'] + df['mall_name']).astype('category')
    df['mall_name'] = df['mall_name'].astype('category')
    df['chain_name'] = df['chain_name'].astype('category')
    df['plaace_hierarchy_id'] = df['plaace_hierarchy_id'].astype('category')

    # attempt to difference the lat and lon values, as they seem to be somewhat trending negatively.
    df['lon'] = df['lon'].diff()
    df['lat'] = df['lat'].diff()

    # remove duplicates and merge with the spatial data.
    spatial.drop_duplicates(subset=['grunnkrets_id'])
    df = pd.merge(df, spatial.drop_duplicates(subset=['grunnkrets_id']), how='left')
    df['grunnkrets_name'] = df['grunnkrets_name'].astype('category')
    df['district_name'] = df['district_name'].astype('category')
    df['municipality_name'] = df['municipality_name'].astype('category')
    df['geometry'] = df['geometry'].astype('category')

    # age.drop_duplicates(subset=['grunnkrets_id'])
    # df = pd.merge(df, age.drop_duplicates(subset=['grunnkrets_id']), how='left')

    income.drop_duplicates(subset=['grunnkrets_id'])
    df = pd.merge(df, income.drop_duplicates(subset=['grunnkrets_id']), how='left')

    households.drop_duplicates(subset=['grunnkrets_id'])
    df = pd.merge(df, households.drop_duplicates(subset=['grunnkrets_id']), how='left')

    plaace.drop_duplicates(subset=['plaace_hierarchy_id'])
    df = pd.merge(df, plaace.drop_duplicates(subset=['plaace_hierarchy_id']), how='left')
    df['plaace_hierarchy_id'] = df['plaace_hierarchy_id'].astype('category')
    df['sales_channel_name'] = df['sales_channel_name'].astype('category')
    df['lv1_desc'] = df['lv1_desc'].astype('category')
    df['lv2_desc'] = df['lv2_desc'].astype('category')
    df['lv3'] = df['lv3'].astype('category')
    df['lv3_desc'] = df['lv3_desc'].astype('category')
    df['lv4'] = df['lv4'].astype('category')
    df['lv4_desc'] = df['lv4_desc'].astype('category')

    df = df.drop(columns=['grunnkrets_id', 'plaace_hierarchy_id'])

    return df


train = pd.read_csv('data/stores_train.csv')
test = pd.read_csv('data/stores_test.csv')

label_name = 'revenue'
X = train.drop(columns=[label_name])
y = train[label_name]

submission = pd.read_csv('data/sample_submission.csv')
model_to_load = "modeling/0002.model"

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=.8)

X_train, X_val = generate_features(X_train), generate_features(X_val)


In [91]:
# Clear buffers
folder = os.path.join(os.getcwd(), 'modeling')
for filename in os.listdir(folder):
    file_path = os.path.join(folder, filename)
    if os.path.isfile(file_path):
        os.unlink(file_path)
        print(f'Deleted file: {file_path}')

train_buffer_path = 'modeling/train.buffer'
test_buffer_path = 'modeling/test.buffer'

dtrain = xgb.DMatrix(data=X_train, label=y_train, enable_categorical=True)
dtrain.save_binary(train_buffer_path)
print(f'--> {train_buffer_path} created and saved.')

dvalid = xgb.DMatrix(data=X_val, label=y_val, enable_categorical=True)
dvalid.save_binary(test_buffer_path)
print(f'--> {test_buffer_path} created and saved.')


--> modeling/train.buffer created and saved.
--> modeling/test.buffer created and saved.


In [127]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, KFold
from xgboost import XGBClassifier

# # check if there already exists a model.
# if os.path.exists(model_to_load):
#     print("\nModel found, attempting to load.")
#     model = xgb.Booster({'nthread': 4, 'disable_default_eval_metric': True})  # init model
#     model.load_model(model_to_load)  # load data
#     print("--> model successfully loaded.")
# else:
print("\nNo model found. Attempt at creating a new one will now start:")
print("Attempting to initialize parameters for training...")
params = {
    'max_depth': 9, 
    'eta': 0.05, 
    'min_child_weight': 7,
    'disable_default_eval_metric': True,
}
num_round = 999

# xgbmodel = XGBClassifier()
# kfold = KFold(n_splits=5,shuffle=True, random_state=420)
# xgb_clf = RandomizedSearchCV(xgbmodel, param_distributions=param, n_iter=100, )

watchlist = [(dtrain, 'train'), (dvalid, 'valid')]

# print("--> parameters for training initialized.")

# print("Attempting to start training...")
# bst = xgb.train(
#     params=params, 
#     dtrain=dtrain, 
#     num_boost_round=num_round, 
#     obj=squared_log,
#     custom_metric=rmsle,
#     evals=watchlist, 
#     early_stopping_rounds=10, 
#     verbose_eval=5)
# print("--> model trained.")

# print("Attempting to save model...")
# bst.save_model(model_to_load)
# print("--> model saved.")

print("Starting k-fold cross validaition...")
xgb_param = xgb.cv(params, dtrain, num_boost_round=200, nfold=10, seed=420, obj=squared_log, 
                custom_metric=rmsle, verbose_eval=True, 
                callbacks=[xgb.callback.EvaluationMonitor(show_stdv=False),
                           xgb.callback.EarlyStopping(5)])



No model found. Attempt at creating a new one will now start:
Attempting to initialize parameters for training...
Starting k-fold cross validaition...
[0]	train-RMSLE:1.54266+0.00486	test-RMSLE:1.54265+0.04401
[0]	train-RMSLE:1.54266	test-RMSLE:1.54265
[1]	train-RMSLE:1.52145+0.00482	test-RMSLE:1.52207+0.04370
[1]	train-RMSLE:1.52145	test-RMSLE:1.52207
[2]	train-RMSLE:1.50054+0.00478	test-RMSLE:1.50181+0.04340
[2]	train-RMSLE:1.50054	test-RMSLE:1.50181
[3]	train-RMSLE:1.47994+0.00474	test-RMSLE:1.48189+0.04308
[3]	train-RMSLE:1.47994	test-RMSLE:1.48189
[4]	train-RMSLE:1.45964+0.00470	test-RMSLE:1.46228+0.04276
[4]	train-RMSLE:1.45964	test-RMSLE:1.46228
[5]	train-RMSLE:1.43965+0.00466	test-RMSLE:1.44298+0.04243
[5]	train-RMSLE:1.43965	test-RMSLE:1.44298
[6]	train-RMSLE:1.41996+0.00462	test-RMSLE:1.42397+0.04212
[6]	train-RMSLE:1.41996	test-RMSLE:1.42397
[7]	train-RMSLE:1.40057+0.00458	test-RMSLE:1.40529+0.04180
[7]	train-RMSLE:1.40057	test-RMSLE:1.40529
[8]	train-RMSLE:1.38149+0.00453	

In [106]:
gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(9, 12)
    for min_child_weight in range(5, 8)
]

min_rmsle = float("Inf")
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
        max_depth,
        min_child_weight))
    # Update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=100,
        seed=42,
        nfold=5,
        custom_metric=rmsle,
        early_stopping_rounds=10
    )
    # Update best MAE
    mean_rmsle = cv_results['test-RMSLE-mean'].min()
    boost_rounds = cv_results['test-RMSLE-mean'].argmin()
    print("\tRMSLE {} for {} rounds".format(mean_rmsle, boost_rounds))
    if mean_rmsle < min_rmsle:
        min_rmsle = mean_rmsle
        best_params = (max_depth, min_child_weight)
print("Best params: {}, {}, RMSLE: {}".format(best_params[0], best_params[1], min_rmsle))


CV with max_depth=9, min_child_weight=5
\RMSLE 0.765178 for 21 rounds
CV with max_depth=9, min_child_weight=6
\RMSLE 0.7640579999999999 for 22 rounds
CV with max_depth=9, min_child_weight=7
\RMSLE 0.7630676000000001 for 22 rounds
CV with max_depth=10, min_child_weight=5
\RMSLE 0.7687875999999999 for 21 rounds
CV with max_depth=10, min_child_weight=6
\RMSLE 0.766042 for 21 rounds
CV with max_depth=10, min_child_weight=7
\RMSLE 0.7636806 for 23 rounds
CV with max_depth=11, min_child_weight=5
\RMSLE 0.7707318 for 22 rounds
CV with max_depth=11, min_child_weight=6
\RMSLE 0.7708202 for 23 rounds
CV with max_depth=11, min_child_weight=7
\RMSLE 0.7668524 for 22 rounds
Best params: 9, 7, RMSLE: 0.7630676000000001


In [146]:
from utils import xgb_cross_validation

params = {
    'max_depth': range(9, 12),
    'eta': [0.05, 0.1, 0.2, 0.3],
    'min_child_weight': range(5, 8),
    # 'disable_default_eval_metric': True,
}

xgb_cross_validation(params, dtrain)


{'max_depth': 9, 'eta': 0.05, 'min_child_weight': 5}
	RMSLE 0.765178 for 21 rounds 	(New best)
{'max_depth': 9, 'eta': 0.05, 'min_child_weight': 6}
	RMSLE 0.7640579999999999 for 22 rounds 	(New best)
{'max_depth': 9, 'eta': 0.05, 'min_child_weight': 7}
	RMSLE 0.7630676000000001 for 22 rounds 	(New best)
{'max_depth': 9, 'eta': 0.1, 'min_child_weight': 5}
	RMSLE 0.7667822 for 10 rounds 	
{'max_depth': 9, 'eta': 0.1, 'min_child_weight': 6}
	RMSLE 0.765438 for 10 rounds 	
{'max_depth': 9, 'eta': 0.1, 'min_child_weight': 7}
	RMSLE 0.7644392 for 10 rounds 	


KeyboardInterrupt: 

In [139]:
import itertools

params = {
    'max_depth': [9, 10, 11],
    'eta': [0.05],
    'min_child_weight': [7, 11, 33],
    # 'disable_default_eval_metric': True,
}

def product_dict(**kwargs):
    keys = kwargs.keys()
    vals = kwargs.values()
    lst = []
    for instance in itertools.product(*vals):
        # yield dict(zip(keys, instance))
        lst.append(dict(zip(keys, instance)))
    return lst


product_dict(**params)


[{'max_depth': 9, 'eta': 0.05, 'min_child_weight': 7},
 {'max_depth': 9, 'eta': 0.05, 'min_child_weight': 11},
 {'max_depth': 9, 'eta': 0.05, 'min_child_weight': 33},
 {'max_depth': 10, 'eta': 0.05, 'min_child_weight': 7},
 {'max_depth': 10, 'eta': 0.05, 'min_child_weight': 11},
 {'max_depth': 10, 'eta': 0.05, 'min_child_weight': 33},
 {'max_depth': 11, 'eta': 0.05, 'min_child_weight': 7},
 {'max_depth': 11, 'eta': 0.05, 'min_child_weight': 11},
 {'max_depth': 11, 'eta': 0.05, 'min_child_weight': 33}]

In [103]:
# This can take some time…
min_mae = float("Inf")
best_params = None
for eta in [.3, .2, .1, .05, .01, .005]:
    print("CV with eta={}".format(eta))
    # We update our parameters
    params['eta'] = eta
    # Run and time CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=100,
        seed=42,
        nfold=5,
        custom_metric=rmsle,
        # verbose_eval=True,
        early_stopping_rounds=10
    )
    # Update best score
    mean_mae = cv_results['test-RMSLE-mean'].min()
    boost_rounds = cv_results['test-RMSLE-mean'].argmin()
    print("\tRMSLE {} for {} rounds\n".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = eta

print("Best params: {}, RMSLE: {}".format(best_params, min_mae))


# print("Starting k-fold cross validaition...")
# xgb_param = xgb.cv(params, dtrain, num_boost_round=100, nfold=10, seed=0, obj=squared_log,
#                    custom_metric=rmsle, verbose_eval=True,
#                    callbacks=[xgb.callback.EvaluationMonitor(show_stdv=False),
#                               xgb.callback.EarlyStopping(3)])


CV with eta=0.3
	MAE 0.7847501999999998 for 2 rounds

CV with eta=0.2
	MAE 0.7792326 for 4 rounds

CV with eta=0.1
	MAE 0.7743976 for 11 rounds

CV with eta=0.05
	MAE 0.7719683999999999 for 25 rounds

CV with eta=0.01
	MAE 0.7767526 for 99 rounds

CV with eta=0.005
	MAE 0.8555772000000001 for 99 rounds

Best params: 0.05, RMSLE: 0.7719683999999999


In [118]:
X_test = generate_features(test)
dtest = xgb.DMatrix(data=X_test, enable_categorical=True)

print("\nAttempting to start prediction...")
y_pred = bst.predict(dtest, ntree_limit=bst.best_iteration)
print("--> Prediction finished.")

print("\nAttempting to save prediction...")
submission['predicted'] = np.array(y_pred)
submission.to_csv('submissions/submission.csv', index=False)
print("--> prediction saved with features as name in submission folder.")


Attempting to start prediction...
--> Prediction finished.

Attempting to save prediction...
--> prediction saved with features as name in submission folder.


In [None]:
from xgboost import plot_tree

plot_importance(model)
xgb.to_graphviz(model, num_trees=1)
# fig, ax = plt.subplots(figsize=(30, 30))
# xgb.plot_tree(bst, num_trees=1, ax=ax)
# plt.savefig("temp.pdf")
# plt.show()


XGBoostError: [17:49:29] c:\users\administrator\workspace\xgboost-win64_release_1.6.0\src\c_api\c_api_utils.h:222: Check failed: feature_names.size() == n_features (34 vs. 36) : Incorrect number of feature names.