In [1]:
%load_ext autoreload

In [2]:
%autoreload

import warnings
import os.path
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import geopy
import xgboost as xgb
import os
import shutil

from shapely import wkt
from retail_revenue_xgb import generate_features, create_buffer
from sklearn.model_selection import train_test_split
from utils import squared_log, rmsle
from xgboost import plot_importance, to_graphviz

pd.options.mode.chained_assignment = None  # default='warn'


In [3]:
spatial = pd.read_csv('data/grunnkrets_norway_stripped.csv')
age = pd.read_csv('data/grunnkrets_age_distribution.csv')
income = pd.read_csv('data/grunnkrets_income_households.csv')
households = pd.read_csv('data/grunnkrets_households_num_persons.csv')
submission = pd.read_csv('data/sample_submission.csv')
plaace = pd.read_csv('data/plaace_hierarchy.csv')
busstops = pd.read_csv('data/busstops_norway.csv')


def generate_features(df: pd.DataFrame):
    features = ['year', 'store_name', 'mall_name', 'chain_name', 'address', 'lat', 'lon',
                'plaace_hierarchy_id', 'grunnkrets_id']
    df = df[features]
    df['store_name'] = df['store_name'].astype('category')
    # df['store_id'] = df['store_id'].astype('category')
    df['address'] = df['address'].astype('category')
    df['chain_name+mall_name'] = (df['chain_name'] + df['mall_name']).astype('category')
    df['mall_name'] = df['mall_name'].astype('category')
    df['chain_name'] = df['chain_name'].astype('category')
    df['plaace_hierarchy_id'] = df['plaace_hierarchy_id'].astype('category')

    # attempt to difference the lat and lon values, as they seem to be somewhat trending negatively.
    df['lon'] = df['lon'].diff()
    df['lat'] = df['lat'].diff()

    # remove duplicates and merge with the spatial data.
    spatial.drop_duplicates(subset=['grunnkrets_id'])
    df = pd.merge(df, spatial.drop_duplicates(subset=['grunnkrets_id']), how='left')
    df['grunnkrets_name'] = df['grunnkrets_name'].astype('category')
    df['district_name'] = df['district_name'].astype('category')
    df['municipality_name'] = df['municipality_name'].astype('category')
    df['geometry'] = df['geometry'].astype('category')

    # age.drop_duplicates(subset=['grunnkrets_id'])
    # df = pd.merge(df, age.drop_duplicates(subset=['grunnkrets_id']), how='left')

    income.drop_duplicates(subset=['grunnkrets_id'])
    df = pd.merge(df, income.drop_duplicates(subset=['grunnkrets_id']), how='left')

    households.drop_duplicates(subset=['grunnkrets_id'])
    df = pd.merge(df, households.drop_duplicates(subset=['grunnkrets_id']), how='left')

    plaace.drop_duplicates(subset=['plaace_hierarchy_id'])
    df = pd.merge(df, plaace.drop_duplicates(subset=['plaace_hierarchy_id']), how='left')
    df['plaace_hierarchy_id'] = df['plaace_hierarchy_id'].astype('category')
    df['sales_channel_name'] = df['sales_channel_name'].astype('category')
    df['lv1_desc'] = df['lv1_desc'].astype('category')
    df['lv2_desc'] = df['lv2_desc'].astype('category')
    df['lv3'] = df['lv3'].astype('category')
    df['lv3_desc'] = df['lv3_desc'].astype('category')
    df['lv4'] = df['lv4'].astype('category')
    df['lv4_desc'] = df['lv4_desc'].astype('category')

    df = df.drop(columns=['grunnkrets_id', 'plaace_hierarchy_id'])

    return df


train = pd.read_csv('data/stores_train.csv')
test = pd.read_csv('data/stores_test.csv')

label_name = 'revenue'
X = train.drop(columns=[label_name])
y = train[label_name]

submission = pd.read_csv('data/sample_submission.csv')
model_to_load = "modeling/0002.model"

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=.8)

X_train, X_val = generate_features(X_train), generate_features(X_val)


In [4]:
# Clear buffers
folder = os.path.join(os.getcwd(), 'modeling')
for filename in os.listdir(folder):
    file_path = os.path.join(folder, filename)
    if os.path.isfile(file_path):
        os.unlink(file_path)
        print(f'Deleted file: {file_path}')

train_buffer_path = 'modeling/train.buffer'
test_buffer_path = 'modeling/test.buffer'

dtrain = xgb.DMatrix(data=X_train, label=y_train, enable_categorical=True)
dtrain.save_binary(train_buffer_path)
print(f'--> {train_buffer_path} created and saved.')

dvalid = xgb.DMatrix(data=X_val, label=y_val, enable_categorical=True)
dvalid.save_binary(test_buffer_path)
print(f'--> {test_buffer_path} created and saved.')


Deleted file: c:\dev\maskin\maskinprosjekt\modeling\0002.model
Deleted file: c:\dev\maskin\maskinprosjekt\modeling\test.buffer
Deleted file: c:\dev\maskin\maskinprosjekt\modeling\train.buffer
--> modeling/train.buffer created and saved.
--> modeling/test.buffer created and saved.


In [12]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, KFold
from xgboost import XGBClassifier


print("Attempting to initialize parameters for training...")
# params = {
#     'max_depth': 9, 
#     'eta': 0.05, 
#     'min_child_weight': 7,
#     'disable_default_eval_metric': True,
# }
# params = {'colsample_bytree': 0.5478255177656529, 'learning_rate': 0.02853786979050646, 'max_depth': 7, 'min_child_weight': 1, 'n_estimators': 195, 'subsample': 0.9733176496063143}
params = {'colsample_bytree': 0.7717138210314867, 'learning_rate': 0.047506668950627134, 'max_depth': 8, 'min_child_weight': 3, 'n_estimators': 223, 'subsample': 0.9929036803032936}
params['disable_default_eval_metric'] = True
# print("--> parameters for training initialized.")

num_round = 999
watchlist = [(dtrain, 'train'), (dvalid, 'valid')]

print("Attempting to start training...")
bst = xgb.train(
    params=params, 
    dtrain=dtrain, 
    num_boost_round=num_round, 
    obj=squared_log,
    custom_metric=rmsle,
    evals=watchlist, 
    early_stopping_rounds=10, 
    verbose_eval=5)
print("--> model trained.")

print("Attempting to save model...")
bst.save_model(model_to_load)
print("--> model saved.")


No model found. Attempt at creating a new one will now start:
Attempting to initialize parameters for training...
{'colsample_bytree': 0.7717138210314867, 'learning_rate': 0.047506668950627134, 'max_depth': 8, 'min_child_weight': 3, 'n_estimators': 223, 'subsample': 0.9929036803032936, 'disable_default_eval_metric': True}
Attempting to start training...
Parameters: { "n_estimators" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	train-RMSLE:1.54144	valid-RMSLE:1.55857
[5]	train-RMSLE:1.44422	valid-RMSLE:1.46343
[10]	train-RMSLE:1.35469	valid-RMSLE:1.37933
[15]	train-RMSLE:1.27224	valid-RMSLE:1.30187
[20]	train-RMSLE:1.19735	valid-RMSLE:1.23826
[25]	train-RMSLE:1.12937	valid-RMSLE:1.17536
[30]	train-RMSLE:1.06831	valid-RMSLE:1.12261
[35]	train-

KeyboardInterrupt: 

In [6]:
from utils import xgb_cross_validation

params = {
    'max_depth': range(9, 12),
    'eta': [0.05, 0.1, 0.2, 0.3],
    'min_child_weight': range(5, 8),
    # 'disable_default_eval_metric': True,
}

# xgb_cross_validation(params, dtrain)


In [7]:
X_test = generate_features(test)
dtest = xgb.DMatrix(data=X_test, enable_categorical=True)

print("\nAttempting to start prediction...")
y_pred = bst.predict(dtest, ntree_limit=bst.best_iteration)
print("--> Prediction finished.")

print("\nAttempting to save prediction...")
submission['predicted'] = np.array(y_pred)
submission.to_csv('submissions/submission.csv', index=False)
print("--> prediction saved with features as name in submission folder.")


Attempting to start prediction...
--> Prediction finished.

Attempting to save prediction...
--> prediction saved with features as name in submission folder.




In [8]:
from xgboost import plot_tree

plot_importance(model)
xgb.to_graphviz(model, num_trees=1)
# fig, ax = plt.subplots(figsize=(30, 30))
# xgb.plot_tree(bst, num_trees=1, ax=ax)
# plt.savefig("temp.pdf")
# plt.show()


NameError: name 'model' is not defined