In [69]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [70]:
%autoreload

import warnings
import os.path
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import geopy
import xgboost as xgb
import os
import shutil

from shapely import wkt
from retail_revenue_xgb import generate_features, create_buffer
from sklearn.model_selection import train_test_split
from utils import squared_log, rmsle
from xgboost import plot_importance, to_graphviz

pd.options.mode.chained_assignment = None  # default='warn'


In [91]:
spatial = pd.read_csv('data/grunnkrets_norway_stripped.csv')
age = pd.read_csv('data/grunnkrets_age_distribution.csv')
income = pd.read_csv('data/grunnkrets_income_households.csv')
households = pd.read_csv('data/grunnkrets_households_num_persons.csv')
submission = pd.read_csv('data/sample_submission.csv')
plaace = pd.read_csv('data/plaace_hierarchy.csv')
busstops = pd.read_csv('data/busstops_norway.csv')


def generate_features1(df: pd.DataFrame):
    features = ['year', 'store_name', 'mall_name', 'chain_name', 'address', 'lat', 'lon',
                'plaace_hierarchy_id', 'grunnkrets_id']
    df = df[features]
    df['store_name'] = df['store_name'].astype('category')
    # df['store_id'] = df['store_id'].astype('category')
    df['address'] = df['address'].astype('category')
    df['chain_name+mall_name'] = (df['chain_name'] + df['mall_name']).astype('category')
    df['mall_name'] = df['mall_name'].astype('category')
    df['chain_name'] = df['chain_name'].astype('category')
    df['plaace_hierarchy_id'] = df['plaace_hierarchy_id'].astype('category')

    # attempt to difference the lat and lon values, as they seem to be somewhat trending negatively.
    df['lon'] = df['lon'].diff()
    df['lat'] = df['lat'].diff()

    # remove duplicates and merge with the spatial data.
    spatial.drop_duplicates(subset=['grunnkrets_id'])
    df = pd.merge(df, spatial.drop_duplicates(subset=['grunnkrets_id']), how='left')
    df['grunnkrets_name'] = df['grunnkrets_name'].astype('category')
    df['district_name'] = df['district_name'].astype('category')
    df['municipality_name'] = df['municipality_name'].astype('category')
    df['geometry'] = df['geometry'].astype('category')

    # age.drop_duplicates(subset=['grunnkrets_id'])
    # df = pd.merge(df, age.drop_duplicates(subset=['grunnkrets_id']), how='left')

    income.drop_duplicates(subset=['grunnkrets_id'])
    df = pd.merge(df, income.drop_duplicates(subset=['grunnkrets_id']), how='left')

    households.drop_duplicates(subset=['grunnkrets_id'])
    df = pd.merge(df, households.drop_duplicates(subset=['grunnkrets_id']), how='left')

    plaace.drop_duplicates(subset=['plaace_hierarchy_id'])
    df = pd.merge(df, plaace.drop_duplicates(subset=['plaace_hierarchy_id']), how='left')
    df['plaace_hierarchy_id'] = df['plaace_hierarchy_id'].astype('category')
    df['sales_channel_name'] = df['sales_channel_name'].astype('category')
    df['lv1_desc'] = df['lv1_desc'].astype('category')
    df['lv2_desc'] = df['lv2_desc'].astype('category')
    df['lv3'] = df['lv3'].astype('category')
    df['lv3_desc'] = df['lv3_desc'].astype('category')
    df['lv4'] = df['lv4'].astype('category')
    df['lv4_desc'] = df['lv4_desc'].astype('category')

    df = df.drop(columns=['grunnkrets_id', 'plaace_hierarchy_id'])

    return df


def generate_features(df: pd.DataFrame):
    features = ['year', 'store_name', 'mall_name', 'chain_name', 'address', 'lat', 'lon',
                'plaace_hierarchy_id', 'grunnkrets_id']
    df = df[features]
    df['store_name'] = df['store_name']
    # df['store_id'] = df['store_id']
    df['address'] = df['address']
    df['chain_name+mall_name'] = (df['chain_name'] + df['mall_name'])
    df['mall_name'] = df['mall_name']
    df['chain_name'] = df['chain_name']
    df['plaace_hierarchy_id'] = df['plaace_hierarchy_id']

    # attempt to difference the lat and lon values, as they seem to be somewhat trending negatively.
    df['lon'] = df['lon'].diff()
    df['lat'] = df['lat'].diff()

    # remove duplicates and merge with the spatial data.
    spatial.drop_duplicates(subset=['grunnkrets_id'])
    df = pd.merge(df, spatial.drop_duplicates(subset=['grunnkrets_id']), how='left')
    df['grunnkrets_name'] = df['grunnkrets_name']
    df['district_name'] = df['district_name']
    df['municipality_name'] = df['municipality_name']
    df['geometry'] = df['geometry']

    # age.drop_duplicates(subset=['grunnkrets_id'])
    # df = pd.merge(df, age.drop_duplicates(subset=['grunnkrets_id']), how='left')

    income.drop_duplicates(subset=['grunnkrets_id'])
    df = pd.merge(df, income.drop_duplicates(subset=['grunnkrets_id']), how='left')

    households.drop_duplicates(subset=['grunnkrets_id'])
    df = pd.merge(df, households.drop_duplicates(subset=['grunnkrets_id']), how='left')

    plaace.drop_duplicates(subset=['plaace_hierarchy_id'])
    df = pd.merge(df, plaace.drop_duplicates(subset=['plaace_hierarchy_id']), how='left')
    df['plaace_hierarchy_id'] = df['plaace_hierarchy_id']
    df['sales_channel_name'] = df['sales_channel_name']
    df['lv1_desc'] = df['lv1_desc']
    df['lv2_desc'] = df['lv2_desc']
    df['lv3'] = df['lv3']
    df['lv3_desc'] = df['lv3_desc']
    df['lv4'] = df['lv4']
    df['lv4_desc'] = df['lv4_desc']

    df = df.drop(columns=['grunnkrets_id', 'plaace_hierarchy_id'])

    return df


train = pd.read_csv('data/stores_train.csv')
test = pd.read_csv('data/stores_test.csv')

label_name = 'revenue'
X = train.drop(columns=[label_name])
y = train[label_name]

submission = pd.read_csv('data/sample_submission.csv')
model_to_load = "modeling/0002.model"

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=.8)

X = generate_features(X)
X_train, X_val = generate_features(X_train), generate_features(X_val)


In [72]:
# # Clear buffers
# folder = os.path.join(os.getcwd(), 'modeling')
# for filename in os.listdir(folder):
#     file_path = os.path.join(folder, filename)
#     if os.path.isfile(file_path):
#         os.unlink(file_path)
#         print(f'Deleted file: {file_path}')

# train_buffer_path = 'modeling/train.buffer'
# test_buffer_path = 'modeling/test.buffer'

# dtrain = xgb.DMatrix(data=X_train, label=y_train, enable_categorical=True)
# dtrain.save_binary(train_buffer_path)
# print(f'--> {train_buffer_path} created and saved.')

# dvalid = xgb.DMatrix(data=X_val, label=y_val, enable_categorical=True)
# dvalid.save_binary(test_buffer_path)
# print(f'--> {test_buffer_path} created and saved.')


In [99]:
from sklearn.model_selection import RandomizedSearchCV, KFold, GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_log_error
from xgboost.sklearn import XGBRegressor
from sklearn.preprocessing import LabelEncoder

lbl = LabelEncoder()

# Parameter grid for XGBoost
params = {
    # 'min_child_weight': [1, 5, 10],
    # 'gamma': [0.5, 1, 1.5, 2, 5],
    'subsample': [0.6, 0.8, 1.0],
    # 'colsample_bytree': [0.6, 0.8, 1.0],
    'max_depth': [4, 7, 10]
}
folds = 5
param_comb = 5

# Convert columns that are not numeric to a numeric value
for c in X_train.columns:
    if X_train[c].dtype == 'object':
        lbl.fit(list(X_train[c].values))
        X_train[c] = lbl.transform(list(X_train[c].values))
        # x_train_full.drop(c,axis=1,inplace=True)

X_val = generate_features(test)

for c in X_val.columns:
    if X_val[c].dtype == 'object':
        lbl.fit(list(X_val[c].values))
        X_val[c] = lbl.transform(list(X_val[c].values))
        # x_test.drop(c,axis=1,inplace=True)


# xgb_model = XGBRegressor()
# xgb_clf = RandomizedSearchCV(xgb_model, param_distributions=params, n_iter=100, 
#                                 scoring=mean_squared_log_error, n_jobs=4, 
#                                 cv=kfold.split(X_train, y_train), verbose=3, 
#                                 random_state=420)

# model = xgb_clf.fit(X_train, y_train)
# print(model.best_estimator_)

# Various hyper-parameters to tune
kfold = KFold(n_splits=folds, shuffle=True, random_state=420)
xgb1 = XGBRegressor(eval_metric=mean_squared_log_error)
parameters = {'nthread': [4],  # when use hyperthread, xgboost may become slower
              'objective': ['reg:squaredlogerror'],
              'learning_rate': [.03, 0.05, .07],  # so called `eta` value
              'max_depth': [5, 10, 15],
              'min_child_weight': [4],
              'silent': [1],
              'subsample': [0.7],
              'colsample_bytree': [0.7],
              'n_estimators': [500]}

def rmsle(X, y):
    y[y < -1] = -1 + 1e-6
    elements = np.power(np.log1p(y) - np.log1p(y), 2)
    return 'RMSLE', float(np.sqrt(np.sum(elements) / len(y)))


def rmsle_vanilla(y_pred, y_true):
    elements = np.power(np.log1p(y_pred) - np.log1p(y_true), 2)
    return float(np.sqrt(np.sum(elements) / len(y_true)))


rmsle_scorer = make_scorer(rmsle_vanilla, greater_is_better=False)


xgb_grid = GridSearchCV(xgb1,
                        parameters,
                        cv=kfold,
                        n_jobs=-1,
                        scoring=rmsle_scorer,
                        verbose=3)

xgb_grid.fit(X_train, y_train)

print(xgb_grid.best_score_)
print(xgb_grid.best_params_)

model = xgb_grid.best_estimator_
# y_predict_train = xgb_grid.best_estimator_.predict(X_train)


# y_predict_test = xgb_grid.best_estimator_.predict(X_test)

# X_test = generate_features(test)
# dtest = xgb.DMatrix(data=X_test, enable_categorical=True)

# y_pred = model.predict_proba(X_test)
# print("\nAttempting to save prediction...")
# submission['predicted'] = np.array(y_pred)
# submission.to_csv('submissions/submission.csv', index=False)
# print("--> prediction saved with features as name in submission folder.")


Fitting 5 folds for each of 9 candidates, totalling 45 fits
Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


-0.7734579532975809
{'colsample_bytree': 0.7, 'learning_rate': 0.05, 'max_depth': 5, 'min_child_weight': 4, 'n_estimators': 500, 'nthread': 4, 'objective': 'reg:squaredlogerror', 'silent': 1, 'subsample': 0.7}


In [97]:

def rmsle_vanilla(y_pred, y_true):
    elements = np.power(np.log1p(y_pred) - np.log1p(y_true), 2)
    return 'RMSLE', float(np.sqrt(np.sum(elements) / len(y_true)))

rmsle_scorer = make_scorer(rmsle_vanilla, greater_is_better=False)

rmsle(
    np.array([1, 2, 3]),
    np.array([1, 2, 2])
)

('RMSLE', 0.16609332197106508)

In [87]:
X_test = generate_features(test)

for c in X_test.columns:
    if X_test[c].dtype == 'object':
        lbl.fit(list(X_test[c].values))
        X_test[c] = lbl.transform(list(X_test[c].values))
        # x_test.drop(c,axis=1,inplace=True)

y_pred_test = model.predict(X_test)
# print(mean_squared_log_error(y_test, y_pred_test, squared=False))

print("\nAttempting to save prediction...")
submission['predicted'] = np.array(y_pred_test)
submission.to_csv('submissions/submission.csv', index=False)
print("--> prediction saved with features as name in submission folder.")


KeyboardInterrupt: 

In [92]:
for c in X_val.columns:
    if X_val[c].dtype == 'object':
        lbl.fit(list(X_val[c].values))
        X_val[c] = lbl.transform(list(X_val[c].values))
        # x_test.drop(c,axis=1,inplace=True)


y_pred_val = xgb_grid.best_estimator_.predict(X_val)
print(len(y_val), len(y_pred_val), len(X_val))
print(mean_squared_log_error(y_val, y_pred_val, squared=False))

# y_pred_train = xgb_grid.best_estimator_.predict(X_train)
# print(mean_squared_log_error(y_train, y_pred_train, squared=False))


2572 2572 2572
1.085321068283053


In [None]:
# X_test = generate_features(test)
# dtest = xgb.DMatrix(data=X_test, enable_categorical=True)

# print("\nAttempting to start prediction...")
# y_pred = model.predict(dtest, ntree_limit=model.best_iteration)
# print("--> Prediction finished.")

# print("\nAttempting to save prediction...")
# submission['predicted'] = np.array(y_pred)
# submission.to_csv('submissions/submission.csv', index=False)
# print("--> prediction saved with features as name in submission folder.")


Attempting to start prediction...
--> Prediction finished.

Attempting to save prediction...
--> prediction saved with features as name in submission folder.




In [None]:
from xgboost import plot_tree

plot_importance(model)
xgb.to_graphviz(model, num_trees=1)
# fig, ax = plt.subplots(figsize=(30, 30))
# xgb.plot_tree(bst, num_trees=1, ax=ax)
# plt.savefig("temp.pdf")
# plt.show()


XGBoostError: [17:49:29] c:\users\administrator\workspace\xgboost-win64_release_1.6.0\src\c_api\c_api_utils.h:222: Check failed: feature_names.size() == n_features (34 vs. 36) : Incorrect number of feature names.