In [46]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [47]:
%autoreload

import warnings
import os.path
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import geopy
import xgboost as xgb
import os
import shutil

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from shapely import wkt
from retail_revenue_xgb import generate_features, create_buffer
from sklearn.model_selection import train_test_split
from utils import squared_log, rmsle
from xgboost import plot_importance, to_graphviz

pd.options.mode.chained_assignment = None  # default='warn'


In [48]:
spatial = pd.read_csv('data/grunnkrets_norway_stripped.csv')
age = pd.read_csv('data/grunnkrets_age_distribution.csv')
income = pd.read_csv('data/grunnkrets_income_households.csv')
households = pd.read_csv('data/grunnkrets_households_num_persons.csv')
submission = pd.read_csv('data/sample_submission.csv')
plaace = pd.read_csv('data/plaace_hierarchy.csv')
busstops = pd.read_csv('data/busstops_norway.csv')


def generate_features1(df: pd.DataFrame):
    features = ['year', 'store_name', 'mall_name', 'chain_name', 'address', 'lat', 'lon',
                'plaace_hierarchy_id', 'grunnkrets_id']
    df = df[features]
    df['store_name'] = df['store_name'].astype('category')
    # df['store_id'] = df['store_id'].astype('category')
    df['address'] = df['address'].astype('category')
    df['chain_name+mall_name'] = (df['chain_name'] + df['mall_name']).astype('category')
    df['mall_name'] = df['mall_name'].astype('category')
    df['chain_name'] = df['chain_name'].astype('category')
    df['plaace_hierarchy_id'] = df['plaace_hierarchy_id'].astype('category')

    # attempt to difference the lat and lon values, as they seem to be somewhat trending negatively.
    df['lon'] = df['lon'].diff()
    df['lat'] = df['lat'].diff()

    # remove duplicates and merge with the spatial data.
    spatial.drop_duplicates(subset=['grunnkrets_id'])
    df = pd.merge(df, spatial.drop_duplicates(subset=['grunnkrets_id']), how='left')
    df['grunnkrets_name'] = df['grunnkrets_name'].astype('category')
    df['district_name'] = df['district_name'].astype('category')
    df['municipality_name'] = df['municipality_name'].astype('category')
    df['geometry'] = df['geometry'].astype('category')

    # age.drop_duplicates(subset=['grunnkrets_id'])
    # df = pd.merge(df, age.drop_duplicates(subset=['grunnkrets_id']), how='left')

    income.drop_duplicates(subset=['grunnkrets_id'])
    df = pd.merge(df, income.drop_duplicates(subset=['grunnkrets_id']), how='left')

    households.drop_duplicates(subset=['grunnkrets_id'])
    df = pd.merge(df, households.drop_duplicates(subset=['grunnkrets_id']), how='left')

    plaace.drop_duplicates(subset=['plaace_hierarchy_id'])
    df = pd.merge(df, plaace.drop_duplicates(subset=['plaace_hierarchy_id']), how='left')
    df['plaace_hierarchy_id'] = df['plaace_hierarchy_id'].astype('category')
    df['sales_channel_name'] = df['sales_channel_name'].astype('category')
    df['lv1_desc'] = df['lv1_desc'].astype('category')
    df['lv2_desc'] = df['lv2_desc'].astype('category')
    df['lv3'] = df['lv3'].astype('category')
    df['lv3_desc'] = df['lv3_desc'].astype('category')
    df['lv4'] = df['lv4'].astype('category')
    df['lv4_desc'] = df['lv4_desc'].astype('category')

    df = df.drop(columns=['grunnkrets_id', 'plaace_hierarchy_id'])

    return df


def generate_features(df: pd.DataFrame):
    features = ['year', 'store_name', 'mall_name', 'chain_name', 'address', 'lat', 'lon',
                'plaace_hierarchy_id', 'grunnkrets_id']
    df = df[features]
    df['store_name'] = df['store_name']
    # df['store_id'] = df['store_id']
    df['address'] = df['address']
    df['chain_name+mall_name'] = (df['chain_name'] + df['mall_name'])
    df['mall_name'] = df['mall_name']
    df['chain_name'] = df['chain_name']
    df['plaace_hierarchy_id'] = df['plaace_hierarchy_id']

    # attempt to difference the lat and lon values, as they seem to be somewhat trending negatively.
    df['lon'] = df['lon'].diff()
    df['lat'] = df['lat'].diff()

    # remove duplicates and merge with the spatial data.
    spatial.drop_duplicates(subset=['grunnkrets_id'])
    df = pd.merge(df, spatial.drop_duplicates(subset=['grunnkrets_id']), how='left')
    df['grunnkrets_name'] = df['grunnkrets_name']
    df['district_name'] = df['district_name']
    df['municipality_name'] = df['municipality_name']
    df['geometry'] = df['geometry']

    # age.drop_duplicates(subset=['grunnkrets_id'])
    # df = pd.merge(df, age.drop_duplicates(subset=['grunnkrets_id']), how='left')

    income.drop_duplicates(subset=['grunnkrets_id'])
    df = pd.merge(df, income.drop_duplicates(subset=['grunnkrets_id']), how='left')

    households.drop_duplicates(subset=['grunnkrets_id'])
    df = pd.merge(df, households.drop_duplicates(subset=['grunnkrets_id']), how='left')

    plaace.drop_duplicates(subset=['plaace_hierarchy_id'])
    df = pd.merge(df, plaace.drop_duplicates(subset=['plaace_hierarchy_id']), how='left')
    df['plaace_hierarchy_id'] = df['plaace_hierarchy_id']
    df['sales_channel_name'] = df['sales_channel_name']
    df['lv1_desc'] = df['lv1_desc']
    df['lv2_desc'] = df['lv2_desc']
    df['lv3'] = df['lv3']
    df['lv3_desc'] = df['lv3_desc']
    df['lv4'] = df['lv4']
    df['lv4_desc'] = df['lv4_desc']

    df = df.drop(columns=['grunnkrets_id', 'plaace_hierarchy_id'])

    return df


train = pd.read_csv('data/stores_train.csv')
test = pd.read_csv('data/stores_test.csv')

label_name = 'revenue'
X = train.drop(columns=[label_name])
y = train[label_name]

submission = pd.read_csv('data/sample_submission.csv')
model_to_load = "modeling/0002.model"

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=.8)

X_train, X_val = generate_features(X_train), generate_features(X_val)


In [49]:
# # Clear buffers
# folder = os.path.join(os.getcwd(), 'modeling')
# for filename in os.listdir(folder):
#     file_path = os.path.join(folder, filename)
#     if os.path.isfile(file_path):
#         os.unlink(file_path)
#         print(f'Deleted file: {file_path}')

# train_buffer_path = 'modeling/train.buffer'
# test_buffer_path = 'modeling/test.buffer'

# dtrain = xgb.DMatrix(data=X_train, label=y_train, enable_categorical=True)
# dtrain.save_binary(train_buffer_path)
# print(f'--> {train_buffer_path} created and saved.')

# dvalid = xgb.DMatrix(data=X_val, label=y_val, enable_categorical=True)
# dvalid.save_binary(test_buffer_path)
# print(f'--> {test_buffer_path} created and saved.')


In [50]:
from sklearn.preprocessing import LabelEncoder

lbl = LabelEncoder()

# Convert columns that are not numeric to a numeric value
for c in X_train.columns:
    if X_train[c].dtype == 'object':
        lbl.fit(list(X_train[c].values))
        X_train[c] = lbl.transform(list(X_train[c].values))

for c in X_val.columns:
    if X_val[c].dtype == 'object':
        lbl.fit(list(X_val[c].values))
        X_val[c] = lbl.transform(list(X_val[c].values))

In [51]:
X_train.dtypes

year                                      int64
store_name                                int32
mall_name                                 int32
chain_name                                int32
address                                   int32
lat                                     float64
lon                                     float64
chain_name+mall_name                      int32
grunnkrets_name                           int32
district_name                             int32
municipality_name                         int32
geometry                                  int32
area_km2                                float64
all_households                          float64
singles                                 float64
couple_without_children                 float64
couple_with_children                    float64
other_households                        float64
single_parent_with_children             float64
couple_children_0_to_5_years            float64
couple_children_18_or_above             

In [71]:
from typing import Dict
from sklearn.metrics import accuracy_score, mean_squared_log_error, mean_squared_error


def objective(space: Dict):
    clf = xgb.XGBRegressor(n_estimators=space['n_estimators'], max_depth=int(space['max_depth']), gamma=space['gamma'],
                           reg_alpha=int(space['reg_alpha']), min_child_weight=int(space['min_child_weight']),
                           colsample_bytree=int(space['colsample_bytree']), eval_metric=mean_squared_log_error,
                           early_stopping_rounds=20, learning_rate=space['learning_rate'])

    evals = [(X_train, y_train), (X_val, y_val)]

    clf.fit(X_train, y_train, eval_set=evals, verbose=False)
    
    pred = clf.predict(X_val)
    score = mean_squared_log_error(y_val, pred)
    print("SCORE:", score)
    return {'loss': score, 'status': STATUS_OK}
    

space = {'max_depth': hp.quniform("max_depth", 3, 18, 1),
         'gamma': hp.uniform('gamma', 1, 9),
         'reg_alpha': hp.quniform('reg_alpha', 40, 180, 1),
         'reg_lambda': hp.uniform('reg_lambda', 0, 1),
         'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
         'min_child_weight': hp.quniform('min_child_weight', 0, 10, 1),
         'n_estimators': 1000,
         'learning_rate': hp.uniform('learning_rate', 0.05, 0.3),
         'seed': 0}

trials = Trials()

best_hyperparams = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=100, trials=trials)
print("The best hyperparameters are : ", "\n")
print(best_hyperparams)


SCORE:                                                 
1.0196920160954843                                     
SCORE:                                                                           
1.0190749639111631                                                               
SCORE:                                                                           
1.0209513313431031                                                               
SCORE:                                                                           
1.024639098118048                                                                
SCORE:                                                                           
1.0264536765379544                                                               
SCORE:                                                                           
1.031165398684851                                                                
SCORE:                                                              

KeyboardInterrupt: 

In [None]:
# X_test = generate_features(test)
# dtest = xgb.DMatrix(data=X_test, enable_categorical=True)

# print("\nAttempting to start prediction...")
# y_pred = model.predict(dtest, ntree_limit=model.best_iteration)
# print("--> Prediction finished.")

# print("\nAttempting to save prediction...")
# submission['predicted'] = np.array(y_pred)
# submission.to_csv('submissions/submission.csv', index=False)
# print("--> prediction saved with features as name in submission folder.")


Attempting to start prediction...
--> Prediction finished.

Attempting to save prediction...
--> prediction saved with features as name in submission folder.


