In [55]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [56]:
%autoreload

import warnings
import os.path
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import geopy
import xgboost as xgb
import os
import shutil
import geopandas as gpd
import catboost as cb

from xgboost import XGBClassifier, plot_importance, to_graphviz, plot_tree
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, KFold
from k_fold import random_k_fold
from shapely import wkt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from utils import squared_log, rmsle_xgb, add_city_centre_dist, group_ages, to_categorical, nan_to_string, object_encoder
from k_fold import random_k_fold, _rmsle
from scipy.stats import uniform, randint

pd.options.mode.chained_assignment = None  # default='warn'

spatial = pd.read_csv('data/grunnkrets_norway_stripped.csv')
age = pd.read_csv('data/grunnkrets_age_distribution.csv')
income = pd.read_csv('data/grunnkrets_income_households.csv')
households = pd.read_csv('data/grunnkrets_households_num_persons.csv')
submission = pd.read_csv('data/sample_submission.csv')
plaace = pd.read_csv('data/plaace_hierarchy.csv')
busstops = pd.read_csv('data/busstops_norway.csv')

train = pd.read_csv('data/stores_train.csv')
test = pd.read_csv('data/stores_test.csv')

submission = pd.read_csv('data/sample_submission.csv')
model_name = "modeling/0002.model"


In [57]:
def generate_features(df: pd.DataFrame, predictor: str = ''):
    features = ['store_id', 'year', 'store_name', 'mall_name', 'chain_name', 'address', 'lat', 'lon',
                'plaace_hierarchy_id', 'grunnkrets_id']
    _X = df[features]

    _X['store_name'] = _X['store_name']
    _X['address'] = _X['address']
    _X['mall_name'] = _X['mall_name']
    _X['chain_name'] = _X['chain_name']
    _X['plaace_hierarchy_id'] = _X['plaace_hierarchy_id']
    # _X['latlon'] = f'{_X.lat}{_X.lon}'
    # _X['latlon'] = _X['latlon'].astype('category')

    # Merge spatial data
    _X = _X.merge(spatial.drop(columns=['year']).drop_duplicates(subset=['grunnkrets_id']), on='grunnkrets_id', how='left')
    _X['grunnkrets_name'] = _X['grunnkrets_name']
    _X['district_name'] = _X['district_name']
    _X['municipality_name'] = _X['municipality_name']
    _X = _X.drop(columns=['geometry'])

    # Merge age data
    age_ranges = [
        (0, 19),
        (20, 39),
        (40, 59),
        (60, 79),
        (80, 90),
    ]
    grouped_ages = group_ages(age, age_ranges)
    _X = _X.merge(grouped_ages, on='grunnkrets_id', how='left')

    # Merge income data
    _X = _X.merge(income.drop(columns=['year']).drop_duplicates(subset='grunnkrets_id'), how='left')

    # Merge household data
    _X = _X.merge(households.drop(columns=['year']).drop_duplicates(subset='grunnkrets_id'), how='left')

    # Merge plaace data
    _X = _X.merge(plaace.drop_duplicates(subset='plaace_hierarchy_id'), how='left')
    _X['plaace_hierarchy_id'] = _X['plaace_hierarchy_id']
    _X['sales_channel_name'] = _X['sales_channel_name']
    _X = _X.drop(columns=['lv1', 'lv2', 'lv3', 'lv4'])
    _X['lv1_desc'] = _X['lv1_desc']
    _X['lv2_desc'] = _X['lv2_desc']
    _X['lv3_desc'] = _X['lv3_desc']
    _X['lv4_desc'] = _X['lv4_desc']
    
    _X = add_city_centre_dist(_X).drop(columns=['lon_center', 'lat_center'])

    # Merge bus data
    bus_data_train = gpd.read_parquet('derived_data/stores_bus_stops_lt_1km_train')
    _X = _X.merge(bus_data_train.drop(columns=['geometry']), on='store_id', how='left')

    _X = _X.drop(columns=['grunnkrets_id', 'plaace_hierarchy_id', 'year', 'store_id'])

    if predictor == 'xgb':
        # _X = to_categorical(_X)
        _X = object_encoder(_X)
    elif predictor == 'catboost':
        print('hei')
        _X = nan_to_string(_X)
        print(_X.isna().sum())
        
    return _X


label_name = 'revenue'
X = train.drop(columns=[label_name])
y = train[label_name]


In [58]:
def plot_corr(data):
  df = data[['revenue', 
    # 'age_0_19', 'age_20_39', 'age_40_59', 'age_60_79', 'age_80_90', 
    # 'bus_stops_count', 'Mangler viktighetsnivå', 'Standard holdeplass', 'Lokalt knutepunkt', 'Nasjonalt knutepunkt', 'Regionalt knutepunkt', 'Annen viktig holdeplass', 
    'dist_to_center', 'lat','lon'
    ]]
  df['knutepunkt'] = data[['Lokalt knutepunkt', 'Nasjonalt knutepunkt', 'Regionalt knutepunkt']].sum(axis=1)
  # df.revenue = np.exp(df.revenue)
  # df.bus_stops_count = np.sqrt(df.bus_stops_count)
  df = df[df.dist_to_center < 70_000]
  # df.dist_to_center = np.log(df.dist_to_center)
  
  plt.figure(figsize=(15, 15))
  pairplot = sns.pairplot(df)
  # heatmap = sns.heatmap(df.corr(), vmin=-1, vmax=1, annot=True)


# data_full =  pd.merge(X_train, y_train, left_index=True, right_index=True) 
# plot_corr(data_full)


In [59]:
def clear_buffers(X_train, y_train, X_val, y_val):
    # Clear buffers
    folder = os.path.join(os.getcwd(), 'modeling')

    for filename in os.listdir(folder):
        file_path = os.path.join(folder, filename)
        if os.path.isfile(file_path):
            os.unlink(file_path)
            print(f'Deleted file: {file_path}')

    train_buffer_path = 'modeling/train.buffer'
    test_buffer_path = 'modeling/test.buffer'

    dtrain = xgb.DMatrix(data=X_train, label=np.log1p(y_train), enable_categorical=True)
    dtrain.save_binary(train_buffer_path)
    print(f'--> {train_buffer_path} created and saved.')

    dvalid = xgb.DMatrix(data=X_val, label=y_val, enable_categorical=True)
    dvalid.save_binary(test_buffer_path)
    print(f'--> {test_buffer_path} created and saved.')

    return dtrain, dvalid

In [60]:
def train_xgb_model(X_train, y_train, X_val, y_val):
    print('Clearing and creating buffers...')
    dtrain, dvalid = clear_buffers(X_train, y_train, X_val, y_val)
    print(dtrain, dvalid)

    print("Attempting to initialize parameters for training...")

    params = {'colsample_bytree': 0.7717138210314867, 'learning_rate': 0.047506668950627134, 'max_depth': 8, 'min_child_weight': 3, 'n_estimators': 223, 'subsample': 0.9929036803032936}
    
    rand_src_model = random_k_fold(X_train, y_train, verbose=0, n_iter=20)
    params = rand_src_model.best_params_
    print(params)

    # params = {'colsample_bytree': 0.8601277878899238, 'eval_metric': _rmsle, 'gamma': 0.12760202929262826, 'learning_rate': 0.07356461924449906, 'max_depth': 5, 'min_child_weight': 1, 'n_estimators': 306, 'objective': 'reg:squaredlogerror', 'subsample': 0.8993341396761092}

    params['disable_default_eval_metric'] = True
    print("--> parameters for training initialized.")

    # y_pred = model.predict(dvalid)
    # print(_rmsle(y_val, y_pred))

    # X_test = generate_features(test, predictor='xgb')
    # dtest = xgb.DMatrix(data=X_test, enable_categorical=True)

    # print("\nAttempting to start prediction...")
    # y_pred = model.predict(dtest, ntree_limit=model.best_iteration)
    # print("--> Prediction finished.")

    # print("\nAttempting to save prediction...")
    # submission['predicted'] = np.array(y_pred)
    # submission.to_csv('submissions/submission.csv', index=False)
    # print("--> prediction saved with features as name in submission folder.")

    num_round = 999
    watchlist = [(dtrain, 'train'), (dvalid, 'valid')]

    print("Attempting to start training...")
    model = xgb.train(
        params=params, 
        dtrain=dtrain, 
        num_boost_round=num_round, 
        obj=squared_log,
        custom_metric=rmsle_xgb,
        evals=watchlist, 
        early_stopping_rounds=10, 
        verbose_eval=20)
    print("--> model trained.")
    print('Best score:', model.best_score)

    print("Attempting to save model...")
    model.save_model(model_name)
    print("--> model saved.")

    return model


X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=.8)
X_train, X_val = generate_features(X_train, predictor='xgb'), generate_features(X_val, predictor='xgb')

model = train_xgb_model(X_train, y_train, X_val, y_val)

Clearing and creating buffers...
Deleted file: c:\dev\maskin\maskinprosjekt\modeling\0002.model
Deleted file: c:\dev\maskin\maskinprosjekt\modeling\test.buffer
Deleted file: c:\dev\maskin\maskinprosjekt\modeling\train.buffer
--> modeling/train.buffer created and saved.
--> modeling/test.buffer created and saved.
<xgboost.core.DMatrix object at 0x0000023DBB0D93D0> <xgboost.core.DMatrix object at 0x0000023DBB8E5C40>
Attempting to initialize parameters for training...


KeyboardInterrupt: 

In [None]:
def xgb_prediction(X_test, model):
    dtest = xgb.DMatrix(data=X_test, enable_categorical=True)

    print("\nAttempting to start prediction...")
    y_pred = model.predict(dtest, ntree_limit=model.best_iteration)
    print("--> Prediction finished.")

    print("\nAttempting to save prediction...")
    submission['predicted'] = np.array(y_pred)
    submission.to_csv('submissions/submission.csv', index=False)
    print("--> prediction saved with features as name in submission folder.")


X_test = generate_features(test, predictor='xgb')
xgb_prediction(X_test, model)


Attempting to start prediction...
--> Prediction finished.

Attempting to save prediction...
--> prediction saved with features as name in submission folder.




In [None]:
plot_importance(model)
xgb.to_graphviz(model, num_trees=1)

### Prepare features for Catboost predictor

In [None]:
# X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=.8)
# X_train, X_val = generate_features(X_train, predictor='catboost'), generate_features(X_val, predictor='catboost')
# X_test = generate_features(test, predictor='catboost')

# cat_features = list(X_train.select_dtypes(include=[object]).columns)

# train_pool = cb.Pool(X_train, y_train, cat_features=cat_features)
# test_pool = cb.Pool(X_test, cat_features=cat_features)

# # X_train[X_train.columns[X_train.isna().any()].tolist()]


In [None]:
# params = {
#     'depth': randint(2, 20),
#     'learning_rate': uniform(0.01, 0.4),
#     'iterations': randint(10, 1000)
# }

# model = cb.CatBoostRegressor(loss_function='RMSE')

# model.randomized_search(train_pool, param_distributions=params, cv=5)

# pred = model.predict(X_val)
# rmse = (np.sqrt(mean_squared_log_error(y_val, pred)))
# print('Testing performance')
# print('RMSE: {:.2f}'.format(rmse))