# 1. Install and import libraries and modules

In [3]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
import sys
!{sys.executable} -m pip install -r requirements_nogeo.txt

Defaulting to user installation because normal site-packages is not writeable


In [5]:
%autoreload

import warnings
import os.path
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
# import geopy
import xgboost as xgb
import os
import shutil
# import geopandas as gpd
import catboost as cb
import optuna
from pyproj import Geod
import joblib

from xgboost import XGBRegressor, plot_importance, to_graphviz, plot_tree
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, KFold
from sklearn.cluster import KMeans
from k_fold import random_k_fold
from shapely import wkt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from utils import squared_log, rmsle_xgb, add_city_centre_dist, group_ages, to_categorical, nan_to_string, object_encoder, only_2016_data
from k_fold import random_k_fold, xgb_cross_validation
from objectives_and_metrics import rmsle, RmsleMetric, RmsleObjective, LogTargetsRmsleMetric, RmseObjective
from scipy.stats import uniform, randint

warnings.filterwarnings('ignore')
pd.options.mode.chained_assignment = None  # default='warn'

spatial = pd.read_csv('data/grunnkrets_norway_stripped.csv')
age = pd.read_csv('data/grunnkrets_age_distribution.csv')
income = pd.read_csv('data/grunnkrets_income_households.csv').set_index(['grunnkrets_id', 'year']).add_prefix('income_').reset_index()
households = pd.read_csv('data/grunnkrets_households_num_persons.csv')
submission = pd.read_csv('data/sample_submission.csv')
plaace = pd.read_csv('data/plaace_hierarchy.csv')
busstops = pd.read_csv('data/busstops_norway.csv')

train = pd.read_csv('data/stores_train.csv')
test = pd.read_csv('data/stores_test.csv') 

submission = pd.read_csv('data/sample_submission.csv')
model_name = "modeling/0002.model"

FileNotFoundError: [Errno 2] No such file or directory: '../data/grunnkrets_norway_stripped.csv'

In [92]:
def clean(df: pd.DataFrame, min_val=0, max_val=100):
    print('Length of data frame:', len(df))
    df = df[(df.revenue > min_val) & (df.revenue < max_val)]
    print('Length after removing extreme values and zero revenue retail stores:',  len(df))
    plt.hist(np.log1p(train.revenue), 30)
    plt.show()
    return df.drop(columns=['revenue']), df.revenue


label_name = 'revenue'
X = train.drop(columns=[label_name])
y = np.log1p(train[label_name])


# 2. EDA

## 2.x Data cleaning

The train and test data only contains data from 2016, so for the other datasets with an age column
we only use the values from 2016, where possible. 

In [None]:
spatial_2016 = only_2016_data(spatial)
income_2016 = only_2016_data(income)
households_2016 = only_2016_data(households)

Futhermore, we noticed that the datasets in the cell above 

In [None]:
train_spatial_no_nan = train_spatial[pd.notnull(train_spatial.grunnkrets_name)]
train_income_no_nan = train_income[pd.notnull(train_income.income_all_households)]
train_house_no_nan = train_house[pd.notnull(train_house.couple_children_0_to_5_years)]

In [None]:
def clean(df: pd.DataFrame, min_val=0, max_val=100):
    print('Length of data frame:', len(df))
    df = df[(df.revenue > min_val) & (df.revenue < max_val)]
    print('Length after removing extreme values and zero revenue retail stores:',  len(df))
    plt.hist(np.log1p(train.revenue), 30)
    plt.show()
    return df.drop(columns=['revenue']), df.revenue


def clean_out_nan_heavy_rows(df: pd.DataFrame):
    """Cleans out rows that have no match in the age, spatial, income or household datasets."""

    df2 = df.merge(group_ages(age, age_ranges), on='grunnkrets_id', how='left')
    df2 = df2.merge(spatial_2016.drop(columns=['year']), on='grunnkrets_id', how='left')
    df2 = df2.merge(income_2016.drop(columns=['year']), on='grunnkrets_id', how='left')
    df2 = df2.merge(households_2016.drop(columns=['year']), on='grunnkrets_id', how='left')

    df_cleaned = df[
        ~(df2.age_0_19.isna() | df2.couple_children_0_to_5_years.isna() | df2.grunnkrets_name.isna() | df2.income_all_households.isna())  
    ]

    print(f'Cleaned out {len(df) - len(df_cleaned)} out of {len(df)} rows.')

    return df_cleaned


train = clean_out_nan_heavy_rows(train)
label_name = 'revenue'
X = train.drop(columns=[label_name])
y = train[label_name]

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=.8) # , random_state=SEED
X_train, y_train = clean(pd.merge(X_train, y_train, left_index=True, right_index=True))

y_train = np.log1p(y_train)
y_val = np.log1p(y_val)

# Feature generation

In [116]:
def generate_features(df: pd.DataFrame, data_origin: str, predictor: str = ''):
    age_ranges = [
        (0, 19),
        (20, 39),
        (40, 59),
        (60, 79),
        (80, 90),
    ]
    
    # Define datasets to be merged
    age_groups_merge = group_ages(age, age_ranges)
    # spatial_merge = spatial.drop(columns=['year']).drop_duplicates(subset=['grunnkrets_id'])
    # income_merge = income.drop(columns=['year']).drop_duplicates(subset='grunnkrets_id')
    # households_merge = households.drop(columns=['year']).drop_duplicates(subset='grunnkrets_id')
    spatial_merge = spatial_2016.drop(columns=['year'])
    income_merge = income_2016.drop(columns=['year'])
    households_merge = households_2016.drop(columns=['year'])
    plaace_merge = plaace.drop_duplicates(subset='plaace_hierarchy_id')
    bus_data_train_merge = gpd.read_parquet(f'derived_data/stores_bus_stops_lt_1km_{data_origin}').drop(columns=['geometry'])
    stores_vicinity_merge = gpd.read_parquet(f'derived_data/stores_count_lt_1km_{data_origin}').drop(columns=['geometry'])

    # Merge datasets
    df = df.merge(spatial_merge, on='grunnkrets_id', how='left')
    df = df.merge(age_groups_merge, on='grunnkrets_id', how='left')
    df = df.merge(income_merge, on='grunnkrets_id', how='left')
    df = df.merge(households_merge, on='grunnkrets_id', how='left')
    df = df.merge(plaace_merge, how='left')
    df = df.merge(bus_data_train_merge, on='store_id', how='left')
    df = df.merge(stores_vicinity_merge, on='store_id', how='left')
    df = add_city_centre_dist(df).drop(columns=['lon_center', 'lat_center'])

    # Transformations
    df.stores_count_lt_1km = df.stores_count_lt_1km.apply(np.log)

    # Handle categories for different predictors
    if predictor == 'xgb':
        # df = to_categorical(df)
        df = object_encoder(df)
    elif predictor == 'catboost':
        df = nan_to_string(df)
    else: 
        raise ValueError('Invalid predictor')

    features = [
        'store_name', 
        'mall_name', 
        'chain_name',
        'address', 
        'lat', 'lon',
        
        *age_groups_merge.drop(columns=['grunnkrets_id']).columns,
        *income_merge.drop(columns=['grunnkrets_id']).columns,
        *households_merge.drop(columns=['grunnkrets_id']).columns,
        'lv1_desc', 'lv2_desc', 'sales_channel_name',
        *bus_data_train_merge.drop(columns=['store_id']).columns,
        *stores_vicinity_merge.drop(columns=['store_id']).columns,
        'dist_to_center'
    ]

    return df[features]

In [6]:
def generate_kmeans(df: pd.DataFrame, clusters: int, filter: str):
    if filter and filter in df:
        data = []
        for column in df[filter].unique():
            data.append(df[df[filter] == column])
        kmeans = []
        for i in range(len(data)):
            k = KMeans(n_clusters=clusters, random_state=0, verbose=False, max_iter=300).fit(np.column_stack((data[i]['lat'], data[i]['lon'])))
            joblib.dump(k, "kmeans/kmeans"+str(clusters)+"_"+str(filter)+"_"+str(df[filter].unique()[i])+".joblib")
            kmeans.append(k)
    else: 
        kmeans = KMeans(n_clusters=clusters, random_state=0, verbose=False, max_iter=300).fit(np.column_stack((df['lat'], df['lon'])))
        joblib.dump(kmeans, "kmeans/kmeans"+str(clusters)+"_"+str(filter)+".joblib")
    return kmeans

In [7]:
def predict_kmeans(df:pd.DataFrame, kmeans: KMeans, filter: str, clusters: int):
    if filter and filter in df:
        columns = df[filter].unique()
        for i in range(len(df[filter].unique())):
            column = columns[i]
            df.loc[df[filter]== column, 'cluster'] = kmeans[i].predict(np.column_stack((df[df[filter] == column]['lat'], df[df[filter] == column]['lon'])))
            #df[df[filter] == column]['cluster'] = kmeans[i].predict(np.column_stack((df[df[filter] == column]['lat'], df[df[filter] == column]['lon'])))
    else:
        df['cluster'] = kmeans.predict(np.column_stack((df['lat'], df['lon'])))
    return df

In [8]:
def plot_corr(data):
  df = data[['revenue', 
    # 'age_0_19', 'age_20_39', 'age_40_59', 'age_60_79', 'age_80_90', 
    # 'bus_stops_count', 'Mangler viktighetsnivå', 'Standard holdeplass', 'Lokalt knutepunkt', 'Nasjonalt knutepunkt', 'Regionalt knutepunkt', 'Annen viktig holdeplass', 
    'dist_to_center', 'lat','lon'
    ]]
  df['knutepunkt'] = data[['Lokalt knutepunkt', 'Nasjonalt knutepunkt', 'Regionalt knutepunkt']].sum(axis=1)
  # df.revenue = np.exp(df.revenue)
  # df.bus_stops_count = np.sqrt(df.bus_stops_count)
  df = df[df.dist_to_center < 70_000]
  # df.dist_to_center = np.log(df.dist_to_center)
  
  plt.figure(figsize=(15, 15))
  pairplot = sns.pairplot(df)
  # heatmap = sns.heatmap(df.corr(), vmin=-1, vmax=1, annot=True)


# data_full =  pd.merge(X_train, y_train, left_index=True, right_index=True) 
# plot_corr(data_full)


In [95]:
def clear_buffers(X_train, y_train, X_val, y_val):
    # Clear buffers
    folder = os.path.join(os.getcwd(), 'modeling')

    for filename in os.listdir(folder):
        file_path = os.path.join(folder, filename)
        if os.path.isfile(file_path):
            os.unlink(file_path)
            print(f'Deleted file: {file_path}')

    train_buffer_path = 'modeling/train.buffer'
    test_buffer_path = 'modeling/test.buffer'

    dtrain = xgb.DMatrix(data=X_train, label=y_train, enable_categorical=True)
    dtrain.save_binary(train_buffer_path)
    print(f'--> {train_buffer_path} created and saved.')

    dvalid = xgb.DMatrix(data=X_val, label=y_val, enable_categorical=True)
    dvalid.save_binary(test_buffer_path)
    print(f'--> {test_buffer_path} created and saved.')

    return dtrain, dvalid

In [96]:
# print(model.best_score_)
# y_pred_train = model.predict(X_train)
# y_pred_val = model.predict(X_val)
# print(rmsle(y_train, y_pred_train))
# print(rmsle(y_val, y_pred_val))

In [97]:
def train_xgb_model(X_train, y_train, X_val, y_val):
    params = {'colsample_bytree': 0.7717138210314867, 'learning_rate': 0.047506668950627134, 'max_depth': 8, 'min_child_weight': 3, 'n_estimators': 223, 'subsample': 0.9929036803032936}
    print('Clearing and creating buffers...')
    dtrain, dvalid = clear_buffers(X_train, y_train, X_val, y_val)
    
    rand_search_model = random_k_fold(X_train, y_train, verbose=1, n_iter=100)
    model = rand_search_model
    params = model.best_params_
    print(rand_search_model.best_score_, params)
    
    # params = {'colsample_bytree': 0.8601277878899238, 'eval_metric': 'rmsle', 'gamma': 0.12760202929262826, 'learning_rate': 0.07356461924449906, 'max_depth': 5, 'min_child_weight': 1, 'n_estimators': 306, 'objective': 'reg:squaredlogerror', 'subsample': 0.8993341396761092}
    
    params['disable_default_eval_metric'] = True
    # model = XGBRegressor()
    # model.set_params(**params)
    # model.fit(X_train, y_train)
    # y_pred_train = model.predict(X_train)
    # y_pred_val = model.predict(X_val)
    # print(rmsle(y_train, y_pred_train))
    # print(rmsle(y_val, y_pred_val))

    # num_round = 999
    # watchlist = [(dtrain, 'train'), (dvalid, 'valid')]

    # print("Attempting to start training...")
    # model = xgb.train(
    #     params=params, 
    #     dtrain=dtrain, 
    #     num_boost_round=num_round, 
    #     evals=watchlist, 
    #     early_stopping_rounds=10, 
    #     verbose_eval=20)
    # print("--> model trained.")
    # print('Best score:', model.best_score)

    # print("Attempting to save model...")
    # model.save_model(model_name)
    # print("--> model saved.")

    return model


# X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=.8)
# X_train, X_val = generate_features(X_train, predictor='xgb'), generate_features(X_val, predictor='xgb')

# model = train_xgb_model(X_train, y_train, X_val, y_val)

In [98]:
# y_pred_train = model.predict(X_train)
# y_pred_val = model.predict(X_val)
# print(rmsle(y_train, y_pred_train))
# print(rmsle(y_val, y_pred_val))

In [99]:
def xgb_prediction(X_test, model):
    dtest = xgb.DMatrix(data=X_test, enable_categorical=True)

    print("\nAttempting to start prediction...")
    y_pred = model.predict(dtest, ntree_limit=model.best_iteration)
    print("--> Prediction finished.")

    print("\nAttempting to save prediction...")
    submission['predicted'] = np.array(y_pred)
    submission.to_csv('submissions/submission.csv', index=False)
    print("--> prediction saved with features as name in submission folder.")


# X_test = generate_features(test, predictor='xgb')
# xgb_prediction(X_test, model)

In [100]:
# xgb_model = model.best_estimator_ if model.best_estimator_ is not None else model
# xgb_model = model
# plot_importance(xgb_model)
# xgb.to_graphviz(xgb_model, num_trees=1)

### Prepare features for Catboost predictor

In [117]:
filter = 'lv1_desc'
clusters = 10

X_train_extra = pd.concat([train, extra])
X_train_extra = generate_features(X_train_extra, predictor='catboost')
kmeans = generate_kmeans(X_train_extra, clusters=clusters, filter=filter)

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=.8)
# X_train, y_train = clean(pd.merge(X_train, y_train, left_index=True, right_index=True))

X_train = generate_features(X_train, data_origin='train', predictor='catboost')
X_val = generate_features(X_val, data_origin='train', predictor='catboost')
X_test = generate_features(test, data_origin='test', predictor='catboost')

# auxillary_columns = ['address']
text_features = ['store_name', 'address', 'sales_channel_name'] 
cat_features = ['mall_name', 'chain_name', 'lv1_desc', 'lv2_desc']

X_train.to_csv('xtrain.csv', index=False)
train_pool = cb.Pool(
    X_train,
    y_train,
    cat_features=cat_features,
    text_features=text_features,
    feature_names=list(X_train)
)

valid_pool = cb.Pool(
    X_val,
    y_val,
    cat_features=cat_features,
    text_features=text_features,
    feature_names=list(X_train)
)

print(len(X_train), len(X_val))



 store_id                                    0
year                                        0
store_name                                  0
plaace_hierarchy_id                         0
sales_channel_name                          0
grunnkrets_id                               0
address                                  1424
lat                                         0
lon                                         0
chain_name                               7328
mall_name                                8437
grunnkrets_name                         10287
district_name                           10287
municipality_name                       10287
geometry                                10287
area_km2                                10287
age_0_19                                  609
age_20_39                                 609
age_40_59                                 609
age_60_79                                 609
age_80_90                                 609
income_all_households          

In [102]:
from catboost.utils import get_gpu_device_count

gpu_count = get_gpu_device_count()

non_tunable_params = {
    'objective': 'RMSE',
    'eval_metric': 'RMSE',
    'task_type': 'GPU' if gpu_count else 'CPU', 
    'devices': f'0:{gpu_count}'
}

def objective(trial: optuna.Trial) -> float:
    tunable_params = {
        'depth': trial.suggest_int('depth', 4, 9),
        'boosting_type': trial.suggest_categorical('boosting_type', ['Ordered', 'Plain']),
        'bootstrap_type': trial.suggest_categorical('bootstrap_type', ['Bayesian', 'Bernoulli', 'MVS']),
        # 'iterations': trial.suggest_int('iterations', 1000, 2000),
        # 'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 2, 4),
    }

    kmeans_param = {
        'clusters': trial.suggest_int('clusters', 10, 50),
        'filter': trial.suggest_categorical('filter', ['lv1_desc', 'lv2_desc', False])
    }
    
    kmeans = generate_kmeans(X_train_extra, **kmeans_param)
    X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=.8)
    X_train, X_val = generate_features(X_train, predictor='catboost'), generate_features(X_val, predictor='catboost')
    X_train, X_val = predict_kmeans(X_train, kmeans, **kmeans_param), predict_kmeans(X_val, kmeans, **kmeans_param)
    y_train, 
    train_pool = cb.Pool(
        X_train,
        y_train,
        cat_features=cat_features,
        text_features=text_features,
        feature_names=list(X_train)
    )

    valid_pool = cb.Pool(
        X_val,
        y_val,
        cat_features=cat_features,
        text_features=text_features,
        feature_names=list(X_train)
    )

    if param['bootstrap_type'] == 'Bayesian': 
        param['bagging_temperature'] = trial.suggest_float('bagging_temperature', 0, 10)
    elif param['bootstrap_type'] == 'Bernoulli':
        param['subsample'] = trial.suggest_float('subsample', 0.1, 1, log=True)

    cbr = cb.CatBoostRegressor(**param, task_type='CPU', devices='0:1')
    
    # pruning_callback = optuna.integration.CatBoostPruningCallback(trial, 'LogTargetsRmsleMetric')
    if tunable_params['bootstrap_type'] == 'Bayesian': 
        tunable_params['bagging_temperature'] = trial.suggest_float('bagging_temperature', 0, 10)
    elif tunable_params['bootstrap_type'] == 'Bernoulli':
        tunable_params['subsample'] = trial.suggest_float('subsample', 0.1, 1, log=True)

    cbr = cb.CatBoostRegressor(**non_tunable_params, **tunable_params) 
    cbr.fit(
        train_pool,
        eval_set=[(X_val, y_val)],
        verbose=True,
        early_stopping_rounds=50,
    )

    y_pred = cbr.predict(X_val)
    score = rmsle(np.expm1(y_val), np.expm1(y_pred))

    return score


def tuned_hyperparameters():
    study = optuna.create_study(
        study_name='catboost-tuning',
        pruner=optuna.pruners.MedianPruner(n_warmup_steps=5), 
        direction='minimize'
    )
    study.optimize(objective, n_trials=100, timeout=900, show_progress_bar=True) 

    print('Number of finished trials: {}'.format(len(study.trials)))

    print('Best trial:')
    trial = study.best_trial

    print('Value:', trial.value)
    print('Params:')
    print(trial.params)

    return trial.params


tuned_params = tuned_hyperparameters()
# tuned_params = {'depth': 9, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'subsample': 0.7494756089749968}


[32m[I 2022-11-09 13:40:52,812][0m A new study created in memory with name: catboost-tuning[0m


  0%|          | 0/100 [00:00<?, ?it/s]

[32m[I 2022-11-09 13:41:18,283][0m Trial 0 finished with value: 0.726526253877934 and parameters: {'depth': 9, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'subsample': 0.3982817713250909}. Best is trial 0 with value: 0.726526253877934.[0m
[32m[I 2022-11-09 13:41:20,352][0m Trial 1 finished with value: 0.7501095022059352 and parameters: {'depth': 6, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 0 with value: 0.726526253877934.[0m
[32m[I 2022-11-09 13:41:24,963][0m Trial 2 finished with value: 0.7736112551215502 and parameters: {'depth': 5, 'boosting_type': 'Plain', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 8.117373101773806}. Best is trial 0 with value: 0.726526253877934.[0m
[32m[I 2022-11-09 13:41:29,446][0m Trial 3 finished with value: 0.7282037013062744 and parameters: {'depth': 9, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'subsample': 0.9849527582125648}. Best is trial 0 with value: 0.726526253877934.[0m
[32m[

In [103]:
# Slower, but due to an issue with Catboost, training on the CPU often yields a better result than on the GPU 
non_tunable_params['task_type'] = 'CPU'

model = cb.CatBoostRegressor(**non_tunable_params, **tuned_params, iterations=1000)
model.fit(train_pool, eval_set=valid_pool, verbose=50, plot=True)

y_pred = np.expm1(model.predict(X_test))
submission['predicted'] = y_pred
submission.to_csv('submissions/submission.csv', index=False)

# model.save_model(f'models/{model.best_score_}')

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.073422
0:	learn: 1.0040598	test: 0.9895524	best: 0.9895524 (0)	total: 21.9ms	remaining: 21.9s
50:	learn: 0.7651243	test: 0.7574132	best: 0.7574132 (50)	total: 1.1s	remaining: 20.5s
100:	learn: 0.7463352	test: 0.7454019	best: 0.7454019 (100)	total: 2.09s	remaining: 18.6s
150:	learn: 0.7324013	test: 0.7385452	best: 0.7385452 (150)	total: 3.07s	remaining: 17.3s
200:	learn: 0.7199191	test: 0.7348295	best: 0.7348295 (200)	total: 4.07s	remaining: 16.2s
250:	learn: 0.7109668	test: 0.7330781	best: 0.7330781 (250)	total: 5.05s	remaining: 15.1s
300:	learn: 0.7036817	test: 0.7321571	best: 0.7321332 (291)	total: 6.03s	remaining: 14s
350:	learn: 0.6964346	test: 0.7303929	best: 0.7303607 (349)	total: 7.1s	remaining: 13.1s
400:	learn: 0.6914414	test: 0.7292969	best: 0.7292529 (397)	total: 8.18s	remaining: 12.2s
450:	learn: 0.6861766	test: 0.7283429	best: 0.7283367 (449)	total: 9.24s	remaining: 11.2s
500:	learn: 0.6812082	test: 0.7275097	best: 0.7275097 (500)	total: 10.3s	remain

In [104]:
# loaded_model = cb.CatBoostRegressor(**non_tunable_params, **tuned_params, iterations=1000).load_model('models/070034')
# y_pred = np.expm1(loaded_model.predict(X_val))
# rmsle(np.expm1(y_val), y_pred)
