In [365]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [366]:
import sys
!{sys.executable} -m pip install -r requirements.txt



In [367]:
%autoreload

import warnings
import os.path
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import geopy
import xgboost as xgb
import os
import shutil
import geopandas as gpd
import catboost as cb
import optuna

from xgboost import XGBRegressor, plot_importance, to_graphviz, plot_tree
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, KFold
from k_fold import random_k_fold
from shapely import wkt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from utils import squared_log, rmsle_xgb, add_city_centre_dist, group_ages, to_categorical, nan_to_string, object_encoder
from k_fold import random_k_fold, xgb_cross_validation
from objectives_and_metrics import rmsle, RmsleMetric, RmsleObjective, LogTargetsRmsleMetric, RmseObjective
from scipy.stats import uniform, randint

warnings.filterwarnings('ignore')
pd.options.mode.chained_assignment = None  # default='warn'

spatial = pd.read_csv('data/grunnkrets_norway_stripped.csv')
age = pd.read_csv('data/grunnkrets_age_distribution.csv')
income = pd.read_csv('data/grunnkrets_income_households.csv').set_index(['grunnkrets_id', 'year']).add_prefix('income_').reset_index()
households = pd.read_csv('data/grunnkrets_households_num_persons.csv')
submission = pd.read_csv('data/sample_submission.csv')
plaace = pd.read_csv('data/plaace_hierarchy.csv')
busstops = pd.read_csv('data/busstops_norway.csv')


train = pd.read_csv('data/stores_train.csv')
test = pd.read_csv('data/stores_test.csv') 


submission = pd.read_csv('data/sample_submission.csv')
model_name = "modeling/0002.model"

In [368]:
def clean(df: pd.DataFrame, min_val=0, max_val=100):
    print('Length of data frame:', len(df))
    df = df[(df.revenue > min_val) & (df.revenue < max_val)]
    print('Length after removing extreme values and zero revenue retail stores:',  len(df))
    plt.hist(np.log1p(train.revenue), 30)
    plt.show()
    return df.drop(columns=['revenue']), df.revenue


label_name = 'revenue'
X = train.drop(columns=[label_name])
y = np.log1p(train[label_name])


In [369]:
def generate_features(df: pd.DataFrame, predictor: str = ''):
    age_ranges = [
        (0, 19),
        (20, 39),
        (40, 59),
        (60, 79),
        (80, 90),
    ]
    
    # Define datasets to be merged
    spatial_merge = spatial.drop(columns=['year']).drop_duplicates(subset=['grunnkrets_id'])
    age_groups_merge = group_ages(age, age_ranges)
    income_merge = income.drop(columns=['year']).drop_duplicates(subset='grunnkrets_id')
    households_merge = households.drop(columns=['year']).drop_duplicates(subset='grunnkrets_id')
    plaace_merge = plaace.drop_duplicates(subset='plaace_hierarchy_id')
    bus_data_train_merge = gpd.read_parquet('derived_data/stores_bus_stops_lt_1km_train').drop(columns=['geometry'])
    stores_vicinity_merge = gpd.read_parquet('derived_data/stores_count_lt_1km_train').drop(columns=['geometry'])

    # Merge datasets
    df = df.merge(spatial_merge, on='grunnkrets_id', how='left')
    df = df.merge(age_groups_merge, on='grunnkrets_id', how='left')
    df = df.merge(income_merge, on='grunnkrets_id', how='left')
    df = df.merge(households_merge, on='grunnkrets_id', how='left')
    df = df.merge(plaace_merge, how='left')
    df = df.merge(bus_data_train_merge, on='store_id', how='left')
    df = df.merge(stores_vicinity_merge, on='store_id', how='left')
    df = add_city_centre_dist(df).drop(columns=['lon_center', 'lat_center'])

    # Transformations
    df.stores_count_lt_1km = df.stores_count_lt_1km.apply(np.log)

    # Handle categories for different predictors
    if predictor == 'xgb':
        # df = to_categorical(df)
        df = object_encoder(df)
    elif predictor == 'catboost':
        df = nan_to_string(df)
    else: 
        raise ValueError('Invalid predictor')

    features = [
        'store_name', 
        'mall_name', 
        'chain_name',
        # 'address', 
        'lat', 'lon',
        
        *age_groups_merge.drop(columns=['grunnkrets_id']).columns,
        *income_merge.drop(columns=['grunnkrets_id']).columns,
        *households_merge.drop(columns=['grunnkrets_id']).columns,
        'lv1_desc', 'lv2_desc', # 'sales_channel_name', 
        *bus_data_train_merge.drop(columns=['store_id']).columns,
        *stores_vicinity_merge.drop(columns=['store_id']).columns,
        'dist_to_center'
    ]

    return df[features]

In [370]:
def plot_corr(data):
  df = data[['revenue', 
    # 'age_0_19', 'age_20_39', 'age_40_59', 'age_60_79', 'age_80_90', 
    # 'bus_stops_count', 'Mangler viktighetsnivå', 'Standard holdeplass', 'Lokalt knutepunkt', 'Nasjonalt knutepunkt', 'Regionalt knutepunkt', 'Annen viktig holdeplass', 
    'dist_to_center', 'lat','lon'
    ]]
  df['knutepunkt'] = data[['Lokalt knutepunkt', 'Nasjonalt knutepunkt', 'Regionalt knutepunkt']].sum(axis=1)
  # df.revenue = np.exp(df.revenue)
  # df.bus_stops_count = np.sqrt(df.bus_stops_count)
  df = df[df.dist_to_center < 70_000]
  # df.dist_to_center = np.log(df.dist_to_center)
  
  plt.figure(figsize=(15, 15))
  pairplot = sns.pairplot(df)
  # heatmap = sns.heatmap(df.corr(), vmin=-1, vmax=1, annot=True)


# data_full =  pd.merge(X_train, y_train, left_index=True, right_index=True) 
# plot_corr(data_full)


In [371]:
def clear_buffers(X_train, y_train, X_val, y_val):
    # Clear buffers
    folder = os.path.join(os.getcwd(), 'modeling')

    for filename in os.listdir(folder):
        file_path = os.path.join(folder, filename)
        if os.path.isfile(file_path):
            os.unlink(file_path)
            print(f'Deleted file: {file_path}')

    train_buffer_path = 'modeling/train.buffer'
    test_buffer_path = 'modeling/test.buffer'

    dtrain = xgb.DMatrix(data=X_train, label=y_train, enable_categorical=True)
    dtrain.save_binary(train_buffer_path)
    print(f'--> {train_buffer_path} created and saved.')

    dvalid = xgb.DMatrix(data=X_val, label=y_val, enable_categorical=True)
    dvalid.save_binary(test_buffer_path)
    print(f'--> {test_buffer_path} created and saved.')

    return dtrain, dvalid

In [372]:
# print(model.best_score_)
# y_pred_train = model.predict(X_train)
# y_pred_val = model.predict(X_val)
# print(rmsle(y_train, y_pred_train))
# print(rmsle(y_val, y_pred_val))

In [373]:
def train_xgb_model(X_train, y_train, X_val, y_val):
    params = {'colsample_bytree': 0.7717138210314867, 'learning_rate': 0.047506668950627134, 'max_depth': 8, 'min_child_weight': 3, 'n_estimators': 223, 'subsample': 0.9929036803032936}
    print('Clearing and creating buffers...')
    dtrain, dvalid = clear_buffers(X_train, y_train, X_val, y_val)
    
    rand_search_model = random_k_fold(X_train, y_train, verbose=1, n_iter=100)
    model = rand_search_model
    params = model.best_params_
    print(rand_search_model.best_score_, params)
    
    # params = {'colsample_bytree': 0.8601277878899238, 'eval_metric': 'rmsle', 'gamma': 0.12760202929262826, 'learning_rate': 0.07356461924449906, 'max_depth': 5, 'min_child_weight': 1, 'n_estimators': 306, 'objective': 'reg:squaredlogerror', 'subsample': 0.8993341396761092}
    
    params['disable_default_eval_metric'] = True
    # model = XGBRegressor()
    # model.set_params(**params)
    # model.fit(X_train, y_train)
    # y_pred_train = model.predict(X_train)
    # y_pred_val = model.predict(X_val)
    # print(rmsle(y_train, y_pred_train))
    # print(rmsle(y_val, y_pred_val))

    # num_round = 999
    # watchlist = [(dtrain, 'train'), (dvalid, 'valid')]

    # print("Attempting to start training...")
    # model = xgb.train(
    #     params=params, 
    #     dtrain=dtrain, 
    #     num_boost_round=num_round, 
    #     evals=watchlist, 
    #     early_stopping_rounds=10, 
    #     verbose_eval=20)
    # print("--> model trained.")
    # print('Best score:', model.best_score)

    # print("Attempting to save model...")
    # model.save_model(model_name)
    # print("--> model saved.")

    return model


# X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=.8)
# X_train, X_val = generate_features(X_train, predictor='xgb'), generate_features(X_val, predictor='xgb')

# model = train_xgb_model(X_train, y_train, X_val, y_val)

In [374]:
# y_pred_train = model.predict(X_train)
# y_pred_val = model.predict(X_val)
# print(rmsle(y_train, y_pred_train))
# print(rmsle(y_val, y_pred_val))

In [375]:
def xgb_prediction(X_test, model):
    dtest = xgb.DMatrix(data=X_test, enable_categorical=True)

    print("\nAttempting to start prediction...")
    y_pred = model.predict(dtest, ntree_limit=model.best_iteration)
    print("--> Prediction finished.")

    print("\nAttempting to save prediction...")
    submission['predicted'] = np.array(y_pred)
    submission.to_csv('submissions/submission.csv', index=False)
    print("--> prediction saved with features as name in submission folder.")


# X_test = generate_features(test, predictor='xgb')
# xgb_prediction(X_test, model)

In [376]:
# xgb_model = model.best_estimator_ if model.best_estimator_ is not None else model
# xgb_model = model
# plot_importance(xgb_model)
# xgb.to_graphviz(xgb_model, num_trees=1)

### Prepare features for Catboost predictor

In [377]:
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=.8)
# X_train, y_train = clean(pd.merge(X_train, y_train, left_index=True, right_index=True))

X_train, X_val = generate_features(X_train, predictor='catboost'), generate_features(X_val, predictor='catboost')
X_test = generate_features(test, predictor='catboost')

# auxillary_columns = ['address']
text_features = ['store_name', 'address']
cat_features = ['mall_name', 'chain_name', 'lv1_desc', 'lv2_desc']

train_pool = cb.Pool(
    X_train,
    y_train,
    cat_features=cat_features,
    text_features=text_features,
    feature_names=list(X_train)
)

valid_pool = cb.Pool(
    X_val,
    y_val,
    cat_features=cat_features,
    text_features=text_features,
    feature_names=list(X_train)
)

In [382]:
from catboost.utils import get_gpu_device_count

gpu_count = get_gpu_device_count()

non_tunable_params = {
    'objective': 'RMSE',
    'eval_metric': 'RMSE',
    'task_type': 'GPU' if gpu_count else 'CPU', 
    'devices': f'0:{gpu_count}'
}

def objective(trial: optuna.Trial) -> float:
    tunable_params = {
        'depth': trial.suggest_int('depth', 4, 9),
        'boosting_type': trial.suggest_categorical('boosting_type', ['Ordered', 'Plain']),
        'bootstrap_type': trial.suggest_categorical('bootstrap_type', ['Bayesian', 'Bernoulli', 'MVS']),
        # 'iterations': trial.suggest_int('iterations', 1000, 2000),
        # 'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 2, 4),
    }

    if tunable_params['bootstrap_type'] == 'Bayesian': 
        tunable_params['bagging_temperature'] = trial.suggest_float('bagging_temperature', 0, 10)
    elif tunable_params['bootstrap_type'] == 'Bernoulli':
        tunable_params['subsample'] = trial.suggest_float('subsample', 0.1, 1, log=True)

    cbr = cb.CatBoostRegressor(**non_tunable_params, **tunable_params) 
    cbr.fit(
        train_pool,
        eval_set=[(X_val, y_val)],
        verbose=0,
        early_stopping_rounds=50,
    )

    y_pred = cbr.predict(X_val)
    score = rmsle(np.expm1(y_val), np.expm1(y_pred))

    return score


def tuned_hyperparameters():
    study = optuna.create_study(
        study_name='catboost-tuning',
        pruner=optuna.pruners.MedianPruner(n_warmup_steps=5), 
        direction='minimize'
    )
    study.optimize(objective, n_trials=100, timeout=900, show_progress_bar=True) 

    print('Number of finished trials: {}'.format(len(study.trials)))

    print('Best trial:')
    trial = study.best_trial

    print('Value:', trial.value)
    print('Params:')
    print(trial.params)

    return trial.params


tuned_params = tuned_hyperparameters()
# tuned_params = {'depth': 9, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'subsample': 0.7494756089749968}


[32m[I 2022-11-08 13:47:54,152][0m A new study created in memory with name: catboost-tuning[0m


  0%|          | 0/100 [00:00<?, ?it/s]

[32m[I 2022-11-08 13:48:01,040][0m Trial 0 finished with value: 0.7585999404434786 and parameters: {'depth': 4, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 1.7161555362368563}. Best is trial 0 with value: 0.7585999404434786.[0m
[32m[I 2022-11-08 13:48:02,897][0m Trial 1 finished with value: 0.7763539411493736 and parameters: {'depth': 5, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 0 with value: 0.7585999404434786.[0m
[32m[I 2022-11-08 13:48:12,865][0m Trial 2 finished with value: 0.7592575016730437 and parameters: {'depth': 8, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 0.7509522044690908}. Best is trial 0 with value: 0.7585999404434786.[0m
[32m[I 2022-11-08 13:48:15,784][0m Trial 3 finished with value: 0.7750021957784125 and parameters: {'depth': 8, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 0 with value: 0.7585999404434786.[0m
[32m[I 2022-11-08 13:48:24,

KeyboardInterrupt: 

In [379]:
model = cb.CatBoostRegressor(**non_tunable_params, **tuned_params, iterations=324)
model.fit(train_pool, eval_set=valid_pool, verbose=50, plot=True)

y_pred = np.expm1(model.predict(X_test))
submission['predicted'] = y_pred
submission.to_csv('submissions/submission.csv', index=False)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.110137
0:	learn: 0.9778262	test: 0.9892557	best: 0.9892557 (0)	total: 20.3ms	remaining: 6.55s
50:	learn: 0.7182412	test: 0.7672415	best: 0.7672415 (50)	total: 707ms	remaining: 3.79s
100:	learn: 0.6889272	test: 0.7611773	best: 0.7611773 (100)	total: 1.24s	remaining: 2.74s
150:	learn: 0.6708856	test: 0.7588583	best: 0.7588583 (150)	total: 1.67s	remaining: 1.92s
200:	learn: 0.6544238	test: 0.7586469	best: 0.7582753 (172)	total: 2.12s	remaining: 1.3s
250:	learn: 0.6407412	test: 0.7580009	best: 0.7579397 (236)	total: 2.58s	remaining: 751ms
300:	learn: 0.6313647	test: 0.7572846	best: 0.7571980 (296)	total: 3.03s	remaining: 232ms
323:	learn: 0.6245028	test: 0.7569111	best: 0.7568690 (311)	total: 3.26s	remaining: 0us
bestTest = 0.7568689977
bestIteration = 311
Shrink model to first 312 iterations.


In [380]:
y_pred = np.expm1(model.predict(X_val))
print(rmsle(np.expm1(y_val), y_pred))

0.7568689626865944


In [381]:
y_pred = np.expm1(model.predict(X_val))
print(rmsle(np.expm1(y_val), y_pred))

best_model = model.ge
# y_pred_best = np.expm1(best_model.pre)
best_model

0.7568689626865944


AttributeError: 'CatBoostRegressor' object has no attribute 'ge'