In [1]:
%load_ext autoreload

In [2]:
import sys
!{sys.executable} -m pip install -r requirements_nogeo.txt

Defaulting to user installation because normal site-packages is not writeable


In [3]:
%autoreload

import warnings
import os.path
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
# import geopy
import xgboost as xgb
import os
import shutil
# import geopandas as gpd
import catboost as cb
import optuna
from pyproj import Geod
import joblib

from xgboost import XGBRegressor, plot_importance, to_graphviz, plot_tree
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, KFold
from sklearn.cluster import KMeans
from k_fold import random_k_fold
from shapely import wkt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from utils import squared_log, rmsle_xgb, add_city_centre_dist, group_ages, to_categorical, nan_to_string, object_encoder
from k_fold import random_k_fold, xgb_cross_validation
from objectives_and_metrics import _rmsle, RmsleMetric, RmsleObjective, LogTargetsRmsleMetric, RmseObjective
from scipy.stats import uniform, randint

warnings.filterwarnings('ignore')
pd.options.mode.chained_assignment = None  # default='warn'

spatial = pd.read_csv('../data/grunnkrets_norway_stripped.csv')
age = pd.read_csv('../data/grunnkrets_age_distribution.csv')
income = pd.read_csv('../data/grunnkrets_income_households.csv').set_index(['grunnkrets_id', 'year']).add_prefix('income_').reset_index()
households = pd.read_csv('../data/grunnkrets_households_num_persons.csv')
submission = pd.read_csv('../data/sample_submission.csv')
plaace = pd.read_csv('../data/plaace_hierarchy.csv')
busstops = pd.read_csv('../data/busstops_norway.csv')

train = pd.read_csv('../data/stores_train.csv')
extra = pd.read_csv('../data/stores_extra.csv')
test = pd.read_csv('../data/stores_test.csv')

submission = pd.read_csv('../data/sample_submission.csv')
model_name = "modeling/0002.model"


In [4]:
def generate_features(df: pd.DataFrame, predictor: str = ''):
    age_ranges = [
        (0, 19),
        (20, 39),
        (40, 59),
        (60, 79),
        (80, 90),
    ]
    
    # Define datasets to be merged
    spatial_merge = spatial.drop(columns=['year']).drop_duplicates(subset=['grunnkrets_id'])
    age_groups_merge = group_ages(age, age_ranges)
    income_merge = income.drop(columns=['year']).drop_duplicates(subset='grunnkrets_id')
    households_merge = households.drop(columns=['year']).drop_duplicates(subset='grunnkrets_id')
    plaace_merge = plaace.drop_duplicates(subset='plaace_hierarchy_id')
    bus_data_train_merge = pd.read_parquet('derived_data/stores_bus_stops_lt_1km_train').drop(columns=['geometry'])

    # Merge datasets
    df = df.merge(spatial_merge, on='grunnkrets_id', how='left')
    df = df.merge(age_groups_merge, on='grunnkrets_id', how='left')
    df = df.merge(income_merge, on='grunnkrets_id', how='left')
    df = df.merge(households_merge, on='grunnkrets_id', how='left')
    df = df.merge(plaace_merge, how='left')
    df = df.merge(bus_data_train_merge, on='store_id', how='left')
    # df = add_city_centre_dist(df).drop(columns=['lon_center', 'lat_center'])
    
    # Handle categories for different predictors
    if predictor == 'xgb':
        # df = to_categorical(df)
        df = object_encoder(df)
    elif predictor == 'catboost':
        df = nan_to_string(df)
    else: 
        raise ValueError('Invalid predictor')

    features = [
        'store_name', 
        'mall_name', 
        'chain_name',
        'address', 
        'lat', 'lon',
        
        *age_groups_merge.drop(columns=['grunnkrets_id']).columns,
        *income_merge.drop(columns=['grunnkrets_id']).columns,
        *households_merge.drop(columns=['grunnkrets_id']).columns,
        'lv1_desc', 'lv2_desc',
        *bus_data_train_merge.drop(columns=['store_id']).columns
    ]

    return df[features]


label_name = 'revenue'
X = train.drop(columns=[label_name])
y = np.log1p(train[label_name])


In [5]:
def generate_kmeans(df: pd.DataFrame, clusters: int, filter: str):
    if filter and filter in df:
        data = []
        for column in df[filter].unique():
            data.append(df[df[filter] == column])
        kmeans = []
        for i in range(len(data)):
            k = KMeans(n_clusters=clusters, random_state=0, verbose=False, max_iter=300).fit(np.column_stack((data[i]['lat'], data[i]['lon'])))
            joblib.dump(k, "kmeans/kmeans"+str(clusters)+"_"+str(filter)+"_"+str(df[filter].unique()[i])+".joblib")
            kmeans.append(k)
    else: 
        kmeans = KMeans(n_clusters=clusters, random_state=0, verbose=False, max_iter=300).fit(np.column_stack((df['lat'], df['lon'])))
        joblib.dump(kmeans, "kmeans/kmeans"+str(clusters)+"_"+str(filter)+".joblib")
    return kmeans

In [6]:
def predict_kmeans(df:pd.DataFrame, kmeans: KMeans, filter: str, clusters: int):
    if filter and filter in df:
        columns = df[filter].unique()
        for i in range(len(df[filter].unique())):
            column = columns[i]
            df.loc[df[filter]== column, 'cluster'] = kmeans[i].predict(np.column_stack((df[df[filter] == column]['lat'], df[df[filter] == column]['lon'])))
            #df[df[filter] == column]['cluster'] = kmeans[i].predict(np.column_stack((df[df[filter] == column]['lat'], df[df[filter] == column]['lon'])))
    else:
        df['cluster'] = kmeans.predict(np.column_stack((df['lat'], df['lon'])))
    return df

In [7]:
def plot_corr(data):
  df = data[['revenue', 
    # 'age_0_19', 'age_20_39', 'age_40_59', 'age_60_79', 'age_80_90', 
    # 'bus_stops_count', 'Mangler viktighetsnivå', 'Standard holdeplass', 'Lokalt knutepunkt', 'Nasjonalt knutepunkt', 'Regionalt knutepunkt', 'Annen viktig holdeplass', 
    'dist_to_center', 'lat','lon'
    ]]
  df['knutepunkt'] = data[['Lokalt knutepunkt', 'Nasjonalt knutepunkt', 'Regionalt knutepunkt']].sum(axis=1)
  # df.revenue = np.exp(df.revenue)
  # df.bus_stops_count = np.sqrt(df.bus_stops_count)
  df = df[df.dist_to_center < 70_000]
  # df.dist_to_center = np.log(df.dist_to_center)
  
  plt.figure(figsize=(15, 15))
  pairplot = sns.pairplot(df)
  # heatmap = sns.heatmap(df.corr(), vmin=-1, vmax=1, annot=True)


# data_full =  pd.merge(X_train, y_train, left_index=True, right_index=True) 
# plot_corr(data_full)


In [8]:
def clear_buffers(X_train, y_train, X_val, y_val):
    # Clear buffers
    folder = os.path.join(os.getcwd(), 'modeling')

    for filename in os.listdir(folder):
        file_path = os.path.join(folder, filename)
        if os.path.isfile(file_path):
            os.unlink(file_path)
            print(f'Deleted file: {file_path}')

    train_buffer_path = 'modeling/train.buffer'
    test_buffer_path = 'modeling/test.buffer'

    dtrain = xgb.DMatrix(data=X_train, label=y_train, enable_categorical=True)
    dtrain.save_binary(train_buffer_path)
    print(f'--> {train_buffer_path} created and saved.')

    dvalid = xgb.DMatrix(data=X_val, label=y_val, enable_categorical=True)
    dvalid.save_binary(test_buffer_path)
    print(f'--> {test_buffer_path} created and saved.')

    return dtrain, dvalid

In [9]:
# print(model.best_score_)
# y_pred_train = model.predict(X_train)
# y_pred_val = model.predict(X_val)
# print(_rmsle(y_train, y_pred_train))
# print(_rmsle(y_val, y_pred_val))

In [10]:
def train_xgb_model(X_train, y_train, X_val, y_val):
    params = {'colsample_bytree': 0.7717138210314867, 'learning_rate': 0.047506668950627134, 'max_depth': 8, 'min_child_weight': 3, 'n_estimators': 223, 'subsample': 0.9929036803032936}
    print('Clearing and creating buffers...')
    dtrain, dvalid = clear_buffers(X_train, y_train, X_val, y_val)
    
    rand_search_model = random_k_fold(X_train, y_train, verbose=1, n_iter=100)
    model = rand_search_model
    params = model.best_params_
    print(rand_search_model.best_score_, params)
    
    # params = {'colsample_bytree': 0.8601277878899238, 'eval_metric': 'rmsle', 'gamma': 0.12760202929262826, 'learning_rate': 0.07356461924449906, 'max_depth': 5, 'min_child_weight': 1, 'n_estimators': 306, 'objective': 'reg:squaredlogerror', 'subsample': 0.8993341396761092}
    
    params['disable_default_eval_metric'] = True
    # model = XGBRegressor()
    # model.set_params(**params)
    # model.fit(X_train, y_train)
    # y_pred_train = model.predict(X_train)
    # y_pred_val = model.predict(X_val)
    # print(_rmsle(y_train, y_pred_train))
    # print(_rmsle(y_val, y_pred_val))

    # num_round = 999
    # watchlist = [(dtrain, 'train'), (dvalid, 'valid')]

    # print("Attempting to start training...")
    # model = xgb.train(
    #     params=params, 
    #     dtrain=dtrain, 
    #     num_boost_round=num_round, 
    #     evals=watchlist, 
    #     early_stopping_rounds=10, 
    #     verbose_eval=20)
    # print("--> model trained.")
    # print('Best score:', model.best_score)

    # print("Attempting to save model...")
    # model.save_model(model_name)
    # print("--> model saved.")

    return model


# X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=.8)
# X_train, X_val = generate_features(X_train, predictor='xgb'), generate_features(X_val, predictor='xgb')

# model = train_xgb_model(X_train, y_train, X_val, y_val)

In [11]:
# y_pred_train = model.predict(X_train)
# y_pred_val = model.predict(X_val)
# print(_rmsle(y_train, y_pred_train))
# print(_rmsle(y_val, y_pred_val))

In [12]:
def xgb_prediction(X_test, model):
    dtest = xgb.DMatrix(data=X_test, enable_categorical=True)

    print("\nAttempting to start prediction...")
    y_pred = model.predict(dtest, ntree_limit=model.best_iteration)
    print("--> Prediction finished.")

    print("\nAttempting to save prediction...")
    submission['predicted'] = np.array(y_pred)
    submission.to_csv('submissions/submission.csv', index=False)
    print("--> prediction saved with features as name in submission folder.")


# X_test = generate_features(test, predictor='xgb')
# xgb_prediction(X_test, model)

In [13]:
# xgb_model = model.best_estimator_ if model.best_estimator_ is not None else model
# xgb_model = model
# plot_importance(xgb_model)
# xgb.to_graphviz(xgb_model, num_trees=1)

### Prepare features for Catboost predictor

In [14]:
filter = 'lv1_desc'
X_train_extra = pd.concat([train, extra])
X_train_extra = generate_features(X_train_extra, predictor='catboost')
kmeans = generate_kmeans(X_train_extra, clusters=110, filter=filter)

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=.8)
X_train, X_val = generate_features(X_train, predictor='catboost'), generate_features(X_val, predictor='catboost')
X_train, X_val = predict_kmeans(X_train, kmeans, filter, 0), predict_kmeans(X_val, kmeans, filter, 0)

X_test = generate_features(test, predictor='catboost')

auxillary_columns = ['address']
text_features = ['store_name', 'address']
cat_features = ['mall_name', 'chain_name', 'lv1_desc', 'lv2_desc']

train_pool = cb.Pool(
    X_train,
    y_train,
    cat_features=cat_features,
    text_features=text_features,
    feature_names=list(X_train)
)

valid_pool = cb.Pool(
    X_val,
    y_val,
    cat_features=cat_features,
    text_features=text_features,
    feature_names=list(X_train)
)

In [15]:
def objective(trial: optuna.Trial) -> float:
    param = {
        # 'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.01, 0.1, log=True),
        'depth': trial.suggest_int('depth', 3, 9),
        'boosting_type': trial.suggest_categorical('boosting_type', ['Ordered', 'Plain']),
        'bootstrap_type': trial.suggest_categorical('bootstrap_type', ['Bayesian', 'Bernoulli', 'MVS']),
        'iterations': 1000,
        # 'objective': RmseObjective(),
        # 'eval_metric': LogTargetsRmsleMetric(),
        'objective': 'RMSE',
        'eval_metric': 'RMSE'
    }

    kmeans_param = {
        'clusters': trial.suggest_int('clusters', 10, 50),
        'filter': trial.suggest_categorical('filter', ['lv1_desc', 'lv2_desc', False])
    }
    
    kmeans = generate_kmeans(X_train_extra, **kmeans_param)
    X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=.8)
    X_train, X_val = generate_features(X_train, predictor='catboost'), generate_features(X_val, predictor='catboost')
    X_train, X_val = predict_kmeans(X_train, kmeans, **kmeans_param), predict_kmeans(X_val, kmeans, **kmeans_param)
    y_train, 
    train_pool = cb.Pool(
        X_train,
        y_train,
        cat_features=cat_features,
        text_features=text_features,
        feature_names=list(X_train)
    )

    valid_pool = cb.Pool(
        X_val,
        y_val,
        cat_features=cat_features,
        text_features=text_features,
        feature_names=list(X_train)
    )

    if param['bootstrap_type'] == 'Bayesian': 
        param['bagging_temperature'] = trial.suggest_float('bagging_temperature', 0, 10)
    elif param['bootstrap_type'] == 'Bernoulli':
        param['subsample'] = trial.suggest_float('subsample', 0.1, 1, log=True)

    cbr = cb.CatBoostRegressor(**param, task_type='CPU', devices='0:1')
    
    # pruning_callback = optuna.integration.CatBoostPruningCallback(trial, 'LogTargetsRmsleMetric')
    cbr.fit(
        train_pool,
        eval_set=[(X_val, y_val)],
        verbose=True,
        early_stopping_rounds=50,
        # callbacks=[pruning_callback]
    )

    # pruning_callback.check_pruned()

    y_pred = np.expm1(cbr.predict(X_val))
    score = _rmsle(np.expm1(y_val), y_pred)[1]

    # score = np.sqrt(mean_squared_log_error(np.expm1(y_val), y_pred))

    return score


study = optuna.create_study(pruner=optuna.pruners.MedianPruner(n_warmup_steps=5), direction='minimize')
study.optimize(objective, n_trials=100) # timeout=600

print('Number of finished trials: {}'.format(len(study.trials)))

print('Best trial:')
trial = study.best_trial

print('Value:', trial.value)
print('Params:')
print(trial.params)

[32m[I 2022-11-08 18:05:51,697][0m A new study created in memory with name: no-name-d7a69c0f-2d6c-4c1c-a390-974264871b61[0m


Learning rate set to 0.073422
0:	learn: 0.9980261	test: 0.9980990	best: 0.9980990 (0)	total: 117ms	remaining: 1m 56s
1:	learn: 0.9741560	test: 0.9738523	best: 0.9738523 (1)	total: 165ms	remaining: 1m 22s
2:	learn: 0.9559006	test: 0.9531556	best: 0.9531556 (2)	total: 217ms	remaining: 1m 12s
3:	learn: 0.9391284	test: 0.9360453	best: 0.9360453 (3)	total: 277ms	remaining: 1m 8s
4:	learn: 0.9254616	test: 0.9198258	best: 0.9198258 (4)	total: 331ms	remaining: 1m 5s
5:	learn: 0.9114981	test: 0.9055813	best: 0.9055813 (5)	total: 380ms	remaining: 1m 3s
6:	learn: 0.8979668	test: 0.8916075	best: 0.8916075 (6)	total: 438ms	remaining: 1m 2s
7:	learn: 0.8873657	test: 0.8801784	best: 0.8801784 (7)	total: 486ms	remaining: 1m
8:	learn: 0.8781435	test: 0.8696457	best: 0.8696457 (8)	total: 536ms	remaining: 59s
9:	learn: 0.8682499	test: 0.8595251	best: 0.8595251 (9)	total: 586ms	remaining: 58s
10:	learn: 0.8596888	test: 0.8505189	best: 0.8505189 (10)	total: 636ms	remaining: 57.1s
11:	learn: 0.8524720	test:

[32m[I 2022-11-08 18:06:49,237][0m Trial 0 finished with value: 0.7349941800744053 and parameters: {'depth': 3, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'clusters': 48, 'filter': False, 'subsample': 0.6383365038996822}. Best is trial 0 with value: 0.7349941800744053.[0m


953:	learn: 0.7010786	test: 0.7352007	best: 0.7349942 (906)	total: 49.5s	remaining: 2.39s
954:	learn: 0.7010266	test: 0.7351794	best: 0.7349942 (906)	total: 49.6s	remaining: 2.34s
955:	learn: 0.7009794	test: 0.7351564	best: 0.7349942 (906)	total: 49.6s	remaining: 2.28s
956:	learn: 0.7009441	test: 0.7351411	best: 0.7349942 (906)	total: 49.7s	remaining: 2.23s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.7349941794
bestIteration = 906

Shrink model to first 907 iterations.
Learning rate set to 0.073422
0:	learn: 0.9980066	test: 1.0023169	best: 1.0023169 (0)	total: 299ms	remaining: 4m 58s
1:	learn: 0.9735186	test: 0.9781233	best: 0.9781233 (1)	total: 558ms	remaining: 4m 38s
2:	learn: 0.9534283	test: 0.9575877	best: 0.9575877 (2)	total: 809ms	remaining: 4m 28s
3:	learn: 0.9347107	test: 0.9376843	best: 0.9376843 (3)	total: 1.06s	remaining: 4m 23s
4:	learn: 0.9164059	test: 0.9196677	best: 0.9196677 (4)	total: 1.34s	remaining: 4m 27s
5:	learn: 0.9016713	test: 0.9051277	b

[32m[I 2022-11-08 18:07:59,163][0m Trial 1 finished with value: 0.7633772910697593 and parameters: {'depth': 8, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS', 'clusters': 22, 'filter': 'lv2_desc'}. Best is trial 0 with value: 0.7349941800744053.[0m


249:	learn: 0.6995839	test: 0.7642900	best: 0.7633773 (199)	total: 59.6s	remaining: 2m 58s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.7633772925
bestIteration = 199

Shrink model to first 200 iterations.
Learning rate set to 0.073422
0:	learn: 0.9991036	test: 0.9910500	best: 0.9910500 (0)	total: 39.5ms	remaining: 39.4s
1:	learn: 0.9754931	test: 0.9672052	best: 0.9672052 (1)	total: 76.6ms	remaining: 38.2s
2:	learn: 0.9582807	test: 0.9495992	best: 0.9495992 (2)	total: 112ms	remaining: 37.3s
3:	learn: 0.9390797	test: 0.9302330	best: 0.9302330 (3)	total: 151ms	remaining: 37.5s
4:	learn: 0.9231286	test: 0.9133009	best: 0.9133009 (4)	total: 189ms	remaining: 37.5s
5:	learn: 0.9083536	test: 0.8978257	best: 0.8978257 (5)	total: 230ms	remaining: 38s
6:	learn: 0.8951512	test: 0.8848764	best: 0.8848764 (6)	total: 279ms	remaining: 39.5s
7:	learn: 0.8838811	test: 0.8735126	best: 0.8735126 (7)	total: 317ms	remaining: 39.3s
8:	learn: 0.8721000	test: 0.8613408	best: 0.8613408 (

[32m[I 2022-11-08 18:08:32,675][0m Trial 2 finished with value: 0.7439490117169535 and parameters: {'depth': 5, 'boosting_type': 'Plain', 'bootstrap_type': 'Bayesian', 'clusters': 11, 'filter': 'lv1_desc', 'bagging_temperature': 2.883478449693647}. Best is trial 0 with value: 0.7349941800744053.[0m


660:	learn: 0.7266505	test: 0.7439607	best: 0.7439490 (613)	total: 24.4s	remaining: 12.5s
661:	learn: 0.7266127	test: 0.7439659	best: 0.7439490 (613)	total: 24.4s	remaining: 12.5s
662:	learn: 0.7266013	test: 0.7439865	best: 0.7439490 (613)	total: 24.5s	remaining: 12.4s
663:	learn: 0.7265495	test: 0.7439808	best: 0.7439490 (613)	total: 24.5s	remaining: 12.4s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.7439490124
bestIteration = 613

Shrink model to first 614 iterations.
Learning rate set to 0.073422
0:	learn: 1.0005938	test: 0.9863701	best: 0.9863701 (0)	total: 500ms	remaining: 8m 19s
1:	learn: 0.9747241	test: 0.9615405	best: 0.9615405 (1)	total: 961ms	remaining: 7m 59s
2:	learn: 0.9517954	test: 0.9404211	best: 0.9404211 (2)	total: 1.45s	remaining: 8m 2s
3:	learn: 0.9315736	test: 0.9223047	best: 0.9223047 (3)	total: 1.96s	remaining: 8m 7s
4:	learn: 0.9135092	test: 0.9064777	best: 0.9064777 (4)	total: 2.45s	remaining: 8m 6s
5:	learn: 0.8977501	test: 0.8923848	best

[32m[I 2022-11-08 18:12:34,098][0m Trial 3 finished with value: 0.7464903603753933 and parameters: {'depth': 9, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS', 'clusters': 26, 'filter': False}. Best is trial 0 with value: 0.7349941800744053.[0m


503:	learn: 0.6373741	test: 0.7468202	best: 0.7464904 (453)	total: 3m 55s	remaining: 3m 51s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.746490362
bestIteration = 453

Shrink model to first 454 iterations.
Learning rate set to 0.073422
0:	learn: 1.0010219	test: 0.9820227	best: 0.9820227 (0)	total: 115ms	remaining: 1m 54s
1:	learn: 0.9758404	test: 0.9563042	best: 0.9563042 (1)	total: 248ms	remaining: 2m 3s
2:	learn: 0.9541441	test: 0.9338048	best: 0.9338048 (2)	total: 368ms	remaining: 2m 2s
3:	learn: 0.9348404	test: 0.9144716	best: 0.9144716 (3)	total: 483ms	remaining: 2m
4:	learn: 0.9183133	test: 0.8978737	best: 0.8978737 (4)	total: 601ms	remaining: 1m 59s
5:	learn: 0.9036602	test: 0.8829246	best: 0.8829246 (5)	total: 719ms	remaining: 1m 59s
6:	learn: 0.8892177	test: 0.8687243	best: 0.8687243 (6)	total: 837ms	remaining: 1m 58s
7:	learn: 0.8779499	test: 0.8579136	best: 0.8579136 (7)	total: 956ms	remaining: 1m 58s
8:	learn: 0.8674258	test: 0.8477134	best: 0.8477134

In [None]:
print(trial.value, trial.params)

0.7270089548135714 {'colsample_bylevel': 0.07329097359385545, 'depth': 9, 'boosting_type': 'Plain', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 1.1734504015536018}


In [None]:
def print_cv_summary(cv_data):
    cv_data.head(10)

    best_value = cv_data['test-Logloss-mean'].min()
    best_iter = cv_data['test-Logloss-mean'].values.argmin()

    print('Best validation Logloss score : {:.4f}±{:.4f} on step {}'.format(
        best_value,
        cv_data['test-Logloss-std'][best_iter],
        best_iter)
    )

param_dist = {
    'learning_rate': uniform(0.01, 0.3),
    'max_depth': randint(3, 9),
    'iterations': [1000]
}

cbr = cb.CatBoostRegressor(
    objective=RmseObjective(), 
    eval_metric=LogTargetsRmsleMetric(),
    verbose=False,
)

# cbr.randomized_search(param_dist, X=train_pool, cv=3, n_iter=20, shuffle=True, stratified=True, plot=True)

# feature_importance = cbr.get_feature_importance(prettified=True)
# print('Feature importance:', feature_importance)

In [None]:
params = {
    'iterations': 2000,
    'learning_rate': 0.03,
    # 'objective': RmsleObjective(),
    # 'eval_metric': RmsleMetric()
    'objective': RmseObjective(),
    'eval_metric': LogTargetsRmsleMetric()
}

params = {'iterations': 1000,
 'loss_function': 'RMSE',
 'eval_metric': LogTargetsRmsleMetric(),
 'objective': RmseObjective(),
 'depth': 7.0,
 'learning_rate': 0.0494195212108839}

params = {'colsample_bylevel': 0.07329097359385545, 'depth': 9, 'boosting_type': 'Plain', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 1.1734504015536018}
params = {'depth': 9, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'subsample': 0.635180352520854}

model = cb.CatBoostRegressor(**params, objective=RmseObjective(), eval_metric=LogTargetsRmsleMetric())
model.fit(train_pool, eval_set=valid_pool, verbose=50, plot=True)

y_pred = np.expm1(model.predict(X_test))
submission['predicted'] = y_pred
submission.to_csv('submissions/submission.csv', index=False)


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 1.8410715	test: 1.8806790	best: 1.8806790 (0)	total: 283ms	remaining: 4m 42s
50:	learn: 0.9037483	test: 0.9133867	best: 0.9133867 (50)	total: 15s	remaining: 4m 39s
100:	learn: 0.7946943	test: 0.7925219	best: 0.7925219 (100)	total: 30s	remaining: 4m 27s
150:	learn: 0.7723114	test: 0.7744926	best: 0.7744926 (150)	total: 44.6s	remaining: 4m 10s
200:	learn: 0.7627387	test: 0.7686614	best: 0.7686614 (200)	total: 59.3s	remaining: 3m 55s
250:	learn: 0.7560784	test: 0.7651931	best: 0.7651931 (250)	total: 1m 13s	remaining: 3m 40s
300:	learn: 0.7503074	test: 0.7628645	best: 0.7628645 (300)	total: 1m 28s	remaining: 3m 25s
350:	learn: 0.7410254	test: 0.7602637	best: 0.7602335 (348)	total: 1m 43s	remaining: 3m 12s
400:	learn: 0.7316951	test: 0.7579294	best: 0.7579294 (400)	total: 1m 59s	remaining: 2m 57s
450:	learn: 0.7225471	test: 0.7555986	best: 0.7555986 (450)	total: 2m 14s	remaining: 2m 43s
500:	learn: 0.7148193	test: 0.7534449	best: 0.7534237 (499)	total: 2m 29s	remaining: 2m 28s
550

In [None]:
y_pred = np.expm1(model.predict(X_test))
submission['predicted'] = y_pred
submission.to_csv('submissions/submission.csv', index=False)

In [None]:
# y_pred = model.predict(test)
# y_pred

a = 10
log_a = np.log1p(a)
exp_log_a = np.expm1(log_a)

print(a, exp_log_a)

model.best_score_

y_pred = np.expm1()
submission['predicted'] = np.array(y_pred)
submission.to_csv('submissions/submission.csv', index=False)


10 10.000000000000002


In [None]:
params = {
    'depth': randint(2, 20),
    'learning_rate': uniform(0.01, 0.4),
    'iterations': randint(10, 1000)
}

model = cb.CatBoostRegressor(loss_function=RmsleObjective, eval_metric=RmsleMetric)


# model.randomized_search(train_pool, param_distributions=params, cv=5)

pred = model.predict(X_val)
rmsle = (np.sqrt(mean_squared_log_error(y_val, pred)))
print('Testing performance')
print('RMSLE: {:.2f}'.format(rmsle))

CatBoostError: There is no trained model to use predict(). Use fit() to train model. Then use this method.