# Submission

In [468]:
# Magic to automatically update imports if functions in utils are changed
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
from tqdm import tqdm
from pathlib import Path

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [473]:
submission_with_whole_train = False

# Feature engineer (option 1)

In [474]:
from sklearn.model_selection import train_test_split

stores_train = pd.read_csv("data/stores_train.csv")
stores_test = pd.read_csv("data/stores_test.csv")
stores_extra = pd.read_csv("data/stores_extra.csv")
if not submission_with_whole_train:
    stores_train, stores_val = train_test_split(stores_train, test_size=0.2, random_state=0)

In [475]:
from utils import split_plaace_cat


store_dataframes = {
    "train": stores_train, 
    "extra": stores_extra, 
    "test": stores_test, 
    }

if not submission_with_whole_train:
    store_dataframes["val"] = stores_val

for df_name, df in store_dataframes.items():
    df = split_plaace_cat(df)

In [477]:
store_dataframes["train"].shape

(10287, 16)

In [478]:
store_dataframes["train"]["log_revenue"] = store_dataframes["train"].revenue.apply(lambda x: np.log1p(x))
if not submission_with_whole_train:
    store_dataframes["val"]["log_revenue"] = store_dataframes["val"].revenue.apply(lambda x: np.log1p(x))

In [479]:
from utils import create_geographical_columns, create_chain_and_mall_columns, generate_rev_dict, generate_chain_rev_dict, create_mean_chain_rev_col

chain_count = stores_train["chain_name"].value_counts().to_dict()
lower_limit = 1
rev_dict, mean_revenue = generate_rev_dict(store_dataframes["train"])

for df_name, df in tqdm(store_dataframes.items()):
    store_dataframes[df_name] = create_geographical_columns(df)
    store_dataframes[df_name] = create_chain_and_mall_columns(df, chain_count, lower_limit=lower_limit)

chain_rev_dict = generate_chain_rev_dict(store_dataframes["train"])
for df_name, df in tqdm(store_dataframes.items()):
    store_dataframes[df_name] = create_mean_chain_rev_col(df, bounded_chain_revs=chain_rev_dict)


100%|██████████| 4/4 [00:00<00:00, 48.54it/s]
100%|██████████| 4/4 [00:00<00:00, 115.27it/s]


In [480]:
from utils import concat_df_keep_unq_index
concat_df = concat_df_keep_unq_index(stores_train, stores_extra)

!NB next cell takes 3 minutes to run (if comp_plaace_gran. = [1, 2, 3, 4]) (on M1 Mac with 16GB RAM)

(outdated) If you have the .csv files temp_data/closest_comp_\{df_name\}, skip running this cell and run the cell below it instead

In [481]:
from utils import find_dist_to_nearest_comp

nearest_comp_plaace_cat_gran = [1, 2, 3, 4]
n_nearest_comp = [1, 2, 3, 4, 5, 7, 10]

store_dataframes["train"] = find_dist_to_nearest_comp(
    store_dataframes["train"], 
    nearest_comp_plaace_cat_gran, 
    n_nearest_comp, 
    training=True, 
    training_df=concat_df,
    )

In [482]:
if not submission_with_whole_train:
    store_dataframes["val"] = find_dist_to_nearest_comp(
        store_dataframes["val"], 
        nearest_comp_plaace_cat_gran, 
        n_nearest_comp, 
        training=True, 
        training_df=concat_df,
        )

In [483]:
store_dataframes["test"] = find_dist_to_nearest_comp(
    store_dataframes["test"], 
    nearest_comp_plaace_cat_gran, 
    n_nearest_comp, 
    training=False,
    training_df=concat_df
)

In [484]:
comp_plaace_cols = list(store_dataframes["train"].columns[-56:])

In [485]:
dist_dict = store_dataframes["train"][['log_revenue'] + comp_plaace_cols].corr().iloc[0].to_dict()
sorted_relevant_dist_cols = [[k, v] for k, v in sorted(dist_dict.items(), key=lambda item: abs(item[1]), reverse=True)]
sorted_relevant_dist_cols

[['log_revenue', 1.0],
 ['sum_dist_to_nearest_10_comp_plaace_1', -0.11318923733863769],
 ['mean_dist_to_nearest_10_comp_plaace_1', -0.11318923733863753],
 ['mean_dist_to_nearest_7_comp_plaace_1', -0.10947896974172534],
 ['sum_dist_to_nearest_7_comp_plaace_1', -0.10947896974172525],
 ['sum_dist_to_nearest_5_comp_plaace_1', -0.10512637590277266],
 ['mean_dist_to_nearest_5_comp_plaace_1', -0.1051263759027725],
 ['sum_dist_to_nearest_4_comp_plaace_1', -0.10176508975028223],
 ['mean_dist_to_nearest_4_comp_plaace_1', -0.10176508975028223],
 ['sum_dist_to_nearest_3_comp_plaace_1', -0.09554149149809875],
 ['mean_dist_to_nearest_3_comp_plaace_1', -0.09554149149809853],
 ['sum_dist_to_nearest_2_comp_plaace_1', -0.09178689133186166],
 ['mean_dist_to_nearest_2_comp_plaace_1', -0.09178689133186166],
 ['sum_dist_to_nearest_1_comp_plaace_1', -0.07893162376330974],
 ['mean_dist_to_nearest_1_comp_plaace_1', -0.07893162376330974],
 ['sum_dist_to_nearest_2_comp_plaace_2', -0.0522650006821979],
 ['mean_di

In [486]:
comp_relevant_cols = sorted_relevant_dist_cols[1:14:2]
comp_relevant_cols = [r[0] for r in comp_relevant_cols]

In [487]:
comp_relevant_cols

['sum_dist_to_nearest_10_comp_plaace_1',
 'mean_dist_to_nearest_7_comp_plaace_1',
 'sum_dist_to_nearest_5_comp_plaace_1',
 'sum_dist_to_nearest_4_comp_plaace_1',
 'sum_dist_to_nearest_3_comp_plaace_1',
 'sum_dist_to_nearest_2_comp_plaace_1',
 'sum_dist_to_nearest_1_comp_plaace_1']

!NB next cell takes 15 minutes to run. Grab a coffee or somtething while you wait :) (on M1 Mac with 16GB RAM)

If you have the .csv files temp_data/closest_bus_stop_\{df_name\}, skip running this cell and run the cell below it instead

In [488]:
from bus_utils import find_closest_bus_stop

bus_stop_n = [1, 3, 5, 7]
bus_mean = True
bus_sum = True

bus_stop_columns = []

if(bus_sum):
    bus_stop_columns += [f"closest_bus_stop_sum_{i}" for i in bus_stop_n]

if(bus_mean):
    bus_stop_columns += [f"closest_bus_stop_mean_{i}" for i in bus_stop_n]


for df_name, df in tqdm(store_dataframes.items()):
    store_dataframes[df_name] = find_closest_bus_stop(df, bus_stop_n, _sum=bus_sum, _mean=bus_mean)    

100%|██████████| 4/4 [1:12:41<00:00, 1090.36s/it]  


In [489]:
bus_dict = store_dataframes["train"][['log_revenue'] + bus_stop_columns].corr().iloc[0].to_dict()
bus_sorted_relevant_dist_cols = [[k, v] for k, v in sorted(bus_dict.items(), key=lambda item: abs(item[1]), reverse=True)]
bus_sorted_relevant_dist_cols

[['log_revenue', 1.0],
 ['closest_bus_stop_mean_7', -0.04357740220925073],
 ['closest_bus_stop_sum_7', -0.04357740220925051],
 ['closest_bus_stop_mean_5', -0.04101326738679946],
 ['closest_bus_stop_sum_5', -0.04101326738679933],
 ['closest_bus_stop_sum_3', -0.03871598741831423],
 ['closest_bus_stop_mean_3', -0.03871598741831408],
 ['closest_bus_stop_sum_1', -0.031865730956687065],
 ['closest_bus_stop_mean_1', -0.031865730956687065]]

In [490]:
bus_relevant_cols = bus_sorted_relevant_dist_cols[1:9:2]
bus_relevant_cols = [r[0] for r in bus_relevant_cols]

In [491]:
bus_relevant_cols

['closest_bus_stop_mean_7',
 'closest_bus_stop_mean_5',
 'closest_bus_stop_sum_3',
 'closest_bus_stop_sum_1']

In [492]:
from grunnkrets import make_grunnkrets_df

full_population_dataframes = {}
full_pop_columns = []

for df_name, df in tqdm(store_dataframes.items()):
    full_population_dataframes[df_name] = make_grunnkrets_df(df)
    full_pop_columns = full_population_dataframes[df_name].columns

  full_population_df[f'{level}.pop_density_log'] = np.log1p(full_population_df[f'{level}.pop_density'])
  full_population_df[f'{level}.pop_density'] = full_population_df[f'{level}.tot_pop']/full_population_df[f'{level}.area_km2']
  full_population_df[f'{level}.pop_density_log'] = np.log1p(full_population_df[f'{level}.pop_density'])
100%|██████████| 4/4 [00:26<00:00,  6.60s/it]


In [493]:
full_pop_columns = list(full_pop_columns[-184:])

In [494]:
for df_name, df in store_dataframes.items():
    store_dataframes[df_name] = df.merge(
        full_population_dataframes[df_name], 
        left_index=True,
        right_index=True,
        how="outer", 
        suffixes=('', '_redundant')
    )
    store_dataframes[df_name].drop(store_dataframes[df_name].filter(regex='_redundant$').columns, axis=1, inplace=True)

In [495]:
full_pop_dict = store_dataframes["train"][['log_revenue'] + full_pop_columns].corr().iloc[0].to_dict()
full_pop_sorted_relevant_dist_cols = [[k, v] for k, v in sorted(full_pop_dict.items(), key=lambda item: abs(item[1]), reverse=True)]
full_pop_sorted_relevant_dist_cols

[['log_revenue', 1.0],
 ['grunnkrets_id.income_density', -0.045215448175719486],
 ['kommune.c_age_19-30_ratio', 0.04472970219247787],
 ['delomrade.mean_age', -0.042862752871162244],
 ['grunnkrets_id.pop_density', -0.0424516238143989],
 ['delomrade.c_age_56-90_ratio', -0.04160424303133776],
 ['delomrade.c_age_19-30_ratio', 0.040181313913752716],
 ['grunnkrets_id.c_age_19-30_ratio', 0.03751481046775187],
 ['kommune.mean_age', -0.031377899056215804],
 ['kommune.c_age_56-90_ratio', -0.029925231776778072],
 ['grunnkrets_id.income_density_log', -0.029019830465243063],
 ['grunnkrets_id.pop_density_log', -0.026966666588406994],
 ['grunnkrets_id.tot_pop_log', 0.026369164197539173],
 ['grunnkrets_id.total_income_log', 0.023533902577560918],
 ['fylke.couple_with_children_income', -0.023220318307684916],
 ['grunnkrets_id.c_age_56-90_ratio', -0.021586672144664712],
 ['kommune.tot_pop_log', 0.01983545484580259],
 ['kommune.total_income_log', 0.019724976569824165],
 ['fylke.single_parent_with_childre

In [496]:
full_pop_relevant_cols = full_pop_sorted_relevant_dist_cols[1:8]
full_pop_relevant_cols = [r[0] for r in full_pop_relevant_cols]

In [497]:
full_pop_relevant_cols

['grunnkrets_id.income_density',
 'kommune.c_age_19-30_ratio',
 'delomrade.mean_age',
 'grunnkrets_id.pop_density',
 'delomrade.c_age_56-90_ratio',
 'delomrade.c_age_19-30_ratio',
 'grunnkrets_id.c_age_19-30_ratio']

In [498]:
fylke_relevant_features = [col_name for col_name in store_dataframes["train"].columns if col_name.startswith("fylke.")]
kommune_relevant_features = [col_name for col_name in store_dataframes["train"].columns if col_name.startswith("kommune.")]
delomrade_relevant_features = [col_name for col_name in store_dataframes["train"].columns if col_name.startswith("delomrade.")]
grunnkrets_relevant_features = [col_name for col_name in store_dataframes["train"].columns if col_name.startswith("grunnkrets_id.")]

In [499]:
from utils import mean_func_rev, generate_rev_dict

for i in range(1, 5):
    rev_dict, mean_revenue = generate_rev_dict(stores_train, i)
    for df_name, df in store_dataframes.items():
        df["mean_revenue_" + str(i)] = df["plaace_cat_" + str(i)].apply(lambda x: mean_func_rev(x, rev_dict, mean_revenue))
        df["log_mean_revenue_" + str(i)] = df["mean_revenue_" + str(i)].apply(lambda x: np.log1p(x))

In [500]:
from num_stores import add_num_stores_info

for df_name, df in store_dataframes.items():
    store_dataframes[df_name] = add_num_stores_info(df)

  grouped = geo_df.groupby([ level , cat]).sum()['count'].to_frame()
  grouped = geo_df.groupby([ level , cat]).sum()['count'].to_frame()
  grouped = geo_df.groupby([ level , cat]).sum()['count'].to_frame()
  grouped = geo_df.groupby([ level , cat]).sum()['count'].to_frame()
  grouped = geo_df.groupby([ level , cat]).sum()['count'].to_frame()
  grouped = geo_df.groupby([ level , cat]).sum()['count'].to_frame()
  grouped = geo_df.groupby([ level , cat]).sum()['count'].to_frame()
  grouped = geo_df.groupby([ level , cat]).sum()['count'].to_frame()
  grouped = geo_df.groupby([ level , cat]).sum()['count'].to_frame()
  grouped = geo_df.groupby([ level , cat]).sum()['count'].to_frame()
  grouped = geo_df.groupby([ level , cat]).sum()['count'].to_frame()
  grouped = geo_df.groupby([ level , cat]).sum()['count'].to_frame()
  grouped = geo_df.groupby([ level , cat]).sum()['count'].to_frame()
  grouped = geo_df.groupby([ level , cat]).sum()['count'].to_frame()
  grouped = geo_df.groupby([ level

In [501]:
num_store_cols = list(store_dataframes["train"].columns[-64:])

In [502]:
num_store_dict = store_dataframes["train"][['log_revenue'] + num_store_cols].corr().iloc[0].to_dict()
num_store_sorted_relevant_cols = [[k, v] for k, v in sorted(num_store_dict.items(), key=lambda item: abs(item[1]), reverse=True)]
num_store_sorted_relevant_cols

[['log_revenue', 1.0],
 ['fylke.plaace_cat_1_per_capita', 0.3036462055602537],
 ['fylke.plaace_cat_1_per_tot_income', 0.29950542010049447],
 ['kommune.plaace_cat_1_per_capita', 0.25661756586529566],
 ['kommune.plaace_cat_1_per_tot_income', 0.245370813652734],
 ['fylke.plaace_cat_3_per_capita', 0.18362019261601314],
 ['fylke.plaace_cat_1_count', 0.18265093811288075],
 ['fylke.plaace_cat_3_per_tot_income', 0.1797567860196848],
 ['fylke.plaace_cat_4_per_capita', 0.16374528532317106],
 ['fylke.plaace_cat_4_per_tot_income', 0.16261122183977433],
 ['grunnkrets_id.plaace_cat_1_count', 0.1230872564366986],
 ['delomrade.plaace_cat_1_count', 0.11156949047890931],
 ['fylke.plaace_cat_3_count', 0.10050161427908715],
 ['kommune.plaace_cat_3_per_capita', 0.09570990088677313],
 ['kommune.plaace_cat_1_per_km2', 0.08386190923409534],
 ['kommune.plaace_cat_3_per_tot_income', 0.08377451789326158],
 ['fylke.plaace_cat_4_count', 0.07989315137361924],
 ['grunnkrets_id.plaace_cat_1_per_capita', 0.06542117839

In [503]:
num_store_relevant_cols = num_store_sorted_relevant_cols[1:15]
num_store_relevant_cols = [r[0] for r in num_store_relevant_cols]

In [504]:
num_store_relevant_cols

['fylke.plaace_cat_1_per_capita',
 'fylke.plaace_cat_1_per_tot_income',
 'kommune.plaace_cat_1_per_capita',
 'kommune.plaace_cat_1_per_tot_income',
 'fylke.plaace_cat_3_per_capita',
 'fylke.plaace_cat_1_count',
 'fylke.plaace_cat_3_per_tot_income',
 'fylke.plaace_cat_4_per_capita',
 'fylke.plaace_cat_4_per_tot_income',
 'grunnkrets_id.plaace_cat_1_count',
 'delomrade.plaace_cat_1_count',
 'fylke.plaace_cat_3_count',
 'kommune.plaace_cat_3_per_capita',
 'kommune.plaace_cat_1_per_km2']

# Removing outliers

In [505]:
from outlier_utils import remove_low_revenue

store_dataframes["train"] = remove_low_revenue(store_dataframes["train"], low_rev_limit=0.05)

# Read data directly from CSV (option 2)

In [356]:
stores_train = pd.read_csv("temp_data/full_features_train.csv", index_col=0)
stores_val = pd.read_csv("temp_data/full_features_val.csv", index_col=0)
stores_extra = pd.read_csv("temp_data/full_features_extra.csv", index_col=0)
stores_test = pd.read_csv("temp_data/full_features_test.csv", index_col=0)

store_dataframes = {
    "train": stores_train, 
    "extra": stores_extra, 
    "test": stores_test, 
    "val": stores_val
    }

In [31]:
with open("relevant_cols.txt", "r") as f:
    data = f.readlines()

comp_relevant_cols = data[0].strip().split(",")
bus_relevant_cols = data[1].strip().split(",")
num_stores_relevant_cols = data[2].strip().split(",")
full_pop_relevant_cols = data[3].strip().split(",")

In [102]:
with open("relevant_features.txt", "r") as f:
    fw_relevant_cols = f.read().strip().split(";")

In [123]:
fw_OH_cols = []
fw_OE_cols = []
fw_num_cols = []
fw_OE_cols.append(fw_relevant_cols.pop(0))
fw_OE_cols.append(fw_relevant_cols.pop(0))
fw_OE_cols.append(fw_relevant_cols.pop(0))
fw_OE_cols.append(fw_relevant_cols.pop(0))
fw_OE_cols.append(fw_relevant_cols.pop(0))
fw_OE_cols.append(fw_relevant_cols.pop(0))
fw_OE_cols.append(fw_relevant_cols.pop(6))
fw_num_cols = fw_relevant_cols

# Training the model

In [506]:
from RMSLE import rmsle
from pred_var_utils import reverse_log1p_transform_pred_var

In [507]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder


OE_categorical_features = ["bounded_chain_name", "kommune", "delomrade", "is_grocery", "plaace_cat_2", "plaace_cat_3", "plaace_cat_4"]
OE_categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(missing_values=np.nan, strategy="constant")),
        ("encoder", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)),
    ]
)

OH_categorical_features = [] #["fylke", "plaace_cat_2"]
OH_categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(missing_values=np.nan, strategy="constant")),
        ("encoder", OneHotEncoder(handle_unknown="ignore")),
    ]
)


numerical_features = ["lat", "lon", 
#"mean_revenue_1", "mean_revenue_2", "mean_revenue_3", "mean_revenue_4", 
"log_mean_revenue_1", "log_mean_revenue_2", "log_mean_revenue_3", "log_mean_revenue_4", 
"log_chain_mean_revenue"
] + full_pop_relevant_cols + comp_relevant_cols + bus_relevant_cols + num_store_relevant_cols
numerical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="mean")), 
        ("scaler", StandardScaler(with_mean=True, with_std=True))]
)


preprocessor = ColumnTransformer(
    transformers=[
        ("oe_cat", OE_categorical_transformer, OE_categorical_features),
        ("oh_cat", OH_categorical_transformer, OH_categorical_features),
        ("num", numerical_transformer, numerical_features),
    ],
    remainder='drop'
)


X_train = preprocessor.fit_transform(store_dataframes["train"])
X_val = preprocessor.transform(store_dataframes["val"])

In [508]:
y_train = np.array(store_dataframes["train"].log_revenue)
if not submission_with_whole_train:
    y_val = np.array(store_dataframes["val"].revenue)
mean_y = y_train.mean()
std_y = y_train.std()

y_train -= mean_y
y_train /= std_y

## Apply PCA 

In [None]:
from sklearn.decomposition import PCA
pca = PCA(.95)

pca.fit(stores_train)

pca.n_components_


ValueError: could not convert string to float: '990857423-990974489-88185'

## Random Forest

In [509]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(random_state=0, n_jobs=-1, n_estimators=250, max_features=8, min_samples_leaf=2, min_samples_split=16)

In [510]:
rf.fit(X_train, y_train)

In [511]:
rf_y_pred = reverse_log1p_transform_pred_var(rf.predict(X_val), std_y=std_y, mean_y=mean_y)


In [512]:
rmsle(y_val, rf_y_pred)

0.7190464855759933

## Linear Regression

In [513]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression(n_jobs=-1)

In [514]:
lr.fit(X_train, y_train)

In [515]:
lr_y_pred = reverse_log1p_transform_pred_var(lr.predict(X_val), std_y, mean_y)
lr_y_pred = np.array([max(0, xi) for xi in lr_y_pred])

In [516]:
rmsle(y_val, lr_y_pred)

0.7424995951555908

## Light GBM

In [517]:
from lightgbm import LGBMRegressor

lgbm = LGBMRegressor(random_state=0, n_jobs=-1, learning_rate=0.1, n_estimators=100, reg_lambda=0.01)

In [518]:
lgbm.fit(X_train, y_train)

In [519]:
lgbm_y_pred = reverse_log1p_transform_pred_var(lgbm.predict(X_val), std_y, mean_y)
lgbm_y_pred = np.array([max(0, xi) for xi in lgbm_y_pred])

In [520]:
rmsle(y_val, lgbm_y_pred)

0.7250122422511303

## CatBoost

In [528]:
from catboost import CatBoostRegressor
from sklearn.model_selection import GridSearchCV

cb = CatBoostRegressor(
    random_seed=0, 
    verbose=False, 
    eval_metric="RMSE"
    #depth= 4, 
    #l2_leaf_reg= 5, 
    #learning_rate= 0.1
    )

cb_params = grid = {
    'learning_rate': [0.05, 0.1, 0.3],
    'depth': [2, 3, 4, 5, 6],
    'l2_leaf_reg': [4, 5], 
    }

cb_clf = cb.grid_search(cb_params, X=X_train, y=y_train)


bestTest = 0.663774604
bestIteration = 985

0:	loss: 0.6637746	best: 0.6637746 (0)	total: 1.37s	remaining: 39.7s

bestTest = 0.6639610703
bestIteration = 435

1:	loss: 0.6639611	best: 0.6637746 (0)	total: 2.61s	remaining: 36.6s

bestTest = 0.6653933334
bestIteration = 226

2:	loss: 0.6653933	best: 0.6637746 (0)	total: 3.9s	remaining: 35.1s

bestTest = 0.6640689225
bestIteration = 990

3:	loss: 0.6640689	best: 0.6637746 (0)	total: 5.16s	remaining: 33.6s

bestTest = 0.6630711632
bestIteration = 558

4:	loss: 0.6630712	best: 0.6630712 (4)	total: 6.45s	remaining: 32.3s

bestTest = 0.6653917636
bestIteration = 139

5:	loss: 0.6653918	best: 0.6630712 (4)	total: 7.68s	remaining: 30.7s

bestTest = 0.662668854
bestIteration = 699

6:	loss: 0.6626689	best: 0.6626689 (6)	total: 9.18s	remaining: 30.2s

bestTest = 0.6641392009
bestIteration = 305

7:	loss: 0.6641392	best: 0.6626689 (6)	total: 10.7s	remaining: 29.3s

bestTest = 0.6668814401
bestIteration = 171

8:	loss: 0.6668814	best: 0.6626689 (6

In [529]:
cb_clf["params"]

{'depth': 6, 'l2_leaf_reg': 5, 'learning_rate': 0.05}

In [530]:
cb.fit(X_train, y_train)

<catboost.core.CatBoostRegressor at 0x17352a280>

In [531]:
cb_y_pred = reverse_log1p_transform_pred_var(cb.predict(X_val), std_y, mean_y)
cb_y_pred = np.array([max(0, xi) for xi in cb_y_pred])

In [532]:
rmsle(y_val, cb_y_pred)

0.7190096805219454

## Stacking classifiers

In [347]:
from sklearn.ensemble import StackingRegressor

estimators = [
    ('rf', rf), 
    ('cb', cb), 
    ('lgbm', lgbm), 
]

reg = StackingRegressor(
    estimators=estimators,
    final_estimator=CatBoostRegressor(random_state=42, verbose=False)
)

In [348]:
reg.fit(X_train, y_train)

In [349]:
reg_y_pred = reverse_log1p_transform_pred_var(reg.predict(X_val), std_y, mean_y)

In [350]:
rmsle(y_pred=reg_y_pred, y_true=y_val)

0.7327124215229028

## Creating the submission

In [466]:
# Predict on the test set 
X_test = preprocessor.transform(store_dataframes["test"])
y_test_pred = reverse_log1p_transform_pred_var(cb.predict(X_test), std_y, mean_y)

# Generate submission dataframe 
# NOTE: It is important that the ID and predicted values match
submission = pd.DataFrame()
submission['id'] = stores_test.store_id 
submission['predicted'] = np.asarray(y_test_pred)

# Save it to disk (`index=False` means don't save the index in the csv)
submission.to_csv('submission.csv', index=False)

# Create CSV of dataframes

In [467]:
for df_name, df in store_dataframes.items():
    filepath = Path(f"temp_data/full_features_{df_name}.csv")  
    filepath.parent.mkdir(parents=True, exist_ok=True)  
    df.to_csv(filepath, index=True)