# Submission

In [1]:
# Magic to automatically update imports if functions in utils are changed
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
from tqdm import tqdm
from pathlib import Path

# Feature engineer (option 1)

In [2]:
from sklearn.model_selection import train_test_split

stores_train = pd.read_csv("data/stores_train.csv")
stores_test = pd.read_csv("data/stores_test.csv")
stores_extra = pd.read_csv("data/stores_extra.csv")

In [3]:
from utils import split_plaace_cat


store_dataframes = {
    "train": stores_train, 
    "extra": stores_extra, 
    "test": stores_test, 
    }

for df_name, df in store_dataframes.items():
    df = split_plaace_cat(df)

In [4]:
store_dataframes["train"]["log_revenue"] = store_dataframes["train"].revenue.apply(lambda x: np.log1p(x))

In [5]:
from utils import mean_rev_of_competitor, log_mean_rev_of_competitor, create_geographical_columns, create_chain_and_mall_columns, generate_chain_rev_dict, generate_plaace_rev_dict, create_mean_chain_rev_col

chain_count = stores_train["chain_name"].value_counts().to_dict()

for df_name, df in tqdm(store_dataframes.items()):
    store_dataframes[df_name] = create_geographical_columns(df)
    store_dataframes[df_name] = create_chain_and_mall_columns(df, chain_count, lower_limit=1)

chain_rev_dict, log_bounded_chain_rev_dict = generate_chain_rev_dict(store_dataframes["train"], quantile=0)

for i in tqdm(range(1, 5)):
    rev_plaace_dict, mean_plaace_revenue, log_rev_plaace_dict, log_mean_plaace_revenue = generate_plaace_rev_dict(store_dataframes["train"], i, quantile=0)
    for df_name, df in store_dataframes.items():
            store_dataframes[df_name] = mean_rev_of_competitor(store_dataframes[df_name], i, rev_dict=rev_plaace_dict, mean_revenue=mean_plaace_revenue)
            store_dataframes[df_name] = log_mean_rev_of_competitor(store_dataframes[df_name], i, log_rev_dict=log_rev_plaace_dict, log_mean_revenue=log_mean_plaace_revenue)
    
for df_name, df in tqdm(store_dataframes.items()):
    store_dataframes[df_name] = create_mean_chain_rev_col(df, bounded_chain_revs=chain_rev_dict, log_bounded_chain_revs=log_bounded_chain_rev_dict)

100%|██████████| 3/3 [00:00<00:00, 23.68it/s]
100%|██████████| 4/4 [00:00<00:00, 10.67it/s]
100%|██████████| 3/3 [00:00<00:00, 165.62it/s]


In [7]:
from utils import concat_df_keep_unq_index
concat_df = concat_df_keep_unq_index(store_dataframes["train"], store_dataframes["extra"])
concat_df = concat_df_keep_unq_index(concat_df, store_dataframes["test"])

In [8]:
from utils import find_dist_to_nearest_comp

nearest_comp_plaace_cat_gran = [1, 2, 3, 4]
n_nearest_comp = [1, 2, 3, 4, 5, 7, 10]

store_dataframes["train"] = find_dist_to_nearest_comp(
    store_dataframes["train"], 
    nearest_comp_plaace_cat_gran, 
    n_nearest_comp, 
    training=True, 
    training_df=concat_df,
    )

In [9]:
store_dataframes["test"] = find_dist_to_nearest_comp(
    store_dataframes["test"], 
    nearest_comp_plaace_cat_gran, 
    n_nearest_comp, 
    training=True,
    training_df=concat_df
)

In [10]:
comp_plaace_cols = list(store_dataframes["train"].columns[-56:])

In [11]:
dist_dict = store_dataframes["train"][['log_revenue'] + comp_plaace_cols].corr().iloc[0].to_dict()
sorted_relevant_dist_cols = [[k, v] for k, v in sorted(dist_dict.items(), key=lambda item: abs(item[1]), reverse=True)]

In [12]:
comp_relevant_cols = sorted_relevant_dist_cols[1:14:2]
comp_relevant_cols = [r[0] for r in comp_relevant_cols]

In [13]:
from bus_utils import find_closest_bus_stop

bus_stop_n = [1, 2, 3, 5, 7, 10, 15 ,25, 50, 100]
bus_mean = True
bus_sum = True

bus_stop_columns = []

if(bus_sum):
    bus_stop_columns += [f"closest_bus_stop_sum_{i}" for i in bus_stop_n]

if(bus_mean):
    bus_stop_columns += [f"closest_bus_stop_mean_{i}" for i in bus_stop_n]


for df_name, df in tqdm(store_dataframes.items()):
    store_dataframes[df_name] = find_closest_bus_stop(df, bus_stop_n, _sum=bus_sum, _mean=bus_mean)    

100%|██████████| 3/3 [13:16<00:00, 265.54s/it]


In [14]:
bus_dict = store_dataframes["train"][['log_revenue'] + bus_stop_columns].corr().iloc[0].to_dict()
bus_sorted_relevant_dist_cols = [[k, v] for k, v in sorted(bus_dict.items(), key=lambda item: abs(item[1]), reverse=True)]

In [15]:
bus_relevant_cols = bus_sorted_relevant_dist_cols[1::2]
bus_relevant_cols = [r[0] for r in bus_relevant_cols]

In [16]:
bus_relevant_cols = bus_relevant_cols[:5]

In [17]:
from grunnkrets_old import make_grunnkrets_df

full_population_dataframes = {}
full_pop_columns = []

for df_name, df in tqdm(store_dataframes.items()):
    full_population_dataframes[df_name] = make_grunnkrets_df(df)
    full_pop_columns = full_population_dataframes[df_name].columns

  full_population_df[f'{level}.income_density_log']  = np.log1p(full_population_df[f'{level}.income_density'])
  full_population_df[f'{level}.income_density']  = full_population_df[f'{level}.total_income']/full_population_df[f'{level}.area_km2']
  full_population_df[f'{level}.income_density_log']  = np.log1p(full_population_df[f'{level}.income_density'])
  full_population_df[f'{level}.pop_density'] = full_population_df[f'{level}.tot_pop']/full_population_df[f'{level}.area_km2']
  full_population_df[f'{level}.pop_density_log'] = np.log1p(full_population_df[f'{level}.pop_density'])
  full_population_df[f'{level}.pop_density'] = full_population_df[f'{level}.tot_pop']/full_population_df[f'{level}.area_km2']
  full_population_df[f'{level}.pop_density_log'] = np.log1p(full_population_df[f'{level}.pop_density'])
  full_population_df[f'{level}.pop_density'] = full_population_df[f'{level}.tot_pop']/full_population_df[f'{level}.area_km2']
  full_population_df[f'{level}.pop_density_log'] = np.log

In [18]:
full_pop_columns = list(full_pop_columns[-184:])

In [19]:
for df_name, df in store_dataframes.items():
    store_dataframes[df_name] = df.merge(
        full_population_dataframes[df_name], 
        left_index=True,
        right_index=True,
        how="outer", 
        suffixes=('', '_redundant')
    )
    store_dataframes[df_name].drop(store_dataframes[df_name].filter(regex='_redundant$').columns, axis=1, inplace=True)

In [20]:
full_pop_dict = store_dataframes["train"][['log_revenue'] + full_pop_columns].corr().iloc[0].to_dict()
full_pop_sorted_relevant_dist_cols = [[k, v] for k, v in sorted(full_pop_dict.items(), key=lambda item: abs(item[1]), reverse=True)]

In [21]:
full_pop_relevant_cols = full_pop_sorted_relevant_dist_cols[1:8]
full_pop_relevant_cols = [r[0] for r in full_pop_relevant_cols]

In [22]:
fylke_relevant_features = [col_name for col_name in store_dataframes["train"].columns if col_name.startswith("fylke.")]
kommune_relevant_features = [col_name for col_name in store_dataframes["train"].columns if col_name.startswith("kommune.")]
delomrade_relevant_features = [col_name for col_name in store_dataframes["train"].columns if col_name.startswith("delomrade.")]
grunnkrets_relevant_features = [col_name for col_name in store_dataframes["train"].columns if col_name.startswith("grunnkrets_id.")]

In [23]:
from num_stores import add_num_stores_info

for df_name, df in store_dataframes.items():
    store_dataframes[df_name] = add_num_stores_info(df)

  grouped = geo_df.groupby([ level , cat]).sum()['count'].to_frame()
  grouped = geo_df.groupby([ level , cat]).sum()['count'].to_frame()
  grouped = geo_df.groupby([ level , cat]).sum()['count'].to_frame()
  grouped = geo_df.groupby([ level , cat]).sum()['count'].to_frame()
  grouped = geo_df.groupby([ level , cat]).sum()['count'].to_frame()
  grouped = geo_df.groupby([ level , cat]).sum()['count'].to_frame()
  grouped = geo_df.groupby([ level , cat]).sum()['count'].to_frame()
  grouped = geo_df.groupby([ level , cat]).sum()['count'].to_frame()
  grouped = geo_df.groupby([ level , cat]).sum()['count'].to_frame()
  grouped = geo_df.groupby([ level , cat]).sum()['count'].to_frame()
  grouped = geo_df.groupby([ level , cat]).sum()['count'].to_frame()
  grouped = geo_df.groupby([ level , cat]).sum()['count'].to_frame()
  grouped = geo_df.groupby([ level , cat]).sum()['count'].to_frame()
  grouped = geo_df.groupby([ level , cat]).sum()['count'].to_frame()
  grouped = geo_df.groupby([ level

In [24]:
num_store_cols = list(store_dataframes["train"].columns[-64:])

In [25]:
num_store_dict = store_dataframes["train"][['log_revenue'] + num_store_cols].corr().iloc[0].to_dict()
num_store_sorted_relevant_cols = [[k, v] for k, v in sorted(num_store_dict.items(), key=lambda item: abs(item[1]), reverse=True)]

In [26]:
num_store_relevant_cols = num_store_sorted_relevant_cols[1:15]
num_store_relevant_cols = [r[0] for r in num_store_relevant_cols]

## Adding mean revenue (plaace_cat, level)

In [27]:
from avg_revenue import add_avg_revenue, create_avg_revenue_csvs

create_avg_revenue_csvs()

for df_name, df in store_dataframes.items():
    store_dataframes[df_name] = add_avg_revenue(df, total=True)

In [28]:
mean_rev_cols = list(store_dataframes["train"].columns)[-20:]

In [30]:
mean_rev_dict = store_dataframes["train"][['log_revenue'] + mean_rev_cols].corr().iloc[0].to_dict()
mean_rev_sorted_relevant_cols = [[k, v] for k, v in sorted(mean_rev_dict.items(), key=lambda item: abs(item[1]), reverse=True)]

In [31]:
mean_rev_relevant_cols = mean_rev_sorted_relevant_cols[1:]
mean_rev_relevant_cols = [r[0] for r in mean_rev_relevant_cols]

In [32]:
mean_rev_relevant_log_cols = []
for col in mean_rev_relevant_cols:
    for df_name, df in store_dataframes.items():
        store_dataframes[df_name][col + "_log"] = store_dataframes[df_name][col].apply(lambda x: np.log1p(x))
    mean_rev_relevant_log_cols.append(col + "_log")

In [33]:
from new_plaace_index import create_index_csv, add_new_plaace_index

create_index_csv()

In [34]:
for df_name, df in store_dataframes.items():
    store_dataframes[df_name] = add_new_plaace_index(store_dataframes[df_name])

In [35]:
from clustering import add_clusters, create_cluster_csv

create_cluster_csv()

for df_name, df in store_dataframes.items():
    store_dataframes[df_name] = add_clusters(store_dataframes[df_name])

  geo_df = stores_total_train.append(stores_extra).append(stores_test)[['lat', 'lon', 'store_id']]


In [36]:
for df_name, df in store_dataframes.items():
    store_dataframes[df_name]["chain_count"] = store_dataframes[df_name].bounded_chain_name.apply(lambda x: 0 if (x == "OTHER" or x in chain_count.keys()) else chain_count[x])

In [39]:
comp_relevant_cols = [
    'sum_dist_to_nearest_10_comp_plaace_1',
    'mean_dist_to_nearest_7_comp_plaace_1',
    'sum_dist_to_nearest_5_comp_plaace_1',
    'sum_dist_to_nearest_4_comp_plaace_1',
    'mean_dist_to_nearest_3_comp_plaace_1',
    'sum_dist_to_nearest_2_comp_plaace_1',
    'sum_dist_to_nearest_1_comp_plaace_1'
    ]

# Transforming the data

In [40]:
from RMSLE import rmsle
from pred_var_utils import reverse_log1p_transform_pred_var
from sklearn.model_selection import GridSearchCV

In [96]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder


OE_categorical_features = ["bounded_chain_name", "kommune", "delomrade", "is_grocery", "plaace_cat_2", "plaace_cat_3", "plaace_cat_4", "grunnkrets_id"]
OE_categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(missing_values=np.nan, strategy="constant")),
        ("encoder", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)),
    ]
)

OH_categorical_features = ["plaace_cat_1"]
OH_categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(missing_values=np.nan, strategy="constant")),
        ("encoder", OneHotEncoder(handle_unknown="ignore")),
    ]
)


numerical_features = ["lat", "lon", 
    "log_mean_revenue_1", 
    "log_mean_revenue_2", 
    "log_mean_revenue_3", 
    "log_mean_revenue_4", 
    "log_chain_mean_revenue"
    ] + comp_relevant_cols 
numerical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="mean")), 
        ("scaler", StandardScaler(with_mean=True, with_std=True))]
)


preprocessor = ColumnTransformer(
   transformers=[
       ("oe_cat", OE_categorical_transformer, OE_categorical_features),
       ("oh_cat", OH_categorical_transformer, OH_categorical_features),
       ("num", numerical_transformer, numerical_features),
   ],
   remainder='drop'
)


X_train = preprocessor.fit_transform(store_dataframes["train"])
X_test = preprocessor.transform(store_dataframes["test"])

In [97]:
y_train = np.array(store_dataframes["train"].log_revenue)
mean_y = y_train.mean()
std_y = y_train.std()

y_train -= mean_y
y_train /= std_y

# PCA

In [98]:
PCA_cols_dict = {
    "OE": [],
    "OH": [],  
    "num": []
}

reduntant_cols = ["store_id", "year", "store_name", "plaace_hierarchy_id", "sales_channel_name", "address", "revenue", "log_revenue", "point", "plaace_cat_0"]

for col_name, dtype in store_dataframes["train"].dtypes.to_dict().items():
    if(col_name in reduntant_cols):
        continue
    if(dtype == int or dtype == float):
        _type = "num"
    elif(dtype == bool):
        _type = "OE"
    elif(dtype == object):
        if(store_dataframes["train"][col_name].nunique() <= 10):
            _type = "OH"
        else:
            _type = "OE"
    else:
        print(f"Unknown type {dtype} encountered for columns {col_name}")
    PCA_cols_dict[_type].append(col_name)
    

In [99]:
all_num_cols = PCA_cols_dict["num"][:339]
PCA_cols_dict["OE"] += PCA_cols_dict["num"][-3:-1]

In [100]:
all_OH_cols = PCA_cols_dict["OH"]

In [101]:
all_OE_cols = PCA_cols_dict["OE"]

In [102]:
import numpy as np
from sklearn.decomposition import PCA

### Full population cols

In [103]:
full_pop_cols = all_num_cols[92:276]

In [104]:
full_pop_preprocessor = ColumnTransformer(
    transformers=[
        #("oe_cat", OE_categorical_transformer, PCA_cols_dict["OE"]),
        #("oh_cat", OH_categorical_transformer, PCA_cols_dict["OH"]),
        ("num", numerical_transformer, full_pop_cols),
    ],
    remainder='drop'
)

In [105]:
pre_PCA_X_train_full_pop_cols = full_pop_preprocessor.fit_transform(store_dataframes["train"])
pre_PCA_X_test_full_pop_cols = full_pop_preprocessor.transform(store_dataframes["test"])

In [106]:
pca = PCA(n_components=3)
PCA_X_train = pca.fit_transform(pre_PCA_X_train_full_pop_cols)
PCA_X_test = pca.transform(pre_PCA_X_test_full_pop_cols)

In [107]:
print(np.sum(pca.explained_variance_ratio_))

print(pca.explained_variance_ratio_)

print(pca.singular_values_)

0.7788648525545628
[0.41349462 0.26895047 0.09641976]
[989.11649004 797.71666919 477.6343187 ]


In [108]:
X_train = np.concatenate((X_train,PCA_X_train),axis=1)
X_test = np.concatenate((X_test,PCA_X_test),axis=1)

### Bus distance cols

In [109]:
bus_stop_cols = all_num_cols[72:92]

In [110]:
bus_stop_preprocessor = ColumnTransformer(
    transformers=[
        #("oe_cat", OE_categorical_transformer, PCA_cols_dict["OE"]),
        #("oh_cat", OH_categorical_transformer, PCA_cols_dict["OH"]),
        ("num", numerical_transformer, bus_stop_cols),
    ],
    remainder='drop'
)

In [111]:
pre_PCA_X_train_bus_stop_cols = bus_stop_preprocessor.fit_transform(store_dataframes["train"])
pre_PCA_X_test_bus_stop_cols = bus_stop_preprocessor.fit_transform(store_dataframes["test"])

In [112]:
pca = PCA(n_components=2)
PCA_X_train = pca.fit_transform(pre_PCA_X_train_bus_stop_cols)
PCA_X_test = pca.transform(pre_PCA_X_test_bus_stop_cols)

In [113]:
print(np.sum(pca.explained_variance_ratio_))

print(pca.explained_variance_ratio_)

print(pca.singular_values_)

0.9445344957042299
[0.79420438 0.15033011]
[451.94411556 196.62629024]


In [114]:
X_train = np.concatenate((X_train,PCA_X_train),axis=1)
X_test = np.concatenate((X_test,PCA_X_test),axis=1)

# Training the model

## CatBoost

In [115]:
from catboost import CatBoostRegressor
from sklearn.model_selection import GridSearchCV


# best params (8.11)
# {'depth': 6, 'l2_leaf_reg': 10, 'learning_rate': 0.05, 'eval_metric': 'RMSE'}
# rmsle(on val) = 0.0.7148919867904334

# best params (12.11) (all features)
# {'depth': 6, 'l2_leaf_reg': 10, 'learning_rate': 0.05, 'eval_metric': 'RMSE'}
# rmsle(on val) = 0.7177413486698632

cb = CatBoostRegressor(
    random_seed=0, 
    verbose=False, 
    eval_metric="RMSE", 
    rsm=0.1,
    depth=8, 
    l2_leaf_reg= 8, 
    learning_rate= 0.03
    )

cb_params = grid = {
    'learning_rate': [0.01, 0.03, 0.05, 0.1, 0.3],
    'depth': [5, 6, 8, 10, 15, 20],
    'l2_leaf_reg': [3, 4, 5, 6, 7, 8, 10, 15], 
    }

#cb_clf = cb.randomized_search(cb_params, X=X_train, y=y_train)

In [74]:
cb_clf["params"]

{'depth': 8, 'l2_leaf_reg': 8, 'learning_rate': 0.03}

In [116]:
cb.fit(X_train, y_train)

<catboost.core.CatBoostRegressor at 0x1738079d0>

## Creating the submission

In [117]:
# Predict on the test set 
y_test_pred = reverse_log1p_transform_pred_var(cb.predict(X_test), std_y, mean_y)

# Generate submission dataframe 
# NOTE: It is important that the ID and predicted values match
submission = pd.DataFrame()
submission['id'] = stores_test.store_id 
submission['predicted'] = np.asarray(y_test_pred)

# Save it to disk (`index=False` means don't save the index in the csv)
submission.to_csv('submission.csv', index=False)