# Submission

In [2]:
# Magic to automatically update imports if functions in utils are changed
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
from tqdm import tqdm
from pathlib import Path

In [3]:
submission_with_whole_train = False

# Feature engineer (option 1)

In [4]:
from sklearn.model_selection import train_test_split

stores_train = pd.read_csv("data/stores_train.csv")
stores_test = pd.read_csv("data/stores_test.csv")
stores_extra = pd.read_csv("data/stores_extra.csv")
if not submission_with_whole_train:
    stores_train, stores_val = train_test_split(stores_train, test_size=0.2, random_state=0)

In [5]:
from utils import split_plaace_cat


store_dataframes = {
    "train": stores_train, 
    "extra": stores_extra, 
    "test": stores_test, 
    }

if not submission_with_whole_train:
    store_dataframes["val"] = stores_val

for df_name, df in store_dataframes.items():
    df = split_plaace_cat(df)

In [6]:
store_dataframes["train"]["log_revenue"] = store_dataframes["train"].revenue.apply(lambda x: np.log1p(x))
if not submission_with_whole_train:
    store_dataframes["val"]["log_revenue"] = store_dataframes["val"].revenue.apply(lambda x: np.log1p(x))

In [7]:
from utils import mean_rev_of_competitor, log_mean_rev_of_competitor, create_geographical_columns, create_chain_and_mall_columns, generate_chain_rev_dict, generate_plaace_rev_dict, create_mean_chain_rev_col

chain_count = stores_train["chain_name"].value_counts().to_dict()

for df_name, df in tqdm(store_dataframes.items()):
    store_dataframes[df_name] = create_geographical_columns(df)
    store_dataframes[df_name] = create_chain_and_mall_columns(df, chain_count, lower_limit=1)

chain_rev_dict, log_bounded_chain_rev_dict = generate_chain_rev_dict(store_dataframes["train"], quantile=0)

for i in tqdm(range(1, 5)):
    rev_plaace_dict, mean_plaace_revenue, log_rev_plaace_dict, log_mean_plaace_revenue = generate_plaace_rev_dict(store_dataframes["train"], i, quantile=0)
    for df_name, df in store_dataframes.items():
            store_dataframes[df_name] = mean_rev_of_competitor(store_dataframes[df_name], i, rev_dict=rev_plaace_dict, mean_revenue=mean_plaace_revenue)
            store_dataframes[df_name] = log_mean_rev_of_competitor(store_dataframes[df_name], i, log_rev_dict=log_rev_plaace_dict, log_mean_revenue=log_mean_plaace_revenue)
    
for df_name, df in tqdm(store_dataframes.items()):
    store_dataframes[df_name] = create_mean_chain_rev_col(df, bounded_chain_revs=chain_rev_dict, log_bounded_chain_revs=log_bounded_chain_rev_dict)

100%|██████████| 4/4 [00:00<00:00, 46.78it/s]
100%|██████████| 4/4 [00:00<00:00, 19.74it/s]
100%|██████████| 4/4 [00:00<00:00, 236.17it/s]


In [8]:
from utils import concat_df_keep_unq_index
concat_df = concat_df_keep_unq_index(store_dataframes["train"], store_dataframes["val"])
concat_df = concat_df_keep_unq_index(concat_df, store_dataframes["extra"])
concat_df = concat_df_keep_unq_index(concat_df, store_dataframes["test"])

In [9]:
from utils import find_dist_to_nearest_comp

nearest_comp_plaace_cat_gran = [1, 2, 3, 4]
n_nearest_comp = [1, 2, 3, 4, 5, 7, 10]

store_dataframes["train"] = find_dist_to_nearest_comp(
    store_dataframes["train"], 
    nearest_comp_plaace_cat_gran, 
    n_nearest_comp, 
    training=True, 
    training_df=concat_df,
    )

In [10]:
if not submission_with_whole_train:
    store_dataframes["val"] = find_dist_to_nearest_comp(
        store_dataframes["val"], 
        nearest_comp_plaace_cat_gran, 
        n_nearest_comp, 
        training=True, 
        training_df=concat_df,
        )

In [11]:
store_dataframes["test"] = find_dist_to_nearest_comp(
    store_dataframes["test"], 
    nearest_comp_plaace_cat_gran, 
    n_nearest_comp, 
    training=True,
    training_df=concat_df
)

In [12]:
comp_plaace_cols = list(store_dataframes["train"].columns[-56:])

In [13]:
dist_dict = store_dataframes["train"][['log_revenue'] + comp_plaace_cols].corr().iloc[0].to_dict()
sorted_relevant_dist_cols = [[k, v] for k, v in sorted(dist_dict.items(), key=lambda item: abs(item[1]), reverse=True)]

In [14]:
comp_relevant_cols = sorted_relevant_dist_cols[1:14:2]
comp_relevant_cols = [r[0] for r in comp_relevant_cols]

In [15]:
new_comp_relevant_cols = [
    'sum_dist_to_nearest_10_comp_plaace_1',
    'sum_dist_to_nearest_3_comp_plaace_1',
    'sum_dist_to_nearest_1_comp_plaace_1', 
    'sum_dist_to_nearest_3_comp_plaace_2',
    'sum_dist_to_nearest_1_comp_plaace_2',
    'sum_dist_to_nearest_1_comp_plaace_3',
    ]

In [16]:
from bus_utils import find_closest_bus_stop

bus_stop_n = [1, 2, 3, 5, 7, 10, 15 ,25, 50, 100]
bus_mean = True
bus_sum = True

bus_stop_columns = []

if(bus_sum):
    bus_stop_columns += [f"closest_bus_stop_sum_{i}" for i in bus_stop_n]

if(bus_mean):
    bus_stop_columns += [f"closest_bus_stop_mean_{i}" for i in bus_stop_n]


for df_name, df in tqdm(store_dataframes.items()):
    store_dataframes[df_name] = find_closest_bus_stop(df, bus_stop_n, _sum=bus_sum, _mean=bus_mean)    

100%|██████████| 4/4 [12:07<00:00, 181.96s/it]


In [17]:
bus_dict = store_dataframes["train"][['log_revenue'] + bus_stop_columns].corr().iloc[0].to_dict()
bus_sorted_relevant_dist_cols = [[k, v] for k, v in sorted(bus_dict.items(), key=lambda item: abs(item[1]), reverse=True)]

In [18]:
bus_relevant_cols = bus_sorted_relevant_dist_cols[1::2]
bus_relevant_cols = [r[0] for r in bus_relevant_cols]

In [19]:
bus_relevant_cols = bus_relevant_cols[:5]

In [21]:
from grunnkrets_old import make_grunnkrets_df

full_population_dataframes = {}
full_pop_columns = []

# new_stores_train = pd.read_csv("data/stores_train.csv")
# new_stores_test = pd.read_csv("data/stores_test.csv")
# new_stores_extra = pd.read_csv("data/stores_extra.csv")
# if not submission_with_whole_train:
#     new_stores_train, new_stores_val = train_test_split(new_stores_train, test_size=0.2, random_state=0)

# new_store_dataframes = {
#     "train": stores_train, 
#     "extra": stores_extra, 
#     "test": stores_test, 
#     }

# if not submission_with_whole_train:
#     new_store_dataframes["val"] = new_stores_val

# for df_name, df in new_store_dataframes.items():
#     df = split_plaace_cat(df)

for df_name, df in tqdm(store_dataframes.items()):
    full_population_dataframes[df_name] = make_grunnkrets_df(df)
    full_pop_columns = full_population_dataframes[df_name].columns

  full_population_df[f'{level}.income_density_log']  = np.log1p(full_population_df[f'{level}.income_density'])
  full_population_df[f'{level}.income_density']  = full_population_df[f'{level}.total_income']/full_population_df[f'{level}.area_km2']
  full_population_df[f'{level}.income_density_log']  = np.log1p(full_population_df[f'{level}.income_density'])
  full_population_df[f'{level}.pop_density'] = full_population_df[f'{level}.tot_pop']/full_population_df[f'{level}.area_km2']
  full_population_df[f'{level}.pop_density_log'] = np.log1p(full_population_df[f'{level}.pop_density'])
  full_population_df[f'{level}.pop_density'] = full_population_df[f'{level}.tot_pop']/full_population_df[f'{level}.area_km2']
  full_population_df[f'{level}.pop_density_log'] = np.log1p(full_population_df[f'{level}.pop_density'])
  full_population_df[f'{level}.pop_density'] = full_population_df[f'{level}.tot_pop']/full_population_df[f'{level}.area_km2']
  full_population_df[f'{level}.pop_density_log'] = np.log

In [22]:
full_pop_columns = list(full_pop_columns[-184:])

In [23]:
full_pop_columns

['grunnkrets_id.age_0-6',
 'grunnkrets_id.age_7-13',
 'grunnkrets_id.age_14-20',
 'grunnkrets_id.age_21-27',
 'grunnkrets_id.age_28-34',
 'grunnkrets_id.age_35-41',
 'grunnkrets_id.age_42-48',
 'grunnkrets_id.age_49-55',
 'grunnkrets_id.age_56-62',
 'grunnkrets_id.age_63-69',
 'grunnkrets_id.age_70-76',
 'grunnkrets_id.age_77-83',
 'grunnkrets_id.age_84-90',
 'delomrade.age_0-6',
 'delomrade.age_7-13',
 'delomrade.age_14-20',
 'delomrade.age_21-27',
 'delomrade.age_28-34',
 'delomrade.age_35-41',
 'delomrade.age_42-48',
 'delomrade.age_49-55',
 'delomrade.age_56-62',
 'delomrade.age_63-69',
 'delomrade.age_70-76',
 'delomrade.age_77-83',
 'delomrade.age_84-90',
 'kommune.age_0-6',
 'kommune.age_7-13',
 'kommune.age_14-20',
 'kommune.age_21-27',
 'kommune.age_28-34',
 'kommune.age_35-41',
 'kommune.age_42-48',
 'kommune.age_49-55',
 'kommune.age_56-62',
 'kommune.age_63-69',
 'kommune.age_70-76',
 'kommune.age_77-83',
 'kommune.age_84-90',
 'fylke.age_0-6',
 'fylke.age_7-13',
 'fylke.ag

In [24]:
for df_name, df in store_dataframes.items():
    store_dataframes[df_name] = df.merge(
        full_population_dataframes[df_name], 
        left_index=True,
        right_index=True,
        how="outer", 
        suffixes=('', '_redundant')
    )
    store_dataframes[df_name].drop(store_dataframes[df_name].filter(regex='_redundant$').columns, axis=1, inplace=True)

In [25]:
full_pop_dict = store_dataframes["train"][['log_revenue'] + full_pop_columns].corr().iloc[0].to_dict()
full_pop_sorted_relevant_dist_cols = [[k, v] for k, v in sorted(full_pop_dict.items(), key=lambda item: abs(item[1]), reverse=True)]

In [26]:
full_pop_relevant_cols = full_pop_sorted_relevant_dist_cols[1:8]
full_pop_relevant_cols = [r[0] for r in full_pop_relevant_cols]

In [27]:
fylke_relevant_features = [col_name for col_name in store_dataframes["train"].columns if col_name.startswith("fylke.")]
kommune_relevant_features = [col_name for col_name in store_dataframes["train"].columns if col_name.startswith("kommune.")]
delomrade_relevant_features = [col_name for col_name in store_dataframes["train"].columns if col_name.startswith("delomrade.")]
grunnkrets_relevant_features = [col_name for col_name in store_dataframes["train"].columns if col_name.startswith("grunnkrets_id.")]

In [28]:
from num_stores import add_num_stores_info

for df_name, df in store_dataframes.items():
    store_dataframes[df_name] = add_num_stores_info(df)

  grouped = geo_df.groupby([ level , cat]).sum()['count'].to_frame()
  grouped = geo_df.groupby([ level , cat]).sum()['count'].to_frame()
  grouped = geo_df.groupby([ level , cat]).sum()['count'].to_frame()
  grouped = geo_df.groupby([ level , cat]).sum()['count'].to_frame()
  grouped = geo_df.groupby([ level , cat]).sum()['count'].to_frame()
  grouped = geo_df.groupby([ level , cat]).sum()['count'].to_frame()
  grouped = geo_df.groupby([ level , cat]).sum()['count'].to_frame()
  grouped = geo_df.groupby([ level , cat]).sum()['count'].to_frame()
  grouped = geo_df.groupby([ level , cat]).sum()['count'].to_frame()
  grouped = geo_df.groupby([ level , cat]).sum()['count'].to_frame()
  grouped = geo_df.groupby([ level , cat]).sum()['count'].to_frame()
  grouped = geo_df.groupby([ level , cat]).sum()['count'].to_frame()
  grouped = geo_df.groupby([ level , cat]).sum()['count'].to_frame()
  grouped = geo_df.groupby([ level , cat]).sum()['count'].to_frame()
  grouped = geo_df.groupby([ level

In [29]:
num_store_cols = list(store_dataframes["train"].columns[-64:])

In [30]:
num_store_dict = store_dataframes["train"][['log_revenue'] + num_store_cols].corr().iloc[0].to_dict()
num_store_sorted_relevant_cols = [[k, v] for k, v in sorted(num_store_dict.items(), key=lambda item: abs(item[1]), reverse=True)]
num_store_sorted_relevant_cols

[['log_revenue', 1.0],
 ['fylke.plaace_cat_1_per_capita', 0.3036462055602537],
 ['fylke.plaace_cat_1_per_tot_income', 0.29950542010049447],
 ['kommune.plaace_cat_1_per_capita', 0.25661756586529566],
 ['kommune.plaace_cat_1_per_tot_income', 0.245370813652734],
 ['fylke.plaace_cat_1_count', 0.18265093811288075],
 ['fylke.plaace_cat_4_per_capita', 0.16374528532317106],
 ['fylke.plaace_cat_4_per_tot_income', 0.16261122183977433],
 ['fylke.plaace_cat_3_per_capita', 0.1510299207483036],
 ['fylke.plaace_cat_3_per_tot_income', 0.14797391353258624],
 ['grunnkrets_id.plaace_cat_1_count', 0.1230872564366986],
 ['delomrade.plaace_cat_1_count', 0.11156949047890931],
 ['kommune.plaace_cat_1_per_km2', 0.08386190923409534],
 ['fylke.plaace_cat_4_count', 0.07989315137361924],
 ['fylke.plaace_cat_3_count', 0.07410923091607406],
 ['kommune.plaace_cat_3_per_capita', 0.06780469618921373],
 ['grunnkrets_id.plaace_cat_1_per_capita', 0.06542117839938809],
 ['delomrade.plaace_cat_1_per_capita', 0.0653411991997

In [31]:
num_store_relevant_cols = num_store_sorted_relevant_cols[1:15]
num_store_relevant_cols = [r[0] for r in num_store_relevant_cols]

## Adding mean revenue (plaace_cat, level)

In [32]:
from avg_revenue import add_avg_revenue

for df_name, df in store_dataframes.items():
    store_dataframes[df_name] = add_avg_revenue(df, total=submission_with_whole_train)

In [33]:
mean_rev_cols = list(store_dataframes["train"].columns)[-20:]

In [34]:
mean_rev_dict = store_dataframes["val"][['log_revenue'] + mean_rev_cols].corr().iloc[0].to_dict()
mean_rev_sorted_relevant_cols = [[k, v] for k, v in sorted(mean_rev_dict.items(), key=lambda item: abs(item[1]), reverse=True)]

In [35]:
mean_rev_relevant_cols = mean_rev_sorted_relevant_cols[1:]
mean_rev_relevant_cols = [r[0] for r in mean_rev_relevant_cols]

In [36]:
mean_rev_relevant_log_cols = []
for col in mean_rev_relevant_cols:
    for df_name, df in store_dataframes.items():
        store_dataframes[df_name][col + "_log"] = store_dataframes[df_name][col].apply(lambda x: np.log1p(x))
    mean_rev_relevant_log_cols.append(col + "_log")

In [37]:
from new_plaace_index import create_index_csv, add_new_plaace_index

create_index_csv()

In [38]:
for df_name, df in store_dataframes.items():
    store_dataframes[df_name] = add_new_plaace_index(store_dataframes[df_name])

In [39]:
from clustering import add_clusters

for df_name, df in store_dataframes.items():
    store_dataframes[df_name] = add_clusters(store_dataframes[df_name])

In [40]:
for df_name, df in store_dataframes.items():
    store_dataframes[df_name]["chain_count"] = store_dataframes[df_name].bounded_chain_name.apply(lambda x: 0 if (x == "OTHER" or x in chain_count.keys()) else chain_count[x])

# Read data directly from CSV (option 2)

## Train, val split

In [851]:
if submission_with_whole_train:
    stores_train = pd.read_csv("temp_data/full_train_features_train.csv", index_col=0)
    stores_extra = pd.read_csv("temp_data/full_train_features_extra.csv", index_col=0)
    stores_test = pd.read_csv("temp_data/full_train_features_test.csv", index_col=0)

    old_store_dataframes = {
        "train": stores_train, 
        "extra": stores_extra, 
        "test": stores_test, 
        }
else:
    stores_train = pd.read_csv("temp_data/full_features_train.csv", index_col=0)
    stores_val = pd.read_csv("temp_data/full_features_val.csv", index_col=0)
    stores_extra = pd.read_csv("temp_data/full_features_extra.csv", index_col=0)
    stores_test = pd.read_csv("temp_data/full_features_test.csv", index_col=0)

    old_store_dataframes = {
        "train": stores_train, 
        "extra": stores_extra, 
        "test": stores_test, 
        "val": stores_val
        }

In [739]:
with open("relevant_cols.txt", "r") as f:
    data = f.readlines()

comp_relevant_cols = data[0].strip().split(",")
bus_relevant_cols = data[1].strip().split(",")
num_store_relevant_cols = data[2].strip().split(",")
full_pop_relevant_cols = data[3].strip().split(",")

In [740]:
comp_relevant_cols = comp_relevant_cols[::2]

In [741]:
comp_relevant_cols += [
    'sum_dist_to_nearest_10_comp_plaace_2',
    'sum_dist_to_nearest_5_comp_plaace_2', 
    'sum_dist_to_nearest_10_comp_plaace_1',
    'sum_dist_to_nearest_5_comp_plaace_1', 
    'sum_dist_to_nearest_1_comp_plaace_1',
    'sum_dist_to_nearest_1_comp_plaace_2'
    ]

In [742]:
bus_relevant_cols = bus_relevant_cols[::2]

In [743]:
num_store_relevant_cols

['fylke.plaace_cat_1_count',
 'fylke.plaace_cat_4_count',
 'fylke.plaace_cat_3_count',
 'delomrade.plaace_cat_1_count',
 'kommune.plaace_cat_1_count',
 'grunnkrets_id.plaace_cat_1_count']

# Transforming the data

In [41]:
from RMSLE import rmsle
from pred_var_utils import reverse_log1p_transform_pred_var
from sklearn.model_selection import GridSearchCV

In [72]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder


OE_categorical_features = ["bounded_chain_name", "kommune", "delomrade", "is_grocery", "plaace_cat_2", "plaace_cat_3", "plaace_cat_4", "grunnkrets_id"]#"new_plaace"]
OE_categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(missing_values=np.nan, strategy="constant")),
        ("encoder", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)),
    ]
)

OH_categorical_features = ["plaace_cat_1"] #["fylke", "plaace_cat_2"]
OH_categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(missing_values=np.nan, strategy="constant")),
        ("encoder", OneHotEncoder(handle_unknown="ignore")),
    ]
)


numerical_features = ["lat", "lon", 
#"mean_revenue_1", "mean_revenue_2", "mean_revenue_3", "mean_revenue_4", 
"log_mean_revenue_1", 
"log_mean_revenue_2", 
"log_mean_revenue_3", 
"log_mean_revenue_4", 
#"fylke.plaace_cat_1_mean_revenue_log", #"fylke.plaace_cat_3_mean_revenue_log", 
#'grunnkrets_id.tot_pop',
#'delomrade.tot_pop',
#'kommune.tot_pop',
#'fylke.tot_pop',
"log_chain_mean_revenue"
] + comp_relevant_cols 
#+ bus_relevant_cols 
#+ num_store_relevant_cols 
#+ full_pop_relevant_cols 
#+ mean_rev_relevant_log_cols
numerical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="mean")), 
        ("scaler", StandardScaler(with_mean=True, with_std=True))]
)


preprocessor = ColumnTransformer(
   transformers=[
       ("oe_cat", OE_categorical_transformer, OE_categorical_features),
       ("oh_cat", OH_categorical_transformer, OH_categorical_features),
       ("num", numerical_transformer, numerical_features),
   ],
   remainder='drop'
)

# preprocessor = ColumnTransformer(
#     transformers=[
#         ("oe_cat", OE_categorical_transformer, all_OE_cols),
#         ("oh_cat", OH_categorical_transformer, all_OH_cols),
#         ("num", numerical_transformer, all_num_cols[:16] + all_num_cols[72:276]),
#     ],
#     remainder='drop'
# )


X_train = preprocessor.fit_transform(store_dataframes["train"])
if not submission_with_whole_train:
    X_val = preprocessor.transform(store_dataframes["val"])

In [73]:
y_train = np.array(store_dataframes["train"].log_revenue)
if not submission_with_whole_train:
    y_val = np.array(store_dataframes["val"].revenue)
mean_y = y_train.mean()
std_y = y_train.std()

y_train -= mean_y
y_train /= std_y

In [74]:
X_train.shape

(10287, 29)

# Remove outliers

In [1170]:
from outlier_utils import remove_low_revenue

store_dataframes["out_removed_train"] = remove_low_revenue(store_dataframes["train"].copy(), -1)

In [1116]:
from remove_outlier import remove_outliers

store_dataframes["out_removed_train"] = remove_outliers(store_dataframes["out_removed_train"], "plaace_cat_4", 5)

In [1169]:
store_dataframes["out_removed_train"].shape[0]

10212

In [1118]:
store_dataframes["out_removed_train"] = find_dist_to_nearest_comp(
    store_dataframes["out_removed_train"], 
    nearest_comp_plaace_cat_gran, 
    n_nearest_comp, 
    training=True, 
    training_df=concat_df,
    )

In [1119]:
for i in tqdm(range(1, 5)):
    rev_plaace_dict, mean_plaace_revenue, log_rev_plaace_dict, log_mean_plaace_revenue = generate_plaace_rev_dict(store_dataframes["train"], i, quantile=0)
    store_dataframes["out_removed_train"] = mean_rev_of_competitor(store_dataframes["out_removed_train"], i, rev_dict=rev_plaace_dict, mean_revenue=mean_plaace_revenue)
    store_dataframes["out_removed_train"] = log_mean_rev_of_competitor(store_dataframes["out_removed_train"], i, log_rev_dict=log_rev_plaace_dict, log_mean_revenue=log_mean_plaace_revenue)

100%|██████████| 4/4 [00:00<00:00, 14.59it/s]


In [1120]:
chain_rev_dict, log_bounded_chain_rev_dict = generate_chain_rev_dict(store_dataframes["out_removed_train"], quantile=0)
store_dataframes["out_removed_train"] = create_mean_chain_rev_col(store_dataframes["out_removed_train"], bounded_chain_revs=chain_rev_dict, log_bounded_chain_revs=log_bounded_chain_rev_dict)

# PCA

In [75]:
PCA_cols_dict = {
    "OE": [],
    "OH": [],  
    "num": []
}

reduntant_cols = ["store_id", "year", "store_name", "plaace_hierarchy_id", "sales_channel_name", "address", "revenue", "log_revenue", "point", "plaace_cat_0"]

for col_name, dtype in store_dataframes["train"].dtypes.to_dict().items():
    if(col_name in reduntant_cols):
        continue
    if(dtype == int or dtype == float):
        _type = "num"
    elif(dtype == bool):
        _type = "OE"
    elif(dtype == object):
        if(store_dataframes["train"][col_name].nunique() <= 10):
            _type = "OH"
        else:
            _type = "OE"
    else:
        print(f"Unknown type {dtype} encountered for columns {col_name}")
    PCA_cols_dict[_type].append(col_name)
    

In [76]:
all_num_cols = PCA_cols_dict["num"][:339]
PCA_cols_dict["OE"] += PCA_cols_dict["num"][-3:-1]

In [77]:
all_OH_cols = PCA_cols_dict["OH"]
all_OH_cols

['plaace_cat_1']

In [78]:
all_OE_cols = PCA_cols_dict["OE"]
all_OE_cols

['chain_name',
 'mall_name',
 'plaace_cat_2',
 'plaace_cat_3',
 'plaace_cat_4',
 'is_mall',
 'is_chain',
 'bounded_chain_name',
 'is_grocery',
 'old_plaace',
 'new_plaace_name',
 'new_plaace',
 'cluster']

In [79]:
import numpy as np
from sklearn.decomposition import PCA

### Count Cols

In [687]:
geo_plaace_cat_cols = all_num_cols[276:]

In [688]:
count_cols_preprocessor = ColumnTransformer(
    transformers=[
        #("oe_cat", OE_categorical_transformer, PCA_cols_dict["OE"]),
        #("oh_cat", OH_categorical_transformer, PCA_cols_dict["OH"]),
        ("num", numerical_transformer, geo_plaace_cat_cols),
    ],
    remainder='drop'
)

In [689]:
pre_PCA_X_train_count_cols = count_cols_preprocessor.fit_transform(store_dataframes["train"])
if not submission_with_whole_train:
    pre_PCA_X_val_count_cols = count_cols_preprocessor.transform(store_dataframes["val"])

In [690]:
pca = PCA(n_components=3)
PCA_X_train = pca.fit_transform(pre_PCA_X_train_count_cols)
if not submission_with_whole_train:
    PCA_X_val = pca.transform(pre_PCA_X_val_count_cols)

In [691]:
print(np.sum(pca.explained_variance_ratio_))

print(pca.explained_variance_ratio_)

print(pca.singular_values_)

0.5916238986622017
[0.33331616 0.15396038 0.10434736]
[464.7750719  315.87782456 260.04911143]


In [692]:
X_train = np.concatenate((X_train,PCA_X_train),axis=1)
if not submission_with_whole_train:
    X_val = np.concatenate((X_val,PCA_X_val),axis=1)

### Comp dist cols

In [598]:
PCA_comp_dist_cols = all_num_cols[16:72]

In [599]:
comp_dist_preprocessor = ColumnTransformer(
    transformers=[
        #("oe_cat", OE_categorical_transformer, PCA_cols_dict["OE"]),
        #("oh_cat", OH_categorical_transformer, PCA_cols_dict["OH"]),
        ("num", numerical_transformer, PCA_comp_dist_cols),
    ],
    remainder='drop'
)

In [600]:
pre_PCA_X_train_comp_cols = comp_dist_preprocessor.fit_transform(store_dataframes["train"])
if not submission_with_whole_train:
    pre_PCA_X_val_comp_cols = comp_dist_preprocessor.transform(store_dataframes["val"])

In [601]:
pca = PCA(n_components=5)
PCA_X_train = pca.fit_transform(pre_PCA_X_train_comp_cols)
if not submission_with_whole_train:
    PCA_X_val = pca.transform(pre_PCA_X_val_comp_cols)

In [602]:
print(np.sum(pca.explained_variance_ratio_))

print(pca.explained_variance_ratio_)

print(pca.singular_values_)

0.9591197711131739
[0.52962483 0.26730788 0.10415261 0.03619452 0.02183993]
[552.3604204  392.41379081 244.94775403 144.39754555 112.16672492]


In [603]:
X_train = np.concatenate((X_train,PCA_X_train),axis=1)
if not submission_with_whole_train:
    X_val = np.concatenate((X_val,PCA_X_val),axis=1)

### Full population cols

In [80]:
full_pop_cols = all_num_cols[144:276]
full_pop_cols
full_pop_age_cols = full_pop_cols[:20] + full_pop_cols[25:40]

In [81]:
full_pop_preprocessor = ColumnTransformer(
    transformers=[
        #("oe_cat", OE_categorical_transformer, PCA_cols_dict["OE"]),
        #("oh_cat", OH_categorical_transformer, PCA_cols_dict["OH"]),
        ("num", numerical_transformer, full_pop_cols),
    ],
    remainder='drop'
)

In [82]:
pre_PCA_X_train_full_pop_cols = full_pop_preprocessor.fit_transform(store_dataframes["train"])
if not submission_with_whole_train:
    pre_PCA_X_val_full_pop_cols = full_pop_preprocessor.transform(store_dataframes["val"])

In [83]:
pca = PCA(n_components=3)
PCA_X_train = pca.fit_transform(pre_PCA_X_train_full_pop_cols)
if not submission_with_whole_train:
    PCA_X_val = pca.transform(pre_PCA_X_val_full_pop_cols)

In [84]:
print(np.sum(pca.explained_variance_ratio_))

print(pca.explained_variance_ratio_)

print(pca.singular_values_)

0.7176714472755126
[0.40588052 0.20522538 0.10656555]
[742.38713818 527.89416982 380.39934023]


In [85]:
X_train = np.concatenate((X_train,PCA_X_train),axis=1)
if not submission_with_whole_train:
    X_val = np.concatenate((X_val,PCA_X_val),axis=1)

### Bus distance cols

In [86]:
bus_stop_cols = all_num_cols[72:92]

In [87]:
bus_stop_preprocessor = ColumnTransformer(
    transformers=[
        #("oe_cat", OE_categorical_transformer, PCA_cols_dict["OE"]),
        #("oh_cat", OH_categorical_transformer, PCA_cols_dict["OH"]),
        ("num", numerical_transformer, bus_stop_cols),
    ],
    remainder='drop'
)

In [88]:
pre_PCA_X_train_bus_stop_cols = bus_stop_preprocessor.fit_transform(store_dataframes["train"])
if not submission_with_whole_train:
    pre_PCA_X_val_bus_stop_cols = bus_stop_preprocessor.transform(store_dataframes["val"])

In [89]:
pca = PCA(n_components=2)
PCA_X_train = pca.fit_transform(pre_PCA_X_train_bus_stop_cols)
if not submission_with_whole_train:
    PCA_X_val = pca.transform(pre_PCA_X_val_bus_stop_cols)

In [90]:
print(np.sum(pca.explained_variance_ratio_))

print(pca.explained_variance_ratio_)

print(pca.singular_values_)

0.9413717977937323
[0.78820564 0.15316616]
[402.69768744 177.51734059]


In [91]:
X_train = np.concatenate((X_train,PCA_X_train),axis=1)
if not submission_with_whole_train:
    X_val = np.concatenate((X_val,PCA_X_val),axis=1)

### Mean revenue cols

In [277]:
new_mean_rev_cols = []
mean_rev_cols = PCA_cols_dict["num"][366:-3]
for col in mean_rev_cols:
    if "delomrade" in col:
        continue
    if "cat_4" in col:
        continue
    if "cat_3" in col:
        continue
    new_mean_rev_cols.append(col)

new_mean_rev_cols.remove('country.plaace_cat_0_mean_revenue_log')
new_mean_rev_cols

['country.plaace_cat_2_mean_revenue_log',
 'fylke.plaace_cat_2_mean_revenue_log',
 'country.plaace_cat_1_mean_revenue_log',
 'fylke.plaace_cat_1_mean_revenue_log',
 'kommune.plaace_cat_2_mean_revenue_log',
 'kommune.plaace_cat_1_mean_revenue_log',
 'fylke.plaace_cat_0_mean_revenue_log',
 'kommune.plaace_cat_0_mean_revenue_log']

In [278]:
mean_rev_preprocessor = ColumnTransformer(
    transformers=[
        #("oe_cat", OE_categorical_transformer, PCA_cols_dict["OE"]),
        #("oh_cat", OH_categorical_transformer, PCA_cols_dict["OH"]),
        ("num", numerical_transformer, mean_rev_cols),
    ],
    remainder='drop'
)

In [279]:
pre_PCA_X_train_mean_rev_cols = mean_rev_preprocessor.fit_transform(store_dataframes["train"])
if not submission_with_whole_train:
    pre_PCA_X_val_mean_rev_cols = mean_rev_preprocessor.transform(store_dataframes["val"])

In [282]:
pca = PCA(n_components=7)
PCA_X_train = pca.fit_transform(pre_PCA_X_train_mean_rev_cols)
if not submission_with_whole_train:
    PCA_X_val = pca.transform(pre_PCA_X_val_mean_rev_cols)

In [283]:
print(np.sum(pca.explained_variance_ratio_))

print(pca.explained_variance_ratio_)

print(pca.singular_values_)

0.94808787609428
[0.58917955 0.10935341 0.09790841 0.0548109  0.03787202 0.03114834
 0.02781524]
[339.34777294 146.19661838 138.33471162 103.50340937  86.03603807
  78.02587043  73.73311963]


In [284]:
X_train = np.concatenate((X_train,PCA_X_train),axis=1)
if not submission_with_whole_train:
    X_val = np.concatenate((X_val,PCA_X_val),axis=1)

# Removing highly correlated features

In [969]:
cor_matrix = store_dataframes["train"].corr().abs()

  cor_matrix = store_dataframes["train"].corr().abs()


In [997]:
upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape),k=1).astype(bool))

In [999]:
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.999)]
to_drop

['fylke',
 'kommune',
 'delomrade',
 'mean_dist_to_nearest_1_comp_plaace_1',
 'mean_dist_to_nearest_2_comp_plaace_1',
 'mean_dist_to_nearest_3_comp_plaace_1',
 'mean_dist_to_nearest_4_comp_plaace_1',
 'mean_dist_to_nearest_5_comp_plaace_1',
 'mean_dist_to_nearest_7_comp_plaace_1',
 'mean_dist_to_nearest_10_comp_plaace_1',
 'mean_dist_to_nearest_1_comp_plaace_2',
 'mean_dist_to_nearest_2_comp_plaace_2',
 'mean_dist_to_nearest_3_comp_plaace_2',
 'mean_dist_to_nearest_4_comp_plaace_2',
 'mean_dist_to_nearest_5_comp_plaace_2',
 'mean_dist_to_nearest_7_comp_plaace_2',
 'mean_dist_to_nearest_10_comp_plaace_2',
 'mean_dist_to_nearest_1_comp_plaace_3',
 'mean_dist_to_nearest_2_comp_plaace_3',
 'mean_dist_to_nearest_3_comp_plaace_3',
 'mean_dist_to_nearest_4_comp_plaace_3',
 'mean_dist_to_nearest_5_comp_plaace_3',
 'mean_dist_to_nearest_7_comp_plaace_3',
 'mean_dist_to_nearest_10_comp_plaace_3',
 'mean_dist_to_nearest_1_comp_plaace_4',
 'mean_dist_to_nearest_2_comp_plaace_4',
 'mean_dist_to_nea

# Training the model

## Random Forest

In [1016]:
from sklearn.ensemble import RandomForestRegressor

# best params (8.11)
# {'max_features': 8, 'min_samples_leaf': 4, 'min_samples_split': 32, 'n_estimators': 500}
# rmsle(on val): 0.719099511243053

rf = RandomForestRegressor(random_state=0, n_jobs=-1, max_features=8, min_samples_leaf=4, min_samples_split=32, n_estimators=500)
rf_params = {
    "n_estimators" : (100, 250, 500), 
    "max_features" : (2, 4, 8), 
    "min_samples_split" : (4, 8, 16, 32), 
    "min_samples_leaf" : (1, 2, 4), 
    }

rf_clf = GridSearchCV(rf, rf_params, verbose=2)

In [1017]:
rf.fit(X_train, y_train)
rf_y_pred = reverse_log1p_transform_pred_var(rf.predict(X_val), std_y=std_y, mean_y=mean_y)
rmsle(y_val, rf_y_pred)

0.8990312771546934

In [932]:
#rf_clf.best_params_

In [933]:
#rf_clf.best_params_

In [934]:
rf_y_pred = reverse_log1p_transform_pred_var(rf.predict(X_val), std_y=std_y, mean_y=mean_y)


In [935]:
rmsle(y_val, rf_y_pred)

0.7431171369975103

## Linear Regression

In [479]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression(n_jobs=-1)

In [480]:
lr.fit(X_train, y_train)

In [481]:
lr_y_pred = reverse_log1p_transform_pred_var(lr.predict(X_val), std_y, mean_y)
lr_y_pred = np.array([max(0, xi) for xi in lr_y_pred])

In [482]:
rmsle(y_val, lr_y_pred)

0.7485071979848835

## Light GBM

In [1010]:
from lightgbm import LGBMRegressor
from sklearn.model_selection import RandomizedSearchCV

# best params (8.11)
# {'learning_rate': 0.05, 'min_child_samples': 16, 'min_split_gain': 0, 'n_estimators': 100, 'num_leaves': 25}
# rmsle(on val): 0.7208999886795342

lgbm = LGBMRegressor(
    random_state=0, 
    n_jobs=-1, 
    learning_rate=0.05, 
    min_child_samples=16, 
    min_split_gain=0, 
    n_estimators=100, 
    num_leaves=25
    )

lgbm_params = {
    "num_leaves" : (10, 25, 31, 75), 
    "learning_rate" : (0.05, 0.1, 0.25),
    "n_estimators" : (50, 100, 250), 
    "min_split_gain" : (0, 0.01), 
    "min_child_samples" : (4, 8, 16, 32), 
    "reg_alpha" : (0, 0.01, 0.1), 
    "reg_lambda" : (0, 0.01, 0.1), 
    }

#lgbm_clf = RandomizedSearchCV(lgbm, lgbm_params, verbose=2)

In [1011]:
lgbm.fit(X_train, y_train)

In [1012]:
#lgbm_clf.best_params_

In [1013]:
lgbm_y_pred = reverse_log1p_transform_pred_var(lgbm.predict(X_val), std_y, mean_y)
lgbm_y_pred = np.array([max(0, xi) for xi in lgbm_y_pred])

In [1014]:
rmsle(y_val, lgbm_y_pred)

0.722147096898195

## CatBoost

In [92]:
from catboost import CatBoostRegressor
from sklearn.model_selection import GridSearchCV


# best params (8.11)
# {'depth': 6, 'l2_leaf_reg': 10, 'learning_rate': 0.05, 'eval_metric': 'RMSE'}
# rmsle(on val) = 0.0.7148919867904334

# best params (12.11) (all features)
# {'depth': 6, 'l2_leaf_reg': 10, 'learning_rate': 0.05, 'eval_metric': 'RMSE'}
# rmsle(on val) = 0.7177413486698632

cb = CatBoostRegressor(
    random_seed=0, 
    verbose=False, 
    eval_metric="RMSE", 
    rsm=0.1,
    depth=6, 
    l2_leaf_reg= 10, 
    learning_rate= 0.05
    )

cb_params = grid = {
    'learning_rate': [0.01, 0.03, 0.05, 0.1, 0.3],
    'depth': [5, 6, 8, 10, 15, 20],
    'l2_leaf_reg': [3, 4, 5, 6, 7, 8, 10, 15], 
    }

#cb_clf = cb.randomized_search(cb_params, X=X_train, y=y_train, verbose=2)

In [93]:
#cb_clf["params"]

In [94]:
cb.fit(X_train, y_train)

<catboost.core.CatBoostRegressor at 0x2ad480df0>

In [95]:
cb_y_pred = reverse_log1p_transform_pred_var(cb.predict(X_val), std_y, mean_y)
cb_y_pred = np.array([max(0, xi) for xi in cb_y_pred])

In [66]:
rmsle(y_val, cb_y_pred)

0.7140855949981019

In [96]:
rmsle(y_val, cb_y_pred)

0.7171520894142979

## Stacking classifiers

In [None]:
from sklearn.ensemble import StackingRegressor

estimators = [
    ('rf', rf), 
    ('cb', cb), 
    ('lgbm', lgbm), 
]

rf_end_params = {
    "n_estimators" : (50, 100, 250), 
    "max_features" : (1, 2, 3), 
    "min_samples_split" : (16, 32), 
    "min_samples_leaf" : (2, 4, 8), 
    }

rf_end = RandomForestRegressor(random_state=0, n_jobs=-1, n_estimators=50, max_features=3)

reg = StackingRegressor(
    estimators=estimators,
    final_estimator=rf_end
    )

In [None]:
reg.fit(X_train, y_train)

In [None]:
reg_y_pred = reverse_log1p_transform_pred_var(reg.predict(X_val), std_y, mean_y)

In [None]:
rmsle(y_pred=reg_y_pred, y_true=y_val)

## Creating the submission

In [None]:
# Predict on the test set 
X_test = preprocessor.transform(store_dataframes["test"])
y_test_pred = reverse_log1p_transform_pred_var(cb.predict(X_test), std_y, mean_y)

# Generate submission dataframe 
# NOTE: It is important that the ID and predicted values match
submission = pd.DataFrame()
submission['id'] = stores_test.store_id 
submission['predicted'] = np.asarray(y_test_pred)

# Save it to disk (`index=False` means don't save the index in the csv)
submission.to_csv('submission.csv', index=False)

# Create CSV of dataframes

In [1041]:
for df_name, df in store_dataframes.items():
    filepath = Path(f"temp_data/full_features_{df_name}_13_nov.csv")  
    filepath.parent.mkdir(parents=True, exist_ok=True)  
    df.to_csv(filepath, index=True)