# Submission

In [1]:
# Magic to automatically update imports if functions in utils are changed
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import seaborn as sns
import sklearn
import matplotlib.pyplot as plt
from tqdm import tqdm
from pathlib import Path

In [2]:
from sklearn.model_selection import train_test_split

stores_train = pd.read_csv("data/stores_train.csv")
stores_test = pd.read_csv("data/stores_test.csv")
stores_extra = pd.read_csv("data/stores_extra.csv")
stores_train, stores_val = train_test_split(stores_train, test_size=0.2, random_state=0)

In [3]:
from utils import split_plaace_cat

stores_train = split_plaace_cat(stores_train)
stores_val = split_plaace_cat(stores_val)
stores_extra = split_plaace_cat(stores_extra)
stores_test = split_plaace_cat(stores_test)



store_dataframes = {
    "train": stores_train, 
    "val": stores_val, 
    "extra": stores_extra, 
    "test": stores_test, 
    }

In [4]:
from utils import create_geographical_columns, create_chain_and_mall_columns, generate_rev_dict

chain_count = stores_train["chain_name"].value_counts().to_dict()
lower_limit = 10
rev_dict, mean_revenue = generate_rev_dict(stores_train)

for df_name, df in store_dataframes.items():
    df = create_geographical_columns(df)
    df = create_chain_and_mall_columns(df, chain_count, lower_limit=lower_limit)

In [5]:
from utils import concat_df_keep_unq_index
concat_df = concat_df_keep_unq_index(stores_train, stores_extra)

!NB next cell takes 15 minutes to run (if comp_plaace_gran. = [1, 2, 3, 4]) (on M1 Mac with 16GB RAM)

If you have the .csv files temp_data/closest_comp_\{df_name\}, skip running this cell and run the cell below it instead

In [7]:
from utils import find_dist_to_nearest_comp

nearest_comp_plaace_cat_gran = [3, 4]
n_nearest_comp = [1]

store_dataframes["train"], cp = find_dist_to_nearest_comp(
    store_dataframes["train"], 
    nearest_comp_plaace_cat_gran, 
    n_nearest_comp, 
    training=True, 
    training_df=concat_df,
    _mean = False
    )

In [9]:
store_dataframes["train"].shape

(10287, 26)

In [13]:
dist_cols = ["store_id"] + list(store_dataframes["train"].columns[-2:])

In [14]:
dist_cols

['store_id',
 'sum_dist_to_nearest_1_comp_plaace_3',
 'sum_dist_to_nearest_1_comp_plaace_4']

In [15]:
closest_comp_train = pd.read_csv("temp_data/closest_comp_train.csv")

In [16]:
closest_comp_train_new = store_dataframes["train"][dist_cols]

In [None]:
for df_name, df in store_dataframes.items():
    merge_df = pd.read_csv(f"temp_data/closest_comp_{df_name}.csv")
    store_dataframes[df_name] = df.merge(merge_df, left_on="store_id", right_on="store_id")


!NB next cell takes 15 minutes to run. Grab a coffee or somtething while you wait :) (on M1 Mac with 16GB RAM)

If you have the .csv files temp_data/closest_bus_stop_\{df_name\}, skip running this cell and run the cell below it instead

In [None]:
from bus_utils import find_closest_bus_stop

bus_stop_n = [1, 3, 5, 7]
bus_mean = True
bus_sum = True

bus_stop_columns = []

if(bus_sum):
    bus_stop_columns += [f"closest_bus_stop_sum_{i}" for i in bus_stop_n]

if(bus_mean):
    bus_stop_columns += [f"closest_bus_stop_mean_{i}" for i in bus_stop_n]


for df_name, df in tqdm(store_dataframes.items()):
    store_dataframes[df_name] = find_closest_bus_stop(df, bus_stop_n, _sum=bus_sum, _mean=bus_mean)    
    filepath = Path(f"temp_data/closest_bus_stop_{df_name}.csv")  
    filepath.parent.mkdir(parents=True, exist_ok=True)  
    df.to_csv(filepath, columns=["store_id"] + bus_stop_columns, index=False)


In [None]:
for df_name, df in store_dataframes.items():
    merge_df = pd.read_csv(f"temp_data/closest_comp_{df_name}.csv")
    bus_stop_columns = list(merge_df.columns)
    store_dataframes[df_name] = df.merge(merge_df, left_on="store_id", right_on="store_id")

In [None]:
from grunnkrets import make_grunnkrets_df

full_population_dataframes = {}

for df_name, df in tqdm(store_dataframes.items()):
    full_population_dataframes[df_name] = make_grunnkrets_df(df)

In [None]:
for df_name, df in store_dataframes.items():
    store_dataframes[df_name] = df.merge(
        full_population_dataframes[df_name], 
        left_index=True,
        right_index=True,
        how="outer", 
        suffixes=('', '_redundant')
    )
    store_dataframes[df_name].drop(store_dataframes[df_name].filter(regex='_redundant$').columns, axis=1, inplace=True)

In [None]:
fylke_relevant_features = [col_name for col_name in store_dataframes["train"].columns if col_name.startswith("fylke.")]
kommune_relevant_features = [col_name for col_name in store_dataframes["train"].columns if col_name.startswith("kommune.")]
delomrade_relevant_features = [col_name for col_name in store_dataframes["train"].columns if col_name.startswith("delomrade.")]
grunnkrets_relevant_features = [col_name for col_name in store_dataframes["train"].columns if col_name.startswith("grunnkrets_id.")]

In [None]:
from utils import find_dist_to_nearest_comp

nearest_comp_plaace_cat_gran = [3, 4]
n_nearest_comp = [1, 2, 3]

store_dataframes["train"] = find_dist_to_nearest_comp(
    store_dataframes["train"], 
    nearest_comp_plaace_cat_gran, 
    n_nearest_comp, 
    training=True, 
    training_df=store_dataframes["train"]
    )

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder


OE_categorical_features = ["bounded_chain_name", "kommune", "delomrade", "is_grocery", "plaace_cat_3", "plaace_cat_4"]
OE_categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(missing_values=np.nan, strategy="constant")),
        ("encoder", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)),
    ]
)

OH_categorical_features = ["fylke", "plaace_cat_2"]
OH_categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(missing_values=np.nan, strategy="constant")),
        ("encoder", OneHotEncoder(handle_unknown="ignore")),
    ]
)


numerical_features = ["lat", "lon", 
"mean_revenue_1", "mean_revenue_2", "mean_revenue_3", "mean_revenue_4", 
] + delomrade_relevant_features + bus_stop_columns
numerical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="mean")), 
        ("scaler", StandardScaler(with_mean=True, with_std=True))]
)


preprocessor = ColumnTransformer(
    transformers=[
        ("oe_cat", OE_categorical_transformer, OE_categorical_features),
        ("oh_cat", OH_categorical_transformer, OH_categorical_features),
        ("num", numerical_transformer, numerical_features),
    ],
    remainder='drop'
)


X_train = preprocessor.fit_transform(store_dataframes["train"])
X_val = preprocessor.transform(store_dataframes["val"])