In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_log_error, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor

In [6]:
without_gk = pd.read_csv("../../own_data/without_grunnkrets.csv").set_index(["dataset", "range_index"])
without_gk.drop(columns=['store_name', 'address', 'lat', 'lon', 'busstop_id', 'grunnkrets_id', 'importance_level', 'stopplace_type'], inplace=True)
without_gk['in_mall'] = without_gk['mall_name'].notna()
without_gk['in_chain'] = without_gk['chain_name'].notna()
# without_gk['stopplace_type'] = without_gk['stopplace_type'].fillna("Mangler type")
without_gk['mall_name'] = without_gk['mall_name'].fillna("None")
without_gk['chain_name'] = without_gk['chain_name'].fillna("None")


data_with_label_wo = without_gk.loc["train"]
data_with_label_wo.set_index('store_id', inplace=True)
X_, y_ = data_with_label_wo.loc[:, data_with_label_wo.columns != 'revenue'], data_with_label_wo['revenue']
X_train_, X_test_, y_train_, y_test_ = train_test_split(X_, y_, test_size=0.2, random_state=42)
print(X_.columns.to_list())

['chain_name', 'mall_name', 'distance_closest_busstop', 'lv1', 'lv2', 'lv3', 'lv4', 'num_of_buss_stops_close', 'in_mall', 'in_chain']


In [14]:
all_data = pd.read_csv("../../own_data/all_merged.csv").set_index(["dataset", "range_index"])
all_data.drop(columns=['store_name', 'address', 'lat', 'lon', 'busstop_id', 'importance_level', 'stopplace_type', 'grunnkrets_id'], inplace=True)
all_data['in_mall'] = all_data['mall_name'].notna()
all_data['in_chain'] = all_data['chain_name'].notna()
# all_data['stopplace_type'] = all_data['stopplace_type'].fillna("Mangler type")
all_data['mall_name'] = all_data['mall_name'].fillna("None")
all_data['chain_name'] = all_data['chain_name'].fillna("None")


data_with_label = all_data.loc["train"]
data_with_label.set_index('store_id', inplace=True)
X, y = data_with_label.loc[:, data_with_label.columns != 'revenue'], data_with_label['revenue']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
y_train_scaled = np.log1p(y_train)
print(X.columns.to_list())

['chain_name', 'mall_name', 'distance_closest_busstop', 'district_name', 'municipality_name', 'area_km2', 'couple_children_0_to_5_years', 'couple_children_18_or_above', 'couple_children_6_to_17_years', 'couple_without_children_x', 'single_parent_children_0_to_5_years', 'single_parent_children_18_or_above', 'single_parent_children_6_to_17_years', 'singles_x', 'all_households', 'singles_y', 'couple_without_children_y', 'couple_with_children', 'other_households', 'single_parent_with_children', 'age_0', 'age_1', 'age_2', 'age_3', 'age_4', 'age_5', 'age_6', 'age_7', 'age_8', 'age_9', 'age_10', 'age_11', 'age_12', 'age_13', 'age_14', 'age_15', 'age_16', 'age_17', 'age_18', 'age_19', 'age_20', 'age_21', 'age_22', 'age_23', 'age_24', 'age_25', 'age_26', 'age_27', 'age_28', 'age_29', 'age_30', 'age_31', 'age_32', 'age_33', 'age_34', 'age_35', 'age_36', 'age_37', 'age_38', 'age_39', 'age_40', 'age_41', 'age_42', 'age_43', 'age_44', 'age_45', 'age_46', 'age_47', 'age_48', 'age_49', 'age_50', 'age

In [8]:
#print(data_with_label.loc[data_with_label.isnull()].isna().sum())
data_with_label.columns[data_with_label.isnull().any()]

Index([], dtype='object')

In [36]:
column_trans = make_column_transformer(
    (OneHotEncoder(handle_unknown='ignore'), 
    ['municipality_name',
    'chain_name', 
    'mall_name', 
    'district_name',
    'lv1',
    'lv2',
    'lv3',
    'lv4']),
    remainder="passthrough")
grad = GradientBoostingRegressor(random_state=42, learning_rate=.2, n_estimators=1000, loss="squared_error")

In [37]:
column_trans_ = make_column_transformer(
    (OneHotEncoder(handle_unknown='ignore'), 
    ['chain_name', 
    'mall_name', 
    'lv1',
    'lv2',
    'lv3',
    'lv4']),
    remainder="passthrough")

In [38]:
column_trans.fit_transform(X_train)
column_trans_.fit_transform(X_train_)

<10287x946 sparse matrix of type '<class 'numpy.float64'>'
	with 86880 stored elements in Compressed Sparse Row format>

In [39]:
pipe = make_pipeline(column_trans, grad)
pipe_ = make_pipeline(column_trans_, grad)


In [40]:
pipe.fit(X_train, y_train_scaled)
y_hat = pipe.predict(X_test)

In [24]:
def rmsle(y_true, y_pred):
    """
    Computes the Root Mean Squared Logarithmic Error 
    
    Args:
        y_true (np.array): n-dimensional vector of ground-truth values 
        y_pred (np.array): n-dimensional vecotr of predicted values 
    
    Returns:
        A scalar float with the rmsle value 
    
    Note: You can alternatively use sklearn and just do: 
        `sklearn.metrics.mean_squared_log_error(y_true, y_pred) ** 0.5`
    """
    y_pred[y_pred < 0] = 0
    assert (y_true >= 0).all(), 'Received negative y_true values'
    assert (y_pred >= 0).all(), 'Received negative y_pred values'
    assert y_true.shape == y_pred.shape, 'y_true and y_pred have different shapes'
    y_true_log1p = np.log1p(y_true)  # log(1 + y_true)
    y_pred_log1p = np.log1p(y_pred)  # log(1 + y_pred)
    return np.sqrt(np.mean(np.square(y_pred_log1p - y_true_log1p)))

In [87]:
# linear error, no log before training
rmsle(y_test, y_hat)



0.7273457588935354

In [42]:
# squared error log before training
rmsle(y_test, np.expm1(y_hat))

0.7347577275646373

In [29]:
rmsle_scorer = make_scorer(rmsle, greater_is_better=False)
#print(cross_val_score(pipe, X, y, cv=5, scoring=rmsle_scorer))
#print(cross_val_score(pipe, X_test, y_test, scoring=rmsle_scorer))

In [None]:
params = {'gradientboostingregressor__loss': ["absolute_error"],
          'gradientboostingregressor__n_estimators': [200, 500, 1000],
          'gradientboostingregressor__learning_rate': [.2, .1]   }

grid_1 = GridSearchCV(pipe_, params, cv=5, scoring=rmsle_scorer, verbose=3)
grid_1.fit(X_, y_)
print("No grunnpunkt data")
print(grid_1.best_params_)
print(grid_1.best_score_)

In [89]:
# params = {'adaboostregressor__base_estimator__max_depth':[5],
#           'adaboostregressor__base_estimator__min_samples_leaf':[5,10],
#           'adaboostregressor__n_estimators':[20, 40],
#           'adaboostregressor__learning_rate':[0.0001,0.001],
#           'adaboostregressor__loss': ['linear']}
# params = {'randomforestregressor__max_depth': [20, 30]}
params = {'gradientboostingregressor__loss': ["absolute_error"],
          'gradientboostingregressor__n_estimators': [200, 500, 1000],
          'gradientboostingregressor__learning_rate': [.2, .1]   }

grid_1 = GridSearchCV(pipe_, params, cv=5, scoring=rmsle_scorer, verbose=3)
grid_1.fit(X_, y_)
print("No grunnpunkt data")
print(grid_1.best_params_)
print(grid_1.best_score_)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV 1/5] END gradientboostingregressor__learning_rate=0.2, gradientboostingregressor__loss=absolute_error, gradientboostingregressor__n_estimators=200;, score=-0.796 total time=   3.4s
[CV 2/5] END gradientboostingregressor__learning_rate=0.2, gradientboostingregressor__loss=absolute_error, gradientboostingregressor__n_estimators=200;, score=-0.732 total time=   3.0s
[CV 3/5] END gradientboostingregressor__learning_rate=0.2, gradientboostingregressor__loss=absolute_error, gradientboostingregressor__n_estimators=200;, score=-0.717 total time=   3.0s
[CV 4/5] END gradientboostingregressor__learning_rate=0.2, gradientboostingregressor__loss=absolute_error, gradientboostingregressor__n_estimators=200;, score=-0.769 total time=   3.1s
[CV 5/5] END gradientboostingregressor__learning_rate=0.2, gradientboostingregressor__loss=absolute_error, gradientboostingregressor__n_estimators=200;, score=-0.731 total time=   3.2s
[CV 1/5] END gr

In [None]:
params = {'gradientboostingregressor__loss': ["absolute_error"],
          'gradientboostingregressor__n_estimators': [800, 1000, 1200],
          'gradientboostingregressor__learning_rate': [.2, .1]   }

grid_2 = GridSearchCV(pipe, params, cv=5, scoring=rmsle_scorer, verbose=3)
grid_2.fit(X, y)

print("With grunnpunkt data, but not all datapoints")
print(grid_2.best_params_)
print(grid_2.best_score_)