In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBRegressor, XGBClassifier 
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from category_encoders import TargetEncoder, LeaveOneOutEncoder, GLMMEncoder
from sklearn.metrics import mean_squared_log_error, make_scorer
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, chi2, SelectFromModel, mutual_info_classif
from sklearn.svm import LinearSVR

  from pandas import MultiIndex, Int64Index


In [3]:
def rmsle(y_true, y_pred):
    """
    Computes the Root Mean Squared Logarithmic Error 
    
    Args:
        y_true (np.array): n-dimensional vector of ground-truth values 
        y_pred (np.array): n-dimensional vecotr of predicted values 
    
    Returns:
        A scalar float with the rmsle value 
    
    Note: You can alternatively use sklearn and just do: 
        `sklearn.metrics.mean_squared_log_error(y_true, y_pred) ** 0.5`
    """
    y_pred[y_pred < 0] = 0
    assert (y_true >= 0).all(), 'Received negative y_true values'
    assert (y_pred >= 0).all(), 'Received negative y_pred values'
    assert y_true.shape == y_pred.shape, 'y_true and y_pred have different shapes'
    y_true_log1p = np.log1p(y_true)  # log(1 + y_true)
    y_pred_log1p = np.log1p(y_pred)  # log(1 + y_pred)
    return np.sqrt(np.mean(np.square(y_pred_log1p - y_true_log1p)))

In [29]:
df = pd.read_csv("../../own_data/all_merged.csv").set_index(["dataset", "range_index"])
df.drop(columns=['store_name', 'address', 'lat', 'lon', 'busstop_id', 'importance_level', 'stopplace_type', 'grunnkrets_id', "side_placement"], inplace=True)
df['in_mall'] = df['mall_name'].notna()
df['in_chain'] = df['chain_name'].notna()
# df['stopplace_type'] = df['stopplace_type'].fillna("Mangler type")
df['mall_name'] = df['mall_name'].fillna("None")
#df['address'] = df['address'].fillna("None")
#df['stopplace_type'] = df['stopplace_type'].fillna("None")

df['chain_name'] = df['chain_name'].fillna("None")
# df['busstop_id'] = df['busstop_id'].map(str)
df['lv1'] = df['lv1'].map(str)
df['lv2'] = df['lv2'].map(str)
df['lv3'] = df['lv3'].map(str)
df['lv4'] = df['lv4'].map(str)

data_with_label = df.loc["train"]
data_with_label.set_index('store_id', inplace=True)

X, y = data_with_label.loc[:, data_with_label.columns != 'revenue'], data_with_label['revenue']

# Numeric feature selection
print(X.shape)
numeric_features = X.select_dtypes(include=[np.number]).columns
categorical_features = X.select_dtypes(include=[np.object0]).columns
print(numeric_features)
lsvr = LinearSVR().fit(X[numeric_features], y)
model = SelectFromModel(lsvr, prefit=True)
X_new = model.transform(X[numeric_features])
print(X_new.shape)
print(model.get_feature_names_out(input_features=numeric_features))
X_red = pd.DataFrame(data=X_new, columns=model.get_feature_names_out(input_features=numeric_features), index=X.index)
X = X[categorical_features].join(X_red)

# Combining categories
for cat_name in categorical_features:
    series = X[cat_name].value_counts()
    mask = (series/series.sum() * 100).lt(.05)
    # To replace df['column'] use np.where I.e 
    X[cat_name] = np.where(X[cat_name].isin(series[mask].index),'Other',X[cat_name])


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
y_train_log = np.log1p(y_train)
y_log = np.log1p(y)

(12038, 124)
Index(['other_stores_1000', 'other_stores_100', 'other_stores_50',
       'buss_stops_1000', 'buss_stops_300', 'distance_closest_busstop',
       'other_stores_250', 'area_km2', 'couple_children_0_to_5_years',
       'couple_children_18_or_above',
       ...
       'age_82', 'age_83', 'age_84', 'age_85', 'age_86', 'age_87', 'age_88',
       'age_89', 'age_90', 'num_of_buss_stops_close'],
      dtype='object', length=114)
(12038, 14)
['other_stores_1000' 'other_stores_100' 'other_stores_50'
 'distance_closest_busstop' 'other_stores_250'
 'couple_children_0_to_5_years' 'couple_children_18_or_above'
 'couple_children_6_to_17_years' 'couple_without_children_x'
 'single_parent_children_18_or_above' 'singles_x' 'singles_y'
 'couple_without_children_y' 'single_parent_with_children']




In [30]:
numeric_features = X.select_dtypes(include=[np.number]).columns
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())]
)

categorical_features = X.select_dtypes(include=[np.object0]).columns
categorical_transformer = Pipeline(
    steps = [
        # ("imputer", SimpleImputer(fill_value="missing", strategy="constant")),
        ("onehotencoding", OneHotEncoder(handle_unknown="ignore"))])

preprocessor = ColumnTransformer(
    transformers=[
        # ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ],
    remainder="passthrough"
)

In [31]:
def error(y_true, y_pred):
    y_pred[y_pred < 0] = 0
    y_pred = np.expm1(y_pred)
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

In [32]:
from sklearn.feature_selection import SelectPercentile, f_regression
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import FunctionTransformer
from copy import copy

X_train_ft = preprocessor.fit_transform(X_train)
X_test_ft = preprocessor.transform(X_test)
exported_pipeline = make_pipeline(
    make_union(
        FunctionTransformer(copy),
        FunctionTransformer(copy)
    ),
    SelectPercentile(score_func=f_regression, percentile=84),
    XGBRegressor(learning_rate=0.1, max_depth=6, min_child_weight=7, n_estimators=100, n_jobs=1, objective="reg:squarederror", subsample=1.0, verbosity=0)
)

exported_pipeline.fit(X_train_ft, y_train_log)
results = exported_pipeline.predict(X_test_ft)
print(max(results))
print(rmsle(y_test, np.expm1(results)))

4.338935
0.7210650806056396


In [12]:
from sklearn.feature_selection import SelectPercentile, f_regression
from sklearn.model_selection import train_test_split
model_params = {'learning_rate': 0.1, 'max_depth': 6, 'min_child_weight': 7, 'subsample:': 1, 'n_estimators':100, 'objective': 'reg:squarederror'}

pipeline = Pipeline(
    steps=[("preprocessor", preprocessor), 
    ("select", SelectPercentile(score_func=f_regression, percentile=84)),
    ("xgbregressor", 
    XGBRegressor(**model_params))]
)
def kfold_cross_validate(pipeline, X, y, n_splits=5, shuffle=True, scoring=error, random_seed=None):
    kfold = KFold(n_splits=n_splits, shuffle=shuffle, random_state=random_seed)
    splits = kfold.split(X, y)
    scores = []
    for train, test in splits:
        temp = X.reset_index()
        X_train = temp.iloc[train]
        X_test = temp.iloc[test]
        temp_y = y.reset_index()
        y_train = temp_y.iloc[train]
        y_test = temp_y.iloc[test]
        
        X_train.set_index("store_id", inplace=True), X_test.set_index("store_id", inplace=True), y_train.set_index("store_id", inplace=True), y_test.set_index("store_id", inplace=True)
        y_train = np.log1p(y_train)

        pipeline.fit(X_train, y_train)
        score = scoring(y_test, pipeline.predict(X_test))
        score_alt = make_scorer(error, greater_is_better=False)
        print(score_alt(pipeline, X_test, y_test))
        scores.append(score)
    return scores
scores = kfold_cross_validate(pipeline, X_train, y_train, random_seed=0)
print(scores)
print(f"Mean: {np.mean(scores)}")
print(f"Std: {np.std(scores)}")

  y = column_or_1d(y, warn=True)


Parameters: { "subsample:" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


-0.745648964680766


  y = column_or_1d(y, warn=True)


Parameters: { "subsample:" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


-0.7465349926406004


  y = column_or_1d(y, warn=True)


Parameters: { "subsample:" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


-0.717256238410506
Parameters: { "subsample:" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




  y = column_or_1d(y, warn=True)


-0.7351424235834618
Parameters: { "subsample:" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




  y = column_or_1d(y, warn=True)


-0.710026059910413
[0.745648964680766, 0.7465349926406004, 0.717256238410506, 0.7351424235834618, 0.710026059910413]
Mean: 0.7309217358451494
Std: 0.014844893767403135


### Code for gridsearchCV

In [13]:
from sklearn.decomposition import PCA


df = pd.read_csv("../../own_data/all_merged.csv").set_index(["dataset", "range_index"])
df.drop(columns=['store_name', 'address', 'lat', 'lon', 'busstop_id', 'importance_level', 'stopplace_type', 'grunnkrets_id'], inplace=True)
df['in_mall'] = df['mall_name'].notna()
df['in_chain'] = df['chain_name'].notna()
# df['stopplace_type'] = df['stopplace_type'].fillna("Mangler type")
df['mall_name'] = df['mall_name'].fillna("None")
#df['address'] = df['address'].fillna("None")
#df['stopplace_type'] = df['stopplace_type'].fillna("None")

df['chain_name'] = df['chain_name'].fillna("None")
# df['busstop_id'] = df['busstop_id'].map(str)
df['lv1'] = df['lv1'].map(str)
df['lv2'] = df['lv2'].map(str)
df['lv3'] = df['lv3'].map(str)
df['lv4'] = df['lv4'].map(str)

data_with_label = df.loc["train"]
data_with_label.set_index('store_id', inplace=True)

seed = 0
X, y = data_with_label.loc[:, data_with_label.columns != 'revenue'], data_with_label['revenue']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=seed)
y_train = np.log1p(y_train)
y = np.log1p(y)

In [None]:
numeric_features = X.select_dtypes(include=[np.number]).columns
numeric_transformer = Pipeline(
    steps=[("scaler", StandardScaler())]
)

categorical_features = X.select_dtypes(include=[np.object0]).columns
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ],
    remainder="passthrough"
)

pca = PCA()

In [None]:
pipeline = Pipeline(
    steps=[("preprocessor", preprocessor), 
    ("xgbregressor", XGBRegressor())])

param_grid = {
    #'pca__n_components': [5, 10, 15, 20, 25, 30],
    'xgbregressor__max_depth': [2, 3, 5, 7, 10],
    'xgbregressor__n_estimators': [10, 100],
    "xgbregressor__objective": ["squareerror"]
}

grid = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, scoring=make_scorer(error, greater_is_better=False), verbose=4)

In [57]:
%%time

grid.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
CPU times: total: 3.58 s
Wall time: 24 s


In [58]:
print("Best parameter (CV score=%0.3f):" % grid.best_score_)
print(grid.best_params_)

Best parameter (CV score=-0.828):
{'xgbregressor__max_depth': 2, 'xgbregressor__n_estimators': 10}


In [34]:
print(search.cv_results_["params"])

[{'xgbregressor__gamma': 0.1, 'xgbregressor__learning_rate': 0.1, 'xgbregressor__max_depth': 4, 'xgbregressor__min_child_weight': 1, 'xgbregressor__n_estimators': 100, 'xgbregressor__objective': 'reg:squarederror'}, {'xgbregressor__gamma': 0.1, 'xgbregressor__learning_rate': 0.1, 'xgbregressor__max_depth': 4, 'xgbregressor__min_child_weight': 1, 'xgbregressor__n_estimators': 100, 'xgbregressor__objective': 'reg:squaredlogerror'}, {'xgbregressor__gamma': 0.1, 'xgbregressor__learning_rate': 0.1, 'xgbregressor__max_depth': 4, 'xgbregressor__min_child_weight': 1, 'xgbregressor__n_estimators': 100, 'xgbregressor__objective': 'reg:pseudohubererror'}, {'xgbregressor__gamma': 0.1, 'xgbregressor__learning_rate': 0.1, 'xgbregressor__max_depth': 4, 'xgbregressor__min_child_weight': 1, 'xgbregressor__n_estimators': 100, 'xgbregressor__objective': 'reg:tweedie'}]
