In [39]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBRegressor, XGBClassifier 
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from category_encoders import TargetEncoder, LeaveOneOutEncoder, GLMMEncoder
from sklearn.metrics import mean_squared_log_error

In [101]:
df = pd.read_csv("../../own_data/all_merged.csv").set_index(["dataset", "range_index"])
df.drop(columns=['store_name', 'address', 'lat', 'lon', 'busstop_id', 'importance_level', 'stopplace_type', 'grunnkrets_id'], inplace=True)
df['in_mall'] = df['mall_name'].notna()
df['in_chain'] = df['chain_name'].notna()
# df['stopplace_type'] = df['stopplace_type'].fillna("Mangler type")
df['mall_name'] = df['mall_name'].fillna("None")
#df['address'] = df['address'].fillna("None")
#df['stopplace_type'] = df['stopplace_type'].fillna("None")

df['chain_name'] = df['chain_name'].fillna("None")
# df['busstop_id'] = df['busstop_id'].map(str)
df['lv1'] = df['lv1'].map(str)
df['lv2'] = df['lv2'].map(str)
df['lv3'] = df['lv3'].map(str)
df['lv4'] = df['lv4'].map(str)

data_with_label = df.loc["train"]
data_with_label.set_index('store_id', inplace=True)
data_with_label.sort_values(by="revenue", inplace=True)

X, y = data_with_label.loc[:, data_with_label.columns != 'revenue'], data_with_label['revenue']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_with_label.sort_values(by="revenue", inplace=True)


In [124]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=41)

# Non log
CAT_SIZE = 4
y_train_class = y_train//CAT_SIZE
y_test_class = y_test//CAT_SIZE
le = LabelEncoder()
y_train_class = le.fit_transform(y_train_class)

# Log based
y_train_log = np.log1p(y_train)
y_test_log = np.log1p(y_test)

LOG_CAT_SIZE = .2
y_train_logclass = y_train_log//LOG_CAT_SIZE
y_test_logclass = y_test_log//LOG_CAT_SIZE
le = LabelEncoder()
y_train_logclass = le.fit_transform(y_train_logclass)


In [125]:
def rmsle(y_true, y_pred):
    """
    Computes the Root Mean Squared Logarithmic Error 
    
    Args:
        y_true (np.array): n-dimensional vector of ground-truth values 
        y_pred (np.array): n-dimensional vecotr of predicted values 
    
    Returns:
        A scalar float with the rmsle value 
    
    Note: You can alternatively use sklearn and just do: 
        `sklearn.metrics.mean_squared_log_error(y_true, y_pred) ** 0.5`
    """
    y_pred[y_pred < 0] = 0
    assert (y_true >= 0).all(), 'Received negative y_true values'
    assert (y_pred >= 0).all(), 'Received negative y_pred values'
    assert y_true.shape == y_pred.shape, 'y_true and y_pred have different shapes'
    y_true_log1p = np.log1p(y_true)  # log(1 + y_true)
    y_pred_log1p = np.log1p(y_pred)  # log(1 + y_pred)
    return np.sqrt(np.mean(np.square(y_pred_log1p - y_true_log1p)))

In [126]:
numeric_features = X.select_dtypes(include=[np.number]).columns
numeric_transformer = Pipeline(
    steps=[("scaler", StandardScaler())]
)

categorical_features = X.select_dtypes(include=[np.object0]).columns
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ],
    remainder="passthrough"
)

In [127]:
xgboost = Pipeline(
    steps=[("preprocessor", preprocessor), 
    ("exgregressor", 
    XGBClassifier(n_estimators=100,
                    max_depth=4,
                    learning_rate=0.2,
                    objective="multi:softmax"))]
)

xgboost_log = Pipeline(
    steps=[("preprocessor", preprocessor), 
    ("exgregressor", 
    XGBClassifier(n_estimators=100,
                    max_depth=4,
                    learning_rate=0.2,
                    objective="multi:softmax"))]
)

In [128]:
xgboost.fit(X_train, y_train_class)

In [119]:
xgboost_log.fit(X_train, y_train_logclass)

### Non-log predictions

In [129]:
y_pred = xgboost.predict(X_test)

y_pred = y_pred*CAT_SIZE
y_pred = y_pred + CAT_SIZE/2

print(y_pred)
print(rmsle(y_test, y_pred))

y_pred_acc = xgboost.predict(X_test)
print(f" Accuracy is: {(y_pred_acc == np.asarray(y_test_class)).sum()/len(y_pred_acc):.2f}")

_df = pd.DataFrame(data={"test_true": y_test, "test_pred": y_pred})
_df.sort_values(by="test_pred", inplace=True)

[2. 2. 6. ... 2. 2. 2.]
0.8463190339884945
 Accuracy is: 0.58


In [199]:
START = 900
END = 1900
print(rmsle(_df.iloc[START:END].test_true, _df.iloc[START:END].test_pred))

0.8474044058371626


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_pred[y_pred < 0] = 0


In [201]:
for i in range(0,1000,100):
    for j in range(1000,2000,100):
        plt.scatter(i,j, c=rmsle(_df.iloc[i:j].test_true, _df.iloc[i:j].test_pred), cmap="viridis")
plt.colorbar()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_pred[y_pred < 0] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_pred[y_pred < 0] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_pred[y_pred < 0] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_pred[y_pred < 0] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the c

<matplotlib.colorbar.Colorbar at 0x24018e8c460>

In [79]:
%matplotlib qt
import seaborn as sns
fig, ax = plt.subplots(1,1)

df = pd.DataFrame(data={"Revenue": y_test, "Correct prediction": (y_pred_acc == np.asarray(y_test_class))})
sns.histplot(data=df, ax=ax, stat="percent", multiple="stack",
             x="Revenue", kde=False,
             palette=sns.color_palette("husl",2), hue="Correct prediction",
             element="bars", legend=True)
ax.set_title(f"Category size: {CAT_SIZE}")
ax.set_xlabel("Revenue")
ax.set_ylabel("Percent")

Text(0, 0.5, 'Percent')

### Log-predictions

In [120]:
y_pred_log = xgboost_log.predict(X_test)
print(f"Classes: {y_pred_log}")
y_pred_log = y_pred_log*LOG_CAT_SIZE
y_pred_log = y_pred_log + LOG_CAT_SIZE/2

print(f"Shift: {y_pred_log}")
print(f"Real value: {np.expm1(y_pred_log)}")
print(f"RMSLE: {rmsle(y_test, np.expm1(y_pred_log))}")

y_pred_acc_log = xgboost_log.predict(X_test)
print(f"Accuracy is: {(y_pred_acc_log == np.asarray(y_test_logclass)).sum()/len(y_pred_acc_log):.2f}")

log_df = pd.DataFrame(data={"test_true": y_test, "test_pred": np.expm1(y_pred_log)}, index=y_test.index)

Classes: [ 6  8 11 ... 23 20  0]
Shift: [1.3 1.7 2.3 ... 4.7 4.1 0.1]
Real value: [2.66929667e+00 4.47394739e+00 8.97418245e+00 ... 1.08947172e+02
 5.93402876e+01 1.05170918e-01]
RMSLE: 0.8933859158411874
Accuracy is: 0.13


In [70]:
START = 0
END = 500
print(rmsle(log_df.iloc[START:END].test_true, log_df.iloc[START:END].test_pred))

1.0672782547779422


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_pred[y_pred < 0] = 0


In [71]:
sns.scatterplot(x=log_df.iloc[START:END].test_true, y=log_df.iloc[START:END].test_pred)

<AxesSubplot:xlabel='test_true', ylabel='test_pred'>

In [72]:
%matplotlib qt
heatmap, xedges, yedges = np.histogram2d(log_df.iloc[START:END].test_true, log_df.iloc[START:END].test_pred, bins=(100,10))
extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]]

#plt.clf()
plt.imshow(heatmap.T, extent=extent, origin='lower')
plt.show()

In [122]:
%matplotlib qt
import seaborn as sns
fig, ax = plt.subplots(1,1)

df_log = pd.DataFrame(data={"Revenue (Log)": y_test_logclass, "Correct prediction": (y_pred_acc_log == np.asarray(y_test_logclass))})
sns.histplot(data=df_log, ax=ax, stat="density", multiple="stack",
             x="Revenue (Log)", kde=False,
             palette=sns.color_palette("husl",2), hue="Correct prediction",
             element="bars", legend=True)
ax.set_title(f"Category size: {LOG_CAT_SIZE}")
ax.set_xlabel("Revenue (Log)")
ax.set_ylabel("Count")

Text(0, 0.5, 'Count')