In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
import torch
from torch import nn
import pytorch_lightning as pl
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from torch.utils.data import DataLoader, TensorDataset
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np
import pandas as pd
from sklearn.ensemble import  StackingRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from catboost import CatBoostRegressor
%matplotlib inline

  from pandas import MultiIndex, Int64Index


In [4]:
def rmsle(y_true, y_pred):
    """
    Computes the Root Mean Squared Logarithmic Error 
    
    Args:
        y_true (np.array): n-dimensional vector of ground-truth values 
        y_pred (np.array): n-dimensional vecotr of predicted values 
    
    Returns:
        A scalar float with the rmsle value 
    
    Note: You can alternatively use sklearn and just do: 
        `sklearn.metrics.mean_squared_log_error(y_true, y_pred) ** 0.5`
    """
    y_pred[y_pred < 0] = 0
    assert (y_true >= 0).all(), 'Received negative y_true values'
    assert (y_pred >= 0).all(), 'Received negative y_pred values'
    assert y_true.shape == y_pred.shape, 'y_true and y_pred have different shapes'
    y_true_log1p = np.log1p(y_true)  # log(1 + y_true)
    y_pred_log1p = np.log1p(y_pred)  # log(1 + y_pred)
    return np.sqrt(np.mean(np.square(y_pred_log1p - y_true_log1p)))

In [7]:
all_data = pd.read_csv("../../own_data/all_with_stores_pop.csv")
all_data.set_index(["dataset", "range_index"], inplace=True)
all_data['in_mall'] = all_data['mall_name'].notna()
all_data['in_chain'] = all_data['chain_name'].notna()
all_data['mall_name'] = all_data['mall_name'].fillna("None")
all_data['as'] = all_data['store_name'].str.contains(r"\b(AS)\b", case=False, regex=True)
all_data['chain_name'] = all_data['chain_name'].fillna("None")
all_data['busstop_id'] = all_data['busstop_id'].map(str)
all_data['lv1'] = all_data['lv1'].map(str)
all_data['lv2'] = all_data['lv2'].map(str)
all_data['lv3'] = all_data['lv3'].map(str)
all_data['lv4'] = all_data['lv4'].map(str)
all_data.drop(columns=[
  
    'store_name',
    'address',
    'importance_level',
    'busstop_id', 
    'other_stores_50', 
    'buss_stops_300', 
    'municipality_name', 
    'lv1', 
    'lat', 
    'couple_children_6_to_17_years', 
    'couple_without_children_x', 
    'single_parent_children_0_to_5_years', 
    'singles_x', 
    'singles_y', 
    'couple_without_children_y', 
    'couple_with_children', 
    'district_age_0-14_distribution', 
    'district_age_65-90_distribution', 
    'grunnkrets_population', 
    'municipality_density', 
    'all_households', 
    'lv2_population_district_div_count_stores', 
    'lv1_population_municipality_div_count_stores', 
    'lv2_population_municipality_div_count_stores', 
    'in_mall', 
    'lv3_population_district_div_count_stores', 
    'district_name', 
    'num_of_buss_stops_closer_that_1000_to_busstop', 
    'municipality_age_0-14_distribution', 
    'municipality_age_35-64_distribution', 
    'municipality_age_65-90_distribution', 
    ], inplace=True)

data_with_label = all_data.loc[["train"]]

data_with_label.set_index('store_id', inplace=True)
data_without_label = all_data.loc[['test']]
data_without_label.set_index('store_id', inplace=True)
data_without_label.drop(columns=["revenue"], inplace=True)

X, y = data_with_label.loc[:, data_with_label.columns != 'revenue'], data_with_label['revenue']

data_train, data_test = train_test_split(data_with_label, test_size=0.2, random_state=42)

X_train, y_train = data_train.loc[:, data_train.columns != 'revenue'], data_train['revenue']
X_test, y_test = data_test.loc[:, data_test.columns != 'revenue'], data_test['revenue']
y_train_scaled = np.log1p(y_train)
y_test_scaled = np.log1p(y_test)

# y_scaled = np.log1p(y)


# Comment in this when testing on test dataset to kaggle
# X_train = X
# y_train = y
# y_train_scaled = y_scaled
# X_test = data_without_label

  all_data['as'] = all_data['store_name'].str.contains(r"\b(AS)\b", case=False, regex=True)


In [8]:
numeric_features = X_train.select_dtypes(include=[np.number]).columns
numeric_features = list(numeric_features.to_numpy())

print(numeric_features)

numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())]
)

categorical_features = X_train.select_dtypes(include=[np.object0]).columns
categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(fill_value="missing", strategy="constant")),
        ("onehotencoding", OneHotEncoder(handle_unknown="ignore"))
    ]
)


preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ],
    remainder="passthrough",
)

categorical_features_include_bool = list(categorical_features.to_numpy())
categorical_features_include_bool.extend(list(X.select_dtypes(include=[np.bool8]).columns.to_numpy()))
# categorical_transformer_ordinal = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=np.nan)
# preprocessor_ordinal = ColumnTransformer(
#     transformers=[
#         ("num", numeric_transformer, numeric_features),
#         ("cat", categorical_transformer_ordinal, categorical_features_include_bool)
#     ],
#     remainder="passthrough",
# )

X_train_ft = preprocessor.fit_transform(X_train)
X_test_ft = preprocessor.transform(X_test)
# X_val_ft = preprocessor.transform(X_val)

print(X_train_ft.shape)

['grunnkrets_id', 'lon', 'other_stores_1000', 'other_stores_100', 'other_stores_250', 'buss_stops_1000', 'grunnkrets_1', 'distance_closest_busstop', 'area_km2', 'couple_children_0_to_5_years', 'couple_children_18_or_above', 'single_parent_children_18_or_above', 'single_parent_children_6_to_17_years', 'other_households', 'single_parent_with_children', 'district_age_15-34_distribution', 'district_age_35-64_distribution', 'municipality_age_15-34_distribution', 'district_population', 'municipality_population', 'district_area', 'municipality_area', 'district_density', 'lv1_population_district_div_count_stores', 'lv4_population_district_div_count_stores', 'lv3_population_municipality_div_count_stores', 'lv4_population_municipality_div_count_stores']
(10263, 998)


In [9]:
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.feature_selection import SelectFromModel


extra_trees = ExtraTreesRegressor(
    bootstrap=False,
    criterion='squared_error',
    max_depth=None, 
    max_features=0.76315, 
    max_leaf_nodes=None, 
    min_samples_leaf=2,
    min_samples_split=2, 
    min_weight_fraction_leaf=0, 
    n_estimators=100 
    )
extra_trees = extra_trees.fit(X_train_ft, y_train_scaled)


In [10]:
select = SelectFromModel(extra_trees, prefit=True, threshold=-np.inf, max_features=500)
X_train_fts = select.transform(X_train_ft)
# X_val_fts = select.transform(X_val_ft)
X_test_fts = select.transform(X_test_ft)
print(X_train_fts.shape)


(10263, 500)


In [11]:
X_train_ftsr = pd.DataFrame(X_train_fts.toarray())
# X_val_ftsr = pd.DataFrame(X_val_fts.toarray())

X_test_ftsr = pd.DataFrame(X_test_fts.toarray())

categories = X_train_ftsr.columns.tolist()

In [12]:
class RMSLELoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.mse = nn.MSELoss()
        
    def forward(self, pred, actual):
        pred[pred < 0] = 0
        return torch.sqrt(self.mse(torch.log(pred + 1), torch.log(actual + 1)))

class SoftOrdering1DCNN(pl.LightningModule):

    def __init__(self, input_dim, output_dim, sign_size=32, cha_input=16, cha_hidden=32, 
                 K=2, dropout_input=0.2, dropout_hidden=0.2, dropout_output=0.2):
        super().__init__()

        hidden_size = sign_size*cha_input
        sign_size1 = sign_size
        sign_size2 = sign_size//2
        output_size = (sign_size//4) * cha_hidden

        self.hidden_size = hidden_size
        self.cha_input = cha_input
        self.cha_hidden = cha_hidden
        self.K = K
        self.sign_size1 = sign_size1
        self.sign_size2 = sign_size2
        self.output_size = output_size
        self.dropout_input = dropout_input
        self.dropout_hidden = dropout_hidden
        self.dropout_output = dropout_output

        self.batch_norm1 = nn.BatchNorm1d(input_dim)
        self.dropout1 = nn.Dropout(dropout_input)
        dense1 = nn.Linear(input_dim, hidden_size, bias=False)
        self.dense1 = nn.utils.weight_norm(dense1)

        # 1st conv layer
        self.batch_norm_c1 = nn.BatchNorm1d(cha_input)
        conv1 = conv1 = nn.Conv1d(
            cha_input, 
            cha_input*K, 
            kernel_size=5, 
            stride = 1, 
            padding=2,  
            groups=cha_input, 
            bias=False)
        self.conv1 = nn.utils.weight_norm(conv1, dim=None)
        self.ave_po_c1 = nn.AdaptiveAvgPool1d(output_size = sign_size2)

        # 2nd conv layer
        self.batch_norm_c2 = []
        self.dropout_c2 = []
        self.conv2 = []
        for i in range(0):
            self.batch_norm_c2.append(nn.BatchNorm1d(cha_input*K))
            self.dropout_c2.append(nn.Dropout(dropout_hidden))
            conv2 = nn.Conv1d(
                cha_input*K, 
                cha_hidden, 
                kernel_size=3, 
                stride=1, 
                padding=1, 
                bias=False)
            self.conv2.append(nn.utils.weight_norm(conv2, dim=None))    
        

        # 3rd conv layer
        self.batch_norm_c3 = nn.BatchNorm1d(cha_hidden)
        self.dropout_c3 = nn.Dropout(dropout_hidden)
        conv3 = nn.Conv1d(
            cha_hidden, 
            cha_hidden, 
            kernel_size=3, 
            stride=1, 
            padding=1, 
            bias=False)
        self.conv3 = nn.utils.weight_norm(conv3, dim=None)
        

        # 4th conv layer
        self.batch_norm_c4 = nn.BatchNorm1d(cha_hidden)
        conv4 = nn.Conv1d(
            cha_hidden, 
            cha_hidden, 
            kernel_size=5, 
            stride=1, 
            padding=2, 
            groups=cha_hidden, 
            bias=False)
        self.conv4 = nn.utils.weight_norm(conv4, dim=None)

        self.avg_po_c4 = nn.AvgPool1d(kernel_size=4, stride=2, padding=1)

        self.flt = nn.Flatten()

        self.batch_norm2 = nn.BatchNorm1d(output_size)
        self.dropout2 = nn.Dropout(dropout_output)
        dense2 = nn.Linear(output_size, output_dim, bias=False)
        self.dense2 = nn.utils.weight_norm(dense2)

        self.loss = nn.MSELoss()

    def forward(self, x):
        x = self.batch_norm1(x)
        x = self.dropout1(x)
        x = nn.functional.celu(self.dense1(x))

        x = x.reshape(x.shape[0], self.cha_input, self.sign_size1)

        x = self.batch_norm_c1(x)
        x = nn.functional.relu(self.conv1(x))

        x = self.ave_po_c1(x)

        for i in range(0):
            x = self.batch_norm_c2[i](x)
            x = self.dropout_c2[i](x)
            x = nn.functional.relu(self.conv2[i](x))
            x_s = x


        x = self.batch_norm_c3(x)
        x = self.dropout_c3(x)
        x = nn.functional.relu(self.conv3(x))
        x_s = x


        x = self.batch_norm_c4(x)
        x = self.conv4(x)
        x =  x + x_s
        x = nn.functional.relu(x)

        x = self.avg_po_c4(x)

        x = self.flt(x)

        x = self.batch_norm2(x)
        x = self.dropout2(x)
        x = self.dense2(x)

        return x

    def training_step(self, batch, batch_idx):
        X, y = batch
        y_hat = self.forward(X)
        loss = self.loss(y_hat, y)
        self.log('train_loss', loss)
        return loss
    
    def validation_step(self, batch, batch_idx):
        X, y = batch
        y_hat = self.forward(X)
        loss = self.loss(y_hat, y)
        self.log('valid_loss', loss)
        
    def test_step(self, batch, batch_idx):
        X, y = batch
        y_logit = self.forward(X)
        y_probs = torch.sigmoid(y_logit).detach().cpu().numpy()
        loss = self.loss(y_logit, y)
        metric = rmsle(y.cpu().numpy(), y_probs)
        self.log('test_loss', loss)
        self.log('test_metric', metric)
        
    def configure_optimizers(self):
        optimizer = torch.optim.SGD(self.parameters(), lr=1e-2, momentum=0.9)
        scheduler = {
            'scheduler': ReduceLROnPlateau(
                optimizer, 
                mode="min", 
                factor=0.5, 
                patience=5, 
                min_lr=1e-5),
            'interval': 'epoch',
            'frequency': 1,
            'reduce_on_plateau': True,
            'monitor': 'valid_loss',
        }
        return [optimizer], [scheduler]

In [13]:
model = SoftOrdering1DCNN(
    input_dim=X_test_ftsr.shape[1], 
    output_dim=1, 
    sign_size=4, 
    cha_input=16, 
    cha_hidden=32, 
    K=2, 
    dropout_input=0.2, 
    dropout_hidden=0., 
    dropout_output=0
)
early_stop_callback = EarlyStopping(
   monitor='valid_loss',
   min_delta=.0,
   patience=21,
   verbose=True,
   mode='min'
)

trainer = pl.Trainer(callbacks=[early_stop_callback], min_epochs=1, max_epochs=25)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [14]:
print(X_train_ftsr[categories].values.shape, y_train_scaled.values.reshape(-1,1).shape)
train_tensor_dset = TensorDataset(
    torch.tensor(X_train_ftsr[categories].values, dtype=torch.float),
    torch.tensor(y_train_scaled.values.reshape(-1,1), dtype=torch.float)
)
val_tensor_dset = TensorDataset(
    torch.tensor(X_test_ftsr[categories].values, dtype=torch.float),
    torch.tensor(y_test_scaled.values.reshape(-1,1), dtype=torch.float) 
)
# val_tensor =  torch.tensor(X_val_ftsr[categories].values, dtype=torch.float)
# test_tensor_dset = torch.tensor(X_test_ftsr[categories].values, dtype=torch.float)
    #torch.tensor(y_test.values.reshape(-1,1), dtype=torch.float) 

(10263, 500) (10263, 1)


In [15]:
trainer.fit(
    model, 
    DataLoader(train_tensor_dset, batch_size=64, shuffle=True, num_workers=4),
    DataLoader(val_tensor_dset, batch_size=64, shuffle=False, num_workers=4)
)


   | Name          | Type              | Params
-----------------------------------------------------
0  | batch_norm1   | BatchNorm1d       | 1.0 K 
1  | dropout1      | Dropout           | 0     
2  | dense1        | Linear            | 32.1 K
3  | batch_norm_c1 | BatchNorm1d       | 32    
4  | conv1         | Conv1d            | 161   
5  | ave_po_c1     | AdaptiveAvgPool1d | 0     
6  | batch_norm_c3 | BatchNorm1d       | 64    
7  | dropout_c3    | Dropout           | 0     
8  | conv3         | Conv1d            | 3.1 K 
9  | batch_norm_c4 | BatchNorm1d       | 64    
10 | conv4         | Conv1d            | 161   
11 | avg_po_c4     | AvgPool1d         | 0     
12 | flt           | Flatten           | 0     
13 | batch_norm2   | BatchNorm1d       | 64    
14 | dropout2      | Dropout           | 0     
15 | dense2        | Linear            | 33    
16 | loss          | MSELoss           | 0     
-----------------------------------------------------
36.7 K    Trainable params


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Metric valid_loss improved. New best score: 0.656


Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Monitored metric valid_loss did not improve in the last 21 records. Best score: 0.656. Signaling Trainer to stop.


In [59]:
# y_hat = model(dataloaders=DataLoader(test_tensor_dset, batch_size=y_test.shape[0], shuffle=False, num_workers=4))
y_hat = trainer.predict(model, dataloaders=DataLoader(test_tensor_dset, batch_size=64, shuffle=False, num_workers=4))
print(rmsle(y_test, np.expm1(torch.cat(y_hat, 0).numpy().reshape(-1))))



Predicting: 0it [00:00, ?it/s]

0.8367676700585218


In [55]:
y_hat = trainer.predict(model, dataloaders=DataLoader(val_tensor, batch_size=64, shuffle=False, num_workers=4))
print(rmsle(y_val, np.expm1(torch.cat(y_hat, 0).numpy().reshape(-1))))



Predicting: 0it [00:00, ?it/s]

0.8825786976846948


AssertionError: y_true and y_pred have different shapes