In [349]:
import os

import pandas as pd
import numpy as np
import matplotlib

In [350]:
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_log_error

In [351]:
home_dir = "./data"

# Preprocessing

In [352]:
test = pd.read_csv(os.path.join(home_dir, "test.csv"))
train = pd.read_csv(os.path.join(home_dir, "train.csv"))
oil = pd.read_csv(os.path.join(home_dir, "oil.csv"))
stores = pd.read_csv(os.path.join(home_dir, "stores.csv"))
transactions = pd.read_csv(os.path.join(home_dir, "transactions.csv"))
holidays_events = pd.read_csv(os.path.join(home_dir, "holidays_events.csv"))

In [353]:
train['date'] = pd.to_datetime(train['date'])
test['date'] = pd.to_datetime(test['date'])
oil['date'] = pd.to_datetime(oil['date'])
transactions['date'] = pd.to_datetime(transactions['date'])
holidays_events['date'] = pd.to_datetime(holidays_events['date'])

### train and test

In [354]:
train.drop('id', axis=1, inplace=True)
test.drop('id', axis=1, inplace=True)

### holidays_events

In [355]:
def set_holiday(x):
    if x['type'] == 'Holiday' or x['type'] == 'Event':
        return f"{x['locale']}_{x['type']}"
    else:
        return f"{x['locale']}_Holiday"

holidays_events['holiday_type'] = holidays_events.apply(set_holiday, axis=1)
holidays_events.drop(holidays_events[holidays_events.loc[:, 'transferred'] == True].index, inplace=True)
holidays_events.drop(holidays_events.columns.difference(['date', 'holiday_type']), axis=1, inplace=True)
holidays_events

Unnamed: 0,date,holiday_type
0,2012-03-02,Local_Holiday
1,2012-04-01,Regional_Holiday
2,2012-04-12,Local_Holiday
3,2012-04-14,Local_Holiday
4,2012-04-21,Local_Holiday
...,...,...
345,2017-12-22,National_Holiday
346,2017-12-23,National_Holiday
347,2017-12-24,National_Holiday
348,2017-12-25,National_Holiday


### oil

In [356]:
oil.ffill(inplace=True)
oil

Unnamed: 0,date,dcoilwtico
0,2013-01-01,
1,2013-01-02,93.14
2,2013-01-03,92.97
3,2013-01-04,93.12
4,2013-01-07,93.20
...,...,...
1213,2017-08-25,47.65
1214,2017-08-28,46.40
1215,2017-08-29,46.46
1216,2017-08-30,45.96


### merge

In [357]:
train_merged_data = pd.merge(train, stores, on='store_nbr', how='left')
train_merged_data = pd.merge(train_merged_data, oil, on='date', how='left')
train_merged_data = pd.merge(train_merged_data, holidays_events, on='date', how='left', suffixes=('', '_holiday'))
train_merged_data = pd.merge(train_merged_data, transactions, on=['date', 'store_nbr'], how='left')

train_merged_data['holiday_type'] = train_merged_data['holiday_type'].fillna('NA')
train_merged_data['holiday'] = train_merged_data.apply(lambda x: x['holiday_type'] if x['holiday_type'] != 'NA' else ('Weekdays' if x['date'].weekday() == 1 else 'Weekends'), axis=1)
train_merged_data

Unnamed: 0,date,store_nbr,family,sales,onpromotion,city,state,type,cluster,dcoilwtico,holiday_type,transactions,holiday
0,2013-01-01,1,AUTOMOTIVE,0.000,0,Quito,Pichincha,D,13,,National_Holiday,,National_Holiday
1,2013-01-01,1,BABY CARE,0.000,0,Quito,Pichincha,D,13,,National_Holiday,,National_Holiday
2,2013-01-01,1,BEAUTY,0.000,0,Quito,Pichincha,D,13,,National_Holiday,,National_Holiday
3,2013-01-01,1,BEVERAGES,0.000,0,Quito,Pichincha,D,13,,National_Holiday,,National_Holiday
4,2013-01-01,1,BOOKS,0.000,0,Quito,Pichincha,D,13,,National_Holiday,,National_Holiday
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3054343,2017-08-15,9,POULTRY,438.133,0,Quito,Pichincha,B,6,47.57,Local_Holiday,2155.0,Local_Holiday
3054344,2017-08-15,9,PREPARED FOODS,154.553,1,Quito,Pichincha,B,6,47.57,Local_Holiday,2155.0,Local_Holiday
3054345,2017-08-15,9,PRODUCE,2419.729,148,Quito,Pichincha,B,6,47.57,Local_Holiday,2155.0,Local_Holiday
3054346,2017-08-15,9,SCHOOL AND OFFICE SUPPLIES,121.000,8,Quito,Pichincha,B,6,47.57,Local_Holiday,2155.0,Local_Holiday


In [358]:
test_merged_data = pd.merge(test, stores, on='store_nbr', how='left')
test_merged_data = pd.merge(test_merged_data, oil, on='date', how='left')
test_merged_data = pd.merge(test_merged_data, holidays_events, on='date', how='left', suffixes=('', '_holiday'))
test_merged_data = pd.merge(test_merged_data, transactions, on=['date', 'store_nbr'], how='left')

test_merged_data['holiday_type'] = test_merged_data['holiday_type'].fillna('NA')
test_merged_data['holiday'] = test_merged_data.apply(lambda x: x['holiday_type'] if x['holiday_type'] != 'NA' else ('Weekdays' if x['date'].weekday() == 1 else 'Weekends'), axis=1)
test_merged_data

Unnamed: 0,date,store_nbr,family,onpromotion,city,state,type,cluster,dcoilwtico,holiday_type,transactions,holiday
0,2017-08-16,1,AUTOMOTIVE,0,Quito,Pichincha,D,13,46.80,,,Weekends
1,2017-08-16,1,BABY CARE,0,Quito,Pichincha,D,13,46.80,,,Weekends
2,2017-08-16,1,BEAUTY,2,Quito,Pichincha,D,13,46.80,,,Weekends
3,2017-08-16,1,BEVERAGES,20,Quito,Pichincha,D,13,46.80,,,Weekends
4,2017-08-16,1,BOOKS,0,Quito,Pichincha,D,13,46.80,,,Weekends
...,...,...,...,...,...,...,...,...,...,...,...,...
28507,2017-08-31,9,POULTRY,1,Quito,Pichincha,B,6,47.26,,,Weekends
28508,2017-08-31,9,PREPARED FOODS,0,Quito,Pichincha,B,6,47.26,,,Weekends
28509,2017-08-31,9,PRODUCE,1,Quito,Pichincha,B,6,47.26,,,Weekends
28510,2017-08-31,9,SCHOOL AND OFFICE SUPPLIES,9,Quito,Pichincha,B,6,47.26,,,Weekends


In [359]:
train_merged_data = train_merged_data[train_merged_data['date'] >= '2013-01-02']

In [360]:
train_merged_data = train_merged_data.drop('holiday_type', axis=1)
test_merged_data = test_merged_data.drop('holiday_type', axis=1)

In [361]:
train_merged_data['dcoilwtico'] = train_merged_data['dcoilwtico'].ffill()
train_merged_data['transactions'] = train_merged_data['transactions'].fillna(0)

test_merged_data['dcoilwtico'] = test_merged_data['dcoilwtico'].ffill()
test_merged_data['transactions'] = test_merged_data['transactions'].fillna(0)

## one-hot encoding

In [362]:
train_data = pd.get_dummies(train_merged_data.set_index(['date', 'store_nbr', 'family']))
test_data = pd.get_dummies(test_merged_data.set_index(['date', 'store_nbr', 'family']))

## split train-validation-test

In [363]:
from sklearn.model_selection import train_test_split

In [375]:
X = train_data.loc[:,train_data.columns.difference(['sales'])]
y = train_data.loc[:, 'sales']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, shuffle=False)

# Train model

In [376]:
from sklearn.preprocessing import StandardScaler

import torch
import torch.nn as nn
import torch.functional as F
from torch.utils.data import DataLoader, TensorDataset

In [418]:
device = torch.device("cuda" if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else "cpu"))

In [419]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)

X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).to(device)
X_valid_tensor = torch.tensor(X_valid_scaled, dtype=torch.float32).to(device)
y_valid_tensor = torch.tensor(y_valid.values, dtype=torch.float32).to(device)

In [420]:
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
valid_dataset = TensorDataset(X_valid_tensor, y_valid_tensor)

In [421]:
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=False)
valid_loader = DataLoader(valid_dataset, batch_size=64, shuffle=False)

In [435]:
class SalesLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, output_dim):
        super(SalesLSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.epsilon = 1e-6

    def forward(self, x):
        batch_size = x.size(0)
        h0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(x.device)
        c0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(x.device)
        out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))
        out = self.fc(out[:, -1, :])
        return torch.clamp(out, min=self.epsilon)


In [436]:
class RMSLELoss(nn.Module):
    def __init__(self, epsilon=1e-6):
        super().__init__()
        self.mse = nn.MSELoss()
        self.epsilon = epsilon
        
    def forward(self, pred, actual):
        pred = torch.clamp(pred, min=self.epsilon)
        actual = torch.clamp(actual, min=self.epsilon)
        return torch.sqrt(self.mse(torch.log(pred + 1), torch.log(actual + 1)))


In [437]:
input_dim = X_train_tensor.shape[1]
hidden_dim = 100
num_layers = 2
output_dim = 1

model = SalesLSTM(input_dim, hidden_dim, num_layers, output_dim).to(device)

In [438]:
criterion = RMSLELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        inputs = inputs.unsqueeze(1)
        
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels.view(-1, 1))
        loss.backward()
        optimizer.step()

    model.eval()
    val_loss = 0
    with torch.no_grad():
        for inputs, labels in valid_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            inputs = inputs.unsqueeze(1)

            outputs = model(inputs)
            loss = criterion(outputs, labels.view(-1, 1))
            val_loss += loss.item()

    print(f'[Epoch {epoch+1}] Loss: {loss.item()}, Validation Loss: {val_loss/len(valid_loader)}')


Epoch 1, Loss: 1.6368998289108276, Validation Loss: 1.6480070041598014
Epoch 2, Loss: 1.6860562562942505, Validation Loss: 1.6016288589584629
Epoch 3, Loss: 1.7354501485824585, Validation Loss: 1.6223667473475638
Epoch 4, Loss: 1.7303271293640137, Validation Loss: 1.6164248178621259
Epoch 5, Loss: 1.709689974784851, Validation Loss: 1.6369283117244577
Epoch 6, Loss: 1.7519888877868652, Validation Loss: 1.6289724308052402
Epoch 7, Loss: 1.7694123983383179, Validation Loss: 1.6115545124284127
Epoch 8, Loss: 1.7348017692565918, Validation Loss: 1.6238438303849858
Epoch 9, Loss: 1.807400107383728, Validation Loss: 1.6367179572988355
Epoch 10, Loss: 1.7946560382843018, Validation Loss: 1.6473685274365313
Epoch 11, Loss: 1.6694079637527466, Validation Loss: 1.6075419355004215
Epoch 12, Loss: 1.6901450157165527, Validation Loss: 1.5991669743019206
Epoch 13, Loss: 1.7582440376281738, Validation Loss: 1.625005352209699
Epoch 14, Loss: 1.641990303993225, Validation Loss: 1.6095906281402526
Epoch

In [440]:
test_data['holiday_National_Event'] = False
test_data['holiday_National_Holiday'] = False
test_data['holiday_Regional_Holiday'] = False

In [441]:
X_test_scaled = scaler.transform(test_data[X.columns])
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32).to(device)

test_dataset = TensorDataset(X_test_tensor, torch.zeros(len(X_test_tensor)).to(device))
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

model.eval()
predictions = []
with torch.no_grad():
    for inputs, _ in test_loader:
        inputs = inputs.to(device)
        inputs = inputs.unsqueeze(1)
        outputs = model(inputs)
        predictions.append(outputs.cpu().numpy())

predictions = np.concatenate(predictions, axis=0)

In [442]:
predictions

array([[4.0126643e+00],
       [4.0126643e+00],
       [5.7845058e+01],
       ...,
       [8.2919903e+00],
       [2.0839487e+02],
       [1.0000000e-06]], dtype=float32)

In [443]:
output = pd.read_csv(os.path.join(home_dir, "test.csv"))[['id']]
output['sales'] = predictions
output

Unnamed: 0,id,sales
0,3000888,4.012664
1,3000889,4.012664
2,3000890,57.845058
3,3000891,771.738953
4,3000892,4.012664
...,...,...
28507,3029395,8.291990
28508,3029396,0.000001
28509,3029397,8.291990
28510,3029398,208.394867
