# Import required libraries

In [50]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optimizer
import lightgbm as lgbm

from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [8]:
# check for the cuda gpu status
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

device

  return torch._C._cuda_getDeviceCount() > 0


device(type='cpu')

# Visualize, transform and split data

In [9]:
# train and test data url
train_data_url = "./data/train.csv"
test_data_url = "./data/test.csv"

In [10]:
# convert training and testing dataframe
train_df = pd.read_csv(train_data_url)
test_df = pd.read_csv(test_data_url)

In [11]:
train_df.isna().sum()

id             0
date           0
country        0
store          0
product        0
num_sold    8871
dtype: int64

In [12]:
test_df.isna().sum()

id         0
date       0
country    0
store      0
product    0
dtype: int64

In [13]:
# date engineering for the training and testing set dataframes

def fill_nan(df):
    for column in df.columns:
        if df[column].dtype in ["int64", "float64"]:
            df[column].fillna(df[column].mean(), inplace=True)
        else:
            df[column].fillna(df[column].mode(), inplace=True)
    return df

def convert_date(df):
    df["date"] = pd.to_datetime(df["date"])
    df["year"] = df["date"].dt.year
    df["month"] = df["date"].dt.month
    df["weekYear"] = df["date"].dt.isocalendar().week
    df["dayWeek"] = df["date"].dt.dayofweek
    df["dayYear"] = df["date"].dt.dayofyear
    return df

train_df = fill_nan(train_df)

train_df = convert_date(train_df)
test_df = convert_date(test_df)

test_df.isna().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(df[column].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(df[column].mode(), inplace=True)


id          0
date        0
country     0
store       0
product     0
year        0
month       0
weekYear    0
dayWeek     0
dayYear     0
dtype: int64

In [14]:
# split out a single training set into multiple training set (train & validation dataframe)
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

train_df = train_df.reset_index(drop=True)

val_df = val_df.reset_index(drop=True)
val_df = shuffle(val_df, random_state=42)

In [51]:
# preprocessing libraries ABV
scaler = StandardScaler()
le = LabelEncoder()

# filter out unnecessary features for training and validation set
def colLabelEnconder(X):
    columns = ["country", "store", "product"]
    for col in columns:
        X[col] = le.fit_transform(X[col])
    return X

# training
train_X = train_df.drop(columns=["id", "date", "num_sold"])
train_X_encoded = colLabelEnconder(train_X)
#train_X_encoded = pd.get_dummies(train_X, columns=["country", "store", "product"]).values.astype("float32")
train_X_scaled = scaler.fit_transform(train_X_encoded)
train_X_tensor = torch.tensor(train_X_scaled, dtype=torch.float32)

train_y = train_df["num_sold"].values.astype("float32")
train_y_tensor = torch.tensor(train_y, dtype=torch.float32)

train_td = TensorDataset(train_X_tensor, train_y_tensor)
train_loader = DataLoader(train_td, batch_size=16, shuffle=True)

# validation
val_X = val_df.drop(columns=["id", "date", "num_sold"])
val_X_encoded = colLabelEnconder(val_X)
#val_X_encoded = pd.get_dummies(val_X, columns=["country", "store", "product"]).values.astype("float32")
val_X_scaled = scaler.fit_transform(val_X_encoded)
val_X_tensor = torch.tensor(val_X_scaled, dtype=torch.float32)

val_y = val_df["num_sold"].values.astype("float32")
val_y_tensor = torch.tensor(val_y, dtype=torch.float32)

val_td = TensorDataset(val_X_tensor, val_y_tensor)
val_loader = DataLoader(val_td, batch_size=16, shuffle=False)

train_X_tensor.shape

torch.Size([184104, 8])

In [52]:
# align testing features with training set features
test_X = test_df.drop(columns=["id", "date"])
test_X_encoded = colLabelEnconder(test_X)
#test_X_encoded = pd.get_dummies(test_X, columns=["country", "store", "product"]).values.astype("float32")
test_X_scaled = scaler.fit_transform(test_X_encoded)
test_X_tensor = torch.tensor(test_X_scaled, dtype=torch.float32).to(device)

test_X_tensor.shape

torch.Size([98550, 8])

# Self construct and define AI Model

In [55]:
# self implement linear regression model
class lrModel(nn.Module):
    def __init__(self):
        super(lrModel, self).__init__()
        self.fc1 = nn.Linear(19, 8)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(8, 4)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(4, 1)
    def forward(self, x):
        x = self.relu1(self.fc1(x))
        x = self.relu2(self.fc2(x))
        return self.fc3(x)
    
# self implement gru model
class GRUCell(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(GRUCell, self).__init__()
        # gate
        self.Wz = nn.Linear(input_size, hidden_size)
        self.Uz = nn.Linear(hidden_size, hidden_size)
        self.Wr = nn.Linear(input_size, hidden_size)
        self.Ur = nn.Linear(hidden_size, hidden_size)
        # hidden
        self.Wh = nn.Linear(input_size, hidden_size)
        self.Uh = nn.Linear(hidden_size, hidden_size)
    def forward(self, x, h_prev):
        z = torch.sigmoid(self.Wz(x) + self.Uz(h_prev))
        r = torch.sigmoid(self.Wr(x) + self.Ur(h_prev))
        h_tilde = torch.tanh(self.Wh(x) + self.Uh(r * h_prev))
        h = (1 - z) * h_prev + z * h_tilde
        return h
    
class GRUNetwork(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers):
        super(GRUNetwork, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.num_layers = num_layers
        # list of grucells
        self.grucells = nn.ModuleList([GRUCell(input_size, hidden_size) for _ in range(num_layers)])
        self.fc = nn.Linear(hidden_size, output_size)
    def forward(self, x):
        batch_size, seq_len, _ = x.size()
        h = [torch.zeros(batch_size, self.hidden_size).to(device) for _ in range(self.num_layers)]
        for t in range(seq_len):
            for i in range(self.num_layers):
                h[i] = self.grucells[i](x[:, t, :], h[i])
        out = self.fc(h[-1])
        return out
    
class GRUNetwork2(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers):
        super(GRUNetwork2, self).__init__()
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    def forward(self, x):
        out, _ = self.gru(x)
        out = out[:, -1, :]
        out = self.fc(out)
        return out
    
# self implement mape function
def mape(y_pred, y):
    y = torch.where(y == 0, torch.tensor(1e-10, dtype=torch.float32, device=y.device), y)
    loss = torch.mean(torch.abs((y - y_pred) / y)) * 100
    return loss

# self implement mae function
def mae(y_pred, y):
    return torch.mean(torch.abs(y - y_pred))

# self implement mse function
def mse(y_pred, y):
    return torch.mean((y - y_pred)**2)

# define model and its parameters
# model = lrModel().to(device)

input_size = 8
hidden_size = 64
output_size = 1
num_layers = 2
model = GRUNetwork2(input_size, hidden_size, output_size, num_layers)
optim = optimizer.SGD(model.parameters(), lr=0.001)

# Train model

In [56]:
# training set
for epoch in range(5):
    model.train()
    losses = 0
    for data, labels in train_loader:
        data, labels = data.to(device), labels.to(device)
        if len(data.size()) == 2:
            data = data.unsqueeze(1)
        preds = model(data)
        loss = mse(preds, labels)
        losses += loss.item()
        optim.zero_grad()
        loss.backward()
        optim.step()
    print(f"epoch {epoch}, losses {losses:.4f}")

epoch 0, losses 5294610947.9922
epoch 1, losses 5281498193.3359
epoch 2, losses 5273941064.5000
epoch 3, losses 5269566111.4922
epoch 4, losses 5267671804.4609


In [58]:
# validation set
model.eval()
correct_preds = 0
total_preds = 0
with torch.no_grad():
    for data, labels in val_loader:
        data, labels = data.to(device), labels.to(device)
        if len(data.size()) == 2:
            data = data.unsqueeze(1)
        preds = model(data)
        _, predictions = torch.max(preds, 1)
        correct_preds += (predictions == labels).sum().item()
        total_preds += labels.size(0)
    print(f"Total {total_preds} and correct {correct_preds}")

Total 46026 and correct 0


In [59]:
# lgbm model
model2 = lgbm.LGBMRegressor()
model2.fit(train_X, train_y)
y_pred2 = model2.predict(val_X)
loss = mean_squared_error(y_pred2, val_y)
prediction2 = np.sum(np.abs((val_y - y_pred2) / val_y) <= 0.05)
print(f"Total {len(val_y)} and correct {prediction2}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002391 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 347
[LightGBM] [Info] Number of data points in the train set: 184104, number of used features: 8
[LightGBM] [Info] Start training from score 752.192731
Total 46026 and correct 18235
