In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
from ..utils.models.lstm import LSTM
from ..utils.models.cnn import ConvRNN
from ..utils.metrics import growth_metric
from torch.optim import SGD, Adam
import pandas as pd


# Model
Setting the seeds to allow reproducibility

In [None]:
# Set seeds and device
seed = 2
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
np.random.seed(seed)

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device}")

## Load dataset

We Load two different datasets, one with values previous 2022 and one with values after. \
The idea is to first train the model on the data previous 2022 and then fine-tune it on data after 2022

The way the data is structured is as follow:
* The data is normalized per column
* We have the data sorted by "business_entity_doing_business_as_name", "period_end_date"
* We explore the dataframe row by row with window_length=10 and K=1
* The row in the window_length are the input for our model, while the future up to K steps is what the model has to predict
* The idea is that up to K steps we define a metric, which you can find under utils/metrics.py. Nutshell: this metric describes the ratio between interactions/#posts. Then the metric is applied to these K weeks immediately after the window, to produce a label (scalar value), which the model can be trained on

In [None]:
from utils.dataset import get_datasets

path_Full = "../data/Full_Feature_data.csv"
path_Before2022 = "../data/Before2022_Feature_data.csv"
path_From2022 = "../data/From2022_Feature_data.csv"

train_dataset_From2022, val_dataset_From2022 = get_datasets(path_From2022, test_size=0.1)
train_dataset_Before2022, val_dataset_Before2022 = get_datasets(path_Before2022, test_size=0.1)

In [None]:
BATCH_SIZE = 16

train_loader_Before2022 = DataLoader(train_dataset_Before2022, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)
val_loader_Before2022 = DataLoader(val_dataset_Before2022, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)

train_loader_From2022 = DataLoader(train_dataset_From2022, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)
val_loader_From2022 = DataLoader(val_dataset_From2022, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)

## Load models
The two models that we try are a simple LSTM implementation and a ConvRNN. \
The idea is that the models have to capture dependencies inside the window of week to predict the future metric value. \
Hence, we opt for two models which have the right bias to capture this. \
We use as a Loss a simple Mean Squared Error and we evaluate the prediction still with the Mean Squared Error

In [None]:
# Model and optimizer
model = ConvRNN(train_dataset_From2022[0][0].shape[1], train_dataset_From2022[0][0].shape[0], 1, n_channels1=128, n_channels2=128, n_channels3=128, n_units1=128, n_units2=128, n_units3=128)
#model = LSTM()
optimizer = Adam(model.parameters(), lr=0.01)
loss = torch.nn.functional.mse_loss

cfg = {
        "model": model,
        "setup": "train",
        "loss": loss,
        "optimizer": optimizer,
        "epochs": 10,
}

In [None]:
from utils.experiments import Experiment

model = Experiment(cfg)

In [None]:
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, Callback, EarlyStopping

checkpoint_callback = ModelCheckpoint(
    monitor='val_loss',
    dirpath='./',
    filename='{epoch:02d}-{val_loss:.2f}'
)

class PrintCallback(pl.Callback):
    def on_train_epoch_end(self, trainer, pl_module):
        print(f"Training loss: {trainer.callback_metrics['train_loss']}")
        
    def on_validation_end(self, trainer, pl_module):
        print(f"Validation loss: {trainer.callback_metrics['val_loss']}, Mse: {trainer.callback_metrics['val_mse']}")

In [None]:
trainer = pl.Trainer(accelerator="cpu", max_epochs=cfg["epochs"], callbacks=[PrintCallback()])#, EarlyStopping(monitor="val_loss", mode="min")])

# Train before 2022
Here we train the model for some epochs on the dataset before2022

In [None]:
trainer.fit(model, train_loader_Before2022, val_loader_Before2022)

# Train after 2022
The same model is then trained on the dataset after2022 to make it more relevant for the future

In [None]:
trainer.fit(model, train_loader_From2022, val_loader_From2022)

# Inference

We choose a particular brand of interest where we want to infer

In [None]:
df = pd.read_csv(path_From2022, sep=",")
df = df[df["business_entity_doing_business_as_name"] == "Calvin Klein"]

In [None]:
from utils.dataset import BrandDataset
infer_data_set = BrandDataset(df)
infer_loader = DataLoader(infer_data_set, batch_size=1, shuffle=False, num_workers=1)

## Important !
The prediction can be used inside a Test to check whether the brand is an outlier compared to its previous trend. \
To do that we proceed as follow:
* We evaluate the metric over the window_length to generate an *avg_growth* in the past period.  Moreover we compute the *std_dev_growth* from the metric growth for each week inside our window_length. Our model is then making the prediction *growth* of the metric for the future in the next K=1 weeks
* We then check whether *growth* - *avg_growth* > z*std_dev_growth*, z tunable (ex. z=2 means in 95.47% positive outlier) to detect whether we have a **POSITIVE OUTLIER**


In [None]:
from sklearn.metrics import mean_squared_error

for x, y in infer_loader:
        out = model.forward(x)    
        loss = torch.nn.functional.mse_loss(out.squeeze(), y.squeeze())   

        print("train_loss {}".format(loss))
        mse = mean_squared_error(y.cpu().numpy(), out.cpu().detach().numpy())
        print("train_mse {}".format(mse))
        print(out)
        print(y)     