In [33]:
import sys
sys.path.insert(0, "../..")
from src.data import data_tools
from pathlib import Path
from src.models import rnn_models, train_model, metrics
import gin
import numpy as np
import torch
from typing import List
from torch.nn.utils.rnn import pad_sequence

  warn(


# 1 Iterators
We will be using an interesting dataset. [link](https://tev.fbk.eu/resources/smartwatch)

From the site:
> The SmartWatch Gestures Dataset has been collected to evaluate several gesture recognition algorithms for interacting with mobile applications using arm gestures. Eight different users performed twenty repetitions of twenty different gestures, for a total of 3200 sequences. Each sequence contains acceleration data from the 3-axis accelerometer of a first generation Sony SmartWatch™, as well as timestamps from the different clock sources available on an Android device. The smartwatch was worn on the user's right wrist. 

# 1.1 Iterator design pattern
This is a nice opportunity to create our own custom iterator. First, let's look at the simplified design pattern for an iterator.
This introduces two additional dunder methods, used in forloops.

In [2]:
class BaseIterator:
  def __init__(self, n: int):
    self.n = n
    # we generate some dummy data
    self.data = [*range(self.n)]

  def __iter__(self):
    # startindex; this makes the first index used effectively 0,
    # because -1 is smaller than the dataset so the first
    # thing that will happen in __next__ is to add +1
    self.idx = -1
    # we return the full object when iter() is called
    return self

  def __next__(self):
    # for every iteration, __next__ is called
    # as long as the idx is not bigger than the data
    # we need to do -1, because we will increase idx directly after this
    if self.idx < len(self.data) - 1:
      self.idx += 1
      return self.data[self.idx]
    else:
      raise StopIteration

myclass = BaseIterator(n=5)
myiter = iter(myclass) # this calles the __iter__ method and sets idx to -1

for x in myiter: # this calls the __next__ method
  print(x)

0
1
2
3
4


Note how `__iter__` returns `self` with the startsettings (`idx=-1`).
Every time we call `__next__`, (implicitly in a forloop or explicitly with `next()`) the idx is increased by +1.

If you keep calling `next()`, you will eventually get a `StopIteration` error but a forloop will wait for that and stop the loop gracefully.

## 1.2 Iterator implementation
Now, we extend this pattern to our dataset. We will use what we have seen before with the images (lesson 1, 02_datagenerators.ipynb and 03_dataloader.ipynb): we load the paths, and use these to generate the data.

In [3]:
from __future__ import annotations
from typing import Tuple
from tqdm import tqdm
import random
Tensor = torch.Tensor

# the base dataset that provides the boilerplate code
class BaseDataset:
    def __init__(self, paths: List[Path]) -> None:
        self.paths = paths
        random.shuffle(self.paths)
        self.dataset = []
        self.process_data()

    def process_data(self) -> None:
        # this needs to be implemented if you want to use the BaseDataset
        raise NotImplementedError

    def __len__(self) -> int:
        return len(self.dataset)

    def __getitem__(self, idx: int) -> Tuple[Tensor, int]:
        return self.dataset[idx]

# the TSDataset, that inherits all the boilerplate code
# and implements a process_data method for this dataset
class TSDataset(BaseDataset):
    # this is called inheritance.
    # we get all the methods from the BaseDataset for free
    # Only thing we need to do is implement the process_data method
    def process_data(self) -> None:
        for file in tqdm(self.paths):
            x_ = np.genfromtxt(file)[:, 3:]
            x = torch.tensor(x_).type(torch.float32)
            y = torch.tensor(int(file.parent.name) - 1)
            self.dataset.append((x, y))

In [4]:
class BaseDataIterator:
    def __init__(self, dataset: BaseDataset, batchsize: int):
        self.dataset = dataset
        self.batchsize = batchsize

    def __len__(self) -> int:
        # the lenght is the amount of batches
        return int(len(self.dataset) / self.batchsize)

    def __iter__(self) -> BaseDataIterator:
        # initialize index
        self.index = 0
        self.index_list = torch.randperm(len(self.dataset))
        return self
    
    def batchloop(self) -> Tuple[Tensor, Tensor]:
        X = []  # noqa N806
        Y = []  # noqa N806
        # fill the batch
        for _ in range(self.batchsize):
            # the design trick with the index_list and index allows us to shuffle the
            # index_list every time we call __iter__
            # withouth shuffling the actual data
            x, y = self.dataset[int(self.index_list[self.index])]
            X.append(x)
            Y.append(y)
            # the index will signal the end of the dataset in the __next__ method 
            self.index += 1
        return X, Y

    def __next__(self) -> Tuple[Tensor, Tensor]:
        if self.index <= (len(self.dataset) - self.batchsize):
            # this stops the iteration just before the index
            # would reach the end of the length of the dataset
            X, Y = self.batchloop()
            return X, Y
        else:
            raise StopIteration


class PaddedDatagenerator(BaseDataIterator):
    # again, we inherit everything from the baseclass
    def __init__(self, dataset: BaseDataset, batchsize: int) -> None:
        # we initialize the super class BaseDataIterator
        # we now have everything the BaseDataIterator can do, for free
        super().__init__(dataset, batchsize)
    
    def __next__(self) -> Tuple[Tensor, Tensor]:
        if self.index <= (len(self.dataset) - self.batchsize):
            X, Y = self.batchloop()
            # we just want to add padding
            X_ = pad_sequence(X, batch_first=True, padding_value=0)  # noqa N806
            return X_, torch.tensor(Y)
        else:
            raise StopIteration

The main difference with the pattern we used before, is that this pattern will stop after the last item is spit out.
The generator from the first lesson will go on forever, due to the `while True` loop.

## 1.3 Train-test

In [8]:
data_dir = Path("../../data/external/gestures-dataset/").resolve()

# get all paths with the .txt extension
formats = [".txt"]
paths = [path for path in data_tools.walk_dir(data_dir) if path.suffix in formats]
# make a train-test split
split = 0.8
idx = int(len(paths) * split)
trainpaths = paths[:idx]
testpaths = paths[idx:]

651

In [9]:
traindataset = TSDataset(trainpaths)
testdataset = TSDataset(testpaths)

100%|██████████| 2600/2600 [00:00<00:00, 6929.47it/s]
100%|██████████| 651/651 [00:00<00:00, 6967.55it/s]


In [10]:
batchsize = 32
trainloader = PaddedDatagenerator(traindataset, batchsize=batchsize)
testloader = PaddedDatagenerator(testdataset, batchsize=batchsize)

In [11]:
x, y = traindataset[1]
x.shape, y

(torch.Size([15, 3]), tensor(13))

In [10]:
len(traindataset), len(trainloader), len(testdataset), len(testloader)

(2600, 81, 651, 20)

What does the lenght mean?

In [11]:
x, y = next(iter(trainloader))
x.shape, y.shape

(torch.Size([32, 34, 3]), torch.Size([32]))

Can you make sense of the shape?
What does it mean that the shapes are sometimes (32, 27, 3), but a second time might look like (32, 30, 3)? In other words, the second (or first, if you insist on starting at 0) dimension changes. Why is that? How does the model handle this? Do you think this is already padded, or still has to be padded?


# 2 Excercises
Lets test a basemodel, and try to improve upon that.

Fill the gestures.gin file with relevant settings for `input_size`, `hidden_size`, `num_layers` and `horizon` (which, in our case, will be the number of classes...)

As a rule of thumbs: start lower than you expect to need!

In [12]:
gin.parse_config_file("gestures.gin")
model = rnn_models.BaseRNN()

In [13]:
gin.get_bindings("BaseRNN")

{'input_size': 3, 'hidden_size': 128, 'num_layers': 3, 'horizon': 20}

In [14]:
gin.get_bindings("trainloop")

{'epochs': 10,
 'learning_rate': 0.001,
 'optimizer': torch.optim.adam.Adam,
 'loss_fn': CrossEntropyLoss()}

Test the model. What is the output shape you need? Remember, we are doing classification!

In [15]:
yhat = model(x)
yhat.shape

torch.Size([32, 20])

Test the accuracy

In [16]:
from src.models import metrics

accuracy = metrics.Accuracy()
accuracy(y, yhat)

tensor(0.1250)

What do you think of the accuracy? What would you expect from blind guessing?

Check shape of `y` and `yhat`

In [17]:
yhat.shape, y.shape

(torch.Size([32, 20]), torch.Size([32]))

And look at the output of yhat

In [17]:
yhat[0]

tensor([ 0.0620,  0.0036, -0.0671,  0.0461, -0.0350, -0.0161,  0.0289,  0.1370,
        -0.0835,  0.1176, -0.0385,  0.0189, -0.0081, -0.0256,  0.0376, -0.1007,
         0.0590, -0.0623,  0.0075, -0.0462], grad_fn=<SelectBackward0>)

Does this make sense to you? If you are unclear, go back to the classification problem with the MNIST, where we had 10 classes.

We have a classification problem, so we need Cross Entropy Loss.
Remember, [this has a softmax built in](https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html) 

In [19]:
loss_fn = torch.nn.CrossEntropyLoss()
loss = loss_fn(yhat, y)
accuracy = metrics.Accuracy()
loss

tensor(2.9899, grad_fn=<NllLossBackward0>)

In [26]:
import mlflow
from datetime import datetime
from importlib import reload

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("gestures")
modeldir = Path("../../models/gestures/").resolve()
reload(rnn_models)
# gin.parse_config_file("gestures.gin")
# gin.parse_config_file("gestures_gru.gin")
# gin.enter_interactive_mode()
gin.parse_config_file("gestures_gru.gin")

with mlflow.start_run():
    mlflow.set_tag("model", "convnet")
    mlflow.set_tag("dev", "raoul")
    mlflow.log_params(gin.get_bindings("BaseRNN"))
    mlflow.log_params(gin.get_bindings("trainloop"))
    mlflow.log_param("datadir", f"{data_dir}")
    mlflow.log_param("batchsize", f"{batchsize}")

    # model = rnn_models.BaseRNN()
    model = rnn_models.GRUmodel_J()

    model, testloss = train_model.trainloop(
        model=model,
        metrics=[accuracy],
        train_dataloader=trainloader,
        test_dataloader=testloader,
        log_dir="modellog",
        train_steps=len(trainloader),
        eval_steps=len(testloader),
        patience=5,
        factor=0.5,
        early_stopping_patience=10,
        early_stopping_save=True,
        tunewriter=["tensorboard", "mlflow", "gin"]
    )
    mlflow.pytorch.log_model(model, "gru")

    tag = datetime.now().strftime("%Y%m%d-%H%M")
    modelpath = modeldir / (tag + "model.pt")
    torch.save(model, modelpath)

2023-02-10 15:03:58.624 | INFO     | src.data.data_tools:dir_add_timestamp:129 - Logging to modellog/20230210-1503
100%|[38;2;30;71;6m██████████[0m| 81/81 [00:01<00:00, 52.98it/s]
2023-02-10 15:04:00.354 | INFO     | src.models.train_model:trainloop:180 - Epoch 0 train 2.5944 test 2.4841 metric ['0.1344']
100%|[38;2;30;71;6m██████████[0m| 81/81 [00:01<00:00, 52.77it/s]
2023-02-10 15:04:02.079 | INFO     | src.models.train_model:trainloop:180 - Epoch 1 train 2.0236 test 2.3734 metric ['0.1953']
100%|[38;2;30;71;6m██████████[0m| 81/81 [00:01<00:00, 52.26it/s]
2023-02-10 15:04:03.826 | INFO     | src.models.train_model:trainloop:180 - Epoch 2 train 1.7360 test 1.9384 metric ['0.2422']
100%|[38;2;30;71;6m██████████[0m| 81/81 [00:01<00:00, 51.27it/s]
2023-02-10 15:04:05.596 | INFO     | src.models.train_model:trainloop:180 - Epoch 3 train 1.4520 test 1.5081 metric ['0.3703']
100%|[38;2;30;71;6m██████████[0m| 81/81 [00:01<00:00, 51.53it/s]
2023-02-10 15:04:07.367 | INFO     | src.m

In [None]:
mlflow.log_artifact(local_path=modelpath, artifact_path="pytorch_models")

Excercises:

- improve the RNN model
- test different things. What works? What does not?
- experiment with either GRU or LSTM layers, create your own models + ginfiles. 
- experiment with adding Conv1D layers.

You should be able to get above 90% accuracy with the dataset.

In [24]:
# tag = datetime.now().strftime("%Y%m%d-%H%M")
# modelpath = modeldir / (tag + "model.pt")
# torch.save(model, modelpath)
# mlflow.log_artifact(local_path=modelpath, artifact_path="pytorch_models")
# modeldir = Path("../../models/gestures/").resolve()

In [25]:
modeldir

PosixPath('/workspaces/ML22/models/gestures')