https://www.kaggle.com/code/purplejester/pytorch-deep-time-series-classification

In [1]:
import pandas as pd
import numpy as np
import torch
import json
import os
import random
import torch.nn as nn
import time

from glob import glob
from torch.utils.data import Dataset, DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor
from torch.nn import Module
from torch.nn.functional import sigmoid, one_hot
from torch.optim import Optimizer
from tqdm import tqdm

from measurement_utils.measure_db import MeasureDB

In [2]:
def calculate_diff(x_y_z, meas_type_acc) -> np.ndarray:
    if meas_type_acc:
        x_diff, y_diff, z_diff = [np.diff(m) for m in x_y_z]
    else:
        x_diff, y_diff, z_diff = x_y_z

    result = np.abs(x_diff) + np.abs(y_diff) + np.abs(z_diff)
    assert len(result) > 0
    return result

def get_diff(x_y_z, meas_type_acc, length=None, start_idx=None):
    cut_x_y_z = list()
    for array in x_y_z:
      if length is not None:
          assert length < len(array)

          start_idx = start_idx if start_idx is not None else random.randint(0, len(array) - (length + 1))
          if start_idx > len(array) - (length + 1):
              raise ValueError("start_idx is too large")

          cut_x_y_z.append(array[start_idx:start_idx + length])
      else:
          cut_x_y_z = x_y_z

    result = calculate_diff(cut_x_y_z, meas_type_acc)

    assert len(result) > 0
    return result

def get_limb_diff_mean(left_x_y_z, right_x_y_z, meas_type_acc, length=None, start_idx=None):
    left_diff = get_diff(left_x_y_z, meas_type_acc, length, start_idx)
    right_diff = get_diff(right_x_y_z, meas_type_acc, length, start_idx)
    result = np.abs(left_diff.mean() - right_diff.mean())
    return result

def get_limb_ratio_mean(left_x_y_z, right_x_y_z, meas_type_acc,
                        class_value_left, class_value_right,
                        length=None, start_idx=None, mean_first=True):
    left_diff = get_diff(left_x_y_z, meas_type_acc, length, start_idx)
    right_diff = get_diff(right_x_y_z, meas_type_acc, length, start_idx)

    if mean_first:
        if class_value_left > class_value_right:
            result = left_diff.sum() / right_diff.sum()
        else:
            result = right_diff.sum() / left_diff.sum()
    else:
        left_diff = left_diff + 0.1
        right_diff = right_diff + 0.1
        if class_value_left > class_value_right:
            result = np.mean(left_diff / right_diff)
        else:
            result = np.mean(right_diff / left_diff)
    return result

def get_input_from_df(meas_df: pd.DataFrame, length: int, class_value_dict: dict) -> np.ndarray:
    keys_in_order = (("arm", "acc"),
                     ("leg", "acc"),
                     ("arm", "gyr"),
                     ("leg", "gyr"))

    result = list()
    for key in keys_in_order:
        class_value_left = class_value_dict[("left", key[0])]
        class_value_right = class_value_dict[("right", key[0])]

        left_x_y_z = [meas_df[str((("left", key[0], key[1], "x")))].values,
                      meas_df[str((("left", key[0], key[1], "y")))].values,
                      meas_df[str((("left", key[0], key[1], "z")))].values]

        right_x_y_z = [meas_df[str((("right", key[0], key[1], "x")))].values,
                       meas_df[str((("right", key[0], key[1], "y")))].values,
                       meas_df[str((("right", key[0], key[1], "z")))].values]
        meas_type_acc = key[1] == "acc"
        diff_mean = get_limb_diff_mean(left_x_y_z, right_x_y_z, meas_type_acc, length, start_idx=None)
        ratio_mean_first = get_limb_ratio_mean(left_x_y_z, right_x_y_z, meas_type_acc, class_value_left, class_value_right, length, mean_first=True, start_idx=None)
        ratio_mean = get_limb_ratio_mean(left_x_y_z, right_x_y_z, meas_type_acc, class_value_left, class_value_right, length, mean_first=False, start_idx=None)
        result.extend([diff_mean, ratio_mean, ratio_mean_first])

    return np.expand_dims(np.array(result), axis=0)

In [9]:
class ClearMeasurements(object):
    def __init__(self, folder_path: str, clear_json_path: str, cache_size: int = 1) -> None:
        assert cache_size > 0, "cache_size must be positive integer"
        self.cache_size = cache_size
        self.id_path_dict = dict()
        self.cache_dict = dict()

        self.read_csv_path(folder_path)
        self.read_clear_json(clear_json_path)

    def get_meas_id_list(self, data_type: str) -> list:
        return sorted(self.clear_ids_dict[data_type])

    def read_clear_json(self, clear_json_path: str) -> None:
        with open(clear_json_path, "r") as read_file:
            self.clear_ids_dict = json.load(read_file)

        all_meas_ids = set(self.id_path_dict.keys())
        for meas_id in self.clear_ids_dict["train"]:
            assert meas_id in all_meas_ids

        for meas_id in self.clear_ids_dict["test"]:
            assert meas_id in all_meas_ids

    def read_csv_path(self, folder_path: str) -> None:
        print("read_csv_path")
        for csv_path in sorted(glob(os.path.join(folder_path, "*.csv"))):
            file_name = os.path.basename(csv_path)
            meas_id = file_name.split("-")[0]
            self.id_path_dict[int(meas_id)] = csv_path
        print("done")

    def drop_random_from_cache_dict(self):
        self.cache_dict.pop(random.choice(list(self.cache_dict.keys())))

    def get_measurement(self, meas_id: int) -> pd.DataFrame:
        if meas_id in self.cache_dict:
            #print("use cache")
            df = self.cache_dict[meas_id]
        else:
            print("read new")
            if len(self.cache_dict) == self.cache_size:
                print("drop from cache")
                self.drop_random_from_cache_dict()

            csv_path = self.id_path_dict[meas_id]
            df = pd.read_csv(csv_path)
            self.cache_dict[meas_id] = df
            assert len(self.cache_dict) <= self.cache_size, (len(self.cache_dict), self.cache_size)
        return df

In [10]:
class ClearDataset(Dataset):
    def __init__(self,
                 data_type: str, # train or test
                 clear_measurements: ClearMeasurements,
                 measDB: MeasureDB,
                 length: int,
                 sample_per_meas: int) -> None:
        self.meas_id_list = clear_measurements.get_meas_id_list(data_type)
        self.clear_measurements = clear_measurements
        self.measDB = measDB
        self.sample_per_meas = sample_per_meas
        self.length = length

        self.meas_idx = 0
        self.to_tensor = ToTensor()

    def __len__(self):
        return len(self.meas_id_list) * self.sample_per_meas

    def __getitem__(self, idx):
        #print(idx)
        meas_idx = idx // self.sample_per_meas
        meas_id = self.meas_id_list[meas_idx]
        #start = time.time()
        #print(111)
        meas_df = self.clear_measurements.get_measurement(meas_id)
        #print("get meas {} took {:.2}s".format(meas_id, time.time() - start))

        class_value_dict = self.measDB.get_class_value_dict(meas_id=meas_id)
        #start = time.time()
        input_array = get_input_from_df(meas_df, self.length, class_value_dict)
        #print("get input_array for {} took {:.2}s".format(meas_id, time.time() - start))
        input_tensor = torch.from_numpy(input_array).float()

        label = min(class_value_dict.values())
        return input_tensor, label

In [15]:
def validation_step(model: Module,
                    criterion: Module,
                    valid_loader: DataLoader):

    print("validation_step")
    with torch.no_grad():
        model.eval()
        correct = 0
        loss = list()

        total = 0
        tqdm_dict = dict()

        for batch_idx, (x, y) in enumerate(valid_loader):
            total += y.size(0)

            model_out = torch.squeeze(model(x.float()))
            print(model_out.shape, y.shape)
            loss.append(criterion(model_out, y).item())

            _, predicted = model_out.max(1)
            correct += predicted.eq(y).sum().item()
            acc = correct / total

        print("loss: {:.2f}, acc: {:.1f}%".format(sum(loss) / len(loss), acc * 100))

class TrainLoop(object):
    def __init__(self,
                 model: Module,
                 optimizer: Optimizer,
                 criterion: Module,
                 train_loader: DataLoader,
                 valid_loader: DataLoader,
                 num_epoch: int):
        self.model = model
        self.optimizer = optimizer
        self.criterion = criterion
        self.train_loader = train_loader
        self.valid_loader = valid_loader
        self.num_epoch = num_epoch

    def run_loop(self):
        for epoch in range(self.num_epoch):
            print("\nepoch: {}".format(epoch))
            self.model.train()

            correct = 0
            total = 0

            #tq = tqdm(total=(len(self.train_loader)))
            #tq.set_description('ep {}'.format(epoch))
            for batch_idx, (x, y) in enumerate(self.train_loader):
                self.optimizer.zero_grad()

                model_out = torch.squeeze(self.model(x.float()))

                print(model_out.shape, y.shape)
                loss = self.criterion(model_out, y)  # index of the max log-probability
                loss.backward()

                self.optimizer.step()

                total += y.size(0)

                _, predicted = model_out.max(1)
                correct += predicted.eq(y).sum().item()

                acc = correct / total

                #tq.update(1)
                #tq.set_postfix(loss='{:.2f}'.format(loss.item()),
                #               acc='{:.1f}%'.format(acc * 100))
                print("{:.2f}%, loss: {:.2f}, acc: {:.1f}%".format(batch_idx/len(self.train_loader), loss.item(), acc * 100))

            validation_step(self.model, self.criterion, self.valid_loader)


In [16]:
db_path = "./data/WUS-v4measure202307311.accdb"
ucanaccess_path = "./ucanaccess/"
folder_path = "./data/clear_data/"
clear_json_path = "./data/clear_train_test_ids.json"

length = int(1.5 * 60 * 60 * 25)  # 1.5 hours, 25 Hz
sample_per_meas = 1

measDB = MeasureDB(db_path, ucanaccess_path)
clear_measurements = ClearMeasurements(folder_path, clear_json_path)

train_dataset = ClearDataset("train", clear_measurements, measDB, length, sample_per_meas)
test_dataset = ClearDataset("test", clear_measurements, measDB, length, sample_per_meas)

train_dataloader = DataLoader(train_dataset, batch_size=3, shuffle=False, num_workers=1)
test_dataloader = DataLoader(test_dataset, batch_size=3, shuffle=False, num_workers=1)

read_csv_path
done


In [17]:
inpit_size = 12
layer_sizes = [512, 128]
output_size = 6

lr = 0.001
wd = 0

num_epoch = 10

model = nn.Sequential(
        nn.Linear(12, layer_sizes[0]),
        nn.ReLU(),
        nn.Linear(layer_sizes[0], layer_sizes[1]),
        nn.ReLU(),
        nn.Linear(layer_sizes[1], output_size)
)

criterion = torch.nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(model.parameters(),
                             lr=lr,
                             weight_decay=wd,
                             amsgrad=True)
model = model.float()
train_loop = TrainLoop(model,
                       optimizer,
                       criterion,
                       train_dataloader,
                       test_dataloader,
                       num_epoch)

In [18]:
train_loop.run_loop()


epoch: 0
read new
read new
drop from cache
read new
drop from cache
read new
drop from cache
torch.Size([3, 6]) torch.Size([3])
0.00%, loss: 1.86, acc: 0.0%
read new
drop from cache
read new
drop from cache
read new
drop from cache
torch.Size([3, 6]) torch.Size([3])
0.08%, loss: 1.39, acc: 33.3%
read new
drop from cache
read new
drop from cache
read new
drop from cache
torch.Size([3, 6]) torch.Size([3])
0.17%, loss: 3.51, acc: 22.2%
read new
drop from cache
read new
drop from cache
read new
drop from cache
torch.Size([3, 6]) torch.Size([3])
0.25%, loss: 3.00, acc: 25.0%
read new
drop from cache
read new
drop from cache
read new
drop from cache
torch.Size([3, 6]) torch.Size([3])
0.33%, loss: 3.03, acc: 20.0%
read new
drop from cache
read new
drop from cache
read new
drop from cache
torch.Size([3, 6]) torch.Size([3])
0.42%, loss: 2.87, acc: 22.2%
read new
drop from cache
read new
drop from cache
read new
drop from cache
torch.Size([3, 6]) torch.Size([3])
0.50%, loss: 2.45, acc: 19.0%
re