In [34]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [35]:
from pathlib import Path
import pandas as pd

In [36]:
DATASET = 'hinges_12'

In [37]:
DATASETS_PATH = Path.cwd().parent.parent / 'datasets' / 'prints'
dataset_path = DATASETS_PATH / DATASET

In [38]:
extruder_power_path = dataset_path / 'extruder_power.csv.gz'
extruder_target_path = dataset_path / 'extruder_target.csv.gz'
extruder_temperature = dataset_path / 'extruder_temperature.csv.gz'
extruder_velocity = dataset_path / 'extruder_velocity_sensor.csv.gz'
gas = dataset_path / 'gas.csv.gz'
heater_power = dataset_path / 'heater_power.csv.gz'
heater_target = dataset_path / 'heater_target.csv.gz'
heater_temperature = dataset_path / 'heater_temperature.csv.gz'
imu = dataset_path / 'imu.csv.gz'
mcu = dataset_path / 'mcu.csv.gz'
position = dataset_path / 'position_sensor.csv.gz'
position_target = dataset_path / 'position_target.csv.gz'
velocity = dataset_path / 'velocity_sensor.csv.gz'
velocity_target = dataset_path / 'velocity_target.csv.gz'

In [39]:
from typing import Dict

from printer_anomaly_detection.dataset.domain import Sensor


sensor_filenames: Dict[Sensor, str] = {
    Sensor.EXTRUDER_POWER: 'extruder_power.csv.gz',
    Sensor.EXTRUDER_TARGET: 'extruder_target.csv.gz',
    Sensor.EXTRUDER_TEMPERATURE: 'extruder_temperature.csv.gz',
    Sensor.EXTRUDER_VELOCITY: 'extruder_velocity_sensor.csv.gz',
    Sensor.GAS: 'gas.csv.gz',
    Sensor.HEATER_POWER: 'heater_power.csv.gz',
    Sensor.HEATER_TARGET: 'heater_target.csv.gz',
    Sensor.HEATER_TEMPERATURE: 'heater_temperature.csv.gz',
    Sensor.ACCELEROMETER: 'accelerometer.csv.gz',
    Sensor.MAGNETOMETER: 'magnetometer.csv.gz',
    Sensor.GYROSCOPE: 'gyroscope.csv.gz',
    Sensor.MCU: 'mcu.csv.gz',
    Sensor.POSITION: 'position_sensor.csv.gz',
    Sensor.POSITION_TARGET: 'position_target.csv.gz',
    Sensor.VELOCITY: 'velocity_sensor.csv.gz',
    Sensor.VELOCITY_TARGET: 'velocity_target.csv.gz',
}

In [40]:
from typing import Type
from printer_anomaly_detection.dataset.domain.printer import *

from printer_anomaly_detection.dataset.domain.sensors import *

sensor_datatypes: Dict[Sensor, Type[SensorData]] = {
    Sensor.EXTRUDER_POWER: PowerSensorData,
    Sensor.EXTRUDER_TARGET: TargetData,
    Sensor.EXTRUDER_TEMPERATURE: TemperatureSensorData,
    Sensor.EXTRUDER_VELOCITY: VelocitySensorData,
    Sensor.GAS: GasSensorData,
    Sensor.HEATER_POWER: PowerSensorData,
    Sensor.HEATER_TARGET: TargetData,
    Sensor.HEATER_TEMPERATURE: TemperatureSensorData,
    Sensor.ACCELEROMETER: IMUSensorData,
    Sensor.MAGNETOMETER: IMUSensorData,
    Sensor.GYROSCOPE: IMUSensorData,
    Sensor.MCU: MCUData,
    Sensor.POSITION: PositionSensorData,
    Sensor.POSITION_TARGET: PositionTargetData,
    Sensor.VELOCITY: VelocitySensorData,
    Sensor.VELOCITY_TARGET: VelocityTargetData,
}

In [41]:
pd.read_csv(extruder_power_path)


Unnamed: 0,datetime,sensor_type,power
0,2023-06-01T12:44:59.136277,E,1.000000
1,2023-06-01T12:46:14.643423,E,0.930071
2,2023-06-01T12:46:15.144622,E,0.869064
3,2023-06-01T12:46:15.646563,E,0.765104
4,2023-06-01T12:46:16.650460,E,0.674389
...,...,...,...
506,2023-06-01T13:13:29.127487,E,0.486161
507,2023-06-01T13:13:30.381189,E,0.404879
508,2023-06-01T13:13:33.639039,E,0.341342
509,2023-06-01T13:13:37.900606,E,0.311557


In [110]:
from typing import Type
from dataclass_csv import DataclassReader
import gzip

from printer_anomaly_detection.dataset.domain.printer import PowerSensorData

def read_csv(path: Path, dataclass_type: Type):
    with gzip.open(path, 'rt', encoding='utf-8') as f:
        return list(DataclassReader(f, dataclass_type))


In [43]:
pd.DataFrame(read_csv(extruder_power_path, PowerSensorData))

Unnamed: 0,datetime,sensor_type,power
0,2023-06-01T12:44:59.136277,E,1.000000
1,2023-06-01T12:46:14.643423,E,0.930071
2,2023-06-01T12:46:15.144622,E,0.869064
3,2023-06-01T12:46:15.646563,E,0.765104
4,2023-06-01T12:46:16.650460,E,0.674389
...,...,...,...
506,2023-06-01T13:13:29.127487,E,0.486161
507,2023-06-01T13:13:30.381189,E,0.404879
508,2023-06-01T13:13:33.639039,E,0.341342
509,2023-06-01T13:13:37.900606,E,0.311557


In [44]:
read_csv(extruder_power_path, PowerSensorData)

[PowerSensorData(datetime=2023-06-01T12:44:59.136277, sensor_type=<SensorType.EXTRUDER: 'E'>, power=1.0),
 PowerSensorData(datetime=2023-06-01T12:46:14.643423, sensor_type=<SensorType.EXTRUDER: 'E'>, power=0.9300709308913256),
 PowerSensorData(datetime=2023-06-01T12:46:15.144622, sensor_type=<SensorType.EXTRUDER: 'E'>, power=0.8690644675100611),
 PowerSensorData(datetime=2023-06-01T12:46:15.646563, sensor_type=<SensorType.EXTRUDER: 'E'>, power=0.7651043138269606),
 PowerSensorData(datetime=2023-06-01T12:46:16.650460, sensor_type=<SensorType.EXTRUDER: 'E'>, power=0.6743891049443522),
 PowerSensorData(datetime=2023-06-01T12:46:17.151408, sensor_type=<SensorType.EXTRUDER: 'E'>, power=0.6114876047237948),
 PowerSensorData(datetime=2023-06-01T12:46:17.904145, sensor_type=<SensorType.EXTRUDER: 'E'>, power=0.5222718698023764),
 PowerSensorData(datetime=2023-06-01T12:46:18.656823, sensor_type=<SensorType.EXTRUDER: 'E'>, power=0.45628944884787515),
 PowerSensorData(datetime=2023-06-01T12:46:19.

In [45]:
from typing import List, Tuple


def calculate_availability_time(data: Dict[Sensor, List[SensorData]]) -> Tuple[float, float]:
    """Calculate the time in seconds where all sensors are available. Target sensors are ignored because they're artificially set and stay 'forever'."""
    availability_from = max(
        sensor_data[0].datetime for sensor, sensor_data in data.items() if sensor.has_frequent_updates
    )

    availability_to = min(
        sensor_data[-1].datetime for sensor, sensor_data in data.items() if sensor.has_frequent_updates
    )
    return availability_from.ceil(), availability_to.floor()

In [46]:
from typing import Generator, Iterator, TypeVar

T = TypeVar('T', bound=SensorData)
class Interpolator:
    def __init__(self, data: List[T], data_type: Type[T], rate: int) -> 'Interpolator':
        self.data = data
        self.rate = rate
        self.period = 1. / rate
        self.data_type = data_type

    def interpolate(self, start_time: Datetime, end_time: Datetime) -> Iterator[T]:
        raise NotImplementedError()

from typing import Callable, Generator, Iterator, Optional, TypeVar

T = TypeVar('T', bound=SensorData)
class LinearInterpolator(Interpolator):
    """Interpolate data by using the last known value"""
    def __init__(self, data: List[T], rate: int, data_type: Type[T], to_floats: Optional[Callable[[T], List[float]]], from_floats: Optional[Callable[[List[float]], T]]) -> 'Interpolator':
        super().__init__(data, rate, data_type)
    
    def interpolate(self, start_time: Datetime, end_time: Datetime) -> Iterator[T]:
        raise NotImplementedError()
        time = start_time
        while self.data[1].datetime < start_time:
            # Remove all data points that are before the start time so that the first data point is the one that is closest to the start time
            self.data.pop(0)
        while True:
            value = self.data.pop(0)
            while value < time:
                value = self.data.pop(0)
            yield value
            time += self.period

from typing import Iterator

class ConstantInterpolator(Interpolator):
    def __init__(self, data: List[T], rate: int, data_type: Type[T]) -> 'Interpolator':
        super().__init__(data, data_type, rate)
 
    def interpolate(self, start_time: Datetime, end_time: Datetime) -> Iterator[T]:
        time = start_time

        value = None

        while True:
            if time >= end_time:
                break

            if len(self.data) == 0:
                assert value is not None
                yield self.data_type(**{**value.__dict__, 'datetime': time})
                time += self.period
                continue

            value = value or self.data[0]
            # if no more data: keep the last data point
            while len(self.data) > 0 and self.data[0].datetime < time:
                # Remove all data points that are before the current time so that the first data point is the one that is closest before the current time
                value = self.data.pop(0)

            yield self.data_type(**{**value.__dict__, 'datetime': time})

            time += self.period



In [47]:
from printer_anomaly_detection.dataset.domain import InterpolationType

interpolators: Dict[InterpolationType, Interpolator] = {
    InterpolationType.LINEAR: LinearInterpolator,
    InterpolationType.CONSTANT: ConstantInterpolator,
}

In [105]:
from typing import List
from printer_anomaly_detection.dataset.domain import InterpolationType, Sensor


class DatasetBuilder:
    def __init__(self, dataset_name: str, rate: int = 1) -> 'DatasetBuilder':
        self.dataset_name = dataset_name
        self.sensors: List[Sensor] = []
        self.interpolations: Dict[Sensor, InterpolationType] = {}
        self.rate: int = rate

    def with_sensors(self, sensors: List[Sensor], interpolation: InterpolationType = InterpolationType.CONSTANT) -> 'DatasetBuilder':
        for sensor in sensors:
            self.with_sensor(sensor, interpolation)
        return self

    def with_sensor(self, sensor: Sensor, interpolation: InterpolationType = InterpolationType.CONSTANT) -> 'DatasetBuilder':
        self.sensors.append(sensor)
        self.interpolations[sensor] = interpolation
        return self

    def build(self) -> pd.DataFrame:
        # read data
        data: Dict[Sensor, List[SensorData]] = {
            sensor: read_csv(dataset_path / sensor_filenames[sensor], sensor_datatypes[sensor])
            for sensor in self.sensors
        }
        
        # calculate availability time
        start, end = calculate_availability_time(data)

        # interpolate data
        interpolated_data = {
            sensor: interpolators[self.interpolations[sensor]](data[sensor], rate=self.rate, data_type=type(sensor_data[0])).interpolate(start, end) for sensor, sensor_data in data.items()
        }

        # merge together into one dataframe
        dataframes = []
        
        for sensor, sensor_data in interpolated_data.items():
            df = pd.DataFrame(
                sensor_data,
            ).drop(columns=['sensor_type'])
            # convert map object at column datetime
            df['datetime'] = df['datetime'].map(lambda x: x.timestamp)
            
            # add prefix to all columns except datetime
            df = df.rename(columns={c: f'{sensor.name.lower()}_{c}' for c in df.columns if c not in ['datetime']})
            dataframes.append(df)
        
        assert len(dataframes) > 0

        df = dataframes.pop(0)
        for other_df in dataframes:
            df = df.merge(other_df, on='datetime')

        return df

In [116]:
df = DatasetBuilder(DATASET, rate = 4) \
    .with_sensors([
        *Sensor
    ]) \
    .build()

/Users/yannick.habecker/projects/personal/printer-anomaly-detection/datasets/hinges_12/extruder_power.csv.gz
/Users/yannick.habecker/projects/personal/printer-anomaly-detection/datasets/hinges_12/extruder_target.csv.gz
/Users/yannick.habecker/projects/personal/printer-anomaly-detection/datasets/hinges_12/extruder_temperature.csv.gz
/Users/yannick.habecker/projects/personal/printer-anomaly-detection/datasets/hinges_12/extruder_velocity_sensor.csv.gz
/Users/yannick.habecker/projects/personal/printer-anomaly-detection/datasets/hinges_12/gas.csv.gz
/Users/yannick.habecker/projects/personal/printer-anomaly-detection/datasets/hinges_12/heater_power.csv.gz
/Users/yannick.habecker/projects/personal/printer-anomaly-detection/datasets/hinges_12/heater_target.csv.gz
/Users/yannick.habecker/projects/personal/printer-anomaly-detection/datasets/hinges_12/heater_temperature.csv.gz
/Users/yannick.habecker/projects/personal/printer-anomaly-detection/datasets/hinges_12/accelerometer.csv.gz
/Users/yannic

In [93]:
df.min()

datetime       2023-06-01T12:46:59.037435
sensor_type                             M
velocity                            300.0
dtype: object

In [112]:
df = pd.read_csv(dataset_path / sensor_filenames[Sensor.GAS])
df.min()

datetime       2023-06-01T12:42:49.057539+00:00
sensor_type                                   C
value                                      83.0
dtype: object

In [76]:
df = DatasetBuilder(DATASET, rate = 1) \
    .with_sensor(Sensor.VELOCITY) \
    .build()
df

ValueError: max() arg is an empty sequence

In [50]:
data: Dict[Sensor, List[SensorData]] = {
    sensor: read_csv(dataset_path / sensor_filenames[sensor], sensor_datatypes[sensor])
    for sensor in builder.sensors
}

availability_from, availability_to = calculate_availability_time(data)
print(availability_from, availability_to)

NameError: name 'builder' is not defined

In [None]:
interpolation_data = list(data[Sensor.EXTRUDER_TARGET])
print(len(interpolation_data))
interpolated = ConstantInterpolator(interpolation_data, 0.005, TargetData)\
    .interpolate(availability_from, availability_to)

for d in list(interpolated):
    print(d)

In [None]:
# todo: read all data
# find time when all are available
# up/downsample
