# Import libraries and dependencies

In [19]:
# requirements.txt
!pip install pyarrow



In [20]:
import pandas as pd
import numpy as np
import os
import calendar
from datetime import datetime
import math

In [21]:
%ls data/

machina.tinyflux  sample.parquet


In [22]:
df = pd.read_parquet('data/sample.parquet', engine="pyarrow")

# 2.1 Preprocess and Clean

This is the EDA (aka exploratory data analysis) step...

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1546591 entries, 0 to 1546590
Data columns (total 6 columns):
 #   Column       Non-Null Count    Dtype  
---  ------       --------------    -----  
 0   time         1546591 non-null  object 
 1   value        1546591 non-null  float64
 2   field        1546591 non-null  object 
 3   robot_id     1546591 non-null  int64  
 4   run_uuid     1546591 non-null  float64
 5   sensor_type  1546591 non-null  object 
dtypes: float64(2), int64(1), object(3)
memory usage: 70.8+ MB


### notes: 

This is a preprocessing step below for python to display the full id rather than shortened form (ie. 3.380000e6 vs 3380000) by assigning it as an integer type and then as a string type.

In [24]:
# preprocessing
df['run_uuid'] = df['run_uuid'].astype(int).astype(str)
df['robot_id'] = df['robot_id'].astype(int).astype(str)
df['time'] = pd.to_datetime(df['time'])

In [25]:
df.sensor_type.unique()

array(['encoder', 'load_cell'], dtype=object)

In [26]:
print("min: ", df.time.min(), '\nmax: ', df.time.max())

min:  2022-11-23 20:40:00.001000+00:00 
max:  2022-11-23 20:49:59.999000+00:00


In [27]:
df.groupby('field').size()

field
fx    247764
fy    240000
fz    240000
x     300009
y     268810
z     250008
dtype: int64

In [28]:
df['value'].describe()

count    1.546591e+06
mean     3.531151e+02
std      9.832975e+02
min     -1.848419e+03
25%     -2.222900e+02
50%      1.723072e+02
75%      8.685362e+02
max      3.298350e+03
Name: value, dtype: float64

# 2.2 Convert timeseries to features by robot_id

preparing the data...

## expected headers
| time | fx_1 | fx_2 | fy_1 | fy_2 | fz_1 | fz_2 | x_1 | x_2 | y_1 | y_2 | z_1 | z_2 |

## decision point:
I decided not to pivot the field 'robot_id' to make it easier down the line as for processing and calculation efforts.

I see this as a benefit later downstream when we can consider 'robot_id' as a sharding attribute.  Pivoting 'robot_id' into column will actually make it use more resources (ie cpu and memory) as we have to write empty columns for each record between robot 1 and robot 2.  

## new fields
| time | run_uuid | sensor_type | robot_id | fx | fy | fz | x | y | z |

In [37]:
df2 = pd.pivot_table(df, values='value', index=['time', 'run_uuid', 'sensor_type'], columns=['field', 'robot_id'])

In [38]:
df2.columns = [ '_'.join([str(c) for c in c_list]) for c_list in df2.columns.values ]

In [39]:
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,fx_1,fx_2,fy_1,fy_2,fz_1,fz_2,x_1,x_2,y_1,y_2,z_1,z_2
time,run_uuid,sensor_type,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2022-11-23 20:40:00.001000+00:00,7582293080991469568,load_cell,-1192.046953,,716.528276,,-1547.340972,,,,,,,
2022-11-23 20:40:00.003000+00:00,-9223372036854775808,load_cell,-88.747061,,,,,,,,,,,
2022-11-23 20:40:00.003000+00:00,7582293080991469568,load_cell,,-546.669903,,489.207227,,84.484822,,,,,,
2022-11-23 20:40:00.005000+00:00,8910095844186656768,encoder,,,,,,,821.7808,,326.5256,,-1.8051,
2022-11-23 20:40:00.007000+00:00,6176976534744076288,load_cell,176.096381,,174.268623,,-258.179417,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-11-23 20:49:59.996000+00:00,6176976534744076288,load_cell,177.361327,,172.324577,,-259.459629,,,,,,,
2022-11-23 20:49:59.996000+00:00,7582293080991469568,encoder,,,,,,,,3050.7730,,1000.7690,,-771.6320
2022-11-23 20:49:59.998000+00:00,7582293080991469568,load_cell,-1375.790361,-546.645184,-208.161848,489.154858,-1388.172515,83.245564,,,,,,
2022-11-23 20:49:59.999000+00:00,6176976534744076288,encoder,,,,,,,1440.7900,,936.9250,,-222.2900,


In [40]:
df2 = df2.fillna(0)

In [41]:
df2 = df2.sort_index(0, ascending=True)

  df2 = df2.sort_index(0, ascending=True)


In [42]:
df3 = df2.reset_index()

In [43]:
df3

Unnamed: 0,time,run_uuid,sensor_type,fx_1,fx_2,fy_1,fy_2,fz_1,fz_2,x_1,x_2,y_1,y_2,z_1,z_2
0,2022-11-23 20:40:00.001000+00:00,7582293080991469568,load_cell,-1192.046953,0.000000,716.528276,0.000000,-1547.340972,0.000000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
1,2022-11-23 20:40:00.003000+00:00,-9223372036854775808,load_cell,-88.747061,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
2,2022-11-23 20:40:00.003000+00:00,7582293080991469568,load_cell,0.000000,-546.669903,0.000000,489.207227,0.000000,84.484822,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
3,2022-11-23 20:40:00.005000+00:00,8910095844186656768,encoder,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,821.7808,0.0000,326.5256,0.0000,-1.8051,0.0000
4,2022-11-23 20:40:00.007000+00:00,6176976534744076288,load_cell,176.096381,0.000000,174.268623,0.000000,-258.179417,0.000000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
528276,2022-11-23 20:49:59.996000+00:00,6176976534744076288,load_cell,177.361327,0.000000,172.324577,0.000000,-259.459629,0.000000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
528277,2022-11-23 20:49:59.996000+00:00,7582293080991469568,encoder,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,3050.7730,0.0000,1000.7690,0.0000,-771.6320
528278,2022-11-23 20:49:59.998000+00:00,7582293080991469568,load_cell,-1375.790361,-546.645184,-208.161848,489.154858,-1388.172515,83.245564,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
528279,2022-11-23 20:49:59.999000+00:00,6176976534744076288,encoder,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1440.7900,0.0000,936.9250,0.0000,-222.2900,0.0000


### notes:

I tried to stack the columns containing [fx, fy, fz, x, y, z] into [x, y, z].  Decided against this to make calculcation easier below.  Recommend to combine this field in production.

In [None]:
out = df3
out = out.rename(columns={'fx': 'x-force', 
                          'fy': 'y-force', 
                          'fz': 'z-force', 
                          'x': 'x-dist', 
                          'y': 'y-dist', 
                          'z': 'z-dist'})
out = pd.wide_to_long(df=out, 
                      stubnames=['x', 'y', 'z'], 
                      i=['time', 'run_uuid', 'sensor_type', 'robot_id'], 
                      j='measure', 
                      sep='-', 
                      suffix=r'\w+')
out

# Feature Generation

Need to calculate velocity, acceleration, total velocity, total acceleration, and total force
- 6 Velocity values (vx_1, vy_1, vz_1, vx_2, vy_2, vz_2)
- 6 Acceleration values (ax_1, ay_1, az_1, ax_2, ay_2, az_2)
- Total Velocity (v1, v2)
- Total Acceleration (a1, a2)
- Total Force (f1, f2)

In [None]:
df2

In [48]:
test = df2[(df2.index.get_level_values(2)=='encoder')][['x_1', 'y_1', 'z_1', 'x_2', 'y_2', 'z_2']]
test[['x_1diff', 'y_1diff', 'z_1diff', 'x_2diff', 'y_2diff', 'z_2diff']] = test[['x_1', 'y_1', 'z_1', 'x_2', 'y_2', 'z_2']].diff()
test = test.reset_index()
test['timediff'] = test['time'].diff().dt.total_seconds()*1e3
test['velocityx_1'] = test['x_1diff'] / test['timediff']
test['velocityy_1'] = test['y_1diff'] / test['timediff']
test['velocityz_1'] = test['z_1diff'] / test['timediff']
test['velocityx_2'] = test['x_2diff'] / test['timediff']
test['velocityy_2'] = test['y_2diff'] / test['timediff']
test['velocityz_2'] = test['z_2diff'] / test['timediff']
test['total_velocity_1'] = calc_total_velocity(test['velocityx_1'], test['velocityy_1'], test['velocityz_1'])
test['total_velocity_2'] = calc_total_velocity(test['velocityx_2'], test['velocityy_2'], test['velocityz_2'])
test

Unnamed: 0,time,run_uuid,sensor_type,x_1,y_1,z_1,x_2,y_2,z_2,x_1diff,...,z_2diff,timediff,velocityx_1,velocityy_1,velocityz_1,velocityx_2,velocityy_2,velocityz_2,total_velocity_1,total_velocity_2
0,2022-11-23 20:40:00.005000+00:00,8910095844186656768,encoder,821.7808,326.5256,-1.8051,0.0000,0.0000,0.0000,,...,,,,,,,,,,
1,2022-11-23 20:40:00.008000+00:00,6176976534744076288,encoder,0.0000,0.0000,0.0000,1438.4120,939.3830,0.0000,-821.7808,...,0.0000,3.0,-273.926933,-108.841867,0.601700,479.470667,313.127667,0.000000,294.759018,572.661380
2,2022-11-23 20:40:00.008000+00:00,7582293080991469568,encoder,2862.3020,995.8600,-48.5650,0.0000,0.0000,0.0000,2862.3020,...,0.0000,0.0,inf,inf,-inf,-inf,-inf,,inf,
3,2022-11-23 20:40:00.008000+00:00,8910095844186656768,encoder,0.0000,0.0000,0.0000,823.2906,320.5103,-2.9101,-2862.3020,...,-2.9101,0.0,-inf,-inf,inf,inf,inf,-inf,inf,inf
4,2022-11-23 20:40:00.009000+00:00,7582293080991469568,encoder,0.0000,0.0000,0.0000,3050.7730,1000.7690,-771.6320,0.0000,...,-768.7219,1.0,0.000000,0.000000,0.000000,2227.482400,680.258700,-768.721900,0.000000,2452.623718
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
281549,2022-11-23 20:49:59.993000+00:00,6176976534744076288,encoder,0.0000,0.0000,0.0000,1438.4120,0.0000,0.0000,-808.0329,...,0.0000,4.0,-202.008225,-83.326050,3.193525,359.603000,0.000000,0.000000,218.542335,359.603000
281550,2022-11-23 20:49:59.994000+00:00,7582293080991469568,encoder,3064.1770,1225.8770,-57.5040,0.0000,0.0000,0.0000,3064.1770,...,0.0000,1.0,3064.177000,1225.877000,-57.504000,-1438.412000,0.000000,0.000000,3300.797149,1438.412000
281551,2022-11-23 20:49:59.996000+00:00,7582293080991469568,encoder,0.0000,0.0000,0.0000,3050.7730,1000.7690,-771.6320,-3064.1770,...,-771.6320,2.0,-1532.088500,-612.938500,28.752000,1525.386500,500.384500,-385.816000,1650.398574,1651.073169
281552,2022-11-23 20:49:59.999000+00:00,6176976534744076288,encoder,1440.7900,936.9250,-222.2900,0.0000,0.0000,0.0000,1440.7900,...,771.6320,3.0,480.263333,312.308333,-74.096667,-1016.924333,-333.589667,257.210667,577.650137,1100.715446


In [None]:
test = df2.reset_index()
test[test['sensor_type']=='encoder'].groupby(
    [
        pd.Grouper(key='time', freq='S'), 
         'run_uuid', 
         'robot_id'
    ]
)[['x', 'y', 'z']].diff()

In [None]:
df3 = df2.reset_index()
df3 = df3.groupby([pd.Grouper(key='time', freq='S'), 'run_uuid', 'robot_id']).agg(
    first_time=pd.NamedAgg(column='time', aggfunc='first'), 
    last_time=pd.NamedAgg(column='time', aggfunc='last'), 
    first_fx=pd.NamedAgg(column='fx', aggfunc='first'),
    last_fx=pd.NamedAgg(column='fx', aggfunc='last'),
    first_fy=pd.NamedAgg(column='fy', aggfunc='first'),
    last_fy=pd.NamedAgg(column='fy', aggfunc='last'),
    first_fz=pd.NamedAgg(column='fz', aggfunc='first'),
    last_fz=pd.NamedAgg(column='fz', aggfunc='last'),   
    first_x=pd.NamedAgg(column='x', aggfunc='first'),
    last_x=pd.NamedAgg(column='x', aggfunc='last'),
    first_y=pd.NamedAgg(column='y', aggfunc='first'),
    last_y=pd.NamedAgg(column='y', aggfunc='last'),
    first_z=pd.NamedAgg(column='z', aggfunc='first'),
    last_z=pd.NamedAgg(column='z', aggfunc='last'),
    sum_fx=pd.NamedAgg(column='fx', aggfunc='sum'),
    sum_fy=pd.NamedAgg(column='fy', aggfunc='sum'),
    sum_fz=pd.NamedAgg(column='fz', aggfunc='sum')
)

In [None]:
pd.to_timedelta(pd.DatetimeIndex(df3['last_time'])-pd.DatetimeIndex(df3['first_time'])).total_seconds()

In [None]:
df3

In [None]:
time_interval = pd.to_timedelta(pd.DatetimeIndex(df3['last_time'])-pd.DatetimeIndex(df3['first_time'])).total_seconds()
df3['vx'] = (df3['last_x'] - df3['first_x']) / time_interval
df3['vy'] = (df3['last_y'] - df3['first_y']) / time_interval
df3['vz'] = (df3['last_z'] - df3['first_z']) / time_interval

In [None]:
velo_df = df3.reset_index()

In [46]:
# calculate total velocity
def calc_total_velocity(vx, vy, vz):
    velocity = np.sqrt(vx**2 + vy**2 + vz**2)
    return velocity

In [None]:
velo_df['total_velocity'] = calc_total_velocity(velo_df['vx'], velo_df['vy'], velo_df['vz'])

In [None]:
velo_df

In [None]:
accel_df = velo_df.groupby(['run_uuid', 'robot_id']).agg(
    first_first_time=pd.NamedAgg(column='first_time', aggfunc='first'), 
    last_last_time=pd.NamedAgg(column='last_time', aggfunc='last'), 
    first_vel_x=pd.NamedAgg(column='vx', aggfunc='first'), 
    last_vel_x=pd.NamedAgg(column='vx', aggfunc='last'), 
    first_vel_y=pd.NamedAgg(column='vy', aggfunc='first'), 
    last_vel_y=pd.NamedAgg(column='vy', aggfunc='last'),
    first_vel_z=pd.NamedAgg(column='vz', aggfunc='first'), 
    last_vel_z=pd.NamedAgg(column='vz', aggfunc='last'), 
)

In [None]:
accel_df = accel_df.reset_index()
accel_df

In [None]:
time_interval = pd.to_timedelta(pd.DatetimeIndex(accel_df['last_last_time'])-pd.DatetimeIndex(accel_df['first_first_time'])).total_seconds()
accel_df['ax'] = (accel_df['last_vel_x'] - accel_df['first_vel_x']) / time_interval
accel_df['ay'] = (accel_df['last_vel_y'] - accel_df['first_vel_y']) / time_interval 
accel_df['az'] = (accel_df['last_vel_z'] - accel_df['first_vel_z']) / time_interval

In [None]:
# calculate total acceleration
def calc_total_acceleration(ax, ay, az):
    acceleration = np.sqrt(ax**2 + ay**2 + az**2)
    return acceleration

In [None]:
accel_df['total_acceleration'] = calc_total_acceleration(accel_df['ax'], accel_df['ay'], accel_df['az'])

In [None]:
accel_df