In [27]:
import pandas as pd
import numpy as np
import os
import calendar
from datetime import datetime

In [28]:
%ls data/

machina.tinyflux  sample.parquet


In [29]:
df = pd.read_parquet('data/sample.parquet', engine="pyarrow")

# EDA

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1546591 entries, 0 to 1546590
Data columns (total 6 columns):
 #   Column       Non-Null Count    Dtype  
---  ------       --------------    -----  
 0   time         1546591 non-null  object 
 1   value        1546591 non-null  float64
 2   field        1546591 non-null  object 
 3   robot_id     1546591 non-null  int64  
 4   run_uuid     1546591 non-null  float64
 5   sensor_type  1546591 non-null  object 
dtypes: float64(2), int64(1), object(3)
memory usage: 70.8+ MB


In [31]:
df['run_uuid'] = df['run_uuid'].astype(int).astype(str)
df['robot_id'] = df['robot_id'].astype(int).astype(str)

In [32]:
df.sensor_type.unique()

array(['encoder', 'load_cell'], dtype=object)

In [33]:
print("min: ", df.time.min(), '\nmax: ', df.time.max())

min:  2022-11-23T20:40:00.001Z 
max:  2022-11-23T20:49:59Z


In [34]:
df.groupby('field').size()

field
fx    247764
fy    240000
fz    240000
x     300009
y     268810
z     250008
dtype: int64

In [35]:
df['value'].describe()

count    1.546591e+06
mean     3.531151e+02
std      9.832975e+02
min     -1.848419e+03
25%     -2.222900e+02
50%      1.723072e+02
75%      8.685362e+02
max      3.298350e+03
Name: value, dtype: float64

# prep the data

# expected headers
| time | fx_1 | fx_2 | fy_1 | fy_2 | fz_1 | fz_2 | x_1 | x_2 | y_1 | y_2 | z_1 | z_2 |

In [36]:
df2 = pd.pivot_table(df, values='value', index=['time', 'run_uuid', 'sensor_type', 'robot_id'], columns=['field', 'robot_id'])

In [37]:
df2.columns = [ '_'.join([str(c) for c in c_list]) for c_list in df2.columns.values ]

In [38]:
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,fx_1,fx_2,fy_1,fy_2,fz_1,fz_2,x_1,x_2,y_1,y_2,z_1,z_2
time,run_uuid,sensor_type,robot_id,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2022-11-23T20:40:00.001Z,7582293080991469568,load_cell,1,-1192.046953,,716.528276,,-1547.340972,,,,,,,
2022-11-23T20:40:00.003Z,-9223372036854775808,load_cell,1,-88.747061,,,,,,,,,,,
2022-11-23T20:40:00.003Z,7582293080991469568,load_cell,2,,-546.669903,,489.207227,,84.484822,,,,,,
2022-11-23T20:40:00.005Z,8910095844186656768,encoder,1,,,,,,,821.7808,,326.5256,,-1.8051,
2022-11-23T20:40:00.007Z,6176976534744076288,load_cell,1,176.096381,,174.268623,,-258.179417,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-11-23T20:49:59.998Z,7582293080991469568,load_cell,2,,-546.645184,,489.154858,,83.245564,,,,,,
2022-11-23T20:49:59.999Z,6176976534744076288,encoder,1,,,,,,,1440.7900,,936.9250,,-222.2900,
2022-11-23T20:49:59.999Z,8910095844186656768,encoder,2,,,,,,,,808.5914,,321.8688,,-8.3883
2022-11-23T20:49:59.9Z,7582293080991469568,encoder,2,,,,,,,,3050.7730,,1000.7690,,-771.6320


In [39]:
df2 = df2.fillna(0)

In [152]:
df2 = df2.sort_index(0, ascending=True)

  df2 = df2.sort_index(0, ascending=True)


In [153]:
df3 = df2.reset_index()

In [154]:
df3

Unnamed: 0,time,run_uuid,sensor_type,robot_id,fx_1,fx_2,fy_1,fy_2,fz_1,fz_2,x_1,x_2,y_1,y_2,z_1,z_2
0,2022-11-23T20:40:00.001Z,7582293080991469568,load_cell,1,-1192.046953,0.000000,716.528276,0.000000,-1547.340972,0.000000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
1,2022-11-23T20:40:00.003Z,-9223372036854775808,load_cell,1,-88.747061,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
2,2022-11-23T20:40:00.003Z,7582293080991469568,load_cell,2,0.000000,-546.669903,0.000000,489.207227,0.000000,84.484822,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
3,2022-11-23T20:40:00.005Z,8910095844186656768,encoder,1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,821.7808,0.0000,326.5256,0.0000,-1.8051,0.0000
4,2022-11-23T20:40:00.007Z,6176976534744076288,load_cell,1,176.096381,0.000000,174.268623,0.000000,-258.179417,0.000000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
547768,2022-11-23T20:49:59.998Z,7582293080991469568,load_cell,2,0.000000,-546.645184,0.000000,489.154858,0.000000,83.245564,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
547769,2022-11-23T20:49:59.999Z,6176976534744076288,encoder,1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1440.7900,0.0000,936.9250,0.0000,-222.2900,0.0000
547770,2022-11-23T20:49:59.999Z,8910095844186656768,encoder,2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,808.5914,0.0000,321.8688,0.0000,-8.3883
547771,2022-11-23T20:49:59.9Z,7582293080991469568,encoder,2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,3050.7730,0.0000,1000.7690,0.0000,-771.6320


In [139]:
df3[df3['run_uuid'].astype(int) <= 0]['run_uuid'].unique()

array(['-9223372036854775808'], dtype=object)

# Feature Generation

Need to calculate velocity, acceleration, total velocity, total acceleration, and total force

In [140]:
df2.index

MultiIndex([('2022-11-23T20:40:00.001Z',  '7582293080991469568', ...),
            ('2022-11-23T20:40:00.003Z', '-9223372036854775808', ...),
            ('2022-11-23T20:40:00.003Z',  '7582293080991469568', ...),
            ('2022-11-23T20:40:00.005Z',  '8910095844186656768', ...),
            ('2022-11-23T20:40:00.007Z',  '6176976534744076288', ...),
            ('2022-11-23T20:40:00.008Z',  '6176976534744076288', ...),
            ('2022-11-23T20:40:00.008Z',  '7582293080991469568', ...),
            ('2022-11-23T20:40:00.008Z',  '8910095844186656768', ...),
            ('2022-11-23T20:40:00.009Z',  '7582293080991469568', ...),
            ('2022-11-23T20:40:00.011Z',  '6176976534744076288', ...),
            ...
            ('2022-11-23T20:49:59.993Z',  '6176976534744076288', ...),
            ('2022-11-23T20:49:59.994Z',  '7582293080991469568', ...),
            ('2022-11-23T20:49:59.996Z',  '6176976534744076288', ...),
            ('2022-11-23T20:49:59.996Z',  '758229308099146956

In [141]:
df2[df2.index.isin(['1'], level=3)]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,fx_1,fx_2,fy_1,fy_2,fz_1,fz_2,x_1,x_2,y_1,y_2,z_1,z_2
time,run_uuid,sensor_type,robot_id,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2022-11-23T20:40:00.001Z,7582293080991469568,load_cell,1,-1192.046953,0.0,716.528276,0.0,-1547.340972,0.0,0.0000,0.0,0.0000,0.0,0.0000,0.0
2022-11-23T20:40:00.003Z,-9223372036854775808,load_cell,1,-88.747061,0.0,0.000000,0.0,0.000000,0.0,0.0000,0.0,0.0000,0.0,0.0000,0.0
2022-11-23T20:40:00.005Z,8910095844186656768,encoder,1,0.000000,0.0,0.000000,0.0,0.000000,0.0,821.7808,0.0,326.5256,0.0,-1.8051,0.0
2022-11-23T20:40:00.007Z,6176976534744076288,load_cell,1,176.096381,0.0,174.268623,0.0,-258.179417,0.0,0.0000,0.0,0.0000,0.0,0.0000,0.0
2022-11-23T20:40:00.008Z,7582293080991469568,encoder,1,0.000000,0.0,0.000000,0.0,0.000000,0.0,2862.3020,0.0,995.8600,0.0,-48.5650,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-11-23T20:49:59.989Z,8910095844186656768,encoder,1,0.000000,0.0,0.000000,0.0,0.000000,0.0,808.0329,0.0,333.3042,0.0,-12.7741,0.0
2022-11-23T20:49:59.994Z,7582293080991469568,encoder,1,0.000000,0.0,0.000000,0.0,0.000000,0.0,3064.1770,0.0,1225.8770,0.0,-57.5040,0.0
2022-11-23T20:49:59.996Z,6176976534744076288,load_cell,1,177.361327,0.0,172.324577,0.0,-259.459629,0.0,0.0000,0.0,0.0000,0.0,0.0000,0.0
2022-11-23T20:49:59.998Z,7582293080991469568,load_cell,1,-1375.790361,0.0,-208.161848,0.0,-1388.172515,0.0,0.0000,0.0,0.0000,0.0,0.0000,0.0


In [166]:
print(sum(n<0 for n in df2.values.flatten()),sum(df2.value_counts()))

568332 547773


In [142]:
def calc_velocity(df, robot_id, field):
    col = field + '_' + robot_id
    positions = df[col].tolist()
    displacements = []
    velocity = []
    
    for i in range(1, len(positions)):
        displacement = np.subtract(positions[i], positions[i-1])
        displacements.append(displacement)
        
    ts1 = pd.Series(df.index.get_level_values(0))
    ts2 = pd.Series(df.index.get_level_values(0),index=df.index.get_level_values(0)).shift(1)
    ts1 = pd.DatetimeIndex(ts1).round(freq='L')
    ts2 = pd.DatetimeIndex(ts2).round(freq='L')

    p = pd.Series(pd.to_timedelta(ts2-ts1, unit='L'))
    p = [x*1000 for x in p.dt.total_seconds().tolist()[1:]]
        
    return [np.divide(displacement, time_interval) for displacement, time_interval in zip(displacements, p)]

In [156]:
ts1 = pd.Series(df2.index.get_level_values(0))
ts2 = pd.Series(df2.index.get_level_values(0),index=df2.index.get_level_values(0)).shift(1)
ts1 = pd.DatetimeIndex(ts1).round(freq='L')
ts2 = pd.DatetimeIndex(ts2).round(freq='L')

p = pd.Series(pd.to_timedelta(ts2-ts1, unit='L'))
[x*1000 for x in p.dt.total_seconds()]
# for i, j in zip(ts1, ts2):
#     if str(i) == 'NaT' or str(j) == 'NaT':
#         print(0)
#     else:    
#         print(datetime.strptime(str(j), '%Y-%m-%dT%H:%M:%S.%fZ') - datetime.strptime(str(i), '%Y-%m-%dT%H:%M:%S.%fZ'))

[nan,
 -2.0,
 0.0,
 -2.0,
 -2.0,
 -1.0,
 0.0,
 0.0,
 -1.0,
 -2.0,
 0.0,
 -2.0,
 0.0,
 -4.0,
 0.0,
 7.0,
 -11.000000000000002,
 0.0,
 -2.0,
 0.0,
 0.0,
 -4.0,
 -2.0,
 9.000000000000002,
 0.0,
 0.0,
 0.0,
 -11.000000000000002,
 -1.0,
 0.0,
 0.0,
 -1.0,
 0.0,
 0.0,
 -2.0,
 -2.0,
 7.0,
 -11.000000000000002,
 0.0,
 -2.0,
 0.0,
 -1.0,
 0.0,
 0.0,
 -1.0,
 -2.0,
 0.0,
 7.0,
 -11.000000000000002,
 -2.0,
 0.0,
 0.0,
 -3.0,
 0.0,
 0.0,
 -1.0,
 0.0,
 -2.0,
 9.000000000000002,
 -11.000000000000002,
 -2.0,
 0.0,
 -2.0,
 -2.0,
 -1.0,
 0.0,
 0.0,
 -1.0,
 9.000000000000002,
 -11.000000000000002,
 0.0,
 -2.0,
 0.0,
 -4.0,
 0.0,
 7.0,
 -11.000000000000002,
 0.0,
 -2.0,
 0.0,
 0.0,
 -4.0,
 -2.0,
 9.000000000000002,
 0.0,
 0.0,
 0.0,
 -11.000000000000002,
 -1.0,
 0.0,
 0.0,
 -1.0,
 0.0,
 0.0,
 -2.0,
 -2.0,
 7.0,
 -11.000000000000002,
 0.0,
 -2.0,
 0.0,
 -1.0,
 0.0,
 0.0,
 -1.0,
 -2.0,
 0.0,
 -4.0,
 -2.0,
 0.0,
 0.0,
 -3.0,
 0.0,
 0.0,
 -1.0,
 0.0,
 -2.0,
 9.000000000000002,
 -11.000000000000002,
 -2.0,
 0.

In [150]:
x_1 = calc_velocity(df2, '1', 'x')
y_1 = calc_velocity(df2, '1', 'y')
z_1 = calc_velocity(df2, '1', 'z')
x_2 = calc_velocity(df2, '2', 'x')
y_2 = calc_velocity(df2, '2', 'y')
z_2 = calc_velocity(df2, '2', 'z')

  return [np.divide(displacement, time_interval) for displacement, time_interval in zip(displacements, p)]
  return [np.divide(displacement, time_interval) for displacement, time_interval in zip(displacements, p)]
  return [np.divide(displacement, time_interval) for displacement, time_interval in zip(displacements, p)]
  return [np.divide(displacement, time_interval) for displacement, time_interval in zip(displacements, p)]
  return [np.divide(displacement, time_interval) for displacement, time_interval in zip(displacements, p)]
  return [np.divide(displacement, time_interval) for displacement, time_interval in zip(displacements, p)]
  return [np.divide(displacement, time_interval) for displacement, time_interval in zip(displacements, p)]
  return [np.divide(displacement, time_interval) for displacement, time_interval in zip(displacements, p)]
  return [np.divide(displacement, time_interval) for displacement, time_interval in zip(displacements, p)]
  return [np.divide(displacement, tim

In [151]:
x_1

[-0.0,
 nan,
 -410.8904,
 410.8904,
 -0.0,
 inf,
 -inf,
 -0.0,
 -720.395,
 -inf,
 -0.0,
 nan,
 -0.0,
 inf,
 -117.4031,
 -0.0,
 nan,
 -0.0,
 inf,
 -inf,
 -0.0,
 -410.92535,
 -91.31674444444442,
 nan,
 inf,
 -inf,
 -0.0,
 -0.0,
 inf,
 -inf,
 -0.0,
 nan,
 nan,
 -720.395,
 720.395,
 0.0,
 -0.0,
 inf,
 410.9482,
 nan,
 -0.0,
 inf,
 -inf,
 -0.0,
 -720.395,
 -inf,
 0.0,
 -0.0,
 -0.0,
 nan,
 inf,
 273.9857666666667,
 inf,
 -inf,
 -0.0,
 nan,
 -720.395,
 -160.08777777777775,
 -0.0,
 -0.0,
 nan,
 -411.00795,
 411.00795,
 -0.0,
 inf,
 -inf,
 -0.0,
 0.0,
 -130.98090909090908,
 -inf,
 -0.0,
 nan,
 -0.0,
 inf,
 -117.43898571428572,
 -0.0,
 nan,
 -0.0,
 inf,
 -inf,
 -0.0,
 -411.06865,
 -91.34858888888887,
 nan,
 inf,
 -inf,
 -0.0,
 -0.0,
 inf,
 -inf,
 -0.0,
 nan,
 nan,
 -720.395,
 720.395,
 0.0,
 -0.0,
 inf,
 411.0992,
 nan,
 -0.0,
 inf,
 -inf,
 -0.0,
 -720.395,
 -inf,
 -0.0,
 -0.0,
 nan,
 inf,
 274.0882666666667,
 inf,
 -inf,
 -0.0,
 nan,
 -720.395,
 -160.08777777777775,
 -0.0,
 -0.0,
 nan,
 -411.16