In [33]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import re
import seaborn as sns
import os
import math
import traces

In [12]:
df = pd.read_parquet('../data/sample.parquet', engine='pyarrow')

In [13]:
df.to_csv('../data/sample.csv', index=True)

In [14]:
df

Unnamed: 0,time,value,field,robot_id,run_uuid,sensor_type
0,2022-11-23T20:40:00.005Z,821.780800,x,1,8.910096e+18,encoder
1,2022-11-23T20:40:00.017Z,821.821700,x,1,8.910096e+18,encoder
2,2022-11-23T20:40:00.029Z,821.850700,x,1,8.910096e+18,encoder
3,2022-11-23T20:40:00.041Z,821.896400,x,1,8.910096e+18,encoder
4,2022-11-23T20:40:00.053Z,821.957300,x,1,8.910096e+18,encoder
...,...,...,...,...,...,...
1546586,2022-11-23T20:41:17.59Z,-85.692373,fx,1,1.240519e+19,load_cell
1546587,2022-11-23T20:41:17.6Z,-87.231436,fx,1,1.240519e+19,load_cell
1546588,2022-11-23T20:41:17.61Z,-85.649405,fx,1,1.240519e+19,load_cell
1546589,2022-11-23T20:41:17.62Z,-86.430655,fx,1,1.240519e+19,load_cell


In [15]:
def convert_time(x):
    ms_pattern = '\d{4}-[1-9]{2}-[1-9]{2}T\d{2}:\d{2}:\d{2}\.\d*Z'
    sec_pattern = '\d{4}-[1-9]{2}-[1-9]{2}T\d{2}:\d{2}:\d{2}Z'
    if re.match(ms_pattern, x):
        return datetime.datetime.strptime(x, '%Y-%m-%dT%H:%M:%S.%fZ')
    elif re.match(sec_pattern, x):
        return datetime.datetime.strptime(x, '%Y-%m-%dT%H:%M:%SZ')
    else:
        return np.nan

There's no NaN values in this dataset, but we can assume if there is any NaN or misformatted time, we should drop that datapoint. 

In [16]:
df['pdtimestamp'] = pd.to_datetime(df['time']) # .apply(convert_time)
print(df.isna().sum().sum())
df.dropna(inplace=True)

0


In [17]:
print(df.keys())
for key in df.keys():
    print(type(df[key][0]))



Index(['time', 'value', 'field', 'robot_id', 'run_uuid', 'sensor_type',
       'pdtimestamp'],
      dtype='object')
<class 'str'>
<class 'numpy.float64'>
<class 'str'>
<class 'numpy.int64'>
<class 'numpy.float64'>
<class 'str'>
<class 'pandas._libs.tslibs.timestamps.Timestamp'>


In [18]:
print(df.loc[489546,'pdtimestamp'])
print(df[df['time'] == '2022-11-23T20:41:17Z']) # datetime.strptime('2022-11-23T20:41:17.0Z', '%Y-%m-%dT%H:%M:%S.%fZ')])

2022-11-23 20:47:54.335000+00:00
                         time        value field  robot_id      run_uuid  \
826527   2022-11-23T20:41:17Z   481.628401    fx         1  7.582293e+18   
886528   2022-11-23T20:41:17Z    43.796801    fy         1  7.582293e+18   
946529   2022-11-23T20:41:17Z -1724.591704    fz         1  7.582293e+18   
1546527  2022-11-23T20:41:17Z   -84.243155    fx         1  1.240519e+19   

        sensor_type               pdtimestamp  
826527    load_cell 2022-11-23 20:41:17+00:00  
886528    load_cell 2022-11-23 20:41:17+00:00  
946529    load_cell 2022-11-23 20:41:17+00:00  
1546527   load_cell 2022-11-23 20:41:17+00:00  


## 2.1 Understanding the data
There are 4 unique parts in the file.

Encoder values:
min: -771.633000  => origin 
max: 3298.350000  => 3.3m

In [19]:
print(type(df['run_uuid'][0]))
# count number of unique run_uuid
unique_run_uuid = df['run_uuid'].unique()
print(f'{unique_run_uuid[1]:20f}')
print('number of unique run_uuid:', len(unique_run_uuid))


# get average of each sensor type
# by_sensor_type = df.groupby('sensor_type')['value']
# print(by_sensor_type.min(), by_sensor_type.max(), by_sensor_type.mean(), by_sensor_type.std())


# get x, y, z for encoder sensor type
encoder = df[df['sensor_type'] == 'encoder']
print('encoder value range:', encoder['value'].min(), encoder['value'].max(), encoder['value'].mean(), encoder['value'].std())
x = encoder[encoder['field']=='x']
print('encoder x range:', x['value'].min(), x['value'].max(), x['value'].mean(), x['value'].std())
y = encoder[encoder['field']=='y']
print('encoder y range:', y['value'].min(), y['value'].max(), y['value'].mean(), y['value'].std())
z = encoder[encoder['field']=='z']
print('encoder z range:', z['value'].min(), z['value'].max(), z['value'].mean(), z['value'].std())


# get x, y, z for load_cell sensor type
load_cell = df[df['sensor_type'] == 'load_cell']
print('load cell value range:', load_cell['value'].min(), load_cell['value'].max(), load_cell['value'].mean(), load_cell['value'].std())
fx = load_cell[load_cell['field']=='fx']
print('load cell fx range:', fx['value'].min(), fx['value'].max(), fx['value'].mean(), fx['value'].std())
fy = load_cell[load_cell['field']=='fy']
print('load cell fy range:', fy['value'].min(), fy['value'].max(), fy['value'].mean(), fy['value'].std())
fz = load_cell[load_cell['field']=='fz']
print('load cell fz range:', fz['value'].min(), fz['value'].max(), fz['value'].mean(), fz['value'].std())



<class 'numpy.float64'>
7582293080991469568.000000
number of unique run_uuid: 4
encoder value range: -771.633 3298.35 832.5900511217878 1017.678046256491
encoder x range: 693.1998 3298.35 1766.8628798119391 945.2264532674409
encoder y range: 316.6191 1233.344 761.1501298024627 291.9608832444885
encoder z range: -771.633 -1.7623 -211.72249812725988 291.06528403332663
load cell value range: -1848.4186083984373 875.1750048828125 -186.3552420897327 582.9076181210974
load cell fx range: -1718.656328125 663.8337231445313 -237.26581814432706 511.97422031636864
load cell fy range: -957.0925732421877 875.1750048828125 82.21266780450689 450.50478231730125
load cell fz range: -1848.4186083984373 90.85765502929686 -402.36561879401137 659.7890037700796


In [20]:
# print unique timestamp values
print('number of unique timestamps:', len(df['time'].unique()))

number of unique timestamps: 386359


## 2.2 Timeseries -> Features

In [21]:
# convert time series to x, y, z measurements
# create a new dataframe with column names: | time | fx_1 | fx_2 | fy_1 | fy_2 | fz_1 | fz_2 | x_1 | x_2 | y_1 | y_2 | z_1 | z_2 |

features = pd.DataFrame({
    'run_uuid': pd.Series(dtype=np.float64),
    'time': pd.Series(dtype=str), # df['time'].unique(),
    'fx_1': pd.Series(dtype=np.float64),
    'fx_2': pd.Series(dtype=np.float64),
    'fy_1': pd.Series(dtype=np.float64),
    'fy_2': pd.Series(dtype=np.float64),
    'fz_1': pd.Series(dtype=np.float64),
    'fz_2': pd.Series(dtype=np.float64),
    'x_1': pd.Series(dtype=np.float64),
    'x_2': pd.Series(dtype=np.float64),
    'y_1': pd.Series(dtype=np.float64),
    'y_2': pd.Series(dtype=np.float64),
    'z_1': pd.Series(dtype=np.float64),
    'z_2': pd.Series(dtype=np.float64)
})
features

Unnamed: 0,run_uuid,time,fx_1,fx_2,fy_1,fy_2,fz_1,fz_2,x_1,x_2,y_1,y_2,z_1,z_2


In [22]:
uuidtime = df.groupby(['run_uuid','pdtimestamp'])
time = df.groupby(['time'])

In [23]:
print(len(uuidtime.groups))
print(len(time.groups))

489547
386359


In [24]:

            # 'fx_1':row[1]['value'] if row[1]['field'] == 'fx' and row[1]['robot_id'] == 1 else np.nan,
            # 'fx_2':row[1]['value'] if row[1]['field'] == 'fx' and row[1]['robot_id'] == 2 else np.nan,
            # 'fy_1':row[1]['value'] if row[1]['field'] == 'fy' and row[1]['robot_id'] == 1 else np.nan,
            # 'fy_2':row[1]['value'] if row[1]['field'] == 'fy' and row[1]['robot_id'] == 2 else np.nan,
            # 'fz_1':row[1]['value'] if row[1]['field'] == 'fz' and row[1]['robot_id'] == 1 else np.nan,
            # 'fz_2':row[1]['value'] if row[1]['field'] == 'fz' and row[1]['robot_id'] == 2 else np.nan,
            # 'x_1':row[1]['value'] if row[1]['field'] == 'x' and row[1]['robot_id'] == 1 else np.nan,
            # 'x_2':row[1]['value'] if row[1]['field'] == 'x' and row[1]['robot_id'] == 2 else np.nan,
            # 'y_1':row[1]['value'] if row[1]['field'] == 'y' and row[1]['robot_id'] == 1 else np.nan,
            # 'y_2':row[1]['value'] if row[1]['field'] == 'y' and row[1]['robot_id'] == 2 else np.nan,
            # 'z_1':row[1]['value'] if row[1]['field'] == 'z' and row[1]['robot_id'] == 1 else np.nan,
            # 'z_2':row[1]['value'] if row[1]['field'] == 'z' and row[1]['robot_id'] == 2 else np.nan

        # if feature[1]['field'] == 'fx' and feature[1]['robot_id'] == 1:
        #     row['fx_1'] = feature[1]['value']
        # elif feature[1]['field'] == 'fx' and feature[1]['robot_id'] == 2:
        #     row['fx_2'] = feature[1]['value']
        # elif feature[1]['field'] == 'fy' and feature[1]['robot_id'] == 1:
        #     row['fy_1'] = feature[1]['value']
        # elif feature[1]['field'] == 'fy' and feature[1]['robot_id'] == 2:
        #     row['fy_2'] = feature[1]['value']
        # elif feature[1]['field'] == 'fz' and feature[1]['robot_id'] == 1:
        #     row['fz_1'] = feature[1]['value']
        # elif feature[1]['field'] == 'fz' and feature[1]['robot_id'] == 2:
        #     row['fz_2'] = feature[1]['value']
        # elif feature[1]['field'] == 'x' and feature[1]['robot_id'] == 1:
        #     row['x_1'] = feature[1]['value']
        # elif feature[1]['field'] == 'x' and feature[1]['robot_id'] == 2:
        #     row['x_2'] = feature[1]['value']
        # elif feature[1]['field'] == 'y' and feature[1]['robot_id'] == 1:
        #     row['y_1'] = feature[1]['value']
        # elif feature[1]['field'] == 'y' and feature[1]['robot_id'] == 2:
        #     row['y_2'] = feature[1]['value']
        # elif feature[1]['field'] == 'z' and feature[1]['robot_id'] == 1:
        #     row['z_1'] = feature[1]['value']
        # elif feature[1]['field'] == 'z' and feature[1]['robot_id'] == 2:
        #     row['z_2'] = feature[1]['value']

In [123]:
import constants as C
print(C.FEATURES)

['fx_1', 'fx_2', 'fy_1', 'fy_2', 'fz_1', 'fz_2', 'x_1', 'x_2', 'y_1', 'y_2', 'z_1', 'z_2']


In [146]:
feat

feature,x_1,x_2,y_1,y_2,z_1,z_2
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-11-23 20:40:00.005000+00:00,821.7808,,326.5256,,-1.8051,
2022-11-23 20:40:00.008000+00:00,,823.2906,,320.5103,,-2.9101
2022-11-23 20:40:00.017000+00:00,821.8217,,326.5430,,-1.8056,
2022-11-23 20:40:00.020000+00:00,,823.3414,,320.5224,,-2.9163
2022-11-23 20:40:00.029000+00:00,821.8507,,326.5583,,-1.8138,
...,...,...,...,...,...,...
2022-11-23 20:49:59.975000+00:00,,810.4467,,322.1543,,-8.3726
2022-11-23 20:49:59.977000+00:00,808.8678,,333.4379,,-12.7861,
2022-11-23 20:49:59.987000+00:00,,809.5239,,322.0148,,-8.3831
2022-11-23 20:49:59.989000+00:00,808.0329,,333.3042,,-12.7741,


In [149]:
# convert time column to pd datetime
df['time'] = pd.to_datetime(df['time'])
df['feature'] = df['field'] + '_' + df['robot_id'].astype(str)
uuids = df['run_uuid'].unique()

# need to separate by uuid to avoid conflict between same features at the same time stamp from different runs 
for uuid in uuids:
    feat = df[df['run_uuid'] == uuid].pivot_table(values='value', index='time', columns='feature', fill_value=np.nan).sort_index(ascending=True)
    feat.to_csv(f'../data/features_{uuid}.csv', index=True)
    for feature in C.FEATURES:
        if feature not in feat.columns:
            feat[feature] = np.nan
        else:
            feat[feature] = feat[feature].interpolate(method='time')
    feat = feat.reset_index()
    feat.to_csv(f'../data/interpolated_features_{uuid}.csv', index=True)



In [156]:
f'{run_uuid:.0f}'


'6176976534744076288'

In [129]:
# TODO error checking: 
# assume no duplicate fields for a given timestamp
# TODO benchmark this function, make it faster
# assume time series is sorted by time

rows = []
# indexes = []
for group_index, group_fields in uuidtime:
    # index = pd.Series({'run_uuid':group_index[0], 
    #         'time':group_index[1]})
    row = pd.Series({
            'run_uuid':group_index[0], 
            'pddatetime':group_index[1],
            # 'fx_1': np.nan,
            # 'fx_2': np.nan,
            # 'fy_1': np.nan,
            # 'fy_2': np.nan,
            # 'fz_1': np.nan,
            # 'fz_2': np.nan,
            # 'x_1':  np.nan,
            # 'x_2':  np.nan,
            # 'y_1':  np.nan,
            # 'y_2':  np.nan,
            # 'z_1':  np.nan,
            # 'z_2':  np.nan
        })
    for field_name in C.FEATURES:
        row[field_name] = np.nan
    print(row)
    # break
    for i, feature in group_fields.iterrows():
        # use field and robot_id to determine which column to populate
        # -> need to check if column name exists
        field_name = feature['field'] + '_' + str(feature['robot_id'])
        # if field_name in row.index:
        row[field_name] = feature['value']
    # indexes.append(index)
    rows.append(row)
    # break
    


run_uuid                 6176976534744076288.0
pddatetime    2022-11-23 20:40:00.007000+00:00
fx_1                                       NaN
fx_2                                       NaN
fy_1                                       NaN
fy_2                                       NaN
fz_1                                       NaN
fz_2                                       NaN
x_1                                        NaN
x_2                                        NaN
y_1                                        NaN
y_2                                        NaN
z_1                                        NaN
z_2                                        NaN
dtype: object
run_uuid                 6176976534744076288.0
pddatetime    2022-11-23 20:40:00.008000+00:00
fx_1                                       NaN
fx_2                                       NaN
fy_1                                       NaN
fy_2                                       NaN
fz_1                                       NaN

KeyboardInterrupt: 

In [119]:
print(row)

run_uuid                 6176976534744076288.0
pddatetime    2022-11-23 20:40:00.007000+00:00
fx_1                                176.096381
fx_2                                       NaN
fy_1                                174.268623
fy_2                                       NaN
fz_1                               -258.179417
fz_2                                       NaN
x_1                                        NaN
x_2                                        NaN
y_1                                        NaN
y_2                                        NaN
z_1                                        NaN
z_2                                        NaN
dtype: object


In [26]:
# indexes = pd.MultiIndex.from_tuples(indexes, names=['run_uuid', 'time'])
# features = pd.DataFrame(rows, index=indexes)
features = pd.DataFrame(rows)
features.sort_values(by='pddatetime', inplace=True)
print(len(features))
print(features.head())

489547
            run_uuid                       pddatetime         fx_1  \
200151  7.582293e+18 2022-11-23 20:40:00.001000+00:00 -1192.046953   
200152  7.582293e+18 2022-11-23 20:40:00.003000+00:00          NaN   
481783  1.240519e+19 2022-11-23 20:40:00.003000+00:00   -88.747061   
392042  8.910096e+18 2022-11-23 20:40:00.005000+00:00          NaN   
0       6.176977e+18 2022-11-23 20:40:00.007000+00:00   176.096381   

              fx_2        fy_1        fy_2         fz_1       fz_2       x_1  \
200151         NaN  716.528276         NaN -1547.340972        NaN       NaN   
200152 -546.669903         NaN  489.207227          NaN  84.484822       NaN   
481783         NaN         NaN         NaN          NaN        NaN       NaN   
392042         NaN         NaN         NaN          NaN        NaN  821.7808   
0              NaN  174.268623         NaN  -258.179417        NaN       NaN   

        x_2       y_1  y_2     z_1  z_2  
200151  NaN       NaN  NaN     NaN  NaN  
200152 

### Interpolate missing values
A few different ways I thought we could fill in the missing values:
- naive: squish groups of readings together, use the average of timestamp of robot 1 and 2's position measurement 
    - position measurement because they are the most critical 

In [70]:
# how to combine the different measurements into a single row?

# sort by time
# while all fields are not filled, keep taking the next measurement and fill in the fields
# if there are multiple measurements for a given field, take the last one
table_by_uuid = {}
feat_groups = features.groupby(['run_uuid'])
for run_id, run_data in feat_groups:
    run_data.sort_values(by='pddatetime', ascending=True, inplace=True)
    run_data.set_index('pddatetime', inplace=True)
    for measurement_name in ['fx_1', 'fx_2', 'fy_1', 'fy_2', 'fz_1', 'fz_2', 'x_1', 'x_2', 'y_1', 'y_2', 'z_1', 'z_2']:
        run_data[measurement_name] = run_data[measurement_name].interpolate(method='time')
    run_data.to_csv(f'../data/interpolated_features/{run_id}.csv', index=True)
    table_by_uuid[run_id] = run_data


In [28]:
# TODO figure out how to resample the time so that it is evenly spaced
# features['pddatetime'] = features.resample('5L', on='pddatetime')
plt.style.use('seaborn')

plt.rc('font', size=20) #controls default text size
plt.rc('axes', titlesize=20) #fontsize of the title
plt.rc('axes', labelsize=20) #fontsize of the x and y labels
plt.rc('xtick', labelsize=20) #fontsize of the x tick labels
plt.rc('ytick', labelsize=20) #fontsize of the y tick labels
plt.rc('legend', fontsize=20) #fontsize of the legend
if not os.path.exists('../data/dt_plot'):
    os.makedirs('../data/dt_plot')
for k in table_by_uuid:
    print(k)
    time_diff = pd.Series(table_by_uuid[k]['pddatetime'].diff().dt.total_seconds(), name='dt in seconds')
    print(time_diff.value_counts())
    sns.set(rc={'figure.figsize':(12,6)})
    ax = sns.histplot(time_diff, kde=False)
    ax.set(yscale="log")
    ax.set(ylabel="Number of observations")
    plt.savefig(f'../data/dt_plot/{k}.png', dpi=300, bbox_inches='tight')
    plt.clf()


6.176976534744076e+18
0.003    45858
0.002    42648
0.001    40420
0.004    34678
0.005    22685
0.006    12462
0.007     1399
Name: dt in seconds, dtype: int64
7.58229308099147e+18
0.001    61274
0.002    30938
0.003    26647
0.004    23900
0.005    17882
0.006    15079
0.007     9803
0.008     4670
0.009     1529
0.010      168
Name: dt in seconds, dtype: int64
8.910095844186657e+18
0.011    20393
0.001    20391
0.010    16367
0.002    16359
0.012    10250
0.009     2992
0.003     2988
Name: dt in seconds, dtype: int64
1.2405186538561671e+19
0.010    7656
0.009      55
0.011      52
Name: dt in seconds, dtype: int64


<Figure size 1200x600 with 0 Axes>

In [68]:

for k,run in table_by_uuid.items():
    start_time = run['pddatetime'].iloc[0]
    end_time = run['pddatetime'].iloc[-1]
    sampling_period = datetime.timedelta(seconds=0.01)

    # create the new index and a new series full of NaNs
    new_index = pd.date_range(
        start=start_time, 
        end=end_time, 
        freq=sampling_period, 
    )
    new_series = pd.Series(np.nan, index=new_index)

    # concat the old and new series and remove duplicates (if any) 
    comb_series = pd.concat([run[['pddatetime', 'x_1']], new_series])
    comb_series = comb_series[~comb_series.index.duplicated(keep='first')]

    # interpolate to fill the NaNs
    comb_series.interpolate(method='time', inplace=True)

    # take only the values at regular time intervals
    print(comb_series[new_index])
    break
    for feature_name in ['x_1', 'x_2', 'y_1', 'y_2', 'z_1', 'z_2']:
        ts = traces.TimeSeries()
        for i, row in run.iterrows():
            ts[row['pddatetime']] = row[[feature_name]]
        
        print(start_time, end_time, datetime.timedelta(seconds=0.01))
        # print(ts)
        sampled = ts.sample(
            sampling_period=sampling_period,
            start=start_time,
            end=end_time,
            interpolate='linear',
        )
        print(sampled)
        break


ValueError: Invalid fill method. Expecting pad (ffill) or backfill (bfill). Got time

In [63]:
print(row['x_1'])
print(row[['x_1']])

1440.79
x_1    1440.79
Name: 200150, dtype: object


In [71]:
for k in table_by_uuid:
    print(k)
    print(table_by_uuid[k].head())
    # table_by_uuid[k].to_csv(f'../data/interpolated_features/{k}.csv', index=True)

6.176976534744076e+18
                                      run_uuid        fx_1       fx_2  \
pddatetime                                                              
2022-11-23 20:40:00.007000+00:00  6.176977e+18  176.096381        NaN   
2022-11-23 20:40:00.008000+00:00  6.176977e+18  176.332060        NaN   
2022-11-23 20:40:00.010000+00:00  6.176977e+18  176.803416  50.373027   
2022-11-23 20:40:00.011000+00:00  6.176977e+18  177.039155  50.411944   
2022-11-23 20:40:00.017000+00:00  6.176977e+18  178.453284  50.645396   

                                        fy_1        fy_2        fz_1  \
pddatetime                                                             
2022-11-23 20:40:00.007000+00:00  174.268623         NaN -258.179417   
2022-11-23 20:40:00.008000+00:00  174.075613         NaN -258.308164   
2022-11-23 20:40:00.010000+00:00  173.689591 -416.060405 -258.565658   
2022-11-23 20:40:00.011000+00:00  173.496531 -415.966651 -258.694438   
2022-11-23 20:40:00.017000+00:00  

In [19]:
features[features['run_uuid'] == 7582293080991469568].head()

Unnamed: 0,run_uuid,pddatetime,fx_1,fx_2,fy_1,fy_2,fz_1,fz_2,x_1,x_2,y_1,y_2,z_1,z_2
200151,7.582293e+18,2022-11-23 20:40:00.001000+00:00,-1192.046953,,716.528276,,-1547.340972,,,,,,,
200152,7.582293e+18,2022-11-23 20:40:00.003000+00:00,,-546.669903,,489.207227,,84.484822,,,,,,
200153,7.582293e+18,2022-11-23 20:40:00.008000+00:00,,,,,,,2862.302,,995.86,,-48.565,
200154,7.582293e+18,2022-11-23 20:40:00.009000+00:00,,,,,,,,3050.773,,1000.769,,-771.632
200155,7.582293e+18,2022-11-23 20:40:00.011000+00:00,-1194.303789,,714.488726,,-1547.280913,,,,,,,


## 2.3 Engineered Features

In [113]:
# run_data.reset_index(inplace=True)
# run_data.set_index('pddatetime', inplace=True)
run_data['index']

pddatetime
2022-11-23 20:40:00.007000+00:00         0
2022-11-23 20:40:00.008000+00:00         1
2022-11-23 20:40:00.010000+00:00         2
2022-11-23 20:40:00.011000+00:00         3
2022-11-23 20:40:00.017000+00:00         4
                                     ...  
2022-11-23 20:49:59.986000+00:00    200146
2022-11-23 20:49:59.987000+00:00    200147
2022-11-23 20:49:59.993000+00:00    200148
2022-11-23 20:49:59.996000+00:00    200149
2022-11-23 20:49:59.999000+00:00    200150
Name: index, Length: 200151, dtype: int64

In [115]:

# assume total values are cumulative
# create a series with column names: vx_1, vy_1, vz_1, vx_2, vy_2, vz_2, ax_1, ay_1, az_1, ax_2, ay_2, az_2, v1, v2, a1, a2, f1, f2
rows = []

# x1 = features[['run_uuid', 'time', 'fx_1']].dropna()
# x1['time_obj'] = x1['time'].apply(convert_time)
# x1['fx_1'].diff()
engineered_features_by_uuid = {}
for run_uuid in table_by_uuid:
    run_data = table_by_uuid[run_uuid].fillna(0) # fill all NaN with 0 for calculations
    run_data.reset_index(inplace=True)
    engr_feat = pd.DataFrame() # columns=['pddatetime', 'vx_1', 'vy_1', 'vz_1', 'vx_2', 'vy_2', 'vz_2', 'ax_1', 'ay_1', 'az_1', 'ax_2', 'ay_2', 'az_2', 'v1', 'v2', 'a1', 'a2', 'f1', 'f2', ''])
    # engr_feat.index = run_data.index
    engr_feat['pddatetime'] = run_data['pddatetime'] # .apply(convert_time)
    engr_feat['dt'] = engr_feat['pddatetime'].diff().dt.total_seconds() * 1000 # convert to milliseconds
    print(run_uuid)
    for robot_id in [1, 2]:
        for axis in ['x', 'y', 'z']:
            # velocity and acceleration
            engr_feat[f'd{axis}_{robot_id}'] = run_data[f'{axis}_{robot_id}'].diff()
            engr_feat[f'v{axis}_{robot_id}'] = engr_feat[f'd{axis}_{robot_id}'] / engr_feat['dt']
            engr_feat[f'a{axis}_{robot_id}'] = engr_feat[f'v{axis}_{robot_id}'].diff() / engr_feat['dt']
        print('run data:', run_data['x_1'])

        # total values
        engr_feat[f'v{robot_id}'] = np.sqrt(engr_feat[f'vx_{robot_id}']**2 + engr_feat[f'vy_{robot_id}']**2 + engr_feat[f'vz_{robot_id}']**2)
        engr_feat[f'a{robot_id}'] = np.sqrt(engr_feat[f'ax_{robot_id}']**2 + engr_feat[f'ay_{robot_id}']**2 + engr_feat[f'az_{robot_id}']**2)
        engr_feat[f'f{robot_id}'] = np.sqrt(run_data[f'fx_{robot_id}']**2 + run_data[f'fy_{robot_id}']**2 + run_data[f'fz_{robot_id}']**2)

    print(engr_feat.head())

    engineered_features_by_uuid[run_uuid] = engr_feat
    engr_feat.to_csv(f'../data/engineered_features/{run_uuid}.csv')
    break
    

6.176976534744076e+18
run data: 0            0.00
1            0.00
2            0.00
3         1440.79
4         1440.79
           ...   
200146    1440.79
200147    1440.79
200148    1440.79
200149    1440.79
200150    1440.79
Name: x_1, Length: 200151, dtype: float64
run data: 0            0.00
1            0.00
2            0.00
3         1440.79
4         1440.79
           ...   
200146    1440.79
200147    1440.79
200148    1440.79
200149    1440.79
200150    1440.79
Name: x_1, Length: 200151, dtype: float64
                        pddatetime   dt     dx_1     vx_1         ax_1  \
0 2022-11-23 20:40:00.007000+00:00  NaN      NaN      NaN          NaN   
1 2022-11-23 20:40:00.008000+00:00  1.0     0.00     0.00          NaN   
2 2022-11-23 20:40:00.010000+00:00  2.0     0.00     0.00     0.000000   
3 2022-11-23 20:40:00.011000+00:00  1.0  1440.79  1440.79  1440.790000   
4 2022-11-23 20:40:00.017000+00:00  6.0     0.00     0.00  -240.131667   

      dy_1     vy_1        ay_1  

In [88]:
2/np.nan

nan

In [116]:
engr_feat.columns

Index(['pddatetime', 'dt', 'dx_1', 'vx_1', 'ax_1', 'dy_1', 'vy_1', 'ay_1',
       'dz_1', 'vz_1', 'az_1', 'v1', 'a1', 'f1', 'dx_2', 'vx_2', 'ax_2',
       'dy_2', 'vy_2', 'ay_2', 'dz_2', 'vz_2', 'az_2', 'v2', 'a2', 'f2'],
      dtype='object')

In [47]:
d0 = datetime.strptime('2022-11-23T20:40:00.007Z', '%Y-%m-%dT%H:%M:%S.%fZ')
d1 = datetime.strptime('2022-11-23T20:30:00.007Z', '%Y-%m-%dT%H:%M:%S.%fZ')

print((d0-d1).total_seconds())
print((d1-d0).total_seconds())

600.0
-600.0


In [166]:
try:
    pd.to_datetime(['20227', '2022-11-23T20:40:00.007Z'])
    print()
except:
    print('error')

pd.to_datetime('2022-11-23T20:40:00.007Z')

error
