# Using a sequential Monte Carlo model to localize sensors based on radio pings: process ping data

## Load the libraries we need

Load the third-party libraries.

In [1]:
import numpy as np
import pandas as pd
import os
import dateutil.parser

Load our `smclocalize` module.

In [2]:
from smclocalize import *

Load our `sensei_client` module.

In [3]:
import sensei

## Set the base data paths

In [4]:
data_path = './data/'
data_run_directory = 'coco_180309/'

## Load the ping data

In [5]:
api = sensei.Api(
    username="ted.quinn@wildflowerschools.org",
    password="YE9j9FuFRa")

In [6]:
obs = api.get_radio_observations(
    735,
    start_time=dateutil.parser.parse('2018-03-09T16:57:00Z'),
    end_time=dateutil.parser.parse('2018-03-09T17:54:00Z'),
    json_rep=True)

In [7]:
all_data = pd.DataFrame(obs)

In [8]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10035 entries, 0 to 10034
Data columns (total 7 columns):
classroom_id    10035 non-null int64
local_id        10035 non-null int64
local_type      10035 non-null object
observed_at     10035 non-null object
remote_id       10035 non-null int64
remote_type     10035 non-null object
rssi            10035 non-null float64
dtypes: float64(1), int64(3), object(3)
memory usage: 548.9+ KB


In [9]:
all_data['observed_at'] = pd.to_datetime(
            all_data['observed_at'])
# all_data['observed_at'] = pd.Series(
#     pd.Index(
#         pd.to_datetime(
#             all_data['observed_at'])).tz_localize('UTC'))

In [10]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10035 entries, 0 to 10034
Data columns (total 7 columns):
classroom_id    10035 non-null int64
local_id        10035 non-null int64
local_type      10035 non-null object
observed_at     10035 non-null datetime64[ns]
remote_id       10035 non-null int64
remote_type     10035 non-null object
rssi            10035 non-null float64
dtypes: datetime64[ns](1), float64(1), int64(3), object(2)
memory usage: 548.9+ KB


In [11]:
all_data

Unnamed: 0,classroom_id,local_id,local_type,observed_at,remote_id,remote_type,rssi
0,735,6,area,2018-03-09 16:57:00,35267,child,-85.0
1,735,4,area,2018-03-09 16:57:00,35267,child,-74.0
2,735,4,area,2018-03-09 16:57:00,10642,child,-60.0
3,735,35267,child,2018-03-09 16:57:00,4,area,-74.0
4,735,35267,child,2018-03-09 16:57:00,10642,child,-65.0
5,735,10642,child,2018-03-09 16:57:00,4,area,-59.0
6,735,10642,child,2018-03-09 16:57:00,35267,child,-64.0
7,735,10642,child,2018-03-09 16:57:00,38,area,-78.0
8,735,38,area,2018-03-09 16:57:00,10642,child,-77.0
9,735,41,area,2018-03-09 16:57:00,10642,child,-86.0


## Filter the ping data

Here is where we do any manual cleaning and filtering of the data.

In [12]:
usable_data = all_data
# usable_data = all_data[(all_data['remote_id'] != 8) &
#                        (all_data['local_id'] != 8)].reset_index(drop=True)

## Save the ping data in dataframe format

In [13]:
pickle_directory = 'pickle/'
data_pickle_filename = 'usable_data.pkl'

In [14]:
usable_data.to_pickle(data_path + data_run_directory + pickle_directory + data_pickle_filename)

## (Re)load the ping data in dataframe format

In [15]:
usable_data = pd.read_pickle(data_path + data_run_directory + pickle_directory + data_pickle_filename)

## Extract entity IDs

Extract the list of entity IDs in our data set corresponding to each type of sensors. These lists are the basis for our variable structure.

In [16]:
child_entity_ids = np.union1d(pd.unique(usable_data[usable_data.local_type == 'child'].local_id),
                              pd.unique(usable_data[usable_data.remote_type == 'child'].remote_id)).tolist()
material_entity_ids = np.union1d(pd.unique(usable_data[usable_data.local_type == 'material'].local_id),
                                 pd.unique(usable_data[usable_data.remote_type == 'material'].remote_id)).tolist()
teacher_entity_ids = np.union1d(pd.unique(usable_data[usable_data.local_type == 'teacher'].local_id),
                                pd.unique(usable_data[usable_data.remote_type == 'teacher'].remote_id)).tolist()
area_entity_ids = np.union1d(pd.unique(usable_data[usable_data.local_type == 'area'].local_id),
                             pd.unique(usable_data[usable_data.remote_type == 'area'].remote_id)).tolist()

## Define the variable structure for the model

Using the lists of entity IDs, define an instance of the `SensorVariableStructure` class. This class provides a whole bunch of variables and helper functions for working with the data.

In [17]:
variable_structure = SensorVariableStructure(child_entity_ids,
                                             material_entity_ids,
                                             teacher_entity_ids,
                                             area_entity_ids)

## Restructure the ping data for use in the model

Using the helper functions from the `SensorVariableStructure` class, parse the data into arrays which represent the discrete and continuous components of the $\mathbf{Y}$ variables which we will use in the model.

In the below, we parse the data separately for each time step in order to mimic the real-time use case. There is also a helper function called `sensor_data_parse_multiple_timesteps()` for parsing an entire data set containing many timesteps (not shown here).

For `y_discrete_t`, use 0 to indicate that a ping was received and 1 to indicate that a ping was not received (don't ask). For `y_continuous_t`, we convert the integer RSSI values to floats (since we're treating RSSI as a continuous variable) and we just enter a 0.0 value for RSSI if no ping was received.

In [18]:
timestamps = np.sort(usable_data['observed_at'].unique())
num_timesteps = len(timestamps)
y_discrete_t = np.ones(
    (num_timesteps, variable_structure.num_y_discrete_vars),
    dtype='int')
y_continuous_t = np.zeros(
    (num_timesteps, variable_structure.num_y_continuous_vars),
    dtype='float')
for t_index in range(num_timesteps):
    (y_discrete_t[t_index], y_continuous_t[t_index]) = variable_structure.sensor_data_parse_one_timestep(
        usable_data[usable_data['observed_at'] == timestamps[t_index]])

Apply some basic sanity checks.

In [19]:
timestamp_range = pd.date_range(usable_data['observed_at'].min(), usable_data['observed_at'].max(), freq='10S')

In [20]:
np.setdiff1d(timestamp_range, timestamps)

array([], dtype='datetime64[ns]')

Check to make sure that the post-processed data and the pre-processed data agree on the total number of pings received.

In [21]:
np.sum(y_discrete_t == 0)

10035

In [22]:
np.sum(y_continuous_t != 0.0)

10035

In [23]:
len(usable_data)

10035

## Save the ping data in the format required by the model

In [24]:
numpy_directory = 'numpy/'

In [25]:
data_numpy_filename = 'ping_data'

In [26]:
np.savez(
    data_path + data_run_directory + numpy_directory + data_numpy_filename,
    child_entity_ids = child_entity_ids,
    material_entity_ids = material_entity_ids,
    teacher_entity_ids = teacher_entity_ids,
    area_entity_ids = area_entity_ids,
    timestamps = timestamps,
    num_timesteps = num_timesteps,
    y_discrete_t = y_discrete_t,
    y_continuous_t = y_continuous_t)

## Calculate and save the same data in matrix format

In [27]:
y_discrete_all_sensors_t = np.ones(
    (num_timesteps, variable_structure.num_sensors, variable_structure.num_sensors),
    dtype='int')
y_continuous_all_sensors_t = np.zeros(
    (num_timesteps, variable_structure.num_sensors, variable_structure.num_sensors),
    dtype='float')
for t_index in range(num_timesteps):
    (y_discrete_all_sensors_t[t_index], y_continuous_all_sensors_t[t_index]) = variable_structure.sensor_data_all_sensors_one_timestep(
        usable_data[usable_data['observed_at'] == timestamps[t_index]])

In [28]:
data_numpy_filename_all_sensors = 'ping_data_all_sensors'

In [29]:
np.savez(
    data_path + data_run_directory + numpy_directory + data_numpy_filename_all_sensors,
    y_discrete_all_sensors_t = y_discrete_all_sensors_t,
    y_continuous_all_sensors_t = y_continuous_all_sensors_t)

## Load the moving sensor ground truth data

In [30]:
csv_directory = 'csv/'

In [31]:
data_csv_filename = 'coco_test_180309_moving_sensors.csv'

In [32]:
moving_sensors_data_input = pd.read_csv(
    data_path + data_run_directory + csv_directory + data_csv_filename)

In [33]:
moving_sensors_data_input.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 10 columns):
start_time    8 non-null object
end_time      8 non-null object
13l           8 non-null float64
13w           8 non-null float64
16l           8 non-null float64
16w           8 non-null float64
22l           8 non-null float64
22w           8 non-null float64
31l           8 non-null float64
31w           8 non-null float64
dtypes: float64(8), object(2)
memory usage: 720.0+ bytes


In [34]:
moving_sensors_data_input

Unnamed: 0,start_time,end_time,13l,13w,16l,16w,22l,22w,31l,31w
0,10:58:00 AM,11:03:00 AM,0.86,0.45,4.08,0.47,1.77,1.36,1.58,3.42
1,11:05:00 AM,11:10:00 AM,2.49,2.11,2.68,1.97,0.54,3.5,3.04,3.38
2,11:11:00 AM,11:16:00 AM,2.29,1.37,1.72,1.49,2.12,1.8,5.36,3.3
3,11:17:00 AM,11:23:00 AM,4.14,2.75,0.6,2.82,3.81,0.33,4.38,0.36
4,11:25:00 AM,11:30:00 AM,2.81,0.21,1.85,3.5,1.63,3.5,2.48,1.64
5,11:31:00 AM,11:37:00 AM,2.09,1.52,3.09,3.45,4.18,1.65,3.39,3.44
6,11:39:00 AM,11:44:00 AM,0.79,0.57,5.33,3.45,1.53,3.66,0.49,0.86
7,11:46:00 AM,11:53:00 AM,0.43,1.94,3.17,2.07,2.82,2.53,2.26,0.5


In [35]:
moving_sensors_data_input['start_time'] = pd.Series(
        pd.Index(
            pd.to_datetime('2018-03-09 '+ moving_sensors_data_input['start_time'])).tz_localize('America/Chicago').tz_convert('UTC').tz_localize(None))

In [36]:
moving_sensors_data_input['end_time'] = pd.Series(
        pd.Index(
            pd.to_datetime('2018-03-09 '+ moving_sensors_data_input['end_time'])).tz_localize('America/Chicago').tz_convert('UTC').tz_localize(None))

## Expand into a table of observations

In [37]:
dataframes = []
for i in range(len(moving_sensors_data_input)):
    timestamps_df = pd.DataFrame(
        pd.Series(
            pd.date_range(
                moving_sensors_data_input.iloc[i, 0],
                moving_sensors_data_input.iloc[i, 1],
                freq='10S')),
        columns=['observed_at'])
    num_timestamps = len(timestamps_df)
    positions_df = pd.DataFrame(
        np.tile(
            moving_sensors_data_input.iloc[i, 2:].values,
            (num_timestamps, 1)),
        columns=moving_sensors_data_input.columns.tolist()[2:],
        dtype=np.float64)
    row_expanded = pd.concat([timestamps_df, positions_df], axis=1)
    dataframes.append(row_expanded)
moving_sensors_data_expanded = pd.concat(
    dataframes,
    ignore_index=True)

In [38]:
moving_sensors_data_long = pd.melt(
    moving_sensors_data_expanded,
    id_vars=['observed_at'],
    var_name='sensor_id_dimension',
    value_name='value')

In [39]:
moving_sensors_data_long['sensor_id'] = moving_sensors_data_long['sensor_id_dimension'].str[:2].astype('int')
moving_sensors_data_long['dim'] = moving_sensors_data_long['sensor_id_dimension'].str[2]

In [40]:
moving_sensors_data_long = moving_sensors_data_long[['observed_at','sensor_id','dim','value']]

In [41]:
moving_sensors_data_longish = moving_sensors_data_long.pivot_table(
    index=['observed_at', 'sensor_id'],
    columns=['dim'],
    values='value').reset_index()

In [42]:
moving_sensors_data_longish.columns.name=None

## Get the mappings between sensor IDs and entity IDs

In [43]:
sensor_mappings = pd.DataFrame(api.get_sensor_mappings(735, json_rep=True))

In [44]:
sensor_mappings

Unnamed: 0,classroom_id,end_time,entity_id,entity_type,sensor_id,start_time
0,735,,10642,child,13,2018-03-09T16:46:54.794940Z
1,735,,35197,child,16,2018-03-09T16:46:54.814661Z
2,735,,35267,child,22,2018-03-09T16:46:54.830761Z
3,735,,15716,teacher,31,2018-03-09T16:46:54.840076Z
4,735,,4,area,4,2018-03-09T16:46:54.873849Z
5,735,,6,area,6,2018-03-09T16:46:54.881531Z
6,735,,38,area,1,2018-03-09T16:46:54.890295Z
7,735,,40,area,2,2018-03-09T16:46:54.899167Z
8,735,,41,area,3,2018-03-09T16:46:54.911941Z
9,735,,42,area,7,2018-03-09T16:46:54.920953Z


In [45]:
entity_id_lookup = sensor_mappings[['sensor_id', 'entity_type', 'entity_id']]

In [46]:
entity_id_lookup

Unnamed: 0,sensor_id,entity_type,entity_id
0,13,child,10642
1,16,child,35197
2,22,child,35267
3,31,teacher,15716
4,4,area,4
5,6,area,6
6,1,area,38
7,2,area,40
8,3,area,41
9,7,area,42


## Convert moving sensor ground truth data from sensor IDs to entity IDs

In [47]:
moving_sensors_data = pd.merge(
    moving_sensors_data_longish,
    entity_id_lookup,
    how='left')

In [48]:
moving_sensors_data = moving_sensors_data[['observed_at', 'entity_type', 'entity_id', 'l', 'w']]

In [49]:
moving_sensors_data = moving_sensors_data.sort_values(by=['observed_at', 'entity_id']).reset_index(drop=True)

In [50]:
moving_sensors_data

Unnamed: 0,observed_at,entity_type,entity_id,l,w
0,2018-03-09 16:58:00,child,10642,0.86,0.45
1,2018-03-09 16:58:00,teacher,15716,1.58,3.42
2,2018-03-09 16:58:00,child,35197,4.08,0.47
3,2018-03-09 16:58:00,child,35267,1.77,1.36
4,2018-03-09 16:58:10,child,10642,0.86,0.45
5,2018-03-09 16:58:10,teacher,15716,1.58,3.42
6,2018-03-09 16:58:10,child,35197,4.08,0.47
7,2018-03-09 16:58:10,child,35267,1.77,1.36
8,2018-03-09 16:58:20,child,10642,0.86,0.45
9,2018-03-09 16:58:20,teacher,15716,1.58,3.42


## Save the moving sensor ground truth data in dataframe format

In [51]:
pickle_directory = 'pickle/'
data_pickle_filename = 'moving_sensors_data.pkl'

In [52]:
moving_sensors_data.to_pickle(data_path + data_run_directory + pickle_directory + data_pickle_filename)

## (Re)load the moving sensor ground truth data in dataframe format

In [53]:
moving_sensors_data = pd.read_pickle(data_path + data_run_directory + pickle_directory + data_pickle_filename)

In [54]:
moving_sensors_data

Unnamed: 0,observed_at,entity_type,entity_id,l,w
0,2018-03-09 16:58:00,child,10642,0.86,0.45
1,2018-03-09 16:58:00,teacher,15716,1.58,3.42
2,2018-03-09 16:58:00,child,35197,4.08,0.47
3,2018-03-09 16:58:00,child,35267,1.77,1.36
4,2018-03-09 16:58:10,child,10642,0.86,0.45
5,2018-03-09 16:58:10,teacher,15716,1.58,3.42
6,2018-03-09 16:58:10,child,35197,4.08,0.47
7,2018-03-09 16:58:10,child,35267,1.77,1.36
8,2018-03-09 16:58:20,child,10642,0.86,0.45
9,2018-03-09 16:58:20,teacher,15716,1.58,3.42


## Restructure the moving sensor ground truth data for use in the model

In [55]:
x_continuous_t = np.full(
    (num_timesteps, variable_structure.num_x_continuous_vars),
    np.nan,
    dtype='float')
for t_index in range(num_timesteps):
    x_continuous_t[t_index] = variable_structure.sensor_x_continuous_data_parse_one_timestep(
        moving_sensors_data[moving_sensors_data['observed_at'] == timestamps[t_index]])

## Save the moving sensor ground truth data in the format required by the model

In [56]:
numpy_directory = 'numpy/'

In [57]:
data_numpy_filename = 'moving_sensor_data'

In [58]:
np.savez(
    data_path + data_run_directory + numpy_directory + data_numpy_filename,
    x_continuous_t = x_continuous_t)

## Load the fixed sensor position data

In [59]:
csv_directory = 'csv/'

In [60]:
data_csv_filename = 'coco_test_180309_fixed_sensors.csv'

In [61]:
fixed_sensors_data_input = pd.read_csv(
    data_path + data_run_directory + csv_directory + data_csv_filename)

In [62]:
fixed_sensors_data_input.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 3 columns):
sensor_id    6 non-null int64
l            6 non-null float64
w            6 non-null float64
dtypes: float64(2), int64(1)
memory usage: 224.0 bytes


In [63]:
fixed_sensors_data_input

Unnamed: 0,sensor_id,l,w
0,1,5.17,0.28
1,2,2.87,0.0
2,3,0.4,0.0
3,4,2.63,3.77
4,6,5.56,3.74
5,7,0.15,3.65


## Convert fixed sensor position data from sensor IDs to entity IDs

In [64]:
fixed_sensors_data = pd.merge(
    fixed_sensors_data_input,
    entity_id_lookup,
    how='left')

In [65]:
fixed_sensors_data = fixed_sensors_data[['entity_type', 'entity_id', 'l', 'w']]

In [66]:
fixed_sensors_data = fixed_sensors_data.sort_values(by=['entity_id']).reset_index(drop=True)

In [67]:
fixed_sensors_data

Unnamed: 0,entity_type,entity_id,l,w
0,area,4,2.63,3.77
1,area,6,5.56,3.74
2,area,38,5.17,0.28
3,area,40,2.87,0.0
4,area,41,0.4,0.0
5,area,42,0.15,3.65


## Save the fixed sensor position data in dataframe format

In [68]:
pickle_directory = 'pickle/'
data_pickle_filename = 'fixed_sensors_data.pkl'

In [69]:
fixed_sensors_data.to_pickle(data_path + data_run_directory + pickle_directory + data_pickle_filename)

## (Re)load the fixed sensor position data in dataframe format

In [70]:
fixed_sensors_data = pd.read_pickle(data_path + data_run_directory + pickle_directory + data_pickle_filename)

In [71]:
fixed_sensors_data

Unnamed: 0,entity_type,entity_id,l,w
0,area,4,2.63,3.77
1,area,6,5.56,3.74
2,area,38,5.17,0.28
3,area,40,2.87,0.0
4,area,41,0.4,0.0
5,area,42,0.15,3.65


## Restructure the fixed sensor position data for use in the model

In [72]:
area_entity_ids

[4, 6, 38, 40, 41, 42]

In [73]:
fixed_sensor_positions = np.full(
    (variable_structure.num_area_sensors, variable_structure.num_dimensions),
    np.nan,
    dtype='float')
for i in range(len(area_entity_ids)):
    fixed_sensor_positions[i, 0] = fixed_sensors_data.loc[fixed_sensors_data['entity_id'] == area_entity_ids[i], 'l']
    fixed_sensor_positions[i, 1] = fixed_sensors_data.loc[fixed_sensors_data['entity_id'] == area_entity_ids[i], 'w']

In [74]:
fixed_sensor_positions

array([[2.63, 3.77],
       [5.56, 3.74],
       [5.17, 0.28],
       [2.87, 0.  ],
       [0.4 , 0.  ],
       [0.15, 3.65]])

## Save the fixed sensor position data in the format required by the model

In [75]:
numpy_directory = 'numpy/'

In [76]:
data_numpy_filename = 'fixed_sensor_positions'

In [77]:
np.savez(
    data_path + data_run_directory + numpy_directory + data_numpy_filename,
    fixed_sensor_positions = fixed_sensor_positions)