# Using a sequential Monte Carlo model to localize sensors based on radio pings: process ping data

## Load the libraries we need

Load the third-party libraries.

In [1]:
import numpy as np
import pandas as pd
import os

Load our `smclocalize` module.

In [2]:
from smclocalize import *

## Load the ping data

The ping data is in a set of JSON files (one for each time window) in the format produced by [tools/get_radio_obs.py](https://github.com/WildflowerSchools/sensei/blob/master/tools/get_radio_obs.py) from the [sensei repository](https://github.com/WildflowerSchools/sensei).

In [3]:
data_path = './data/'

In [4]:
data_run_directory = 'wildflower_171128/'

In [5]:
json_directory = 'json/'

In [6]:
json_input_files = [x for x in os.listdir(data_path + data_run_directory + json_directory) if x.endswith('.json')]

In [7]:
dataframes = []

In [8]:
for json_input_file in json_input_files:
    with open(os.path.join(data_path, data_run_directory, json_directory, json_input_file), 'r') as input_fullpath:
        dataframes.append(pd.read_json(input_fullpath))

In [9]:
all_data = pd.concat(dataframes, ignore_index = True)

In [10]:
all_data

Unnamed: 0,local_id,local_type,observed_at,remote_id,remote_type,rssi
0,8,area,2017-11-28 14:00:00,11061,child,-88
1,8,area,2017-11-28 14:00:00,11060,child,-91
2,12,area,2017-11-28 14:00:00,11060,child,-83
3,13,area,2017-11-28 14:00:00,11067,child,-86
4,9,area,2017-11-28 14:00:00,11067,child,-82
5,9,area,2017-11-28 14:00:00,11060,child,-85
6,9,area,2017-11-28 14:00:00,11061,child,-83
7,11060,child,2017-11-28 14:00:00,11061,child,-71
8,11060,child,2017-11-28 14:00:00,11067,child,-69
9,11062,child,2017-11-28 14:00:00,11067,child,-81


Remove the data from the area sensor in the bathroom because it will produce strange results (given the vertical offset)

In [11]:
usable_data = all_data[(all_data['remote_id'] != 8) &
                       (all_data['local_id'] != 8)].reset_index(drop=True)

## Extract entity IDs

Extract the list of entity IDs in our data set corresponding to each type of sensors. These lists are the basis for our variable structure.

In [12]:
child_entity_ids = np.union1d(pd.unique(usable_data[usable_data.local_type == 'child'].local_id),
                              pd.unique(usable_data[usable_data.remote_type == 'child'].remote_id)).tolist()
material_entity_ids = np.union1d(pd.unique(usable_data[usable_data.local_type == 'material'].local_id),
                                 pd.unique(usable_data[usable_data.remote_type == 'material'].remote_id)).tolist()
teacher_entity_ids = np.union1d(pd.unique(usable_data[usable_data.local_type == 'teacher'].local_id),
                                pd.unique(usable_data[usable_data.remote_type == 'teacher'].remote_id)).tolist()
area_entity_ids = np.union1d(pd.unique(usable_data[usable_data.local_type == 'area'].local_id),
                             pd.unique(usable_data[usable_data.remote_type == 'area'].remote_id)).tolist()

## Define the variable structure for the model

Using the lists of entity IDs, define an instance of the `SensorVariableStructure` class. This class provides a whole bunch of variables and helper functions for working with the data.

In [13]:
variable_structure = SensorVariableStructure(child_entity_ids,
                                             material_entity_ids,
                                             teacher_entity_ids,
                                             area_entity_ids)

## Restructure the ping data for use in the model

Using the helper functions from the `SensorVariableStructure` class, parse the data into arrays which represent the discrete and continuous components of the $\mathbf{Y}$ variables which we will use in the model.

In the below, we parse the data separately for each time step in order to mimic the real-time use case. There is also a helper function called `sensor_data_parse_multiple_timesteps()` for parsing an entire data set containing many timesteps (not shown here).

For `y_discrete_t`, use 0 to indicate that a ping was received and 1 to indicate that a ping was not received (don't ask). For `y_continuous_t`, we convert the integer RSSI values to floats (since we're treating RSSI as a continuous variable) and we just enter a 0.0 value for RSSI if no ping was received.

In [14]:
timestamps = np.sort(usable_data['observed_at'].unique())
num_timesteps = len(timestamps)
y_discrete_t = np.ones(
    (num_timesteps, variable_structure.num_y_discrete_vars),
    dtype='int')
y_continuous_t = np.zeros(
    (num_timesteps, variable_structure.num_y_continuous_vars),
    dtype='float')
for t_index in range(num_timesteps):
    (y_discrete_t[t_index], y_continuous_t[t_index]) = variable_structure.sensor_data_parse_one_timestep(
        usable_data[usable_data['observed_at'] == timestamps[t_index]])

Apply some basic sanity checks.

In [15]:
timestamp_range = pd.date_range(usable_data['observed_at'].min(), usable_data['observed_at'].max(), freq='10S')

In [16]:
np.setdiff1d(timestamp_range, timestamps)

array(['2017-11-28T14:40:10.000000000', '2017-11-28T14:40:20.000000000',
       '2017-11-28T14:40:30.000000000'], dtype='datetime64[ns]')

There were three time windows in which no ping data was received.

Check to make sure that the post-processed data and the pre-processed data agree on the total number of pings received.

In [17]:
np.sum(y_discrete_t == 0)

106749

In [18]:
np.sum(y_continuous_t != 0.0)

106749

In [19]:
len(usable_data)

106749

## Save the ping data

In [20]:
numpy_directory = 'numpy/'

In [21]:
data_numpy_filename = 'ping_data'

In [22]:
np.savez(
    data_path + data_run_directory + numpy_directory + data_numpy_filename,
    child_entity_ids = child_entity_ids,
    material_entity_ids = material_entity_ids,
    teacher_entity_ids = teacher_entity_ids,
    area_entity_ids = area_entity_ids,
    timestamps = timestamps,
    num_timesteps = num_timesteps,
    y_discrete_t = y_discrete_t,
    y_continuous_t = y_continuous_t)

## Calculate and save the same data in matrix format

In [23]:
y_discrete_all_sensors_t = np.ones(
    (num_timesteps, variable_structure.num_sensors, variable_structure.num_sensors),
    dtype='int')
y_continuous_all_sensors_t = np.zeros(
    (num_timesteps, variable_structure.num_sensors, variable_structure.num_sensors),
    dtype='float')
for t_index in range(num_timesteps):
    (y_discrete_all_sensors_t[t_index], y_continuous_all_sensors_t[t_index]) = variable_structure.sensor_data_all_sensors_one_timestep(
        usable_data[usable_data['observed_at'] == timestamps[t_index]])

In [24]:
data_numpy_filename_all_sensors = 'ping_data_all_sensors'

In [25]:
np.savez(
    data_path + data_run_directory + numpy_directory + data_numpy_filename_all_sensors,
    y_discrete_all_sensors_t = y_discrete_all_sensors_t,
    y_continuous_all_sensors_t = y_continuous_all_sensors_t)