In order to run this notebook with the correct PYTHONPATH, you can use
```
$ ./e-mission-jupyter.bash notebook
```

This notebook makes heavy use of the following python libraries.
- `pandas`: from the python computing stack (http://pandas.pydata.org/pandas-docs/stable/)
- `geojson`: standard JSON representation of geographic data (http://geojson.org/)
- `folium`: (https://github.com/python-visualization/folium) python bridge to leaflet (http://leafletjs.com/)

### Pick a user to work with ###

In [1]:
import emission.core.get_database as edb
import pandas as pd
all_users = pd.DataFrame(list(edb.get_uuid_db().find({}, {"user_email":1, "uuid": 1, "_id": 0})))
all_users

storage not configured, falling back to sample, default configuration
Connecting to database URL localhost


Unnamed: 0,user_email,uuid
0,test_july_22,2b70a272-cdac-464f-859a-9ee5ff24bf41
1,work/2018/e-mission/alvin_2018-09-16.timeline,3708c5f2-47ff-4f8a-9333-b1851f50f4f2
2,test_sep_16,827869cc-6368-4215-8d5e-9b1b224e6d3a
3,alvin_sep_25,aefd0071-fa18-4300-9676-0c04fac7fff7
4,test_otp_insert,90d7d84f-0800-45d3-ac16-5fbc0ab91417
5,test_insert_fake_data,1bf3ad60-5a88-4049-b1c0-f734a0c2e261
6,test_insert_fake_data_2,97071fd7-9baf-4549-a13c-21125348b012
7,test_insert_fake_data_3,373ce1e6-7b4d-4fbb-8cfd-299518a63f76
8,test_insert_fake_data_4,2798bf67-b1df-42d8-b5d4-cc2f7d9f4913
9,test_insert_fake_data_5,5ef32d6d-0482-46fd-b5fb-6a56de0d7fde


In [3]:
from uuid import UUID

In [4]:
test_user_id = all_users.iloc[109].uuid # replace with UUID from above

If you want to work across multiple users, just do the same thing again

In [5]:
test_user_id_2 = all_users.iloc[0].uuid

In [6]:
print(test_user_id)

13f45da5-3daa-4e55-8002-dfa2015927ce


### Preferred access technique

The preferred technique to access wrapper objects from the timeseries is to use the abstract timeseries interface. This makes it easier for us to switch to alternative timeseries implementations later. The timeseries is conceptually a set of streams, one for each of the types, primarily indexed by time. So you can query for all entries of a particular time within a specified time range.

In [7]:
import emission.storage.timeseries.abstract_timeseries as esta
import emission.storage.decorations.analysis_timeseries_queries as esda
import emission.core.wrapper.entry as ecwe
import emission.storage.decorations.trip_queries as esdt
import emission.storage.timeseries.timequery as estt

In [8]:
print(test_user_id, test_user_id_2)
ts = esta.TimeSeries.get_time_series(test_user_id)
ts_2 = esta.TimeSeries.get_time_series(test_user_id_2)

13f45da5-3daa-4e55-8002-dfa2015927ce 2b70a272-cdac-464f-859a-9ee5ff24bf41


#### Accessing entries directly

In [9]:
entry_raw = ts.find_entries(["segmentation/raw_trip"], time_query=None)

## Raw trips

In [39]:
background_filter =ts.find_entries(["background/filtered_location"], time_query=None)

In [40]:
for loc in background_filter:
    print(loc)

{'_id': ObjectId('5bd930fe2e646853780011ad'), 'user_id': UUID('60ee331e-139c-458d-bae6-218e5090d10a'), 'metadata': {'key': 'background/filtered_location', 'platform': 'android', 'write_ts': 1540960504.506523, 'time_zone': 'America/Los_Angeles', 'write_local_dt': {'year': 2018, 'month': 10, 'day': 30, 'hour': 21, 'minute': 35, 'second': 4, 'weekday': 1, 'timezone': 'America/Los_Angeles'}, 'write_fmt_time': '2018-10-30T21:35:04.506523-07:00', 'type': 'sensor-data'}, 'data': {'ts': 1540985700.0, 'latitude': 37.77302, 'longitude': -122.4002, 'sensed_speed': 12.573820529373792, 'accuracy': 0, 'filter': 'distance', 'fmt_time': '2018-10-31T04:35:00-07:00', 'loc': {'type': 'Point', 'coordinates': [-122.4002, 37.77302]}, 'local_dt': {'year': 2018, 'month': 10, 'day': 31, 'hour': 4, 'minute': 35, 'second': 0, 'weekday': 2, 'timezone': 'America/Los_Angeles'}, 'altitude': 0, 'heading': 0}}
{'_id': ObjectId('5bd930fe2e646853780011af'), 'user_id': UUID('60ee331e-139c-458d-bae6-218e5090d10a'), 'metad

{'_id': ObjectId('5bd930ff2e64685378001527'), 'user_id': UUID('60ee331e-139c-458d-bae6-218e5090d10a'), 'metadata': {'key': 'background/filtered_location', 'platform': 'android', 'write_ts': 1540960505.588901, 'time_zone': 'America/Los_Angeles', 'write_local_dt': {'year': 2018, 'month': 10, 'day': 30, 'hour': 21, 'minute': 35, 'second': 5, 'weekday': 1, 'timezone': 'America/Los_Angeles'}, 'write_fmt_time': '2018-10-30T21:35:05.588901-07:00', 'type': 'sensor-data'}, 'data': {'ts': 1541025085.1468, 'latitude': 37.8227, 'longitude': -122.32234, 'sensed_speed': 12.11062582582582, 'accuracy': 0, 'filter': 'distance', 'fmt_time': '2018-10-31T15:31:25.146800-07:00', 'loc': {'type': 'Point', 'coordinates': [-122.32234, 37.8227]}, 'local_dt': {'year': 2018, 'month': 10, 'day': 31, 'hour': 15, 'minute': 31, 'second': 25, 'weekday': 2, 'timezone': 'America/Los_Angeles'}, 'altitude': 0, 'heading': 0}}
{'_id': ObjectId('5bd930ff2e64685378001529'), 'user_id': UUID('60ee331e-139c-458d-bae6-218e5090d10

In [9]:
for raw_trip in entry_raw:
    entry = ecwe.Entry(raw_trip)
    print(entry.data.end_ts)

1509713856.089506
1509754973.090973
1509795743.156265


In [10]:
# Get all cleaned trips for the first user
entry_it = ts.find_entries(["analysis/cleaned_trip"], time_query=None)

All keys and their mapping to data model objects can be found in 
https://github.com/e-mission/e-mission-server/blob/master/emission/core/wrapper/entry.py

In [11]:
for ct in entry_it:
    cte = ecwe.Entry(ct)
    print("=== Trip:", cte.data.start_loc, "->", cte.data.end_loc)
    section_it = esdt.get_sections_for_trip("analysis/cleaned_section", test_user_id, cte.get_id())
    for sec in section_it:
        print("  --- Section:", sec.data.start_loc, "->", sec.data.end_loc, " on ", sec.data.sensed_mode)

=== Trip: {"coordinates": [-122.4002, 37.77302], "type": "Point"} -> {"coordinates": [-122.14091, 37.42872], "type": "Point"}
  --- Section: {"coordinates": [-122.4002, 37.77302], "type": "Point"} -> {"coordinates": [-122.39459, 37.7766], "type": "Point"}  on  MotionTypes.BICYCLING
  --- Section: {"coordinates": [-122.39473, 37.77648], "type": "Point"} -> {"coordinates": [-122.15614, 37.43833], "type": "Point"}  on  MotionTypes.IN_VEHICLE
  --- Section: {"coordinates": [-122.15414, 37.43706], "type": "Point"} -> {"coordinates": [-122.14148, 37.42895], "type": "Point"}  on  MotionTypes.WALKING
  --- Section: {"coordinates": [-122.14148, 37.42895], "type": "Point"} -> {"coordinates": [-122.14136, 37.42906], "type": "Point"}  on  MotionTypes.BICYCLING
  --- Section: {"coordinates": [-122.1414, 37.42903], "type": "Point"} -> {"coordinates": [-122.14091, 37.42872], "type": "Point"}  on  MotionTypes.WALKING
=== Trip: {"coordinates": [-122.14091, 37.42872], "type": "Point"} -> {"coordinates":

In [12]:
# Get all cleaned trips for the second user
entry_it = ts_2.find_entries(["analysis/cleaned_trip"], time_query=None)

In [13]:
for ct in entry_it:
    cte = ecwe.Entry(ct)
    print("=== Trip:", cte.data.start_loc, "->", cte.data.end_loc)
    section_it = esdt.get_sections_for_trip("analysis/cleaned_section", test_user_id, cte.get_id())
    for sec in section_it:
        print("  --- Section:", sec.data.start_loc, "->", sec.data.end_loc, " on ", sec.data.sensed_mode)

=== Trip: {"coordinates": [-122.0876886, 37.3887767], "type": "Point"} -> {"coordinates": [-122.0820411, 37.3920436], "type": "Point"}
=== Trip: {"coordinates": [-122.0820411, 37.3920436], "type": "Point"} -> {"coordinates": [-122.0862974, 37.3908925], "type": "Point"}
=== Trip: {"coordinates": [-122.0862974, 37.3908925], "type": "Point"} -> {"coordinates": [-122.0801222, 37.3921316], "type": "Point"}
=== Trip: {"coordinates": [-122.0801222, 37.3921316], "type": "Point"} -> {"coordinates": [-122.1592342, 37.4438439], "type": "Point"}
=== Trip: {"coordinates": [-122.1592342, 37.4438439], "type": "Point"} -> {"coordinates": [-122.0823243, 37.3793178], "type": "Point"}
=== Trip: {"coordinates": [-122.0823243, 37.3793178], "type": "Point"} -> {"coordinates": [-122.0862601, 37.3909372], "type": "Point"}


In [17]:
# Get cleaned trips for the two users that started on 1st Aug UTC
import arrow

aug_1_tq = estt.TimeQuery("data.start_ts",
                          arrow.get("2017-08-01").timestamp, # start of range
                          arrow.get("2017-08-02").timestamp)  # end of range
entry_it = ts.find_entries(["analysis/cleaned_trip"], time_query=aug_1_tq)
entry_it_2 = ts_2.find_entries(["analysis/cleaned_trip"], time_query=aug_1_tq)
print("From %s -> %s, user %s had %d trips and user %s had %d trips" %
     (aug_1_tq.startTs, aug_1_tq.endTs, test_user_id, len(list(entry_it)), test_user_id_2, len(list(entry_it_2))))

From 1501545600 -> 1501632000, user aefd0071-fa18-4300-9676-0c04fac7fff7 had 0 trips and user 2b70a272-cdac-464f-859a-9ee5ff24bf41 had 0 trips


#### Accessing a dataframe

In [12]:
# Get all cleaned trips for the first user
ct_df = ts.get_data_df("analysis/cleaned_trip", time_query=None)

In [13]:
len(ct_df)

2

In [174]:
ct_df.columns

Index(['_id', 'distance', 'duration', 'end_fmt_time', 'end_loc',
       'end_local_dt_day', 'end_local_dt_hour', 'end_local_dt_minute',
       'end_local_dt_month', 'end_local_dt_second', 'end_local_dt_timezone',
       'end_local_dt_weekday', 'end_local_dt_year', 'end_place', 'end_ts',
       'metadata_write_ts', 'raw_trip', 'source', 'start_fmt_time',
       'start_loc', 'start_local_dt_day', 'start_local_dt_hour',
       'start_local_dt_minute', 'start_local_dt_month',
       'start_local_dt_second', 'start_local_dt_timezone',
       'start_local_dt_weekday', 'start_local_dt_year', 'start_place',
       'start_ts', 'user_id'],
      dtype='object')

In [137]:
ct_df[["start_loc", "end_loc", "start_fmt_time", "end_fmt_time"]]

Unnamed: 0,start_loc,end_loc,start_fmt_time,end_fmt_time
0,"{'type': 'Point', 'coordinates': [-122.4002, 3...","{'type': 'Point', 'coordinates': [-122.14091, ...",2018-10-07T15:04:02+00:00,2018-10-07T16:22:36.089506+00:00


In [None]:
# Get all cleaned trips for the second user
ct_df_2 = ts_2.get_data_df("analysis/cleaned_trip", time_query=None)
ct_df_2[["start_loc", "end_loc", "start_ts", "end_ts"]]

In [None]:
# Get cleaned trips for the two users that started on 1st Aug UTC
import arrow

aug_1_tq = estt.TimeQuery("data.start_ts",
                          arrow.get("2017-08-01").timestamp, # start of range
                          arrow.get("2017-08-02").timestamp)  # end of range
ct_df = ts.get_data_df("analysis/cleaned_trip", time_query=aug_1_tq)
ct_df_2 = ts_2.get_data_df("analysis/cleaned_trip", time_query=aug_1_tq)
print("From %s -> %s, user %s had %d trips and user %s had %d trips" %
     (aug_1_tq.startTs, aug_1_tq.endTs, test_user_id, len(ct_df), test_user_id_2, len(ct_df_2)))

## filtered location 

In [60]:
locations = ts.get_data_df("background/filtered_location", time_query=None)

In [61]:
locations.head()

Unnamed: 0,_id,accuracy,altitude,filter,fmt_time,latitude,loc,local_dt_day,local_dt_hour,local_dt_minute,local_dt_month,local_dt_second,local_dt_timezone,local_dt_weekday,local_dt_year,longitude,metadata_write_ts,sensed_speed,ts,user_id
0,5bcfa83a1945d60ad74aaa85,0,0,distance,2018-10-02 11:52:02+00:00,37.77302,"{'type': 'Point', 'coordinates': [-122.4002, 3...",2,11,52,10,2,UTC,1,2018,-122.4002,1540336000.0,4.569441,1538481000.0,2b70adfa-a14a-4f49-a5e7-36f23f80e6a4
1,5bcfa83a1945d60ad74aaa86,0,0,distance,2018-10-02 11:52:03+00:00,37.77306,"{'type': 'Point', 'coordinates': [-122.40013, ...",2,11,52,10,3,UTC,1,2018,-122.40013,1540336000.0,4.569441,1538481000.0,2b70adfa-a14a-4f49-a5e7-36f23f80e6a4
2,5bcfa83a1945d60ad74aaa87,0,0,distance,2018-10-02 11:52:19+00:00,37.77354,"{'type': 'Point', 'coordinates': [-122.39954, ...",2,11,52,10,19,UTC,1,2018,-122.39954,1540336000.0,4.569441,1538481000.0,2b70adfa-a14a-4f49-a5e7-36f23f80e6a4
3,5bcfa83a1945d60ad74aaa88,0,0,distance,2018-10-02 11:52:22+00:00,37.77362,"{'type': 'Point', 'coordinates': [-122.39944, ...",2,11,52,10,22,UTC,1,2018,-122.39944,1540336000.0,4.569441,1538481000.0,2b70adfa-a14a-4f49-a5e7-36f23f80e6a4
4,5bcfa83a1945d60ad74aaa89,0,0,distance,2018-10-02 11:52:23+00:00,37.77365,"{'type': 'Point', 'coordinates': [-122.3994, 3...",2,11,52,10,23,UTC,1,2018,-122.3994,1540336000.0,4.569441,1538481000.0,2b70adfa-a14a-4f49-a5e7-36f23f80e6a4


In [62]:
error_location_1 = locations[locations['longitude'] == -122.14091]
error_location_1

Unnamed: 0,_id,accuracy,altitude,filter,fmt_time,latitude,loc,local_dt_day,local_dt_hour,local_dt_minute,local_dt_month,local_dt_second,local_dt_timezone,local_dt_weekday,local_dt_year,longitude,metadata_write_ts,sensed_speed,ts,user_id
353,5bcfa83b1945d60ad74aabeb,0,0,distance,2018-10-02 12:56:44+00:00,37.42872,"{'type': 'Point', 'coordinates': [-122.14091, ...",2,12,56,10,44,UTC,1,2018,-122.14091,1540336000.0,0.905557,1538485000.0,2b70adfa-a14a-4f49-a5e7-36f23f80e6a4
354,5bcfa83b1945d60ad74aabec,0,0,distance,2018-10-02 13:13:24+00:00,37.42872,"{'type': 'Point', 'coordinates': [-122.14091, ...",2,13,13,10,24,UTC,1,2018,-122.14091,1540336000.0,0.0,1538486000.0,2b70adfa-a14a-4f49-a5e7-36f23f80e6a4


In [63]:
error_location_2 = locations[locations['longitude'] == -122.14132]
error_location_2

Unnamed: 0,_id,accuracy,altitude,filter,fmt_time,latitude,loc,local_dt_day,local_dt_hour,local_dt_minute,local_dt_month,local_dt_second,local_dt_timezone,local_dt_weekday,local_dt_year,longitude,metadata_write_ts,sensed_speed,ts,user_id
352,5bcfa83b1945d60ad74aabea,0,0,distance,2018-10-02 12:56:44+00:00,37.42898,"{'type': 'Point', 'coordinates': [-122.14132, ...",2,12,56,10,44,UTC,1,2018,-122.14132,1540336000.0,0.905557,1538485000.0,2b70adfa-a14a-4f49-a5e7-36f23f80e6a4


### Get the cleaned secions for the first user

In [13]:
cs_df = ts.get_data_df("analysis/cleaned_section", time_query=None)

## Raw trips

In [87]:
rt_df = ts.get_data_df('segmentation/raw_trip', time_query=None)

In [88]:
rt_df.columns

Index(['_id', 'distance', 'duration', 'end_fmt_time', 'end_loc',
       'end_local_dt_day', 'end_local_dt_hour', 'end_local_dt_minute',
       'end_local_dt_month', 'end_local_dt_second', 'end_local_dt_timezone',
       'end_local_dt_weekday', 'end_local_dt_year', 'end_place', 'end_ts',
       'metadata_write_ts', 'source', 'start_fmt_time', 'start_loc',
       'start_local_dt_day', 'start_local_dt_hour', 'start_local_dt_minute',
       'start_local_dt_month', 'start_local_dt_second',
       'start_local_dt_timezone', 'start_local_dt_weekday',
       'start_local_dt_year', 'start_place', 'start_ts', 'user_id'],
      dtype='object')

In [89]:
raw_trips = rt_df[["start_loc", "end_loc", "start_ts", "end_ts", 'metadata_write_ts']]

In [90]:
import arrow

In [91]:
raw_trips['start_time'] = raw_trips["start_ts"].apply(lambda x : arrow.get(x).format())
raw_trips['end_time'] = raw_trips["end_ts"].apply(lambda x : arrow.get(x).format())
raw_trips['end_coord'] = raw_trips["end_loc"].apply(lambda x : dict(x)['coordinates'])
raw_trips['start_coord'] = raw_trips["start_loc"].apply(lambda x : dict(x)['coordinates'])
raw_trips['meta_write_time'] =raw_trips["metadata_write_ts"].apply(lambda x : arrow.get(x).format())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See

In [92]:
raw_trips[['start_time', 'end_time',"start_coord", "end_coord", 'meta_write_time']]

Unnamed: 0,start_time,end_time,start_coord,end_coord,meta_write_time
0,2017-11-03 08:46:00+00:00,2017-11-03 09:12:09+00:00,"[-122.4002, 37.77302]","[-122.27381, 37.87119]",2018-11-03 01:47:05+00:00
1,2017-11-03 19:28:16+00:00,2017-11-03 20:31:23+00:00,"[-122.27451, 37.87142]","[-122.14083, 37.42881]",2018-11-03 01:47:05+00:00
2,2017-11-04 06:48:15+00:00,2017-11-04 07:56:53+00:00,"[-122.14006, 37.42767]","[-122.27381, 37.87119]",2018-11-03 01:47:05+00:00


### Motion Activites

In [93]:
motion_df = ts.get_data_df('background/motion_activity', time_query=None)

In [94]:
motion_df.columns

Index(['_id', 'confidence', 'metadata_write_ts', 'type', 'user_id'], dtype='object')

In [95]:
motion_df['time'] = motion_df["ts"].apply(lambda x : arrow.get(x).format())

KeyError: 'ts'

In [69]:
motion_df[['time', 'type']]

Unnamed: 0,time,type
0,2018-11-03 01:34:16+00:00,0
1,2018-11-03 01:34:17+00:00,0


In [70]:
motion_df.head()

Unnamed: 0,_id,confidence,fmt_time,local_dt_day,local_dt_hour,local_dt_minute,local_dt_month,local_dt_second,local_dt_timezone,local_dt_weekday,local_dt_year,metadata_write_ts,ts,type,user_id,time
0,5bdcfb202e6468537800a88d,100.0,2018-11-02T18:34:16.612876-07:00,2,18,34,11,16,America/Los_Angeles,4,2018,1541209000.0,1541209000.0,0,3d675c9d-9991-44cb-88bf-6cf3f8f5f4e7,2018-11-03 01:34:16+00:00
1,5bdcfb202e6468537800aa73,100.0,2018-11-02T18:34:17.280346-07:00,2,18,34,11,17,America/Los_Angeles,4,2018,1541209000.0,1541209000.0,0,3d675c9d-9991-44cb-88bf-6cf3f8f5f4e7,2018-11-03 01:34:17+00:00


## sections

In [140]:
cs_df.columns

Index(['_id', 'distance', 'distances', 'duration', 'end_fmt_time', 'end_loc',
       'end_local_dt_day', 'end_local_dt_hour', 'end_local_dt_minute',
       'end_local_dt_month', 'end_local_dt_second', 'end_local_dt_timezone',
       'end_local_dt_weekday', 'end_local_dt_year', 'end_stop', 'end_ts',
       'metadata_write_ts', 'sensed_mode', 'source', 'speeds',
       'start_fmt_time', 'start_loc', 'start_local_dt_day',
       'start_local_dt_hour', 'start_local_dt_minute', 'start_local_dt_month',
       'start_local_dt_second', 'start_local_dt_timezone',
       'start_local_dt_weekday', 'start_local_dt_year', 'start_stop',
       'start_ts', 'trip_id', 'user_id'],
      dtype='object')

In [93]:
sections = cs_df[["start_loc", "end_loc", "start_ts", "end_ts", "sensed_mode"]]

In [94]:
import arrow

In [95]:
sections['start_time'] = sections["start_ts"].apply(lambda x : arrow.get(x).format())
sections['end_time'] = sections["end_ts"].apply(lambda x : arrow.get(x).format())
sections['end_coord'] = sections["end_loc"].apply(lambda x : dict(x)['coordinates'])
sections['start_coord'] = sections["start_loc"].apply(lambda x : dict(x)['coordinates'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See

In [19]:
sections[['start_time', 'end_time',"start_coord", "end_coord"]]

Unnamed: 0,start_time,end_time,start_coord,end_coord
0,2018-10-07 18:04:02+00:00,2018-10-07 18:07:00+00:00,"[-122.4002, 37.77302]","[-122.39475, 37.77618]"
1,2018-10-07 18:07:00+00:00,2018-10-07 19:20:01+00:00,"[-122.39475, 37.77618]","[-122.14148, 37.42895]"
2,2018-10-07 19:20:01+00:00,2018-10-07 19:20:59+00:00,"[-122.14148, 37.42895]","[-122.14148, 37.42895]"
3,2018-10-07 19:20:59+00:00,2018-10-07 19:21:35+00:00,"[-122.14148, 37.42895]","[-122.1414, 37.42903]"
4,2018-10-07 19:21:35+00:00,2018-10-07 19:21:44+00:00,"[-122.1414, 37.42903]","[-122.14132, 37.42898]"


In [96]:
sections[['start_ts', 'end_ts',"start_coord", "end_coord"]]

Unnamed: 0,start_ts,end_ts,start_coord,end_coord
0,1539097000.0,1539097000.0,"[-122.4002, 37.77302]","[-122.39475, 37.77618]"
1,1539097000.0,1539099000.0,"[-122.39475, 37.77618]","[-122.16473, 37.44352]"
2,1539099000.0,1539099000.0,"[-122.16473, 37.44352]","[-122.16473, 37.44352]"
3,1539099000.0,1539099000.0,"[-122.16473, 37.44352]","[-122.16434, 37.44331]"
4,1539099000.0,1539099000.0,"[-122.16456, 37.44311]","[-122.16456, 37.44311]"
5,1539099000.0,1539100000.0,"[-122.16456, 37.44311]","[-122.14093, 37.42852]"
6,1539100000.0,1539100000.0,"[-122.14093, 37.42852]","[-122.14097, 37.4286]"
7,1539100000.0,1539100000.0,"[-122.14097, 37.4286]","[-122.1414, 37.42903]"
8,1539100000.0,1539100000.0,"[-122.1414, 37.42903]","[-122.14132, 37.42898]"


### Get the raw secions for user 1

In [20]:
raw_section_df = ts.get_data_df("segmentation/raw_section", time_query=None)

In [21]:
len(raw_section_df)

5

In [22]:
raw_section_df.columns

Index(['_id', 'duration', 'end_fmt_time', 'end_loc', 'end_local_dt_day',
       'end_local_dt_hour', 'end_local_dt_minute', 'end_local_dt_month',
       'end_local_dt_second', 'end_local_dt_timezone', 'end_local_dt_weekday',
       'end_local_dt_year', 'end_stop', 'end_ts', 'metadata_write_ts',
       'sensed_mode', 'source', 'start_fmt_time', 'start_loc',
       'start_local_dt_day', 'start_local_dt_hour', 'start_local_dt_minute',
       'start_local_dt_month', 'start_local_dt_second',
       'start_local_dt_timezone', 'start_local_dt_weekday',
       'start_local_dt_year', 'start_stop', 'start_ts', 'trip_id', 'user_id'],
      dtype='object')

### Filtered Location 

In [235]:
filtered_location_df = ts.get_data_df("background/filtered_location", time_query=None)
filtered_location_df.head()

Unnamed: 0,_id,accuracy,altitude,filter,fmt_time,latitude,loc,local_dt_day,local_dt_hour,local_dt_minute,local_dt_month,local_dt_second,local_dt_timezone,local_dt_weekday,local_dt_year,longitude,metadata_write_ts,sensed_speed,ts,user_id
0,5bc64b441945d6b35be53eec,0,0,distance,2018-10-09 13:42:02+00:00,37.77302,"{'type': 'Point', 'coordinates': [-122.4002, 3...",9,13,42,10,2,UTC,1,2018,-122.4002,1539722000.0,4.569441,1539093000.0,15798eec-61b5-492b-8f23-420a53faad6c
1,5bc64b441945d6b35be53eed,0,0,distance,2018-10-09 13:42:03+00:00,37.77306,"{'type': 'Point', 'coordinates': [-122.40013, ...",9,13,42,10,3,UTC,1,2018,-122.40013,1539722000.0,4.569441,1539093000.0,15798eec-61b5-492b-8f23-420a53faad6c
2,5bc64b441945d6b35be53eee,0,0,distance,2018-10-09 13:42:19+00:00,37.77354,"{'type': 'Point', 'coordinates': [-122.39954, ...",9,13,42,10,19,UTC,1,2018,-122.39954,1539722000.0,4.569441,1539093000.0,15798eec-61b5-492b-8f23-420a53faad6c
3,5bc64b441945d6b35be53eef,0,0,distance,2018-10-09 13:42:21+00:00,37.77362,"{'type': 'Point', 'coordinates': [-122.39944, ...",9,13,42,10,21,UTC,1,2018,-122.39944,1539722000.0,4.569441,1539093000.0,15798eec-61b5-492b-8f23-420a53faad6c
4,5bc64b441945d6b35be53ef0,0,0,distance,2018-10-09 13:42:22+00:00,37.77365,"{'type': 'Point', 'coordinates': [-122.3994, 3...",9,13,42,10,22,UTC,1,2018,-122.3994,1539722000.0,4.569441,1539093000.0,15798eec-61b5-492b-8f23-420a53faad6c


In [236]:
set(filtered_location_df['filter'])

{'distance'}

### Direct mongodb queries

You can also use direct mongodb queries during exploratory work. I do ask that you create a storage decorator (`emission/storage/decorations`) when you submit a pull request for ongoing use

In [None]:
import emission.core.get_database as edb

In [None]:
edb.get_timeseries_db().find_one()

In [None]:
edb.get_timeseries_db().distinct("metadata.key")

Note that in this case, you need to know whether to use the `timeseries` or the `analysis_timeseries` collection

In [None]:
edb.get_analysis_timeseries_db().distinct("metadata.key")

In [None]:
edb.get_analysis_timeseries_db().find({"user_id": test_user_id, "metadata.key": "analysis/cleaned_trip"}).count()

In particular, you can use this to access entries that are not in the timeseries

In [None]:
edb.get_uuid_db().distinct("uuid")

### Timeline

The trips and places maintain links to each other - e.g. `start_place`, `end_place`

In [65]:
ct_df[["start_place", "end_place"]]

Unnamed: 0,start_place,end_place
0,5bcb9a2a1945d65e294763b7,5bcb9a2b1945d65e294763b8


These are _primary key links_ to other entries in the database. It would be useful to have a doubly linked list representing this properly. The Timeline helps with that.

In [16]:
import emission.storage.decorations.timeline as esdl

In [17]:
trip_start_end_fuzz = 10 # seconds
ct_df = ts.get_data_df("analysis/cleaned_trip", time_query=None)
tl = esdl.get_cleaned_timeline(test_user_id, ct_df.iloc[0].start_ts - trip_start_end_fuzz, ct_df.iloc[-1].end_ts + trip_start_end_fuzz)

In [18]:
for e in tl:
    if 'enter_ts' in e.data:
        # Must be place-like
        print(e.metadata.key, e.data.enter_fmt_time, "->", e.data.exit_fmt_time)
    else:
        print(e.metadata.key, e.data.start_fmt_time, "->", e.data.end_fmt_time)

analysis/cleaned_trip 2018-10-06T18:04:02+00:00 -> 2018-10-06T19:21:44.927082+00:00
analysis/cleaned_place 2018-10-06 19:21:44+00:00 -> None


In [19]:
# The timeline is an iterator, so after it is consumed, it is empty
for e in tl:
    if 'enter_ts' in e.data:
        # Must be place-like
        print(e.metadata.key, e.data.enter_fmt_time, "->", e.data.exit_fmt_time)
    else:
        print(e.metadata.key, e.data.start_fmt_time, "->", e.data.end_fmt_time)

In [20]:
stl = esdt.get_cleaned_timeline_for_trip(test_user_id, tl.first_trip().get_id())

In [21]:
for e in stl:
    print(e.metadata.key)

analysis/cleaned_section
analysis/cleaned_stop
analysis/cleaned_section
analysis/cleaned_stop
analysis/cleaned_section
analysis/cleaned_stop
analysis/cleaned_section
analysis/cleaned_stop
analysis/cleaned_section


In [244]:
stl = esdt.get_cleaned_timeline_for_trip(test_user_id, tl.last_trip().get_id())

In [245]:
for e in stl:
    print(e.metadata.key)

analysis/cleaned_section
analysis/cleaned_stop
analysis/cleaned_section
analysis/cleaned_stop
analysis/cleaned_section
analysis/cleaned_stop
analysis/cleaned_section
analysis/cleaned_stop
analysis/cleaned_section


## Getting trip and section details ##

Once we have trip and section objects, we can retrieve the sensed data associated with them by querying for data in various streams that falls within the time ranges associated with the trip/section. Here again, our architecture of storing the analysis results as a separate datastream makes it easy to retrieve data at various levels of processing.

### Plot a processed trip or set of trips ###

In [14]:
import emission.analysis.plotting.geojson.geojson_feature_converter as gfc
import emission.analysis.plotting.leaflet_osm.our_plotter as lo

In [15]:
first_trip_for_user = ct_df.iloc[0]
first_trip_start_ts = first_trip_for_user.start_ts
first_trip_end_ts = first_trip_for_user.end_ts
trip_start_end_fuzz = 10 # seconds
trips_geojson_list = gfc.get_geojson_for_ts(test_user_id, first_trip_start_ts-trip_start_end_fuzz, ct_df.iloc[-1].end_ts+trip_start_end_fuzz)

Entry({'_id': ObjectId('5bdd0c821945d6bdff15e239'), 'user_id': UUID('13f45da5-3daa-4e55-8002-dfa2015927ce'), 'metadata': {'key': 'analysis/cleaned_place', 'platform': 'server', 'write_ts': 1541213314.739854, 'time_zone': 'America/Los_Angeles', 'write_local_dt': {'year': 2018, 'month': 11, 'day': 2, 'hour': 19, 'minute': 48, 'second': 34, 'weekday': 4, 'timezone': 'America/Los_Angeles'}, 'write_fmt_time': '2018-11-02T19:48:34.739854-07:00'}, 'data': {'source': 'DwellSegmentationDistFilter', 'location': {'type': 'Point', 'coordinates': [-122.4002, 37.77302]}, 'raw_places': [ObjectId('5bdd0c7e1945d6bdff15e0f3'), ObjectId('5bdd0c7e1945d6bdff15e0f3')], 'display_name': 'Townsend Street, San Francisco', 'starting_trip': ObjectId('5bdd0c801945d6bdff15e11a'), 'exit_ts': 1509709922.0, 'exit_fmt_time': '2017-11-03T04:52:02-07:00', 'exit_local_dt': {'year': 2017, 'month': 11, 'day': 3, 'hour': 4, 'minute': 52, 'second': 2, 'weekday': 4, 'timezone': 'America/Los_Angeles'}}})
section start time 2017

In [16]:
len(trips_geojson_list)

2

In [17]:
map_list = lo.get_maps_for_geojson_trip_list(trips_geojson_list)

start_place 5bdd0c821945d6bdff15e239
end_place 5bdd0c821945d6bdff15e23a
stop 5bdd0c811945d6bdff15e1ab
stop 5bdd0c811945d6bdff15e1ac
stop 5bdd0c811945d6bdff15e1ad
stop 5bdd0c811945d6bdff15e1ae
section 5bdd0c801945d6bdff15e11c
section 5bdd0c811945d6bdff15e124
section 5bdd0c811945d6bdff15e19e
section 5bdd0c811945d6bdff15e1a2
section 5bdd0c811945d6bdff15e1a6
start_place 5bdd0c821945d6bdff15e23a
end_place 5bdd0c831945d6bdff15e23b
stop 5bdd0c821945d6bdff15e236
stop 5bdd0c821945d6bdff15e237
stop 5bdd0c821945d6bdff15e238
section 5bdd0c811945d6bdff15e1b1
section 5bdd0c811945d6bdff15e1ca
section 5bdd0c821945d6bdff15e1cd
section 5bdd0c821945d6bdff15e22d


In [18]:
len(map_list)

2

In [19]:
map_list[0]

In [20]:
map_list[-1]

In [21]:
import branca.element as bre

In [22]:
nrows = 2
ncols = 3
fig = bre.Figure()
for i, m in enumerate(map_list[:6]):
    fig.add_subplot(nrows,ncols,i+1).add_child(m)
fig

In [184]:
nrows = 2
ncols = 3
fig = bre.Figure()
for i, map in enumerate(map_list[-6:]):
    fig.add_subplot(nrows,ncols,i+1).add_child(map)
fig

## Can you do better? ##


### Get locations with no processing, basic filtering and resampling for the first trip ###

In [None]:
all_locs = ts.get_data_df("background/location",
                               time_query = esda.get_time_query_for_trip_like(
                                   "analysis/cleaned_section", ct_df.iloc[0]._id))

filtered_locs = ts.get_data_df("background/filtered_location",
                               time_query = esda.get_time_query_for_trip_like(
                                   "analysis/cleaned_section", ct_df.iloc[0]._id))

resampled_locs = ts.get_data_df("analysis/recreated_location",
                                 time_query = esda.get_time_query_for_trip_like(
                                   "analysis/cleaned_section", ct_df.iloc[0]._id))
print("Locations go from all = %d -> filtered = %d -> resampled = %d" % (len(all_locs),
                                                                         len(filtered_locs),
                                                                         len(resampled_locs)))

In [None]:
all_locs[["_id", "latitude", "longitude", "fmt_time"]]

### Get the raw motion activity, in case you want to do different segmentation ###

In [None]:
all_activity = ts.get_data_df("background/motion_activity",
                               esda.get_time_query_for_trip_like(
                                   "analysis/cleaned_section", ct_df.iloc[0]._id))

In [None]:
all_activity.columns

In [None]:
import emission.core.wrapper.motionactivity as ecwm

print("Found %d walking entries, %d on_foot entries" % (len(all_activity[all_activity.type == ecwm.MotionTypes.WALKING.value]), 
                                    len(all_activity[all_activity.type == ecwm.MotionTypes.ON_FOOT.value])))
print("Found %d motorized entries" % (len(all_activity[all_activity.type == ecwm.MotionTypes.IN_VEHICLE.value])))

### Plot the location points ###

In [None]:
map_list = lo.get_maps_for_geojson_unsectioned([gfc.get_feature_list_from_df(all_locs),
                                                gfc.get_feature_list_from_df(filtered_locs),
                                                gfc.get_feature_list_from_df(resampled_locs)])

In [None]:
fig = bre.Figure()
for i, map in enumerate(map_list):
    fig.add_subplot(1,3,i+1).add_child(map)
fig