In [2]:
#import requests
import geopandas as gpd
import pandas as pd

#response = requests.get('https://raw.githubusercontent.com/door2door-io/gis-code-challenge/master/data/activity_points.geojson')
#import fiona; help(fiona.open)
data = gpd.read_file('./d2d/static/data/activity_points.geojson')
data.set_index('id', inplace=True, drop=False)
data['dt_timestamp'] = pd.to_datetime(data.timestamp)
data['dt_created_at'] = pd.to_datetime(data.created_at)
print('total data points:', len(data))
print('CRS:', data.crs)
print(data.dtypes)


total data points: 380
CRS: {'init': 'epsg:4326'}
accuracy                                          float64
altitude                                          float64
bearing                                             int64
created_at                                         object
current_dominating_activity                        object
current_dominating_activity_confidence              int64
feature                                            object
geometry                                           object
id                                                  int64
previous_dominating_activity                       object
previous_dominating_activity_confidence             int64
route                                              object
speed                                               int64
timestamp                                          object
dt_timestamp                               datetime64[ns]
dt_created_at                              datetime64[ns]
dtype: object


In [2]:
## Looking at date based properties.
print('Min timestamp:', data.dt_timestamp.min(), 'Max timestamp:', data.dt_timestamp.max())
print('Min created_at:', data.dt_created_at.min(), 'Max created_at:', data.dt_created_at.max())
# created_at seems earlier than timestamp, test this, and if True we assume this is the time of the coördinate.
print('created_at larger than timestamp:', len(data[data.dt_created_at > data.dt_timestamp]))
# Eleven cases where created_at is latter than timestamp of a total of 380 points. I will ignore this,
# and assume created_at is the time of the coórdinate.

# Are points wit similar times near each other? How many points created at a min 1, max 1 hour from each other are a distance of ± 300 m.
# Add period colum, shifted 1 min in future a period of 1 hour.
data['dt_period'] = pd.DatetimeIndex(data.dt_created_at).shift(1, freq='min').to_period(freq='30T')
# TODO add the frequency to the string representation.
data['period'] = data['dt_period'].astype(str)
near_points_series = pd.Series(0, index=data.index)
# For every row check wich data is inside the period.
for row in data.itertuples():
    m = data[(row.dt_period.start_time <= data.dt_created_at) & (data.dt_created_at <= row.dt_period.end_time)]
    if not m.empty:
        # Decimal precision of 2.5 about 300m.
        near_points = m[m.geom_almost_equals(row.geometry, 2)]
        if not near_points.empty:
            near_points_series[row.id] = len(near_points)
        
data['near_points_count'] = near_points_series
print('Added points near other points in space and time.')

Min timestamp: 2015-11-11 09:03:01 Max timestamp: 2015-12-14 22:57:35
Min created_at: 2015-11-11 06:03:12 Max created_at: 2015-12-14 19:57:38
created_at larger than timestamp: 11
Added points near other points in space and time.
380


In [3]:
data.head()

Unnamed: 0_level_0,accuracy,altitude,bearing,created_at,current_dominating_activity,current_dominating_activity_confidence,feature,geometry,id,previous_dominating_activity,previous_dominating_activity_confidence,route,speed,timestamp,dt_timestamp,dt_created_at,dt_period,period,near_points_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
4,23.0,0.0,0,2015-11-11T06:03:12,in_vehicle,77,passive_tracking,POINT (39.2796017 -6.8224933),4,still,60,,0,2015-11-11T09:03:01,2015-11-11 09:03:01,2015-11-11 06:03:12,2015-11-11 06:04,2015-11-11 06:04,0
9,23.0,0.0,0,2015-11-11T06:03:13,in_vehicle,77,passive_tracking,POINT (39.2796017 -6.8224933),9,still,60,,0,2015-11-11T09:03:01,2015-11-11 09:03:01,2015-11-11 06:03:13,2015-11-11 06:04,2015-11-11 06:04,0
10,15.2,121.0,261,2015-11-12T03:11:34,in_vehicle,56,passive_tracking,POINT (39.1382333 -6.7957683),10,on_foot,100,,261,2015-11-12T06:02:57,2015-11-12 06:02:57,2015-11-12 03:11:34,2015-11-12 03:12,2015-11-12 03:12,0
11,4.9,140.3,147,2015-11-12T03:11:34,,0,passive_tracking,POINT (39.1392216 -6.7911933),11,in_vehicle,77,,147,2015-11-12T06:04:31,2015-11-12 06:04:31,2015-11-12 03:11:34,2015-11-12 03:12,2015-11-12 03:12,0
13,14.0,137.6,101,2015-11-12T03:11:34,,0,passive_tracking,POINT (39.1462299 -6.7913066),13,in_vehicle,46,,101,2015-11-12T06:08:52,2015-11-12 06:08:52,2015-11-12 03:11:34,2015-11-12 03:12,2015-11-12 03:12,0


In [5]:
# Looking at duplicate data.
print('Duplicate timestamps:', len(data[data.dt_timestamp.duplicated()]))
print('Duplicate created_at:', len(data[data.dt_created_at.duplicated()]))
print('Duplicate locations:', len(data[data.geometry.astype('S32').duplicated()]))

380
Duplicate timestamps: 12
380
Duplicate created_at: 45
380
Duplicate locations: 49
380


In [6]:
#print(data['current_dominating_activity'].value_counts(dropna=False))
#print(data['previous_dominating_activity'].value_counts(dropna=False))
# Comparing previous and current dominating activity.
#data[data.current_dominating_activity.isin(['still']) &
#     ~data.previous_dominating_activity.isin(['still', None])]

In [18]:
route_data = gpd.read_file('./d2d/static/data/routes.geojson')
print(route_data.dtypes)
print('Total routes:', len(route_data))
result = None
for row in route_data.itertuples():
    series =data.distance(row.geometry)
    if result is None:
        result = series
    else:
        result.update(series[series < result])
print(result.head())
data['distance_to_route'] = result

geometry    object
route_id     int64
dtype: object
Total routes: 320
id
4     0.000691
9     0.000691
10    0.004841
11    0.000339
13    0.000196
dtype: float64


In [19]:
## Save file.
import os

file_name = 'analysed_activity_points.geojson'

# Convert remove datetime objects.
save_data = data.drop(['dt_timestamp', 'dt_created_at', 'dt_period'], errors='ignore', axis=1, inplace=False)

# Remove previous file.
try:
    os.remove(file_name)
except FileNotFoundError:
    pass
# Save data.
save_data.to_file(file_name, driver='GeoJSON')