In [45]:
import pandas as pd
import numpy as np
import os, sys
import glob, re, time

from collections import Counter

In [4]:
os.getcwd()

'/Volumes/LaCie/Documents/repos/particle_filter'

In [None]:
"""
Applications of particle filters: 
1. Car positioning by map matching, as in http://www.diva-portal.org/smash/get/diva2:316556/FULLTEXT01.pdf
    Essentially same approach is described in Davidson, Collin, and Takala (2011). Application of particle filters to map-matching algorithm
    Idea also similar to Newson and Krumm (2009), though HMM is used there.
    More recent resource: Murphy, Pao, Yuen (2019). Lyft; Map matching when the map is wrong: Efficient on/off road vehicle tracking and map learning

Ideas
Rao-Blackwellization (use Kalman filter for the linear part of the dynamics model)

Initial Approach:
Use Newson and Krumm, but modify it to use particle filters instead of HMM.

So, for a given route, proceed sequentially over obs, maintaining dist of probable road segments

Assumptions (N&K):
-remove obs that are not 2*meas dist sigma from previous obs  (eliminate 39% of data in N&K)
-ignore roads 200m from obs
-zeroize very unlikely particles
"""

In [None]:
class gps_transition_model:
    """
    Transition model to predict probability of next state given current state; simulate next state.
    
    N&K use as a proxy an exponential function of the difference in the great-circle distance 
    between previous observations and previous road points (similar route distance).
    Road points use map/ground-truth data. 
    -Ignore roads 200m from obs
    -If a calculated route would require the vehicle to exceed a speed of 50 m/s (112 miles per hour), zeroize
    
    Other ideas: using dead reckoning.
    """
    def __init__(self):
        pass
    
    def estimate_sigma(self, data):
        """
        Using rescaled median absolute deviation (MAD). 
        Ideally using ground truth data (none here).  # 4.06 m. in N&K. 
        """ 
        pass

In [None]:
class gps_sensor_model:
    """
    Sensor model to predict likelihood of an obs given a particle.
    
    I don't have ground truth, so can't use ML easily, but
    N&K use as a proxy a normal distribution of great-circle distance between observation and road, with sigma 
    estimated from the data.
    Road points use map data. 
    -Zerioze low probability particles (diff in route distance of 2000 m. or more). 
    
    Other ideas: semi-supervised learning. 
    """
    def __init__(self):
        pass
    
    def estimate_beta(self, data):
        """
        Using Gather and Schultze, median-based. 
        Ideally using ground truth data (none here).
        """ 
        pass

In [2]:
class particle_filter:
    """
    Fit a particle filter to data given a transition and sensor model. 
    
    n, integer, number of particles to maintain at each step in time.
    Sensor model, , is some model object we will use to predict the likelihood.
    Transition model, , is some model object we will use to predict next state.
    max_iter, integer, the maximum iterations used in fitting the particle filter.
    conv_tol, the convergence tolerance that will trigger early termination of fitting the particle filter. 
    """
    def __init__(self, n=50, sensor_model=None, transition_model=None, max_iter=100, conv_tol=0.001):
        self.n = n
        self.particles = np.array([])
        self.sensor_model = sensor_model
        self.transition_model = transition_model
    
    def estimate_weights_per_particle(self, obs):
        """
        Estimate likelihood of particle given evidence, P(evidence|particle).
        Uses sensor model.
        
        Obs is single observation at time t, numpy array. 
        """
        pass
        
    def apply_transition_model(self):
        """
        If some particles are very unlikely, cull them and resample from more likely particles. 
        """
        pass
        
    def update_dist(self, obs):
        """
        Re-sample particles from transition model given re-estimated likelihood of existing particles.
        Weighted sample with replacement. 
        
        Note: if find no solutions, need to remove points in a signal break until HMM 'heals.'
        If break > 180 sec, separate into two trips.
        """
        weights = self.estimate_weights_per_particle(obs)
        new_particles = self.apply_transition_model()
        
    def fit(self, data):
        
        
    

'0.24.2'

In [15]:
## it seems i only have beijing data (10K) or sf taxi data (500)
# Read in taxi data
# Each data of different length; ideal use case for pyspark
# Note, the OSM extract basemap data has POI info as well (https://download.bbbike.org)
# also try uber h3 spatial index
trace_dir = '/Volumes/LaCie/datasets/ms_taxi/taxi_log_2008_by_id/'   # MS Taxi
basemap_dir = '/Volumes/LaCie/datasets/Beijing-shp/shape/'

trace_dir = '/Volumes/LaCie/datasets/cabspottingdata/'   # CRAWDAD cabspotting
basemap_dir = '/Volumes/LaCie/datasets/SanFrancisco-shp/shape/'

In [28]:
%%time
all_files = [f for f in os.listdir(trace_dir) if re.match(r'new_.*\.txt', f)]  # glob.glob(trace_dir + "new_*.txt")

CPU times: user 2.07 ms, sys: 906 µs, total: 2.97 ms
Wall time: 2.31 ms


In [51]:
%%time
# 20.9s cabspottingdata; can try spark or dask? or multiprocessing, joblib
trace_list = []
for file_ in all_files:
    file_df = pd.read_csv(file_, sep=" ", index_col=None, header=None, 
                          names=['lat', 'long', 'occupancy', 'time'])
    trace_list.append(file_df)

# concatenate all dfs into one
trace_df = pd.concat(trace_list, ignore_index=True)

CPU times: user 8.28 s, sys: 1.87 s, total: 10.2 s
Wall time: 14.3 s


In [None]:
trace_df.loc[:, ["time"]] = pd.to_datetime(trace_df.time, origin="unix", unit='s')

In [59]:
"""
cabspotting:

latitude and longitude are in decimal degrees, 
occupancy shows if a cab has a fare (1 = occupied, 0 = free) and 
time is in UNIX epoch format
"""
trace_df.dtypes

lat                 float64
long                float64
occupancy             int64
time         datetime64[ns]
dtype: object

In [60]:
trace_df.head()

Unnamed: 0,lat,long,occupancy,time
0,37.75134,-122.39488,0,2008-06-10 07:58:07
1,37.75136,-122.39527,0,2008-06-10 07:57:39
2,37.75199,-122.3946,0,2008-06-10 07:55:40
3,37.7508,-122.39346,0,2008-06-10 07:54:49
4,37.75015,-122.39256,0,2008-06-10 07:50:37


In [None]:
# Load in ...shapefiles? GeoJSON? Which format is best? For parallelization may be one thing...

In [None]:
def preprocess_traces():
    """
    -remove obs that are not 2*meas dist sigma from previous obs  (eliminate 39% of data in N&K)
    """
    pass