In [1]:
import pandas as pd
import seaborn as sns
import numpy as np

Lets read in an example GPS csv file. 

In [2]:
file='ins_example.csv'
df=pd.read_csv(file, header=0, low_memory=False)
nrows=df.shape[0]
print ("Number of rows:", nrows)
df.head()


Number of rows: 146134


Unnamed: 0,timestamp,latitude,longitude,altitude,northing,easting,down,velocity_north,velocity_east,velocity_down
0,1433499013991745,51.760579,-1.261311,114.112079,5735840.0,619993.34668,-114.112079,1.886407,-0.852159,0.092644
1,1433499014011745,51.76058,-1.261312,114.110195,5735840.0,619993.328696,-114.110195,1.900565,-0.857167,0.095289
2,1433499014031745,51.76058,-1.261312,114.108287,5735840.0,619993.310598,-114.108287,1.913448,-0.862771,0.095473
3,1433499014051745,51.76058,-1.261312,114.106377,5735840.0,619993.292381,-114.106377,1.927292,-0.868113,0.095935
4,1433499014071746,51.760581,-1.261312,114.104466,5735840.0,619993.274054,-114.104466,1.940624,-0.873413,0.094671


The timestamp is in seconds. 
Longitude and Latitude should fall within the range of the vehicle's journey. 
51.79 > Longitude > 51.72 
-1.24 > Latitude > -1.27
Lets remove the rows that contain extreme values in Longitude and Latitude

In [3]:
df['latitude'] = df['latitude'].apply(lambda x: x if (x is not None) and (x <51.79) else None)
df['latitude'] = df['latitude'].apply(lambda x: x if (x is not None) and (x >51.72) else None)
df['longitude'] = df['longitude'].apply(lambda x: x if (x is not None) and (x <-1.24) else None)
df['longitude'] = df['longitude'].apply(lambda x: x if (x is not None) and (x >-1.27) else None)
df.dropna(inplace=True)
print ("Number of rows", df.shape[0])

Number of rows 42581


Timestamp should be unique and monotonic. Lets check that. 

In [4]:
timestamp=df.iloc[:,0]
print ("Timestamp is unique:", timestamp.is_unique)
print("Timestamp is monotonic:",timestamp.is_monotonic)

Timestamp is unique: True
Timestamp is monotonic: True


Great, so we can now use the timestamp to calculate the time derivative of our features.
If there is a large change in our values this will show up as a large change in the time derivative
and will be viewed as an anomalous GPS value to be removed. 

The diff function gives us the difference between rows of a column:

In [5]:
df['dt']=df['timestamp'].diff()
df[['timestamp','dt']].head()

Unnamed: 0,timestamp,dt
0,1433499013991745,
1,1433499014011745,20000.0
2,1433499014031745,20000.0
3,1433499014051745,20000.0
4,1433499014071746,20001.0


In [6]:
df['dlat']=(df['latitude'].diff())/df['dt']
df['dlon']=(df['longitude'].diff())/df['dt']
df['dalt']=(df['altitude'].diff())/df['dt']
df['dnor']=(df['northing'].diff())/df['dt']
df['deas']=(df['easting'].diff())/df['dt']
df['ddow']=(df['down'].diff())/df['dt']
df['dvelN']=(df['velocity_north'].diff())/df['dt']
df['dvelE']=(df['velocity_east'].diff())/df['dt']
df['dvelD']=(df['velocity_down'].diff())/df['dt']

Now we have the time derivatives lets remove rows that contain extreme values. 
How do we define extreme values? 
99.7% of normally distributed data should fall within 

In [7]:
df.head()

Unnamed: 0,timestamp,latitude,longitude,altitude,northing,easting,down,velocity_north,velocity_east,velocity_down,dt,dlat,dlon,dalt,dnor,deas,ddow,dvelN,dvelE,dvelD
0,1433499013991745,51.760579,-1.261311,114.112079,5735840.0,619993.34668,-114.112079,1.886407,-0.852159,0.092644,,,,,,,,,,
1,1433499014011745,51.76058,-1.261312,114.110195,5735840.0,619993.328696,-114.110195,1.900565,-0.857167,0.095289,20000.0,5e-11,-5e-11,-9.42e-08,2e-06,-8.992e-07,9.42e-08,7.079e-07,-2.504e-07,1.3225e-07
2,1433499014031745,51.76058,-1.261312,114.108287,5735840.0,619993.310598,-114.108287,1.913448,-0.862771,0.095473,20000.0,0.0,0.0,-9.54e-08,2e-06,-9.049e-07,9.54e-08,6.4415e-07,-2.802e-07,9.2e-09
3,1433499014051745,51.76058,-1.261312,114.106377,5735840.0,619993.292381,-114.106377,1.927292,-0.868113,0.095935,20000.0,0.0,0.0,-9.55e-08,2e-06,-9.1085e-07,9.55e-08,6.922e-07,-2.671e-07,2.31e-08
4,1433499014071746,51.760581,-1.261312,114.104466,5735840.0,619993.274054,-114.104466,1.940624,-0.873413,0.094671,20001.0,4.99975e-11,0.0,-9.554522e-08,2e-06,-9.163042e-07,9.554522e-08,6.665667e-07,-2.649868e-07,-6.319684e-08


In [8]:
def get_extremes(df):
    ilist=[11,12,13,14,16,17,18,19]
    for i in ilist:
        col=df.iloc[:,i]
        mu=col.mean()
        s=col.std()
        max_v=(mu+s*2.0)
        min_v=(mu-s*2.0)
        df.iloc[:,i] = df.iloc[:,i].apply(lambda x: x if (x is not None) and (x <max_v) else None)
        df.iloc[:,i] = df.iloc[:,i].apply(lambda x: x if (x is not None) and (x >min_v) else None)
    return df
print ("Number of rows before removing extremes", df.shape[0])
df=get_extremes(df)
df.dropna(inplace=True)

Number of rows before removing extremes 42581


In [9]:
print ("Number of final rows:",df.shape[0])


Number of final rows: 42406


In [10]:
print ("Number of rows removed:",nrows-df.shape[0])

Number of rows removed: 103728


Most of the rows that were removed were because of the coordinates did not fall in the physical range that the vehicle was in
The derivative algorithm picked up more subtle points where the GPS made mistakes. 