Imports

In [None]:
import trackintel as ti
import glob
import pandas as pd
import geopandas as gpd
import os
import random

from trackintel.preprocessing import generate_staypoints

User parameters

In [None]:
timezone = 'Asia/Shanghai'

data_path = 'release/taxi_log_2008_by_id/'

# Set dropout threshold (should match min staypoint duration)
dropout_threshold = pd.Timedelta(minutes=10)
# Set time difference threshold between dropout/staypoint start here
diff_threshold = pd.Timedelta(minutes=1)

Generate initial staypoints

In [None]:
files = glob.glob(data_path+'*.txt')
print(len(files))
print(files[0])

In [None]:
def get_ti_sps(path, timezone='Asia/Shanghai'):
    df = pd.read_csv(path, names=['user_id', 'tracked_at', 'longitude', 'latitude'])
    if len(df) == 0:
        return
    
    df = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df['longitude'], df['latitude'], crs="EPSG:4326"))
    df['tracked_at'] = pd.to_datetime(df['tracked_at'])
    df['tracked_at']=df['tracked_at'].dt.tz_localize(timezone)

    # Currently using simple parameters
    pfs, sps = generate_staypoints(
        df,
        dist_threshold=100, # Min dist between staypoints, in meters
        time_threshold=pd.Timedelta(minutes=10), # Min duration to create a staypoint
        gap_threshold=pd.Timedelta(minutes=25), # Max gap time to still mark something as a staypoint
        include_last=True, # Makes sure we include the last one if the user ends there
    )

    if len(sps) > 0:
        user_id=sps['user_id'].iloc[0]
        sps.to_csv('t-drive_sps/user_'+str(user_id)+'.csv')

In [None]:
for f in files:
    get_ti_sps(f)

Generate trajectories with dropouts and staypoints based on these

In [None]:
# I want to create on average 8 dropouts. Thus, probability is 8 / len(df). This number is variable
def get_dirty_ti_sps(path, avg_num_dropouts=8):
    df = pd.read_csv(path, names=['user_id', 'tracked_at', 'longitude', 'latitude'])
    if len(df) == 0:
        return
    
    df = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df['longitude'], df['latitude'], crs="EPSG:4326"))
    df['tracked_at'] = pd.to_datetime(df['tracked_at'])
    df['tracked_at']=df['tracked_at'].dt.tz_localize(timezone)
    
    # Create on average 8 dropouts
    dropoutlength = pd.Timedelta(minutes=15) # Can also change dropout time if desired
    dropouts = []
    i = 0
    while i < len(df):
        if random.random() > avg_num_dropouts / len(df):
            i += 1
            continue

        j = i
        while j < len(df) and (df['tracked_at'].iloc[j] - df['tracked_at'].iloc[i]) < dropoutlength:
            j += 1
        
        dropouts.append([i, j])

        i = j
    
    all_dropouts = []
    for dropout in dropouts:
        all_dropouts.extend(list(range(dropout[0], dropout[1])))
    
    # Write indices of dropouts to a text file
    with open('t-drive_dropout_indices/user_'+str(df['user_id'].iloc[0])+'.txt', 'w') as f:
        for line in all_dropouts:
            f.write(f"{line}\n")

    # Create a list of start times we dropped for that agent
    lines = [df['tracked_at'].iloc[x[0]] for x in dropouts]
    with open('t-drive_dropouts/user_'+str(df['user_id'].iloc[0])+'.txt', 'w') as f:
        for line in lines:
            f.write(f"{line}\n")

    df = df.drop(all_dropouts).reset_index(drop=True)

    # We should make this check again after adding dropouts
    if len(df) == 0:
        return

    # Rerun trackintel on the trajectory with dropouts
    pfs, sps = generate_staypoints(
        df,
        dist_threshold=100, # Min dist between staypoints, in meters
        time_threshold=pd.Timedelta(minutes=10), # Min duration to create a staypoint
        gap_threshold=pd.Timedelta(minutes=25), # Max gap time to still mark something as a staypoint
        include_last=True, # Makes sure we include the last one if the user ends there
    )

    # Write to file
    if len(sps) > 0:
        user_id=sps['user_id'].iloc[0]
        sps.to_csv('t-drive_noisy_sps/user_'+str(user_id)+'.csv')

In [None]:
for f in files:
    get_dirty_ti_sps(f)

In [None]:
# Add a flag for whether the staypoint is spurious
def add_is_spurious(path):
    noisy_sps = ti.read_staypoints_csv(path, index_col=None, geom_col='geometry')
    user_id = noisy_sps['user_id'].iloc[0]

    with open("t-drive_dropouts/user_"+str(user_id)+".txt", "r") as file:
        dropouts = [line.strip() for line in file]
    dropouts = pd.Series(dropouts)
    dropouts = pd.to_datetime(dropouts)

    noisy_sps['is_spurious'] = False
    for i in dropouts:
        abs_diff = abs(noisy_sps['started_at'] - i)
        abs_diff = abs_diff <= diff_threshold
        noisy_sps['is_spurious'] = noisy_sps['is_spurious'] | abs_diff
    
    noisy_sps.to_csv(path)

In [None]:
noised_files = glob.glob('t-drive_noisy_sps/*.csv')
for nf in noised_files:
    add_is_spurious(nf)

Now apply filter

In [None]:
def get_user(agent_id):
    df = pd.read_csv(data_path + str(agent_id) + '.txt', names=['user_id', 'tracked_at', 'longitude', 'latitude'])
    df = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df['longitude'], df['latitude'], crs="EPSG:4326"))
    df['tracked_at'] = pd.to_datetime(df['tracked_at'])
    df['tracked_at']=df['tracked_at'].dt.tz_localize(timezone)

    return df

def get_noisy_user(agent_id):
    df = pd.read_csv(data_path + str(agent_id) + '.txt', names=['user_id', 'tracked_at', 'longitude', 'latitude'])
    df = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df['longitude'], df['latitude'], crs="EPSG:4326"))
    df['tracked_at'] = pd.to_datetime(df['tracked_at'])
    df['tracked_at']=df['tracked_at'].dt.tz_localize(timezone)

    with open('t-drive_dropout_indices/user_'+str(agent_id)+'.txt') as file:
        all_dropouts = [int(line.strip()) for line in file]
    
    df = df.drop(all_dropouts).reset_index(drop=True)

    return df

In [None]:
def filter_dropouts(agent_id):

    # Identify dropouts over dropout threshold
    traj = get_noisy_user(agent_id)
    traj['t_delta'] = traj['tracked_at'].diff(1)
    traj['t_delta'] = traj['t_delta'].shift(-1)

    dropouts = traj[traj['t_delta'] >= dropout_threshold].reset_index(drop=True)
    timezone = 'UTC+08:00'
    dropouts['tracked_at']=dropouts['tracked_at'].dt.tz_convert(timezone)

    noisy_sps = ti.read_staypoints_csv('t-drive_noisy_sps/user_'+str(agent_id)+'.csv', index_col=None, geom_col='geometry')

    # This function call identifies the spurious staypoints
    merged = pd.merge_asof(
        noisy_sps,
        dropouts,
        left_on="started_at",
        right_on="tracked_at",
        tolerance=diff_threshold,
        direction="forward"
    )

    # And here we remove these staypoints
    indices_to_drop = noisy_sps.index[merged["tracked_at"].notna()]
    non_dropout_sps = noisy_sps.drop(indices_to_drop).reset_index(drop=True)

    file_path = 't-drive_sps/user_'+str(agent_id)+'.csv'
    if os.path.exists(file_path):
        sps = ti.read_staypoints_csv(file_path, index_col=None, geom_col='geometry')
    else:
        sps = pd.DataFrame() # Should just be able to create an empty df I think

    # We return:    Length of original staypoints, 
    #               length of noised staypoints, 
    #               length of spurious noised staypoints,
    #               length of filtered staypoints
    #               length of filtered spurious staypoints
    
    to_return = [agent_id,
                 len(sps), 
                 len(noisy_sps), 
                 len(noisy_sps[noisy_sps['is_spurious'] == True]), 
                 len(non_dropout_sps), 
                 len(non_dropout_sps[non_dropout_sps['is_spurious'] == True])]
    return to_return

Check results

In [None]:
results = pd.DataFrame(columns=['user_id', 'num_sps', 'num_noised_sps', 'num_spurious_sps', 'num_filtered_sps', 'num_spurious_filtered_sps'])

users = glob.glob('t-drive_noisy_sps/*.csv')
users = [int(user[user.index('user') + 5 : user.index('.')]) for user in users]

for user in users:
    results.loc[len(results)] = filter_dropouts(user)

In [None]:
# Can specify an output location
results.to_csv('t-drive_results.csv')