# 1. Introduction
I'm working on the deep representation learning for trajectory data. It will be much convincing if I can visualize the result of my representations. So I decided to use the original GPS coordinates for clustering, and compare them with the ground-truth labels for this assignment. The pipeline can be transfered to my representation by replacing GPS coordinates with the representation vectors

Next I will introduce my dataset, how I preprocessed it, which libraries were required and the outputs of visualization.

In [1]:
import glob
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
from datetime import datetime
from pytz import timezone
import warnings
warnings.filterwarnings('ignore')

# 2. GeoLife Dataset Preprocessing

# 2.1 About GeoLife
[GeoLife](https://www.microsoft.com/en-us/research/publication/geolife-gps-trajectory-dataset-user-guide/) is a public trajectory dataset collected by Micorsoft Research Asia. It records the outdoor movements of 182 users from April 2007 to August 2012. The related trajectories could be from various vehicles like Bike, Bus, Car and etc. It could be interesting if we could learn a representation that can reflect similarities and differences among all vehicles.

# 2.2 Dataset Preprocessing
The downloaded dataset contains 182 folders for each user. Each user folder has a Trajectory folder about the user's GPS records and a labeled.txt if we have the ground-truth labels. Each Trajectory folder contains multiple .plt files where each line represents one point. We have 18,670 .plt files which can be considered as 18,670 trajectories. But such trajectories are composed of multiple vehicles, which cannot be distinguishable. Thus, the target of dataset preprocessing is to generate trajectories from these plt.files according to the labeled.txt and save them into a pandas.DataFrame where each line represents one trajectory.

In [2]:
# Time convert
tz_utc = timezone("UTC")


def str2ts(timestring, format, tz=tz_utc):
    return datetime.timestamp(datetime.strptime(timestring, format).replace(tzinfo=tz))


def rle_segment(seq):
    #return the index list for each sub trajectories based on seq (i.e., label)
    index = [[0]]
    for i in range(1, len(seq)):
        if seq[i] == seq[i-1]:
            index[-1].append(i)
        else:
            index.append([i])
    return index


def geolife_labeled_unify(path="Data"):
    mode_names = ['walk', 'bike', 'bus', 'car', 'subway', 'train', 'airplane', 'boat', 'run', 'motorcycle', 'taxi']
    # 0 is reserved for unknown mode
    mode_ids = {s: i + 1 for i, s in enumerate(mode_names)}
    
    subfolders = os.listdir(path)
    # remove the useless file generated by Mac System by default.
    subfolders.remove(".DS_Store")
    
    df_list = []
    traj_ID_list = []
    obj_ID_list = []
    timestamp_list = []
    location_list = []
    type_list = []
    missing_data_list = []

    for i, sf in enumerate(subfolders):
        print(f'[{i+1}/{len(subfolders)}], Processing user {sf}')
        user_folder = os.path.join(path, sf)
        plt_files = glob.glob(os.path.join(user_folder, 'Trajectory', '*.plt'))
        
        # Load labeled.txt
        # Space-delimited file where each line includes five Fields about one period that are
        # 0. start date, 1. start time, 2. end date, 3. end time, 4. transportation mode.
        # skiprows=1: Skip the header
        # parse_dates=[[0, 1], [2, 3]]: We combine Field 0&1 for start time and Field 2&3 for end time.
        labels_file = os.path.join(user_folder, 'labels.txt')
        if os.path.exists(labels_file):
            labels = pd.read_csv(labels_file, skiprows=1, header=None, parse_dates=[[0, 1], [2, 3]],
                                 infer_datetime_format=True, delim_whitespace=True)
            # for clarity rename columns
            labels.columns = ['start_time', 'end_time', 'label']
            # replace 'label' column with integer encoding
            labels['label'] = [mode_ids[i] for i in labels['label']]
        else:
            labels = None
            
        # Load .plt file.
        # Commma-delimited file where each line includes seven Fields about one point that are 
        # 0. Latitude, 1. Longitude, 2. unknown, 3. Altitude,
        # 4. Date-number of days since 12/30/1899, 5. Date as a string, 6. Time as a string
        # skiprows=6: According to the User Guide of GeoLife, Line 1...6 are useless, and can be ignored
        # parse_dates=[[5, 6]]: We combine Field 5 and 6 for timestamps.
        for f in plt_files:
            # each file from each user can be mapped to multiple trajectories based on the corresponding vehicle
            # label df_points
            df_points = pd.read_csv(f, skiprows=6, header=None, parse_dates=[[5, 6]], infer_datetime_format=True)
            # Rename the useful columns and remove the unused ones.
            df_points.rename(inplace=True, columns={'5_6': 'time', 0: 'lat', 1: 'lon', 3: 'alt'})
            df_points.drop(inplace=True, columns=[2, 4])
            if labels is not None:
                # We can try to insert the current point to each period in labels.txt
                # if insert, then the label of point will be the label of corresponding period
                indices = labels['start_time'].searchsorted(df_points['time'], side='right') - 1
                # else, no label for this point.
                no_label = (indices < 0) | (df_points['time'].values > labels['end_time'].iloc[indices].values)
                df_points['label'] = labels['label'].iloc[indices].values
                df_points['label'][no_label] = 0
            else:
                df_points['label'] = 0
            # df_traj
            index_subtraj = rle_segment(df_points['label'])

            for id, index in enumerate(index_subtraj):
                df_subtraj = df_points.loc[index, :].reset_index(drop=True)
                df_subtraj["timestamps"] = df_subtraj.apply(lambda x: str2ts(f"{x.time}", "%Y-%m-%d %H:%M:%S"), axis=1)
                obj_ID = f.split('/')[-3]
                ts_list = df_subtraj['timestamps'].tolist()
                point_list = [[df_subtraj.loc[i, "lon"], df_subtraj.loc[i, "lat"]] for i in range(df_subtraj.shape[0])]
                traj_ID_list.append(f"{obj_ID}_{id}")
                obj_ID_list.append(obj_ID)
                timestamp_list.append(ts_list)
                location_list.append(point_list)
                missing_data_list.append("False")
                if df_subtraj.loc[0, 'label'] == 0:
                    type_list.append('Unknown')
                else:
                    type_list.append(mode_names[df_subtraj.loc[0, 'label']-1])

    dataset_dict = {"Traj_ID": traj_ID_list,
                    "Obj_ID": obj_ID_list,
                    "Timestamps": timestamp_list,
                    "Locations": location_list,
                    "Type": type_list,
                    "Missing_Data": missing_data_list}
    df = pd.DataFrame.from_dict(dataset_dict)
    df.to_csv(f"geolife.csv", index=False)
    df_labeled = df[df["Type"] != "Unknown"]
    df_labeled.to_csv(f"geolife-labeled.csv", index=False)

In [3]:
geolife_labeled_unify(path="Data")

[1/182], Processing user 135
[2/182], Processing user 132
[3/182], Processing user 104
[4/182], Processing user 103
[5/182], Processing user 168
[6/182], Processing user 157
[7/182], Processing user 150
[8/182], Processing user 159
[9/182], Processing user 166
[10/182], Processing user 161
[11/182], Processing user 102
[12/182], Processing user 105
[13/182], Processing user 133
[14/182], Processing user 134
[15/182], Processing user 160
[16/182], Processing user 158
[17/182], Processing user 167
[18/182], Processing user 151
[19/182], Processing user 169
[20/182], Processing user 156
[21/182], Processing user 024
[22/182], Processing user 023
[23/182], Processing user 015
[24/182], Processing user 012
[25/182], Processing user 079
[26/182], Processing user 046
[27/182], Processing user 041
[28/182], Processing user 048
[29/182], Processing user 077
[30/182], Processing user 083
[31/182], Processing user 084
[32/182], Processing user 070
[33/182], Processing user 013
[34/182], Processin

In [4]:
df = pd.read_csv("geolife.csv")
df_labeled = pd.read_csv("geolife-labeled.csv")
print(f"We generate {df.shape[0]} trajectories from GeoLife, {df_labeled.shape[0]} are labeled")

We generate 26672 trajectories from GeoLife, 9420 are labeled


# 3. Visualization of Dataset
We first visualize the original dataset by using GeoDataFrame.explore.
1. Some trajectories are shorter than 2 points, we need to filer them out
2. To avoid the crash of visualizing too much samples, for each type we only pick up 5 samples.

In [5]:
import geopandas as gpd
from shapely.geometry import Point, LineString

In [6]:
# To visualize the trajectories, we need to first filter out those with length <= 2, otherwise we can't create a LineString Object.
df["Locations"] = [json.loads(trajectory) for trajectory in df.Locations if not isinstance(trajectory, list)]
df["Length"] = [len(trajectory) for trajectory in df.Locations]
df_filter = df[df["Length"]>2]
print(df_filter.shape)

(25983, 7)


In [7]:
# For each type, we select 5 samples for visualization
df_v = []
for id, df_group in df_filter.groupby(by='Type'):
    df_v.append(df_group.iloc[:5, :])
df_v = pd.concat(df_v, axis=0)

In [8]:
lines = [LineString([Point(lon, lat) for lon, lat in trajectory]) for trajectory in df_v.Locations]
gdf = gpd.GeoDataFrame(geometry=lines, crs='epsg:4326')
gdf["Type"] = df_v["Type"].to_list()

In [9]:
gdf.explore(categorical=True, column=gdf.Type, style_kwds={'fillOpacity':1}, location=(39.785, 116.14))

We can see from the above map that trajectories of airplane and train are crossing multiple cities, it's better to filter them out.

In [10]:
gdf_1 = gdf[(gdf['Type']!='airplane') & (gdf['Type']!='train') ]
gdf_1.explore(categorical=True, column=gdf_1.Type, style_kwds={'fillOpacity':1}, location=(39.785, 116.14))

# 4. Visualization of GPS-based clustering
I use DBSCAN for clustering and DTW distance for distance calculation.

# 4.1 Calculate DTW distance metrix 

In [11]:
from dtw import dtw

Importing the dtw module. When using in academic works please cite:
  T. Giorgino. Computing and Visualizing Dynamic Time Warping Alignments in R: The dtw Package.
  J. Stat. Soft., doi:10.18637/jss.v031.i07.



In [12]:
def dtw_distance_matrix(trajectories):
    n = len(trajectories)
    dist_m = np.zeros((n, n))
    for i in range(n - 1):
        p = trajectories[i]
        for j in range(i + 1, n):
            q = trajectories[j]
            dist_m[i, j] = dtw(p, q).distance
            dist_m[j, i] = dist_m[i, j]
    return dist_m

In [13]:
df_temp = df_v[(df_v['Type']!='airplane') & (df_v['Type']!='train') ]
data = [np.array(locations) for locations in df_temp.Locations.to_list()]

In [14]:
distance_matrix = dtw_distance_matrix(data)

In [15]:
print(distance_matrix.shape)

(46, 46)


## 4.2 DBSCAN Clustering

In [16]:
from sklearn.cluster import DBSCAN
from dtaidistance import dtw
import numpy as np

cl = DBSCAN(min_samples=2, metric='precomputed')
cl.fit(distance_matrix)

DBSCAN(metric='precomputed', min_samples=2)

## 4.3 visualization of clustering methods

In [17]:
gdf_1["NewType"] = cl.labels_
gdf_1.explore(categorical=True, column=gdf_1.NewType, style_kwds={'fillOpacity':1}, location=(39.785, 116.14))