# Transform GPS points to trajectory and save trajectory number as a column.

## Beijing dataset

In [20]:
import pandas as pd
import numpy as np
import cv2
import pickle
import os
import sys
from PIL import Image
from tqdm import tqdm
sys.path.append('..')


def GPS_sort_by_time(df):
    """按车牌号筛选，按时间信息排序"""

    # Get the unique carid values
    unique_carids = df['ID'].unique()

    # Initialize an empty DataFrame to store the sorted results
    sorted_dataframes = []

    iterater = tqdm(unique_carids)
    # Loop through each unique carid
    for carid in iterater:
    # for carid in unique_carids:
        # Filter the DataFrame by the current carid
        filtered_df = df[df['ID'] == carid]

        # Sort the filtered DataFrame by a specific column (e.g., 'brand')
        sorted_df = filtered_df.sort_values(by='time')

        # Append the sorted DataFrame to the list
        sorted_dataframes.append(sorted_df)
        iterater.set_description_str(f"Sort GPS by time {len(sorted_dataframes)}")

    # print(sorted_dataframes[0].shape)

    return sorted_dataframes


def split_by_time(sorted_dfs):
    """按照30minates的时间间隔对排序后的数据进行切分"""
    # Initialize a list to hold the split DataFrames
    split_dataframes = []
    # Initialize variables to keep track of the time interval
    # time_interval = pd.Timedelta(minutes=30)

    # Iterate through the sorted DataFrame

    iterer = tqdm(sorted_dfs)
    for sorted_df in iterer:
    # for sorted_df in sorted_dfs:
        # current_df = pd.DataFrame()
        current_df = []
        previous_time = None
        for index, row in sorted_df.iterrows():
            time = row['time']
            if previous_time is None:
                previous_time = time
            if time - previous_time > 1800:
                split_dataframes.append(current_df)
                current_df = []
            current_df.append(tuple(row))
            previous_time = time
        # Append the last DataFrame
        split_dataframes.append(current_df)
        iterer.set_description_str(f"Split DataFrame {len(split_dataframes)}")

    # Display the split DataFrames
    # for i, split_df in enumerate(split_dataframes):
    #     print(f"Split DataFrame {i}:\n{split_df}")
    return split_dataframes

In [21]:
def gps2traj(patchedGPS):
    """Convert GPS data to trajectory data."""
    patchedGPS['time'] = patchedGPS['time'].astype(int)

    sorted_dfs = GPS_sort_by_time(patchedGPS)
    split_dfs = split_by_time(sorted_dfs)
    traj_dfs = []
    for index, split_df in tqdm(enumerate(split_dfs)):
        #add a column to the DataFrame to store the trajectory ID
        indexed_data = [(index, *value) for value in split_df]
        #add the traj list to the list
        traj_dfs = traj_dfs + indexed_data
    #merge each split_df into a single DataFrame
    traj_df = pd.DataFrame(traj_dfs, columns=['traj', 'id', 'time', 'lon', 'lat', 'angle', 'speed', 'timeinterval'])
    #reset the index of the DataFrame
    # traj_df = traj_df.reset_index(drop=True)
    return traj_df


In [55]:
#classify GPS point into different trajectories
path = "../../datasets/dataset_bj_time/GPS/patch_geo_coor/"
file_list = os.listdir(path)
iterater = tqdm(file_list)
for file_name in iterater:
    iterater.set_description_str(f"Processing {file_name}...")
    with open(os.path.join(path, file_name), 'rb') as f:
        patchedGPS = pickle.load(f)
    print(patchedGPS.shape)
    #Need these two columns to construct trajectory
    if not all(col in patchedGPS.columns for col in ["time", "ID"]):
        continue

    traj_df = gps2traj(patchedGPS)
    save_path = "../../datasets/dataset_bj_time/GPS/patch_traj/"
    if not os.path.isdir(save_path):
        os.makedirs(save_path)

    with open(os.path.join(save_path, file_name), 'wb') as f:
        pickle.dump(traj_df, f)

print("Done!")


Processing 33_36_gps.pkl...:   0%|          | 0/348 [00:00<?, ?it/s]

(22495, 7)




Sort GPS by time 5075: 100%|██████████| 5075/5075 [00:30<00:00, 164.07it/s]


(603, ID              0.000000e+00
time            1.228528e+09
lon             1.164196e+02
lat             3.988267e+01
dir             0.000000e+00
speed           0.000000e+00
timeinterval    3.055000e+02
Name: 603, dtype: float64)
ID              0.000000e+00
time            1.228528e+09
lon             1.164196e+02
lat             3.988267e+01
dir             0.000000e+00
speed           0.000000e+00
timeinterval    3.055000e+02
Name: 603, dtype: float64
(603, ID              0.000000e+00
time            1.228528e+09
lon             1.164196e+02
lat             3.988267e+01
dir             0.000000e+00
speed           0.000000e+00
timeinterval    3.055000e+02
Name: 603, dtype: float64)
ID              0.000000e+00
time            1.228528e+09
lon             1.164196e+02
lat             3.988267e+01
dir             0.000000e+00
speed           0.000000e+00
timeinterval    3.055000e+02
Name: 603, dtype: float64
ID              0.000000e+00
time            1.228528e+09
lon         


Split DataFrame 3653:  49%|████▊     | 2467/5075 [54:39<57:46,  1.33s/it] 
Processing 33_36_gps.pkl...:   0%|          | 0/348 [55:10<?, ?it/s]


KeyboardInterrupt: 

In [48]:
with open('../../datasets/dataset_bj_time/GPS/patch_traj/1_55_gps.pkl','rb')as f:
    data = pickle.load(f)
data

Unnamed: 0,index,id,time,lon,lat,dir,speed,timeinterval
0,0,1.0,1.228381e+09,116.52729,40.02008,63.0,2417.0,192.5
1,1,1.0,1.228385e+09,116.52548,40.01956,243.0,2623.0,185.0
2,2,2.0,1.228276e+09,116.52598,40.01973,243.0,2057.0,303.5
3,3,2.0,1.228280e+09,116.52459,40.01929,243.0,2675.0,310.5
4,4,5.0,1.228637e+09,116.52383,40.01883,63.0,2623.0,7.0
...,...,...,...,...,...,...,...,...
10842,9892,28013.0,1.228442e+09,116.52733,40.02008,63.0,2520.0,311.5
10843,9893,28013.0,1.228544e+09,116.52375,40.01883,63.0,2366.0,60.0
10844,9894,28013.0,1.228551e+09,116.52620,40.01987,243.0,2315.0,60.0
10845,9895,28015.0,1.228212e+09,116.52642,40.01997,243.0,205.0,299.5


## Shenzhen dataset

In [2]:
import pickle
with open('../../datasets/dataset_sz_4096size/GPS/taxi/0_2_gps.pkl', 'rb') as f:
    data = pickle.load(f)
data

Unnamed: 0,id,lon,lat,time,speed,direction
520,粤B060WY,1.267216e+07,2.588672e+06,2018-10-01T08:59:36.000Z,44,167
521,粤B060WY,1.267273e+07,2.588041e+06,2018-10-01T08:58:10.000Z,51,163
1554,粤B0C2M7,1.267214e+07,2.588703e+06,2018-10-01T03:51:43.000Z,0,163
1555,粤B0C2M7,1.267214e+07,2.588703e+06,2018-10-01T04:09:46.000Z,0,163
2658,粤B0C5Q2,1.267285e+07,2.588445e+06,2018-10-01T16:35:14.000Z,89,72
...,...,...,...,...,...,...
131891856,粤BA980学,1.267271e+07,2.589043e+06,2019-10-01T14:32:10.000Z,0,187
131891857,粤BA980学,1.267271e+07,2.589034e+06,2019-10-01T15:28:23.000Z,0,227
131891858,粤BA980学,1.267271e+07,2.589035e+06,2019-10-01T14:24:09.000Z,0,214
131891859,粤BA980学,1.267271e+07,2.589036e+06,2019-10-01T14:10:54.000Z,0,182


In [3]:
import pandas as pd
import numpy as np
import cv2
import pickle
import os
import sys
from PIL import Image
from tqdm import tqdm
sys.path.append('..')


def GPS_sort_by_time(df):
    """按车牌号筛选，按时间信息排序"""

    # Get the unique carid values
    unique_carids = df['id'].unique()

    # Initialize an empty DataFrame to store the sorted results
    sorted_dataframes = []

    iterater = tqdm(unique_carids)
    # Loop through each unique carid
    for carid in iterater:
    # for carid in unique_carids:
        # Filter the DataFrame by the current carid
        filtered_df = df[df['id'] == carid]

        # Sort the filtered DataFrame by a specific column (e.g., 'brand')
        sorted_df = filtered_df.sort_values(by='time')

        # Append the sorted DataFrame to the list
        sorted_dataframes.append(sorted_df)
        iterater.set_description_str(f"Sort GPS by time {len(sorted_dataframes)}")

    # print(sorted_dataframes[0].shape)

    return sorted_dataframes


def split_by_time(sorted_dfs):
    """按照30minates的时间间隔对排序后的数据进行切分"""
    # Initialize a list to hold the split DataFrames
    split_dataframes = []
    # Initialize variables to keep track of the time interval
    time_interval = pd.Timedelta(minutes=30)

    # Iterate through the sorted DataFrame

    iterer = tqdm(sorted_dfs)
    for sorted_df in iterer:
    # for sorted_df in sorted_dfs:
        # current_df = pd.DataFrame()
        current_df = []
        previous_time = None
        for index, row in sorted_df.iterrows():
            time = row['time']
            if previous_time is None:
                previous_time = time
            if time - previous_time > time_interval:
                split_dataframes.append(current_df)
                current_df = []
            current_df.append(tuple(row))
            previous_time = time
        # Append the last DataFrame
        split_dataframes.append(current_df)
        iterer.set_description_str(f"Split DataFrame {len(split_dataframes)}")

    # Display the split DataFrames
    # for i, split_df in enumerate(split_dataframes):
    #     print(f"Split DataFrame {i}:\n{split_df}")
    return split_dataframes

In [4]:
def gps2traj(patchedGPS):
    """Convert GPS data to trajectory data."""
    patchedGPS['time'] = pd.to_datetime(patchedGPS['time'])

    sorted_dfs = GPS_sort_by_time(patchedGPS)
    split_dfs = split_by_time(sorted_dfs)
    traj_dfs = []
    for index, split_df in tqdm(enumerate(split_dfs)):
        #add a column to the DataFrame to store the trajectory ID
        indexed_data = [(index, *value) for value in split_df]
        #add the traj list to the list
        traj_dfs = traj_dfs + indexed_data
    #merge each split_df into a single DataFrame
    traj_df = pd.DataFrame(traj_dfs, columns=['traj', 'id', 'lon', 'lat', 'time', 'speed', 'angle'])
    #reset the index of the DataFrame
    # traj_df = traj_df.reset_index(drop=True)
    return traj_df


In [5]:
#classify GPS point into different trajectories
path = "../../datasets/dataset_sz_4096size/GPS/taxi"
file_list = os.listdir(path)
iterater = tqdm(file_list)
for file_name in iterater:
    iterater.set_description_str(f"Processing {file_name}...")
    with open(os.path.join(path, file_name), 'rb') as f:
        patchedGPS = pickle.load(f)
    # print(patchedGPS.shape)
    #Need these two columns to construct trajectory
    if not all(col in patchedGPS.columns for col in ["time", "id"]):
        continue

    traj_df = gps2traj(patchedGPS)
    save_path = "../../datasets/dataset_sz_4096size/GPS/taxi_traj"
    if not os.path.isdir(save_path):
        os.makedirs(save_path)
    with open(os.path.join(save_path, file_name), 'wb') as f:
        pickle.dump(traj_df, f)
print("Done!")

Sort GPS by time 11677: 100%|██████████| 11677/11677 [03:43<00:00, 52.34it/s]
Split DataFrame 18176: 100%|██████████| 11677/11677 [00:27<00:00, 431.96it/s]
18176it [00:21, 847.53it/s]
Sort GPS by time 1: 100%|██████████| 1/1 [00:00<00:00, 302.36it/s]72.54s/it]
Split DataFrame 1: 100%|██████████| 1/1 [00:00<00:00, 462.18it/s]
1it [00:00, 25420.02it/s]
Sort GPS by time 14213: 100%|██████████| 14213/14213 [09:52<00:00, 23.98it/s]
Split DataFrame 35147: 100%|██████████| 14213/14213 [00:49<00:00, 289.33it/s]
35147it [02:09, 271.49it/s]
Sort GPS by time 3550: 100%|██████████| 3550/3550 [00:34<00:00, 102.40it/s]]
Split DataFrame 4789: 100%|██████████| 3550/3550 [00:08<00:00, 399.17it/s]
4789it [00:01, 2437.04it/s]
Sort GPS by time 9386: 100%|██████████| 9386/9386 [03:23<00:00, 46.20it/s]t]
Split DataFrame 15267: 100%|██████████| 9386/9386 [00:25<00:00, 372.99it/s]
15267it [00:19, 776.11it/s]
Sort GPS by time 3905: 100%|██████████| 3905/3905 [00:17<00:00, 224.00it/s]]
Split DataFrame 5060: 100