# Transform GPS points to trajectory and save trajectory number as a column.

In [59]:
import pickle
fname = "../../datasets/dataset_bj_time/GPS/patch_geo_coor/0_8_gps.pkl"

with open(fname, 'rb') as f:
    data = pickle.load(f)
print(data)

             ID        time        lon       lat  dir  speed  timeinterval
53056        39  1228723854  116.26516  40.02380  216      0          13.0
53057        39  1228723863  116.26517  40.02378  222      0           5.0
53058        39  1228723913  116.26518  40.02374  333      0          53.0
222000      145  1228060924  116.26941  40.02038    0      0           NaN
222001      145  1228082771  116.26977  40.02047   45    360         194.0
...         ...         ...        ...       ...  ...    ...           ...
49425226  27757  1228442312  116.26998  40.02054   36     51          59.5
49511824  27797  1228393967  116.26852  40.02036  237      0           NaN
49636852  27876  1228375526  116.26714  40.02396  210    205          73.5
49636853  27876  1228375528  116.26714  40.02395  201    102          11.0
49636854  27876  1228375548  116.26713  40.02392    0      0          16.5

[363 rows x 7 columns]


In [65]:
data.dtypes

ID                int64
time             object
lon             float64
lat             float64
dir               int64
speed             int64
timeinterval    float64
dtype: object

In [78]:
for index, row in data.iterrows():
    a = row
    break
print(a)
a.dtypes

ID                      39
time            1228723854
lon              116.26516
lat                40.0238
dir                    216
speed                    0
timeinterval          13.0
Name: 53056, dtype: object


dtype('O')

In [20]:
#bj dataset

import pandas as pd
import numpy as np
import cv2
import pickle
import os
import sys
from PIL import Image
from tqdm import tqdm
sys.path.append('..')


def GPS_sort_by_time(df):
    """按车牌号筛选，按时间信息排序"""

    # Get the unique carid values
    unique_carids = df['ID'].unique()

    # Initialize an empty DataFrame to store the sorted results
    sorted_dataframes = []

    iterater = tqdm(unique_carids)
    # Loop through each unique carid
    for carid in iterater:
    # for carid in unique_carids:
        # Filter the DataFrame by the current carid
        filtered_df = df[df['ID'] == carid]

        # Sort the filtered DataFrame by a specific column (e.g., 'brand')
        sorted_df = filtered_df.sort_values(by='time')

        # Append the sorted DataFrame to the list
        sorted_dataframes.append(sorted_df)
        iterater.set_description_str(f"Sort GPS by time {len(sorted_dataframes)}")

    # print(sorted_dataframes[0].shape)

    return sorted_dataframes


def split_by_time(sorted_dfs):
    """按照30minates的时间间隔对排序后的数据进行切分"""
    # Initialize a list to hold the split DataFrames
    split_dataframes = []
    # Initialize variables to keep track of the time interval
    # time_interval = pd.Timedelta(minutes=30)

    # Iterate through the sorted DataFrame

    iterer = tqdm(sorted_dfs)
    for sorted_df in iterer:
    # for sorted_df in sorted_dfs:
        # current_df = pd.DataFrame()
        current_df = []
        previous_time = None
        for index, row in sorted_df.iterrows():
            time = row['time']
            if previous_time is None:
                previous_time = time
            if time - previous_time > 1800:
                split_dataframes.append(current_df)
                current_df = []
            current_df.append(tuple(row))
            previous_time = time
        # Append the last DataFrame
        split_dataframes.append(current_df)
        iterer.set_description_str(f"Split DataFrame {len(split_dataframes)}")

    # Display the split DataFrames
    # for i, split_df in enumerate(split_dataframes):
    #     print(f"Split DataFrame {i}:\n{split_df}")
    return split_dataframes

In [21]:
def gps2traj(patchedGPS):
    """Convert GPS data to trajectory data."""
    patchedGPS['time'] = patchedGPS['time'].astype(int)

    sorted_dfs = GPS_sort_by_time(patchedGPS)
    split_dfs = split_by_time(sorted_dfs)
    traj_dfs = []
    for index, split_df in tqdm(enumerate(split_dfs)):
        #add a column to the DataFrame to store the trajectory ID
        indexed_data = [(index, *value) for value in split_df]
        #add the traj list to the list
        traj_dfs = traj_dfs + indexed_data
    #merge each split_df into a single DataFrame
    traj_df = pd.DataFrame(traj_dfs, columns=['index', 'id', 'time', 'lon', 'lat', 'dir', 'speed', 'timeinterval'])
    #reset the index of the DataFrame
    # traj_df = traj_df.reset_index(drop=True)
    return traj_df


In [55]:
#classify GPS point into different trajectories
path = "../../datasets/dataset_bj_time/GPS/patch_geo_coor/"
file_list = os.listdir(path)
iterater = tqdm(file_list)
for file_name in iterater:
    iterater.set_description_str(f"Processing {file_name}...")
    with open(os.path.join(path, file_name), 'rb') as f:
        patchedGPS = pickle.load(f)
    print(patchedGPS.shape)
    #Need these two columns to construct trajectory
    if not all(col in patchedGPS.columns for col in ["time", "ID"]):
        continue

    traj_df = gps2traj(patchedGPS)
    save_path = "../../datasets/dataset_bj_time/GPS/patch_traj/"
    if not os.path.isdir(save_path):
        os.makedirs(save_path)

    with open(os.path.join(save_path, file_name), 'wb') as f:
        pickle.dump(traj_df, f)

print("Done!")


Processing 33_36_gps.pkl...:   0%|          | 0/348 [00:00<?, ?it/s]

(22495, 7)




Sort GPS by time 5075: 100%|██████████| 5075/5075 [00:30<00:00, 164.07it/s]


(603, ID              0.000000e+00
time            1.228528e+09
lon             1.164196e+02
lat             3.988267e+01
dir             0.000000e+00
speed           0.000000e+00
timeinterval    3.055000e+02
Name: 603, dtype: float64)
ID              0.000000e+00
time            1.228528e+09
lon             1.164196e+02
lat             3.988267e+01
dir             0.000000e+00
speed           0.000000e+00
timeinterval    3.055000e+02
Name: 603, dtype: float64
(603, ID              0.000000e+00
time            1.228528e+09
lon             1.164196e+02
lat             3.988267e+01
dir             0.000000e+00
speed           0.000000e+00
timeinterval    3.055000e+02
Name: 603, dtype: float64)
ID              0.000000e+00
time            1.228528e+09
lon             1.164196e+02
lat             3.988267e+01
dir             0.000000e+00
speed           0.000000e+00
timeinterval    3.055000e+02
Name: 603, dtype: float64
ID              0.000000e+00
time            1.228528e+09
lon         


Split DataFrame 3653:  49%|████▊     | 2467/5075 [54:39<57:46,  1.33s/it] 
Processing 33_36_gps.pkl...:   0%|          | 0/348 [55:10<?, ?it/s]


KeyboardInterrupt: 

In [48]:
with open('../../datasets/dataset_bj_time/GPS/patch_traj/1_55_gps.pkl','rb')as f:
    data = pickle.load(f)
data

Unnamed: 0,index,id,time,lon,lat,dir,speed,timeinterval
0,0,1.0,1.228381e+09,116.52729,40.02008,63.0,2417.0,192.5
1,1,1.0,1.228385e+09,116.52548,40.01956,243.0,2623.0,185.0
2,2,2.0,1.228276e+09,116.52598,40.01973,243.0,2057.0,303.5
3,3,2.0,1.228280e+09,116.52459,40.01929,243.0,2675.0,310.5
4,4,5.0,1.228637e+09,116.52383,40.01883,63.0,2623.0,7.0
...,...,...,...,...,...,...,...,...
10842,9892,28013.0,1.228442e+09,116.52733,40.02008,63.0,2520.0,311.5
10843,9893,28013.0,1.228544e+09,116.52375,40.01883,63.0,2366.0,60.0
10844,9894,28013.0,1.228551e+09,116.52620,40.01987,243.0,2315.0,60.0
10845,9895,28015.0,1.228212e+09,116.52642,40.01997,243.0,205.0,299.5


In [49]:
for row in data.iterrows():
    print(row)
    break

(0, index           0.000000e+00
id              1.000000e+00
time            1.228381e+09
lon             1.165273e+02
lat             4.002008e+01
dir             6.300000e+01
speed           2.417000e+03
timeinterval    1.925000e+02
Name: 0, dtype: float64)
