In [34]:
from utils.sample_data_utils import *
import pandas as pd


采样一部分匹配异常的数据观测，主要根据匹配长度差值采样

In [35]:
matched_orders = pd.read_csv('matched_points_for_qgis.csv')
print(matched_orders.columns)

error_df = pd.read_csv("map_matching_error_analysis.csv")
print(error_df.columns)

original_df = pd.read_csv("filtered_orders.csv")
print(original_df.columns)

Index(['order_id', 'driver_id', 'matched_longitude', 'matched_latitude',
       'point_sequence', 'total_order_distance_m', 'total_order_duration_s',
       'order_avg_confidence'],
      dtype='object')
Index(['order_id', 'num_points', 'original_path_length_m',
       'matched_path_length_m', 'path_length_difference_m',
       'mean_pointwise_error_m', 'median_pointwise_error_m',
       'max_pointwise_error_m', 'frechet_distance'],
      dtype='object')
Index(['driver_id', 'order_id', 'gps_time', 'longitude', 'latitude',
       'time_diff'],
      dtype='object')


In [36]:
def filter_trajectory_data(filtered_orders_path, matched_points_path, error_analysis_path):
    """
    根据路径长度差异筛选出租车订单轨迹数据。

    参数:
    filtered_orders_path (str): 原始出租车订单轨迹CSV文件的路径。
    matched_points_path (str): 匹配后的GPS轨迹CSV文件的路径。
    error_analysis_path (str): 地图匹配误差分析CSV文件的路径。

    返回:
    tuple: 包含两个DataFrame的元组 (filtered_original_df, filtered_matched_df)。
           - filtered_original_df: 筛选后的原始GPS轨迹数据。
           - filtered_matched_df: 筛选后的匹配GPS轨迹数据。
    """
    # 读取CSV文件到Pandas DataFrame
    try:
        original_orders_df = pd.read_csv(filtered_orders_path)
        matched_points_df = pd.read_csv(matched_points_path)
        error_analysis_df = pd.read_csv(error_analysis_path)
    except FileNotFoundError as e:
        print(f"错误: {e}")
        return None, None

    # 筛选出 path_length_difference_m 绝对值小于400米的订单
    filtered_error_df = error_analysis_df[error_analysis_df['path_length_difference_m'].abs() < 70]

    # 获取符合条件的订单ID列表
    valid_order_ids = filtered_error_df['order_id'].tolist()

    # 根据订单ID筛选原始GPS轨迹和匹配后的GPS轨迹
    filtered_original_df = original_orders_df[original_orders_df['order_id'].isin(valid_order_ids)]
    filtered_matched_df = matched_points_df[matched_points_df['order_id'].isin(valid_order_ids)]

    return filtered_original_df, filtered_matched_df


In [37]:
# 定义文件路径
filtered_orders_file = 'filtered_orders.csv'
matched_points_file = 'matched_points_for_qgis.csv'
error_analysis_file = 'map_matching_error_analysis.csv'

# 调用函数进行筛选
filtered_original_traces, filtered_matched_traces = filter_trajectory_data(
    filtered_orders_file,
    matched_points_file,
    error_analysis_file
)

# 以下注释是随机采样订单，如果取消注释方便观察采样
# unique_orders = filtered_original_traces['order_id'].unique()
# sampled_order_ids = np.random.choice(unique_orders, size=10, replace=False)
#
# print(sampled_order_ids)
#
# filtered_original_traces = filtered_original_traces[filtered_original_traces['order_id'].isin(sampled_order_ids)].copy()
# filtered_matched_traces = filtered_matched_traces[filtered_matched_traces['order_id'].isin(sampled_order_ids)].copy()

# 如果筛选成功，则保存到新的CSV文件中
if filtered_original_traces is not None and filtered_matched_traces is not None:
    # 定义输出文件名
    output_original_file = 'original_trajectories_under_70m_diff.csv'
    output_matched_file = 'matched_trajectories_under_70m_diff.csv'

    # 保存筛选后的数据到CSV文件，不包含Pandas的索引列. [1, 2, 3]
    filtered_original_traces.to_csv(output_original_file, index=False)
    filtered_matched_traces.to_csv(output_matched_file, index=False)

    print(f"筛选后的原始轨迹数据已保存至: {output_original_file}")
    print(f"筛选后的匹配轨迹数据已保存至: {output_matched_file}")



筛选后的原始轨迹数据已保存至: original_trajectories_under_70m_diff.csv
筛选后的匹配轨迹数据已保存至: matched_trajectories_under_70m_diff.csv
