In [7]:
import json
from collections import Counter

# 假设你的 json 存在一个文件 data.json
with open("arc_data.json", "r") as f:
    data = json.load(f)

# 把所有的 source 和 destination 放在一起
points = []
for item in data:
    points.append(tuple(item["source"]))
    points.append(tuple(item["destination"]))

# 统计出现次数
counter = Counter(points)

# 找到出现次数最多的两个
most_common_two = counter.most_common(10)

print("出现最多的两个坐标：")
for point, count in most_common_two:
    print(point, "出现次数:", count)

出现最多的两个坐标：
(-74.04262013641586, 40.7196218536227) 出现次数: 117
(-73.959191, 40.808779) 出现次数: 110
(-74.0413138888889, 40.721405555555556) 出现次数: 75
(-74.04152777777777, 40.72095277777778) 出现次数: 48
(-74.04150277777778, 40.72100555555556) 出现次数: 24
(-117.23013888888889, 32.77623055555556) 出现次数: 20
(-74.041825, 40.721336111111114) 出现次数: 17
(-117.23271111111112, 32.78006388888889) 出现次数: 16
(-117.27603055555555, 32.848730555555555) 出现次数: 16
(-73.958725, 40.816902777777784) 出现次数: 16


In [15]:
import json
import math

# ===== 配置 =====
IN_FILE  = "arc_data.json"
OUT_FILE = "filtered_columbia_jersey.json"

# 圆心坐标 (lat, lon) 及半径（米）
REMOVE_CIRCLES = [
    (40.807454263793694, -73.96235812656353, 800),   # 哥大，半径 800 米
    (40.71956242942621,  -74.04266374602265, 2000)    # Jersey City，半径 800 米
]

# ===== 工具函数 =====
def haversine_m(lat1, lon1, lat2, lon2):
    """返回两点之间的球面距离（米）"""
    R = 6371000.0
    phi1, phi2 = math.radians(lat1), math.radians(lat2)
    dphi       = math.radians(lat2 - lat1)
    dlambda    = math.radians(lon2 - lon1)
    a = math.sin(dphi/2)**2 + math.cos(phi1)*math.cos(phi2)*math.sin(dlambda/2)**2
    return 2 * R * math.asin(math.sqrt(a))

def in_any_circle(pt):
    lon, lat = pt
    for clat, clon, r in REMOVE_CIRCLES:
        if haversine_m(lat, lon, clat, clon) <= r:
            return True
    return False

# ===== 读取数据 =====
with open(IN_FILE, "r") as f:
    data = json.load(f)

# ===== 过滤 =====
total = len(data)
removed = 0
filtered = []

for item in data:
    src = tuple(item["source"])
    dst = tuple(item["destination"])

    if in_any_circle(src) or in_any_circle(dst):
        removed += 1
        continue
    filtered.append(item)

# ===== 保存结果 =====
with open(OUT_FILE, "w") as f:
    json.dump(filtered, f, indent=4)

print(f"原始数量: {total}")
print(f"过滤后数量: {len(filtered)}")
print(f"删除数量: {removed}")

原始数量: 5100
过滤后数量: 3884
删除数量: 1216


In [18]:
import json
import math
from collections import Counter

# ===== 配置 =====
IN_FILE  = "arc_data.json"
OUT_FILE = "filtered_columbia_jersey_dist20km.json"

# 圆心配置： (label, lat, lon, radius_m)
CIRCLES = [
    ("Columbia",    40.807454263793694, -73.96235812656353, 800),   # 哥大 800m
    ("JerseyCity",  40.71956242942621,  -74.04266374602265, 2000),   # Jersey City 800m
]

MAX_DIST_KM = 15.0  # 超过此距离（公里）的 source-destination 也将被删除

# ===== 工具函数 =====
def haversine_m(lat1, lon1, lat2, lon2):
    """球面距离（米）"""
    R = 6371000.0
    phi1, phi2 = math.radians(lat1), math.radians(lat2)
    dphi       = math.radians(lat2 - lat1)
    dlambda    = math.radians(lon2 - lon1)
    a = math.sin(dphi/2)**2 + math.cos(phi1)*math.cos(phi2)*math.sin(dlambda/2)**2
    return 2 * R * math.asin(math.sqrt(a))

def dist_km(pt1, pt2):
    # 注意：你的 JSON 点是 [lon, lat]
    lon1, lat1 = pt1
    lon2, lat2 = pt2
    return haversine_m(lat1, lon1, lat2, lon2) / 1000.0

def hit_circles(point):
    """返回该点命中的圈标签集合（可能为空集合）"""
    lon, lat = point
    hits = set()
    for label, clat, clon, r in CIRCLES:
        if haversine_m(lat, lon, clat, clon) <= r:
            hits.add(label)
    return hits

# ===== 读取数据 =====
with open(IN_FILE, "r") as f:
    data = json.load(f)

# ===== 过滤并统计 =====
stats = Counter()
filtered = []

for item in data:
    src = tuple(item["source"])        # (lon, lat)
    dst = tuple(item["destination"])   # (lon, lat)

    # 命中圈（source/destination 任何一个命中都算）
    src_hits = hit_circles(src)
    dst_hits = hit_circles(dst)
    circle_hits = src_hits | dst_hits  # 可能含有 "Columbia"、"JerseyCity" 中的一个或两个

    # 距离超阈值
    too_far = dist_km(src, dst) > MAX_DIST_KM

    # 只要满足任意删除条件，就删除
    if circle_hits or too_far:
        stats["total_removed"] += 1
        # 分原因计数（可重叠计数）
        for label in circle_hits:
            stats[f"removed_in_{label}"] += 1
        if too_far:
            stats["removed_dist_over_20km"] += 1

        # 标记重叠（同时命中圈且距离也超阈值）
        if circle_hits and too_far:
            stats["overlap_circle_and_dist"] += 1
        continue

    filtered.append(item)

# ===== 保存结果 =====
with open(OUT_FILE, "w") as f:
    json.dump(filtered, f, indent=4)

print(f"原始数量: {len(data)}")
print(f"过滤后数量: {len(filtered)}")
print(f"总共删除: {stats['total_removed']}")
print(f"—— 命中哥大圈删除: {stats['removed_in_Columbia']}")
print(f"—— 命中Jersey City圈删除: {stats['removed_in_JerseyCity']}")
print(f"—— source-destination 距离> {MAX_DIST_KM} km 删除: {stats['removed_dist_over_20km']}")
print(f"—— 同时命中圈且距离> {MAX_DIST_KM} km（重叠）: {stats['overlap_circle_and_dist']}")

原始数量: 5100
过滤后数量: 3854
总共删除: 1246
—— 命中哥大圈删除: 643
—— 命中Jersey City圈删除: 580
—— source-destination 距离> 15.0 km 删除: 88
—— 同时命中圈且距离> 15.0 km（重叠）: 58
