Read requests in one folder

In [4]:
from pathlib import Path
import json
import pandas as pd


path_request = "/home/pascal/Documenten/Syntra/Data Scientist/opdracht_transport/data/requests/0521_301-20220531"
requests = Path(path_request)
files = sorted(requests.iterdir(), key=lambda x: x.name)
rows = []
for file in files:
    with file.open("r", encoding="utf-8") as f:
        data = json.load(f)
        configuration = data.get("configurationName", {})
        data_list = data.get("tasks", [])
        for task in data_list:
            rows.append({
                "configuration": configuration,
                "task_id": task["id"],
                "latitude": task["address"]["latitude"],
                "longitude": task["address"]["longitude"],
                "from": task["timeWindow"]["from"],
                "until": task["timeWindow"]["till"]
            })  
df = pd.DataFrame(rows)
#df = df.drop_duplicates()
df['task_id'] = df['task_id'].astype(int)
print(df)
print(df.info())

       configuration  task_id  latitude  longitude                 from  \
0     CreateSequence    65547  0.648395   0.307167  2022-05-31T08:00:00   
1     CreateSequence    65483  0.646775   0.308146  2022-05-31T07:30:00   
2     CreateSequence    65476  0.642500   0.310977  2022-05-31T07:30:00   
3     CreateSequence    65477  0.649220   0.300745  2022-05-31T07:30:00   
4     CreateSequence    65478  0.642850   0.304187  2022-05-31T07:30:00   
...              ...      ...       ...        ...                  ...   
1799   AddToSequence      143  0.636873   0.306125  2022-05-31T07:30:00   
1800   AddToSequence      144  0.636873   0.306125  2022-05-31T07:30:00   
1801   AddToSequence      155  0.648065   0.301981  2022-05-31T12:00:00   
1802   AddToSequence      156  0.648065   0.301981  2022-05-31T12:00:00   
1803   AddToSequence      157  0.649099   0.304113  2022-05-31T12:00:00   

                    until  
0     2022-05-31T17:00:00  
1     2022-05-31T23:59:00  
2     2022-05-3

Locaties vergelijken

In [5]:
from geopy.distance import geodesic

# Coordinates (latitude, longitude)
coords_1 = (0.648395,0.307167)
coords_2 = (0.646775,0.308146)

# Calculate distance in various units
distance_km = geodesic(coords_1, coords_2).meters


print(f"Distance: {distance_km:.2f} m")


Distance: 209.67 m


Unieke location id's maken

In [6]:

from geopy.point import Point

# 1. Initialize the unique location tracking
unique_points = [] # List of Point objects
location_map = {}  # Map index to location_id
threshold_m = 1.0  # 1 meter threshold

# 2. Iterate through the DataFrame
for idx, row in df.iterrows():
    current_pt = Point(row['latitude'], row['longitude'])
    found_match = False
    
    # Check current point against established unique points
    for i, saved_pt in enumerate(unique_points):
        if geodesic(current_pt, saved_pt).meters < threshold_m:
            df.at[idx, 'location_id'] = f"LOC_{i+1:03d}"
            found_match = True
            break
    
    # If no match within 1m, create a new unique location
    if not found_match:
        unique_points.append(current_pt)
        new_id = len(unique_points)
        df.at[idx, 'location_id'] = f"LOC_{new_id:03d}"

# 3. Display results (Rows 1644/1645 and 1801/1802 will share IDs as they are identical)
print(df[['task_id', 'latitude', 'longitude', 'location_id']].tail(10))


      task_id  latitude  longitude location_id
1794      110  0.641092   0.305176     LOC_098
1795      109  0.640913   0.305728     LOC_104
1796      108  0.640778   0.305955     LOC_117
1797      107  0.640991   0.306125     LOC_023
1798      106  0.640979   0.306348     LOC_078
1799      143  0.636873   0.306125     LOC_095
1800      144  0.636873   0.306125     LOC_095
1801      155  0.648065   0.301981     LOC_062
1802      156  0.648065   0.301981     LOC_062
1803      157  0.649099   0.304113     LOC_057
