In [1]:
import os
import json
import time
import requests
from typing import List
from IPython.display import IFrame, display_html

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import folium
from geopy.distance import geodesic

# Funkcje pomocnicze

In [2]:
def get_data(url: str, folder_path: str, file_id: int, api_key: str, headers: dict = None) -> None:
    """
    {'result': 'Błędna metoda lub parametry wywołania'}
    """
    response = requests.post(url, headers=header)
    response.raise_for_status()
    data = response.json()
    if data["result"] == "Błędna metoda lub parametry wywołania":
        print(f"An error occured. Status code: {response.status_code}")
    else:
        save_path = (os.path.join(folder_path, f"data_0{file_id}.json")
                     if file_id < 10
                     else os.path.join(folder_path, f"data_{file_id}.json"))
        with open(save_path, "w", encoding="utf-8") as f:
            json.dump(data, f)


def jsons_to_parquet(folder_path: str, json_files: List[str], save_path: str) -> None:
    """"""
    dataframes = []
    for file in json_files:
        with open(os.path.join(folder_path, file), "r", encoding="utf-8") as f:
            data = json.load(f)
            df = pd.json_normalize(data["result"])
            dataframes.append(df)

    all_data = pd.concat(dataframes, ignore_index=True)
    all_data.to_parquet(f"{save_path}.parquet", engine="pyarrow")


def calculate_distance(data: pd.DataFrame) -> float:
    """Take a vehicle route data and calculate its total distance."""
    total_distance = 0.0
    for i in range(1, len(data)):
        start = (data.iloc[i-1]["Lat"], data.iloc[i-1]["Lon"])
        end = (data.iloc[i]["Lat"], data.iloc[i]["Lon"])
        total_distance += geodesic(start, end).km
        
    return total_distance


def create_distance_df(data: pd.DataFrame) -> pd.DataFrame:
    """"""
    grouped = data.sort_values(by=["VehicleNumber", "Time"]).groupby(["Lines", "VehicleNumber"])
    distances = grouped.apply(calculate_distance, include_groups=False)
    distances = distances.reset_index(name="TotalDistance")
    
    return distances


# Pobranie danych

In [3]:
# api_key = "API_KEY"
url = f"https://api.um.warszawa.pl/api/action/busestrams_get/?resource_id=f2e5503e927d-4ad3-9500-4ab9e55deb59&apikey={api_key}&type=2"
header = {"Cache-Control": "no-cache"}

afternoon_folder = "data_tram_30_afternoon"
evening_folder = "data_tram_30_evening"

# for i in range(12):
#     print(i)
#     get_data(url, evening_folder, i)
#     time.sleep(30)

# Zapisanie i wczytanie danych w formacie parquet

In [4]:
evening_files = sorted([file for file in os.listdir(evening_folder) if file.endswith(".json")])
jsons_to_parquet(evening_folder, evening_files, "data_tram_evening")

afternoon_files = sorted([file for file in os.listdir(afternoon_folder) if file.endswith(".json")])
jsons_to_parquet(afternoon_folder, afternoon_files, "data_tram_afternoon")

In [5]:
data_afternoon = pd.read_parquet("data_tram_afternoon.parquet")
data_evening = pd.read_parquet("data_tram_evening.parquet")

for data in [data_afternoon, data_evening]:
    data.Time = pd.to_datetime(data.Time)

print(data_afternoon.shape, data_evening.shape)
data_afternoon

(6555, 6) (6556, 6)


Unnamed: 0,Lines,Lon,VehicleNumber,Time,Lat,Brigade
0,36,20.929117,1252,2024-05-23 12:06:19,52.300360,4
1,35,21.006310,1276,2024-06-06 15:17:25,52.213062,014
2,26,20.932413,1282,2024-06-05 14:25:50,52.300140,020
3,4,20.958760,1286,2024-03-28 09:01:54,52.249317,018
4,18,20.928482,1290,2024-06-05 17:21:20,52.300053,015
...,...,...,...,...,...,...
6550,2,20.929567,4281,2024-06-06 15:22:58,52.291920,6
6551,33,20.929977,4282,2024-06-06 15:23:02,52.298847,082
6552,25,20.979330,4283,2024-06-06 15:23:01,52.214085,9
6553,33,20.928213,4284,2024-06-06 15:22:59,52.280228,56


# Obliczenie dystansu przebytego przez pojazd w czasie zbierania danych

In [6]:
distances_afternoon = create_distance_df(data_afternoon)
distances_evening = create_distance_df(data_evening)
distances_afternoon

Unnamed: 0,Lines,VehicleNumber,TotalDistance
0,1,3116,0.687372
1,1,3123,1.706877
2,1,3139,0.009538
3,1,3140,0.705022
4,1,3144,0.013164
...,...,...,...
551,9,3615,2.547396
552,9,3625,2.377327
553,9,3639,0.674793
554,9,3645,1.760361


# Analiza danych

In [7]:
# lines = distances_afternoon.Lines.unique()
# lines_vehicles = {line: len(list(distances_afternoon[distances_afternoon.Lines == line].VehicleNumber.unique())) for line in lines}

In [42]:
# distances.groupby("Lines")["total_distance"].agg(["min", "max", "mean", "sum", "std", "size"]).sort_values(by="sum", ascending=False)

In [40]:
city_center_coords = [52.2297, 21.0122]

evening_map = folium.Map(location=city_center_coords, zoom_start=12, width="100%", height="100%")
for sample in data_evening.groupby("VehicleNumber").first().itertuples():
    folium.Marker([sample.Lat, sample.Lon], 
                  popup=f'<b>{sample.Lines}</b>', 
                  tooltip="Click to see the line number!").add_to(evening_map)
evening_map.save("evening.html")

afternoon_map = folium.Map(location=city_center_coords, zoom_start=12, width="100%", height="100%")
for sample in data_afternoon.groupby("VehicleNumber").first().itertuples():
    folium.Marker([sample.Lat, sample.Lon], 
                  popup=f'<b>{sample.Lines}</b>', 
                  tooltip="Click to see the line number!").add_to(afternoon_map)
afternoon_map.save("afternoon.html")

In [41]:
style = """
<style>
.container {
    display: flex;
    justify-content: space-around;
}
.map {
    width: 50%;
}
</style>
"""

iframe1 = IFrame(src="afternoon.html", width="98%", height=850)
iframe2 = IFrame(src="evening.html", width="100%", height=850)

html_content = f"""
<div class="container">
    <div class="map">{iframe1._repr_html_()}</div>
    <div class="map">{iframe2._repr_html_()}</div>
</div>
"""

display_html(style + html_content, raw=True)