In [45]:
import pandas as pd
from geopy.distance import geodesic as gd

# Найдем станции с нагрузкой больше средней

In [2]:
divvy_data = pd.read_csv("data/2023/Divvy_Trips_2023_full.csv.gz", compression="gzip")

In [3]:
# Переводим в формат даты
divvy_data["started_at"] = pd.to_datetime(divvy_data["started_at"])
divvy_data["ended_at"] = pd.to_datetime(divvy_data["ended_at"])

In [4]:
# Добавим столбец с часом начала поездки
divvy_data["hour"] = divvy_data["started_at"].dt.hour

In [5]:
# Группируем данные по станции и часу
grouped_data = (
    divvy_data.groupby(["start_station_name", "hour"])["ride_id"].count().reset_index()
)

In [6]:
# Создаем сводную таблицу с количеством поездок по станциям и часам
grouped_data_start = (
    divvy_data.groupby(["start_station_name", "hour"])["ride_id"].count().reset_index()
)
grouped_data_end = (
    divvy_data.groupby(["end_station_name", "hour"])["ride_id"].count().reset_index()
)

In [7]:
# Считаем количество велосипедов которые приехали/уехали для каждой станции
pivot_table_start = pd.pivot_table(
    grouped_data_start, values="ride_id", index=["start_station_name"], columns=["hour"]
).fillna(0)
pivot_table_end = pd.pivot_table(
    grouped_data_end, values="ride_id", index=["end_station_name"], columns=["hour"]
).fillna(0)

In [8]:
# Получаем разницу между приехавшими и уехавшими велосипедами
net_change = pivot_table_end - pivot_table_start

In [9]:
# Считаем среднюю нагрузку на станции как разницу между изменением на станции и средним изменением по всем станциям
load = net_change.apply(lambda x: x - x.mean(), axis=1)

In [10]:
# Получаем среднюю нагрузку на станции
avg_load = load.mean(axis=0)

In [11]:
peak_hour = avg_load.idxmin()

In [12]:
peak_hour

18

In [13]:
low_stations = load.loc[:, peak_hour].nlargest(5).index.tolist()

In [14]:
peak_stations = load.loc[:, peak_hour].nsmallest(5).index.tolist()

In [15]:
avg_load_peak = load.loc[peak_stations + low_stations, peak_hour].mean()

In [16]:
underloaded_stations = load[load.loc[:, peak_hour] > avg_load_peak].index.tolist()

In [17]:
overloaded_stations = load[load.loc[:, peak_hour] < avg_load_peak].index.tolist()

In [18]:
net_change.head()

hour,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
2112 W Peterson Ave,16.0,7.0,1.0,-2.0,0.0,1.0,-1.0,-7.0,1.0,-1.0,...,0.0,2.0,4.0,8.0,-4.0,-3.0,-4.0,6.0,0.0,-4.0
63rd St Beach,0.0,0.0,0.0,0.0,0.0,0.0,6.0,1.0,0.0,0.0,...,-2.0,2.0,-1.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0
900 W Harrison St,1.0,-4.0,3.0,1.0,4.0,0.0,7.0,76.0,76.0,169.0,...,-49.0,-105.0,-123.0,-120.0,-62.0,-27.0,-6.0,-59.0,0.0,-8.0
Aberdeen St & Jackson Blvd,5.0,2.0,3.0,0.0,1.0,-20.0,-25.0,-23.0,-135.0,-28.0,...,-33.0,8.0,-15.0,63.0,9.0,-4.0,9.0,-5.0,-6.0,5.0
Aberdeen St & Monroe St,-2.0,1.0,0.0,0.0,-30.0,-38.0,-122.0,-17.0,-133.0,-6.0,...,15.0,0.0,178.0,121.0,34.0,-1.0,-11.0,1.0,-1.0,-10.0


In [19]:
peak_stations

['University Ave & 57th St',
 'Clinton St & Washington Blvd',
 'Canal St & Adams St',
 'Dearborn St & Monroe St',
 'Ravenswood Ave & Lawrence Ave']

In [20]:
low_stations

['Sedgwick St & North Ave',
 'Clark St & Schiller St',
 'Wells St & Evergreen Ave',
 'Southport Ave & Clybourn Ave',
 'Emerald Ave & 28th St']

In [21]:
len(overloaded_stations)

52

In [22]:
len(underloaded_stations)

1022

# Для каждой нагруженной станции найдем ближайшую ненагруженную

In [23]:
# Создаем датафреймы с нагруженными и ненагруженными станциями
overloaded_df = pd.DataFrame(overloaded_stations, columns=["overloaded_station"])
underloaded_df = pd.DataFrame(underloaded_stations, columns=["underloaded_station"])

In [24]:
overloaded_df.head()

Unnamed: 0,overloaded_station
0,900 W Harrison St
1,Ashland Ave & Division St
2,Broadway & Wilson Ave
3,Calumet Ave & 35th St
4,Canal St & Adams St


In [25]:
# Загружаем датасет с координатами станций
coords = pd.read_csv("data/2023/station_coord.csv")

In [26]:
coords.head()

Unnamed: 0,station_name,lng,lat,end_station_id
0,Lincoln Ave & Fullerton Ave,-87.667968,41.91461,TA1309000058
1,Kimbark Ave & 53rd St,-87.594747,41.799568,TA1309000037
2,Western Ave & Lunt Ave,-87.700825,41.965875,RP-005
3,Lakeview Ave & Fullerton Pkwy,-87.648556,41.949097,TA1309000019
4,Broadway & Waveland Ave,-87.648577,41.949088,13325


In [27]:
# Добавляем координаты станций в датафреймы
overloaded_df = pd.merge(
    overloaded_df, coords, left_on="overloaded_station", right_on="station_name"
)

overloaded_df.drop(columns=["overloaded_station", "end_station_id"], inplace=True)

In [28]:
underloaded_df = pd.merge(
    underloaded_df, coords, left_on="underloaded_station", right_on="station_name"
)

underloaded_df.drop(columns=["underloaded_station", "end_station_id"], inplace=True)

In [29]:
overloaded_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52 entries, 0 to 51
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   station_name  52 non-null     object 
 1   lng           52 non-null     float64
 2   lat           52 non-null     float64
dtypes: float64(2), object(1)
memory usage: 1.3+ KB


In [30]:
# Для каждой загруженной станции приблизительно находим ближайшую ненагруженную
results = []
for i, overloaded_station in overloaded_df.iterrows():
    min_distance = float("inf")
    less_loaded_station = None
    less_loaded_station_coords = None
    for j, less_loaded_station_tmp in underloaded_df.iterrows():
        distance = float(
            gd(
                (overloaded_station["lat"], overloaded_station["lng"]),
                (less_loaded_station_tmp["lat"], less_loaded_station_tmp["lng"]),
            ).km
        )
        if float(distance) < float(min_distance) and distance > 0.05:
            min_distance = distance
            less_loaded_station = less_loaded_station_tmp
            less_loaded_station_coords = (
                less_loaded_station_tmp["lng"],
                less_loaded_station_tmp["lat"],
            )

    results.append(
        {
            "overloaded_station": overloaded_station["station_name"],
            "less_loaded_station": less_loaded_station["station_name"],
            "distance": min_distance,
            "overloaded_station_coords": (
                overloaded_station["lng"],
                overloaded_station["lat"],
            ),
            "less_loaded_station_coords": less_loaded_station_coords,
        }
    )

In [31]:
df_result = pd.DataFrame(results)

In [32]:
# Посчитаем реальное расстояние для каждой пары
import os
from time import sleep
import openrouteservice as ors

API_KEY = os.environ.get("ORS_KEY")
ors_client = ors.Client(key=API_KEY)

In [33]:
def get_route_length(origin, destination):
    # Задержка для того чтобы не превысить лимит запросов
    sleep(1)
    # Отправляем запрос на сервер и получаем ответ в формате JSON
    response = ors_client.directions(
        coordinates=[origin, destination], profile="driving-car", format="json"
    )
    # Извлекаем длину маршрута из ответа если маршрут получен
    if response["routes"][0]["summary"] != {}:
        route_length = response["routes"][0]["summary"]["distance"]
    else:
        route_length = None
    # Возвращаем длину маршрута в метрах
    return route_length

In [None]:
df_result["real_distance"] = df_result.apply(
    lambda x: get_route_length(
        x["overloaded_station_coords"], x["less_loaded_station_coords"]
    ),
    axis=1,
)

In [35]:
def calc_distance(coords1, coords2):
    return gd(coords1, coords2).km

In [36]:
def find_nearest(coordinates, stations, overload_station):
    distances = [
        (calc_distance(coordinates, station[4]), station)
        for station in stations
        if station[0] != overload_station
    ]
    return min(distances, key=lambda x: x[0])[1]

In [37]:
def find_shortest_path(df, station):
    stations = df.values.tolist()
    route = [
        {
            "station_name": station,
            "distance": None,
            "coords": stations[stations[1] == station][3],
        }
    ]
    current_coords = route[0]["coords"]
    for _ in range(len(stations) - 1):
        remaining_stations = [
            s for s in stations if s[0] not in [p["station_name"] for p in route]
        ]
        next_station = find_nearest(
            current_coords, remaining_stations, route[-1]["station_name"]
        )
        route.append(
            {
                "station_name": next_station[0],
                "distance": next_station[5],
                "coords": next_station[3],
            }
        )
        route.append(
            {
                "station_name": next_station[1],
                "distance": None,
                "coords": next_station[4],
            }
        )
        current_coords = next_station[4]
    return route

In [38]:
start_station = "Damen Ave & Charleston St"
path = find_shortest_path(df_result, start_station)

In [39]:
def calculate_distances(track):
    for _ in range(1, len(track)):
        prev_station = track[_ - 1]["coords"]
        current_station = track[_]["coords"]
        dist = get_route_length(prev_station, current_station)
        track[_]["distance"] = dist
    return track

In [None]:
path = calculate_distances(path)

In [41]:
path_df = pd.DataFrame(path)

In [48]:
path_df.sample(2)

Unnamed: 0,station_name,distance,coords
79,Sheffield Ave & Fullerton Ave,493.1,"(-87.64862, 41.94910266666667)"
82,Western Ave & Leland Ave,880.8,"(-87.64641726, 41.94549036)"


In [43]:
# Сколько всего нужно проехать
round(path_df["distance"].sum(), 1)

116422.0

In [47]:
path_df["distance"].median()

510.5