In [1]:
from scipy.spatial import KDTree
from geopy.distance import geodesic
import pandas as pd
from tqdm.auto import tqdm

In [2]:
gh_df = (
    pd.read_csv("geohash_traffic_density_pt_15clusters.csv", usecols=['GEOHASH', 'LATITUDE', 'LONGITUDE'])
)

# Reading Metro data and ensuring to take only unique coordinates
site_dict = (
    pd.read_csv("datasets/08_tr_istanbul_rail_sea_transportation_locations/tr_ist_bus_stops.csv", 
                usecols=['STATION_CODE', 'LATITUDE', 'LONGITUDE'])
    .set_index('STATION_CODE')
    .T.to_dict()
)

In [3]:
# Unique list of coordinates
coordinates = gh_df[['LATITUDE', 'LONGITUDE']].drop_duplicates().values
coordinates

array([[41.080627, 28.811646],
       [40.987244, 29.108276],
       [41.003723, 29.09729 ],
       ...,
       [40.976257, 29.229126],
       [41.009216, 27.98767 ],
       [41.020203, 27.998657]])

In [4]:
def find_nearest_coordinates(
        target_lat: float,
        target_lon: float,
        coordinates: list[list[float, float]]
        ) -> list[tuple[tuple[float, float], float]]:
    """
    Finds the nearest coordinates to a target latitude and longitude.

    Args:
        target_lat (float): The latitude of the target location.
        target_lon (float): The longitude of the target location.
        coordinates (List[Tuple[float, float]]): A list of tuples representing the coordinates to search from.

    Returns:
        List[Tuple[Tuple[float, float], float]]: A list of tuples containing the nearest coordinates and their distances in kilometers.
    """
    target_coords = (target_lat, target_lon)
    
    # Build a KDTree for efficient nearest neighbor search
    tree = KDTree(coordinates)

    # Calculate the search radius in degrees
    radius_deg = 1 / 111.0  # 1 km radius in degrees

    # Query the KDTree for indices of points within the search radius
    indices = tree.query_ball_point([(target_lat, target_lon)], radius_deg)

    # Calculate the distance between target coordinates and nearest coordinates
    nearest_coordinates = [(coordinates[i], geodesic(target_coords, coordinates[i]).km) for i in indices[0]]

    # Sort the nearest coordinates by distance
    nearest_coordinates.sort(key=lambda x: x[1])  # Sort by distance

    return nearest_coordinates

In [5]:
bus_stn_list = []
new_list = []

for site in tqdm(site_dict):

    target_lat = site_dict[site]['LATITUDE']
    target_lon = site_dict[site]['LONGITUDE']

    nearest_coords = find_nearest_coordinates(target_lat, target_lon, coordinates)

    for i, (coords, distance) in enumerate(nearest_coords):

        gh = gh_df.loc[gh_df['LATITUDE'].eq(coords[0]) & gh_df['LONGITUDE'].eq(coords[1]), 'GEOHASH'].values[0]

        # Creating two dataframe which will later be merged rows wise for visualization purpose.
        bus_stn_list.append({
            "type" : "bus_stn",
            "name" : site,
            "latitude" : target_lat,
            "longitude" : target_lon,
            "nearest" : site
        })

        new_list.append({
            "type" : "geohash",
            "name" : gh,
            "latitude" : coords[0],
            "longitude" : coords[1],
            "nearest" : site
        })

  0%|          | 0/14277 [00:00<?, ?it/s]

In [6]:
df = (
    pd.concat([pd.DataFrame(new_list), pd.DataFrame(bus_stn_list)], ignore_index=True)
    .drop_duplicates(ignore_index=True)
)

df

Unnamed: 0,type,name,latitude,longitude,nearest
0,geohash,sxk3zs,41.113586,28.811646,123932
1,geohash,sxk3ze,41.108093,28.811646,123932
2,geohash,sxk3zt,41.119080,28.811646,123932
3,geohash,sxk3zd,41.102600,28.811646,123932
4,geohash,sxkbbz,40.954285,29.218140,214552
...,...,...,...,...,...
45837,bus_stn,289411,41.001141,29.231451,289411
45838,bus_stn,285672,41.124514,29.284577,285672
45839,bus_stn,182611,41.079900,28.910141,182611
45840,bus_stn,288242,41.092573,29.083313,288242


In [7]:
# Some GEOHASH has up to 72 bus stations nearby
(
    df.query("type=='geohash'")
    .groupby('name')['nearest'].nunique()
    .nlargest(10)
)

name
sxk9mq    72
sxk9mc    69
sxk9mf    69
sxk9qp    67
sxk9er    64
sxk9dw    62
sxk9em    62
sxk9eq    62
sxk9mb    62
sxk9qj    61
Name: nearest, dtype: int64

In [8]:
(
    df.query("name=='sxk9mq'")
)

Unnamed: 0,type,name,latitude,longitude,nearest
565,geohash,sxk9mq,41.036682,29.064331,219241
1138,geohash,sxk9mq,41.036682,29.064331,259642
1139,geohash,sxk9mq,41.036682,29.064331,259661
1837,geohash,sxk9mq,41.036682,29.064331,215562
4157,geohash,sxk9mq,41.036682,29.064331,401251
...,...,...,...,...,...
32021,geohash,sxk9mq,41.036682,29.064331,216362
32215,geohash,sxk9mq,41.036682,29.064331,216671
33419,geohash,sxk9mq,41.036682,29.064331,220002
33802,geohash,sxk9mq,41.036682,29.064331,201251


In [9]:
df.to_csv("datasets/13_gh_proximities/gh_nearby_bus_stns.csv", index=False)