In [1]:
from scipy.spatial import KDTree
from geopy.distance import geodesic
import pandas as pd

In [2]:
gh_df = (
    pd.read_csv("geohash_traffic_density_pt_15clusters.csv", usecols=['GEOHASH', 'LATITUDE', 'LONGITUDE'])
)

# Reading taxi stands data
taxi_df = pd.read_csv("datasets/09_ist_taxi_stands/ist_taxi_stands.csv", usecols=['Name', 'LATITUDE', 'LONGITUDE', 'town', 'postcode'])
taxi_df.head()

Unnamed: 0,Name,LATITUDE,LONGITUDE,town,postcode
0,MEYDAN TAKSİ,40.985617,28.724535,Avcılar,34320.0
1,KARADENİZ TAKSİ,40.985362,29.05937,Kadıköy,34732.0
2,STAR TAKSİ,40.977266,29.074184,Kadıköy,34738.0
3,OYAK SİTESİ GÜL TAKSİ,41.108846,28.996626,Sarıyer,34485.0
4,SİTE TAKSİ,41.089348,29.022291,Beşiktaş,34335.0


In [3]:
# Taxis with duplicate names
dbl_taxi_names = (
    taxi_df
    .groupby('Name').size()
    .nlargest(7)
    .index
)

dbl_taxi_names

Index(['ALKENT TAKSİ', 'BİZİM TAKSİ', 'HİZMET TAKSİ', 'OPTİMUM AVM ÖNÜ',
       'SEKMEN TAKSİ DURAĞI', 'SİTE TAKSİ', 'ÇİÇEK TAKSİ'],
      dtype='object', name='Name')

In [4]:
# Adding suffix to name to make them unique
taxi_df.loc[taxi_df['Name'].isin(dbl_taxi_names), 'Name'] = \
taxi_df.loc[taxi_df['Name'].isin(dbl_taxi_names), 'Name'] + "_" + \
(
    taxi_df.loc[taxi_df['Name'].isin(dbl_taxi_names), 'Name']
    .groupby(taxi_df['Name'])
    .transform("cumcount")
    .astype("string")
)

In [5]:
# All are unique now
(
    taxi_df
    .groupby('Name').size()
    .nlargest(7)
)

Name
1. ULUS TAKSİ DURAĞI                            1
2.ULUS MERKEZ TAKSİ                             1
2.ULUS TURİZM TAKSİ DURAĞI(BEŞİKTAŞ TURİZM))    1
212 TAKSİ                                       1
217.SOKAK NO:1 T.ÖZAL BULVARI                   1
3.LEVENT SEVGİ TAKSİ DURAĞI                     1
ACIBADEM FATİH CAD.TAKSİ DURAĞI                 1
dtype: int64

In [6]:
# converting to dictionary
# Each key is a taxi stand and the values are its coordinates

site_dict = (
    taxi_df
    .set_index('Name')
    .T
    .to_dict()
)

In [7]:
# Unique list of coordinates
coordinates = gh_df[['LATITUDE', 'LONGITUDE']].drop_duplicates().values
coordinates

array([[41.080627, 28.811646],
       [40.987244, 29.108276],
       [41.003723, 29.09729 ],
       ...,
       [40.976257, 29.229126],
       [41.009216, 27.98767 ],
       [41.020203, 27.998657]])

In [8]:
def find_nearest_coordinates(
        target_lat: float,
        target_lon: float,
        coordinates: list[list[float, float]]
        ) -> list[tuple[tuple[float, float], float]]:
    """
    Finds the nearest coordinates to a target latitude and longitude.

    Args:
        target_lat (float): The latitude of the target location.
        target_lon (float): The longitude of the target location.
        coordinates (List[Tuple[float, float]]): A list of tuples representing the coordinates to search from.

    Returns:
        List[Tuple[Tuple[float, float], float]]: A list of tuples containing the nearest coordinates and their distances in kilometers.
    """
    target_coords = (target_lat, target_lon)
    
    # Build a KDTree for efficient nearest neighbor search
    tree = KDTree(coordinates)

    # Calculate the search radius in degrees
    radius_deg = 1 / 111.0  # 1 km radius in degrees

    # Query the KDTree for indices of points within the search radius
    indices = tree.query_ball_point([(target_lat, target_lon)], radius_deg)

    # Calculate the distance between target coordinates and nearest coordinates
    nearest_coordinates = [(coordinates[i], geodesic(target_coords, coordinates[i]).km) for i in indices[0]]

    # Sort the nearest coordinates by distance
    nearest_coordinates.sort(key=lambda x: x[1])  # Sort by distance

    return nearest_coordinates

In [9]:
taxi_list = []
new_list = []

for site in site_dict:

    target_lat = site_dict[site]['LATITUDE']
    target_lon = site_dict[site]['LONGITUDE']

    nearest_coords = find_nearest_coordinates(target_lat, target_lon, coordinates)

    for i, (coords, distance) in enumerate(nearest_coords):

        gh = gh_df.loc[gh_df['LATITUDE'].eq(coords[0]) & gh_df['LONGITUDE'].eq(coords[1]), 'GEOHASH'].values[0]

        # Creating two dataframe which will later be merged rows wise for visualization purpose.
        taxi_list.append({
            "type" : "taxi_stand",
            "name" : site,
            "latitude" : target_lat,
            "longitude" : target_lon,
            "nearest" : site
        })

        new_list.append({
            "type" : "geohash",
            "name" : gh,
            "latitude" : coords[0],
            "longitude" : coords[1],
            "nearest" : site
        })

In [10]:
df = (
    pd.concat([pd.DataFrame(new_list), pd.DataFrame(taxi_list)], ignore_index=True)
    .drop_duplicates(ignore_index=True)
)

df

Unnamed: 0,type,name,latitude,longitude,nearest
0,geohash,sxk3jt,40.987244,28.723755,MEYDAN TAKSİ
1,geohash,sxk3js,40.981750,28.723755,MEYDAN TAKSİ
2,geohash,sxk3jw,40.992737,28.723755,MEYDAN TAKSİ
3,geohash,sxk9jm,40.987244,29.064331,KARADENİZ TAKSİ
4,geohash,sxk9jj,40.987244,29.053345,KARADENİZ TAKSİ
...,...,...,...,...,...
2402,taxi_stand,İSPARK HORHOR - FATİH,41.014178,28.951819,İSPARK HORHOR - FATİH
2403,taxi_stand,Konak Taksi,41.089200,29.013600,Konak Taksi
2404,taxi_stand,KOCAYOL TAKSİ,40.964800,29.102700,KOCAYOL TAKSİ
2405,taxi_stand,ATAŞEHİR GREEN PARK OTELİ,40.966868,29.109637,ATAŞEHİR GREEN PARK OTELİ


In [11]:
# Soem GEOHASH as up to 15 taxi stand within its 1km radius
(
    df.query("type=='geohash'")
    .groupby('name')['nearest'].nunique()
    .nlargest(20)
)

name
sxk97w    15
sxk97t    14
sxk9sp    14
sxk9sn    13
sxk976    12
sxk9hw    12
sxk9eg    11
sxk9e7    10
sxk9s5    10
sxk9sq    10
sxk96c     9
sxk9eu     9
sxk9hx     9
sxk9n0     9
sxk9s4     9
sxk9sj     9
sxk9sr     9
sxk91p     8
sxk977     8
sxk97m     8
Name: nearest, dtype: int64

In [12]:
df.to_csv("datasets/13_gh_proximities/gh_nearby_taxi_stands.csv", index=False)