In [38]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

df = pd.read_csv('../podatki/bicikelj_metadata.csv', delimiter='\t')

#df = df_metadata.drop('total_space', axis=1)
display(df)

Unnamed: 0,postaja,geo-visina,geo-sirina,total_space
0,PREŠERNOV TRG-PETKOVŠKOVO NABREŽJE,46.051367,14.506542,20
1,POGAČARJEV TRG-TRŽNICA,46.051093,14.507186,18
2,KONGRESNI TRG-ŠUBIČEVA ULICA,46.050388,14.504623,20
3,CANKARJEVA UL.-NAMA,46.052431,14.503257,26
4,BREG,46.046498,14.505148,20
...,...,...,...,...
78,DOLENJSKA C. - STRELIŠČE,46.038866,14.517605,20
79,ROŠKA - STRELIŠKA,46.045000,14.518460,20
80,LEK - VEROVŠKOVA,46.076856,14.500222,20
81,VOKA - SLOVENČEVA,46.075207,14.504734,20


In [39]:
# Select the number of clusters
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from scipy.spatial.distance import cdist

k = 35

def k_means_clustering(k):

    # Create a new DataFrame with coordinates
    coordinates = df[['geo-visina', 'geo-sirina']]

    # Initialize KMeans model
    kmeans = KMeans(n_clusters=k, n_init=10, random_state=42, max_iter=300)

    # Fit the model to the coordinates
    kmeans.fit(coordinates)

    # Retrieve the cluster labels
    labels = kmeans.labels_
    
    return labels


def hiearchical_clustering(k):
    # Assuming you have a DataFrame 'df' with 'latitude' and 'longitude' columns
    coordinates = df[['geo-visina', 'geo-sirina']].values

    # Calculate pairwise distances using Haversine distance
    distances = cdist(coordinates, coordinates, metric='minkowski')

    # Perform hierarchical clustering
    n_clusters = k  # Choose the desired number of clusters
    clustering = AgglomerativeClustering(n_clusters=n_clusters, affinity='precomputed', linkage='average')
    cluster_labels = clustering.fit_predict(distances)

    return cluster_labels

# Add cluster labels as a new column in the DataFrame
#df['cluster'] = k_means_clustering(20)
df['cluster'] = hiearchical_clustering(k)


# Create a new column 'total_space_cluster' representing the sum of total_space in the same cluster
df['total_space_cluster'] = df.groupby('cluster')['total_space'].transform('sum')






In [40]:
import folium
import random

def generate_folium_colors(n):
    colors = random.choices(list(folium.Icon.color_options), k=n)
    return colors

clusters = df['cluster'].unique()

# Create a map centered around Ljubljana
ljubljana_map = folium.Map(location=[46.0569, 14.5058], zoom_start=12)
colors = generate_folium_colors(k)
# Iterate over each cluster
for i, cluster in enumerate(clusters):
    cluster_data = df[df['cluster'] == cluster]
    
    # Create a feature group for the cluster
    cluster_group = folium.FeatureGroup(name=f'Cluster {cluster}')
    
    # Add markers for each coordinate in the cluster
    for index, row in cluster_data.iterrows():
        color = colors[i]
        folium.Marker([row['geo-visina'], row['geo-sirina']], icon=folium.Icon(color=color)).add_to(cluster_group)
    
    # Add the cluster group to the map
    cluster_group.add_to(ljubljana_map)

# Add layer control to toggle the display of clusters
folium.LayerControl().add_to(ljubljana_map)

# Display the map
ljubljana_map

In [41]:
def create_dict(df):
    # Create an empty dictionary to store the cluster-station mapping
    cluster_dict = {}

    # Iterate over the rows of the DataFrame
    for _, row in df.iterrows():
        cluster = row['cluster']
        station_name = row['postaja']
        
        # Check if the cluster already exists in the dictionary
        if cluster in cluster_dict:
            cluster_dict[cluster].append(station_name)
        else:
            cluster_dict[cluster] = [station_name]
            
    return cluster_dict

cluster_dict = create_dict(df)
display(cluster_dict)

{10: ['PREŠERNOV TRG-PETKOVŠKOVO NABREŽJE',
  'POGAČARJEV TRG-TRŽNICA',
  'KONGRESNI TRG-ŠUBIČEVA ULICA',
  'CANKARJEVA UL.-NAMA',
  'BREG',
  'GRUDNOVO NABREŽJE-KARLOVŠKA C.',
  'MIKLOŠIČEV PARK',
  'BAVARSKI DVOR',
  'PARKIRIŠČE NUK 2-FF'],
 6: ['TRG OF-KOLODVORSKA UL.',
  'MASARYKOVA DDC',
  'VILHARJEVA CESTA',
  'PARK NAVJE-ŽELEZNA CESTA',
  'DUNAJSKA C.-PS PETROL',
  'LIDL BEŽIGRAD'],
 3: ['TRG MDB',
  'BARJANSKA C.-CENTER STAREJŠIH TRNOVO',
  'TRNOVO',
  'KOPALIŠČE KOLEZIJA'],
 4: ['AMBROŽEV TRG',
  'GH ŠENTPETER-NJEGOŠEVA C.',
  'ILIRSKA ULICA',
  'POLJANSKA-POTOČNIKOVA',
  'ROŠKA - STRELIŠKA'],
 13: ['TRŽAŠKA C.-ILIRIJA', 'GERBIČEVA - ŠPORTNI PARK SVOBODA'],
 11: ['TIVOLI', 'STARA CERKEV'],
 17: ['KINO ŠIŠKA', 'MERCATOR MARKET - CELOVŠKA C. 163'],
 25: ['ŠPICA', 'DOLENJSKA C. - STRELIŠČE'],
 22: ['ZALOŠKA C.-GRABLOVIČEVA C.', 'TRŽNICA MOSTE', 'POVŠETOVA-GRABLOVIČEVA'],
 5: ['ROŽNA DOLINA-ŠKRABČEVA UL.', 'CESTA NA ROŽNIK'],
 14: ['PLEČNIKOV STADION', 'DUNAJSKA C.-PS MERCATOR'],


In [46]:
import pandas as pd
import numpy as np

# Function to calculate the distance between two points using Haversine formula
def haversine_distance(lat1, lon1, lat2, lon2):
    """
    Calculate the distance between two points on the Earth's surface
    using the Haversine formula.
    """
    # Earth radius in kilometers
    R = 6371.0
    
    # Convert coordinates to radians
    lat1_rad = np.radians(lat1)
    lon1_rad = np.radians(lon1)
    lat2_rad = np.radians(lat2)
    lon2_rad = np.radians(lon2)
    
    # Calculate differences in coordinates
    dlat = lat2_rad - lat1_rad
    dlon = lon2_rad - lon1_rad
    
    # Haversine formula
    a = np.sin(dlat / 2)**2 + np.cos(lat1_rad) * np.cos(lat2_rad) * np.sin(dlon / 2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    distance = R * c
    
    return distance

# Load the data into a DataFrame
data = pd.read_csv('../podatki/bicikelj_metadata.csv', delimiter='\t')

# Create a new column to store the index of the closest station
data['closest_station'] = np.nan

# Iterate over each station
for i in range(len(data)):
    lat1 = data.loc[i, 'geo-sirina']
    lon1 = data.loc[i, 'geo-visina']
    min_distance = np.inf
    closest_station_index = None
    
    # Compare the current station with all other stations
    for j in range(len(data)):
        if i != j:
            lat2 = data.loc[j, 'geo-sirina']
            lon2 = data.loc[j, 'geo-visina']
            
            # Calculate the distance between the stations
            distance = haversine_distance(lat1, lon1, lat2, lon2)
            
            # Check if the current distance is smaller than the minimum distance
            if distance < min_distance:
                min_distance = distance
                closest_station_index = j
    
    # Assign the index of the closest station to the current station's row
    data.loc[i, f'closest_station'] = data.iloc[closest_station_index, 0]

# Print the DataFrame with the closest station information
display(data)


Unnamed: 0,postaja,geo-visina,geo-sirina,total_space,closest_station
0,PREŠERNOV TRG-PETKOVŠKOVO NABREŽJE,46.051367,14.506542,20,POGAČARJEV TRG-TRŽNICA
1,POGAČARJEV TRG-TRŽNICA,46.051093,14.507186,18,PREŠERNOV TRG-PETKOVŠKOVO NABREŽJE
2,KONGRESNI TRG-ŠUBIČEVA ULICA,46.050388,14.504623,20,PREŠERNOV TRG-PETKOVŠKOVO NABREŽJE
3,CANKARJEVA UL.-NAMA,46.052431,14.503257,26,KONGRESNI TRG-ŠUBIČEVA ULICA
4,BREG,46.046498,14.505148,20,GRUDNOVO NABREŽJE-KARLOVŠKA C.
...,...,...,...,...,...
78,DOLENJSKA C. - STRELIŠČE,46.038866,14.517605,20,ŠPICA
79,ROŠKA - STRELIŠKA,46.045000,14.518460,20,AMBROŽEV TRG
80,LEK - VEROVŠKOVA,46.076856,14.500222,20,VOKA - SLOVENČEVA
81,VOKA - SLOVENČEVA,46.075207,14.504734,20,LEK - VEROVŠKOVA
