## Baseline GCN testing
Notebook to create and evaluate GCN against EBC on predicting number of passing bicyclists in *copenhagen?*
- Preprocess EBC for graph DONE
- Assign Metrics from data
- Create Torch Graph
- Evaluate against SOTA

In [None]:
import torch
from torch_geometric.data import Data
import torch_geometric as tg
import osmnx as ox
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import geopandas as gpd
import folium
from folium import plugins
from folium.plugins import HeatMap
from shapely.geometry import Point, LineString, Polygon
import shapely
import momepy as mp 
import esda
import seaborn as sns
from shapely.strtree import STRtree
import pickle
from tqdm import tqdm

import os, glob

sns.set_theme()


In [None]:
lat, lon = 55.6867243, 12.5700724
dist = 10000

amenities = ox.features.features_from_point((lat, lon), tags={'shop' : True}, dist=dist)

features = [
    'aerialway',
    # 'aeroway',
    # 'amenity',
    # 'barrier',
    # 'boundary',
    # 'building',
    # 'craft',
    # 'emergency',
    # 'geological',
    # 'healthcare',
    # 'highway',
    # 'historic',
    # 'landuse',
    # 'leisure',
    # 'man_made',
    # 'military',
    # 'natural',
    # 'office',
    # 'place',
    # 'power',
    # 'public_transport',
    # 'railway',
    # 'route',
    'shop',
    # 'telecom',
    # 'tourism',
    # 'water',
    # 'waterway',    
]

expand_features = [
    'shop',
    # 'route', 
    # 'highway', 
    # 'waterway', 
    # 'width', 
    # 'length', 
    # 'aerialway', 
    # 'power', 
    # 'healthcare'
]


In [None]:
def get_city_graph(lat, lon, dist, features, expand_features):
    g = ox.graph_from_point((lat, lon), dist=dist, network_type='bike', simplify=True, retain_all=False)
    feat_dict = {i : True for i in features}
    amenities = ox.features.features_from_point((lat, lon), tags=feat_dict, dist=dist)
    amenities = amenities[amenities.geometry.notnull()]
    amenities['new_col'] = np.nan

    for feat in features:
        if feat not in expand_features:
            amenities.loc[amenities[feat].notnull(), 'new_col'] = feat
    
    amenities['amenity'] = amenities['new_col']

    for feat in expand_features:
        amenities['amenity'].fillna(amenities[feat], inplace=True)
    amenities = amenities[amenities['amenity'].notnull()]

    gdf = mp.nx_to_gdf(g, points=False, lines=True, spatial_weights=True).to_crs(epsg=3857)
    gdf = gdf[gdf.geometry.notnull()].reset_index(drop=True)
    return g, gdf, amenities

g, gdf, amenities = get_city_graph(lat,
                                    lon,
                                    dist,
                                    features = features, 
                                    expand_features = expand_features)


In [None]:
### carry weights over to line graph

def create_linegraph(g):
    g = nx.Graph(g)
    H = nx.line_graph(g)
    H.add_nodes_from((node, g.edges[node]) for node in H)   
    for s, t in H.edges:
        H.edges[s, t]['weight'] = g.edges[s]['length'] + g.edges[t]['length']
    return H

H = create_linegraph(g)


### EBC Calculation

In [None]:
def calc_bc(graph):
    ebc = dict(nx.all_pairs_dijkstra_path(graph,
                                    weight='weight',
                                    cutoff=1000,))
    bc = {i: 0 for i in graph.nodes}
    for node in tqdm(graph.nodes):
        for path in ebc[node].values():
            for node_visited in set(path):
                bc[node_visited] += 1
    total_nodes = graph.number_of_nodes() ** 2
    return {node: count / total_nodes for node, count in bc.items()}

bc = calc_bc(H)
nx.set_node_attributes(H, bc, 'bc')


In [None]:
def load_aadt(filepath, g):
    nodes, edges = mp.nx_to_gdf(g)
    gdf2 = gpd.GeoDataFrame.from_file(filepath)
    gdf2.set_crs(epsg=4326, inplace=True)
    gdf2 = gdf2.to_crs(epsg=3857)
    gdf2['geometry'] = gdf2['geometry']
    # gdf2 = gdf2[gdf2['geometry'].within(gdf['geometry'])]
    ### export only relevant columns
    gdf_new = gdf2[['id', 'vejnavn', 'geometry', 'aadt_cykler']]
    ### remove null values on aadt_cykler
    gdf_new = gdf_new[gdf_new['aadt_cykler'].notnull()]
    xmin, ymin, xmax, ymax = gdf.total_bounds
    gdf_new = gdf_new.cx[xmin:xmax, ymin:ymax]
    gdf_new.to_crs(epsg=4326, inplace=True)
    return gdf_new

gdf_new = load_aadt('../data/raw/trafiktaelling.json', g)


In [None]:
def assign_aadt_to_graph_edges(g, gdf_new, H, aadt_col='aadt_cykler'):
    """
    Assigns AADT values from gdf_new to the nearest edge in graph H based on proximity.

    Parameters:
    - g: networkx graph with edge geometries.
    - gdf_new: GeoDataFrame containing points and AADT values.
    - H: networkx graph where AADT attributes will be assigned.
    - aadt_col: column name in gdf_new containing the AADT values.
    """

    edges_data = list(g.edges(data=True))
    linestrings = [attr['geometry'] if 'geometry' in attr else None for _, _, attr in edges_data]
    from_node = [u for u, _, _ in edges_data]
    to_node = [v for _, v, _ in edges_data]

    tree = STRtree(linestrings)

    for i, row in tqdm(gdf_new.iterrows(), total=len(gdf_new)):
        point = row['geometry']
        if point is None:
            continue

        nearest_edge_idx = tree.nearest(point)
        nearest_edge = linestrings[nearest_edge_idx]
        nearest_edge_distance = nearest_edge.distance(point)

        start_node = from_node[nearest_edge_idx]
        end_node = to_node[nearest_edge_idx]

        # Ensure the edge exists in H
        if (start_node, end_node) not in H.nodes():
            if (end_node, start_node) not in H.nodes():
                continue
            else:
                start_node, end_node = end_node, start_node

        node_pair = (start_node, end_node)

        # Initialize or update AADT attributes if closer
        if 'aadt' not in H.nodes()[node_pair]:
            H.nodes()[node_pair]['aadt'] = row[aadt_col]
            H.nodes()[node_pair]['aadt_distance'] = nearest_edge_distance
        elif H.nodes()[node_pair]['aadt_distance'] > nearest_edge_distance:
            H.nodes()[node_pair]['aadt'] = row[aadt_col]
            H.nodes()[node_pair]['aadt_distance'] = nearest_edge_distance
    return H

H = assign_aadt_to_graph_edges(g, gdf_new, H, aadt_col='aadt_cykler')


In [None]:
# linestrings = [i[2]['geometry'] if 'geometry' in i[2] else None for i in list(g.edges(data=True))]
# from_node = [i[0] for i in list(g.edges(data=True))]
# to_node = [i[1] for i in list(g.edges(data=True))]

# tree = STRtree(linestrings)
# for i, row in tqdm(gdf_new.iterrows(), total=len(gdf_new)):
#     point = row['geometry']
#     if point is None:
#         continue
#     nearest_edge_idx = tree.nearest(point)
#     nearest_edge = linestrings[nearest_edge_idx]
#     nearest_edge_distance = nearest_edge.distance(point)
#     start_node = from_node[linestrings.index(nearest_edge)]
#     end_node = to_node[linestrings.index(nearest_edge)]
    
#     # Ensure the edge exists in the graph
#     if (start_node, end_node) not in H.nodes():
#         if (end_node, start_node) not in H.nodes:
#             continue
#         else:
#             start_node, end_node = end_node, start_node

#     if 'aadt' not in H.nodes()[(start_node, end_node)]:
#         H.nodes()[(start_node, end_node)]['aadt'] = row['aadt_cykler']
#         H.nodes()[(start_node, end_node)]['aadt_distance'] = nearest_edge_distance
#     if 'aadt_distance' not in H.nodes()[(start_node, end_node)] or H.nodes()[(start_node, end_node)]['aadt_distance'] > nearest_edge_distance:
#         H.nodes()[(start_node, end_node)]['aadt'] = row['aadt_cykler']
#         H.nodes()[(start_node, end_node)]['aadt_distance'] = nearest_edge_distance


In [None]:
# print the number of nodes in H with the attribute 'aadt'
print(f"Number of nodes in H with 'aadt' attribute: {sum(1 for _, data in H.nodes(data=True) if 'aadt' in data)}")


In [None]:
for node, value in H.nodes(data=True):
    if 'aadt' not in value.keys():
        value['aadt'] = 0


In [None]:
# # amenities = amenities.reset_index()
# nodes = list((node, linestring) for node, linestring in H.nodes(data='geometry'))
# nodes = [node for node in nodes if node[1] is not None]
# linestrings = [linestring for node, linestring in nodes]
# nodes = [node for node, linestring in nodes]
# assert len(nodes) == len(linestrings)
# amenities['geometry'] = amenities['geometry'].apply(lambda x: x.centroid if x.geom_type == 'Polygon' else x)
# tree = STRtree(linestrings)
# for geom, amenity in zip(amenities['geometry'], amenities['amenity']):
#     nearest = tree.nearest(geom)
#     nearest = nodes[nearest]
#     if 'amenity' not in H.nodes[nearest]:
#         H.nodes[nearest]['amenity'] = [amenity]
#     else:
#         H.nodes[nearest]['amenity'].append(amenity)

# from collections import Counter
# for i in H.nodes(data=True):
#     if 'amenity' in i[1]:
#         amenity_counts = Counter(i[1]['amenity'])
#         for key in amenity_counts:
#             H.nodes[i[0]][key] = amenity_counts[key]
#         ## drop the amenity key
#         H.nodes[i[0]].pop('amenity', None)


In [None]:
def clean_and_standardize_node_features(H, remove_fields=None):
    """
    Cleans and standardizes node attributes in a graph.
    Removes specified fields, ensures all attributes are numeric floats, and fills missing features with 0.

    Parameters:
    - H: networkx graph.
    - remove_fields: list of fields to remove (default common OSM fields).
    
    Returns:
    - all_feats: list of all standardized features across nodes.
    """
    if remove_fields is None:
        remove_fields = ['geometry', 'name', 'highway', 'ref', 'aadt_dist', 'aadt_distance']

    # Clean node attributes and convert to floats where possible
    for node, data in H.nodes(data=True):
        for field in remove_fields:
            data.pop(field, None)

        # Convert to float if possible, else remove
        for key in list(data.keys()):
            if not isinstance(data[key], (int, float)):
                try:
                    data[key] = float(data[key])
                except (ValueError, TypeError):
                    data.pop(key, None)

    # Gather all unique features across all nodes
    all_feats = set()
    for _, data in H.nodes(data=True):
        all_feats.update(data.keys())
    all_feats = list(all_feats)

    # Fill missing features with 0
    for _, data in H.nodes(data=True):
        for feat in all_feats:
            data.setdefault(feat, 0)

    return all_feats

all_feats = clean_and_standardize_node_features(H, remove_fields=None)


In [None]:
# for node in H.nodes(data=True):
#     node[1].pop('geometry', None)
#     node[1].pop('name', None)
#     node[1].pop('highway', None)
#     node[1].pop('ref', None)
#     node[1].pop('aadt_dist', None)
#     node[1].pop('aadt_distance', None)

#     for key in list(node[1].keys()):
#         if type(node[1][key]) not in (int, float):
#             try:
#                 node[1][key] = float(node[1][key])
#             except:
#                 node[1].pop(key, None)

# all_feats = []
# for node in H.nodes(data=True):
#     for key in node[1].keys():
#         if key not in all_feats:
#             all_feats.append(key)


# for node in H.nodes(data=True):
#     for feat in all_feats:
#         if feat not in node[1].keys():
#             node[1][feat] = 0


In [None]:
### save node features in dataframe
node_features = []
for node in H.nodes(data=True):
    node_features.append(node[1])
node_features = pd.DataFrame(node_features).drop(columns=['aadt', 'osmid'])


In [None]:
# node_list, x, y = [], [], []
# osmid_list = []
# for node, feats in list(H.nodes(data=True)):
#     node_list.append(node)
#     x.append([feats[feat] for feat in all_feats if feat not in ['aadt', 'osmid']])
#     y.append(feats['aadt'])
#     osmid_list.append(feats['osmid'])

# node_idx = {node : idx for idx, node in enumerate(node_list)}
# edge_index = []
# for s, t, data in H.edges(data=True):
#     edge_index.append([node_idx[s], node_idx[t]])

# # for s, t in list(H.edges):
# #     edge_index.append([node_idx[s], node_idx[t]])


# edge_index = torch.tensor(edge_index, dtype=torch.long).t()
# x = torch.tensor(x, dtype=torch.float)
# y = torch.tensor(y, dtype=torch.float)

# linegraph = Data()
# linegraph.num_nodes = len(node_list)
# linegraph.x = x
# linegraph.y = y
# linegraph.osmid = torch.tensor(osmid_list, dtype=torch.long)
# linegraph.edge_index = edge_index
# linegraph.H = H
import torch
from torch_geometric.data import Data

def graph_to_linegraph_data(H, all_feats, target_feat='aadt', osmid_feat='osmid'):
    """
    Converts a networkx graph H with node and edge attributes into a PyTorch Geometric Data object.
    
    Parameters:
    - H: networkx graph with node features.
    - all_feats: list of feature names to extract from nodes.
    - target_feat: feature to use as the target variable (default 'aadt').
    - osmid_feat: feature to use as osmid identifier (default 'osmid').
    
    Returns:
    - PyTorch Geometric Data object with node features, targets, osmid, and edge index.
    """
    node_list, x, y, osmid_list = [], [], [], []

    for node, feats in H.nodes(data=True):
        node_list.append(node)
        x.append([feats.get(feat, 0.0) for feat in all_feats if feat not in [target_feat, osmid_feat]])
        y.append(feats[target_feat])
        osmid_list.append(feats[osmid_feat])

    node_idx = {node: idx for idx, node in enumerate(node_list)}

    edge_index = [[node_idx[s], node_idx[t]] for s, t in H.edges()]

    data = Data()
    data.num_nodes = len(node_list)
    data.x = torch.tensor(x, dtype=torch.float)
    data.y = torch.tensor(y, dtype=torch.float)
    data.osmid = torch.tensor(osmid_list, dtype=torch.long)
    data.edge_index = torch.tensor(edge_index, dtype=torch.long).t()
    data.H = H  # Optional: Attach original H graph if needed

    return data

linegraph = graph_to_linegraph_data(H, all_feats, target_feat='aadt', osmid_feat='osmid')


In [None]:
assert linegraph.edge_index.shape[0] == 2
assert linegraph.edge_index.shape[1] == linegraph.edge_attr.shape[0] if 'edge_attr' in linegraph else True
assert linegraph.x.shape[0] == linegraph.num_nodes


In [None]:
# import os
# import glob
# import pickle

# # Find existing config files
# config_folder = glob.glob('../data/graphs/configs/*.txt')

# # Helper function to check if config matches
# def config_matches(file_path, features, expand_features, dist):
#     with open(file_path, 'r') as f:
#         config = f.readlines()
#     config_dict = {}
#     for line in config:
#         key, value = line.strip().split(':', 1)
#         config_dict[key.strip()] = set(value.strip().split()) if key != 'distance' else int(value.strip())

#     return (
#         config_dict.get('features', set()) == set(features) and
#         config_dict.get('expand_features', set()) == set(expand_features) and
#         config_dict.get('distance', None) == dist
#     )

# # Create initial folder if no config exists
# if not config_folder:
#     print('Creating new folder structure...')
#     os.makedirs('../data/graphs/1/models', exist_ok=True)
#     with open('../data/graphs/configs/1.txt', 'w') as f:
#         f.write(f"features: {' '.join(features)}\n")
#         f.write(f"expand_features: {' '.join(expand_features)}\n")
#         f.write(f"distance: {dist}\n")
#     num_folder = '1'
# else:
#     # Check if a matching config already exists
#     num_folder = None
#     for file in config_folder:
#         if config_matches(file, features, expand_features, dist):
#             num_folder = os.path.splitext(os.path.basename(file))[0]
#             break

#     # If no matching config, create new one
#     if not num_folder:
#         num_folder = str(len(config_folder) + 1)
#         os.makedirs(f'../data/graphs/{num_folder}/models', exist_ok=True)
#         with open(f'../data/graphs/configs/{num_folder}.txt', 'w') as f:
#             f.write(f"features: {' '.join(features)}\n")
#             f.write(f"expand_features: {' '.join(expand_features)}\n")
#             f.write(f"distance: {dist}\n")

# # Save graphs and node features
# os.makedirs(f'../data/graphs/{num_folder}', exist_ok=True)

# # Save torch geometric graph
# with open(f'../data/graphs/{num_folder}/linegraph_tg.pkl', 'wb') as f:
#     pickle.dump(linegraph, f)

# # Save corresponding networkx graph
# with open(f'../data/graphs/{num_folder}/linegraph_nx.pkl', 'wb') as f:
#     pickle.dump(H, f)

# # Save node features
# node_features.to_csv(f'../data/graphs/{num_folder}/node_features.csv', index=False)


In [None]:
import os
import glob
import pickle

def save_graph_with_config(
    linegraph, 
    H, 
    node_features, 
    features, 
    expand_features, 
    dist, 
    base_path='../data/graphs'
):
    """
    Save graph data, networkx graph, and node features into a structured folder.
    Uses config files to check for existing setups, creates new folders if needed.

    Parameters:
    - linegraph: PyTorch Geometric Data object.
    - H: NetworkX graph object.
    - node_features: DataFrame of node features.
    - features: list of features.
    - expand_features: list of expanded features.
    - dist: distance parameter.
    - base_path: base path to store the graphs and configs.
    
    Returns:
    - num_folder (str): The assigned folder number where data was saved.
    """
    config_folder = glob.glob(f'{base_path}/configs/*.txt')

    def config_matches(file_path):
        with open(file_path, 'r') as f:
            config = f.readlines()
        config_dict = {}
        for line in config:
            key, value = line.strip().split(':', 1)
            config_dict[key.strip()] = set(value.strip().split()) if key != 'distance' else int(value.strip())

        return (
            config_dict.get('features', set()) == set(features) and
            config_dict.get('expand_features', set()) == set(expand_features) and
            config_dict.get('distance', None) == dist
        )

    # Determine folder number
    if not config_folder:
        print('Creating initial folder structure...')
        num_folder = '1'
    else:
        num_folder = None
        for file in config_folder:
            if config_matches(file):
                num_folder = os.path.splitext(os.path.basename(file))[0]
                break
        if not num_folder:
            num_folder = str(len(config_folder) + 1)

    # Create necessary folders and save configs
    os.makedirs(f'{base_path}/{num_folder}/models', exist_ok=True)
    with open(f'{base_path}/configs/{num_folder}.txt', 'w') as f:
        f.write(f"features: {' '.join(features)}\n")
        f.write(f"expand_features: {' '.join(expand_features)}\n")
        f.write(f"distance: {dist}\n")

    # Save data
    with open(f'{base_path}/{num_folder}/linegraph_tg.pkl', 'wb') as f:
        pickle.dump(linegraph, f)

    with open(f'{base_path}/{num_folder}/linegraph_nx.pkl', 'wb') as f:
        pickle.dump(H, f)

    node_features.to_csv(f'{base_path}/{num_folder}/node_features.csv', index=False)

    print(f"Graph and data saved in folder {num_folder}")
    return num_folder

save_graph_with_config(
    linegraph, 
    H, 
    node_features, 
    features, 
    expand_features, 
    dist
)
