# Processing and Analyzing Road Networks

This notebook is for processing road data by matching road names between two CSVs using fuzzy matching, then creating a linestring road segment from GPS co-ordinates. This is useful for when you get scammed to provide an overengineered solution when you do not have access to the original corridors geospatial dataset.

It is recommended to create a virtual environment in this directory (.venv)

### Importing necessary libraries 

In [24]:
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import geopandas as gpd
from shapely.geometry import Point, LineString
import ast
import numpy as np

### Defining preprocessing and fuzzy matching functions

In [25]:
def preprocess_road_name(road_name):
    if not isinstance(road_name, str):
        return None
    first_word = road_name.split()[0].lower()
    return first_word

def fuzzy_match(row, choices, threshold):
    if not isinstance(row, str):
        return None
    best_match, score = process.extractOne(row, choices)
    if score > threshold:
        return best_match
    return None

### Define the function to pre-process the CSV data

In [26]:
def join_corridors(input_file, output_file):
    linz = pd.read_csv('nz-roads-addressing.csv')
    shu = pd.read_csv(input_file)

    shu['Road Name'] = shu['Road Name'].astype(str)
    shu['Processed Road Name'] = shu['Road Name'].apply(preprocess_road_name)
    shu_roads_list = shu['Processed Road Name'].tolist()

    linz['Processed_Full_Road_Name'] = linz['full_road_name'].apply(preprocess_road_name)
    linz['Processed_Road_Name_Label'] = linz['road_name_label'].apply(preprocess_road_name)

    threshold = 90
    linz['Matched_Road_Full'] = linz['Processed_Full_Road_Name'].apply(lambda x: fuzzy_match(x, shu_roads_list, threshold))
    linz['Matched_Road_Label'] = linz['Processed_Road_Name_Label'].apply(lambda x: fuzzy_match(x, shu_roads_list, threshold))

    matched_roads = linz.dropna(subset=['Matched_Road_Full', 'Matched_Road_Label'], how='all')

    result_full = matched_roads.merge(shu, left_on='Matched_Road_Full', right_on='Processed Road Name', how='inner')
    result_label = matched_roads.merge(shu, left_on='Matched_Road_Label', right_on='Processed Road Name', how='inner')

    result = pd.concat([result_full, result_label], ignore_index=True)
    
    # Create a new column indicating whether GPS coordinates exist
    result['lat_lon'] = ~result['Start GPS Co-ordinates'].isna() & ~result['End GPS Co-ordinates'].isna()

    # Keep only the columns needed for your Rosetta stone
    result = result[['road_id', 'full_road_name', 'Road Name', 'Corridor ID', 'Processed_Full_Road_Name', 'lat_lon']]
    
    # Save the Rosetta stone to a CSV file
    result.to_csv(output_file, index=False)
    
    # Save the rows that did not join to a separate CSV file
    not_joined = shu[~shu['Corridor ID'].isin(result['Corridor ID'])]
    not_joined.to_csv("not_joined.csv", index=False)

    return result

### Execute and do the pre-processing

In [27]:
result_df = join_corridors("shu_raw/local2.csv", "join_table.csv")

Note now you have "join_table.csv" this should be manually checked and edited, to confirm roads have joined correctly. Deleting duplicates manually is recommended before continuing. Note we created "rosetta_stone.csv" by manually filtering join_table to remove incorrect joins/duplicates and manually add not_joined.

### Define the processing function

In [None]:
def process_joined_data(df):
    # Save the rows with missing GPS coordinates to a separate CSV
    df_without_gps = df[df['Start GPS Co-ordinates'].isna() | df['End GPS Co-ordinates'].isna()]
    df_without_gps.to_csv("without_gps.csv", index=False)

    # Keep only the rows with GPS coordinates for further processing
    df = df.dropna(subset=['Start GPS Co-ordinates', 'End GPS Co-ordinates'])

    # Discard duplicates based on 'Start GPS Co-ordinates'
    df = df.drop_duplicates(subset=['Start GPS Co-ordinates'], keep='first')
    
    # Save rows that did not join to a separate CSV
    not_joined = df[~df['Corridor ID'].isin(df['Corridor ID'])]
    not_joined.to_csv("not_joined.csv", index=False)

    columns_to_keep = ['road_id', 'full_road_name', 'road_name_label', 'Road Name', 'Corridor ID', 'Land Use',
                        'Traffic Volume', 'Street Category', 'Collective Risk Band', 'Personal Risk Band',
                        'Posted Speed Limit', 'Free Flow Speed', 'IRR Band', 'Safe and Appropriate Speed',
                        'Difference between posted speed limit and SaAS', 'Difference between operating and SaAS',
                        'Proposed Permanent Speed Limit', 'Proposed Variable Speed Limit', 'Start GPS Co-ordinates',
                        'End GPS Co-ordinates']
    df = df[columns_to_keep]

    df.to_csv("combined.csv", index=False)

    return df


### Execute

In [None]:
processed_df = process_joined_data(result_df)

### Define functions for geo-processing

In [11]:
def is_linestring(geom):
    return geom.geom_type == 'LineString'

def safe_convert(coord):
    try:
        coord_list = ast.literal_eval(coord)
        return Point(float(coord_list[1]), float(coord_list[0]))
    except (ValueError, SyntaxError):
        return np.nan
    
def closest_point(point, linestring):
    closest_point = linestring.interpolate(linestring.project(point))
    return closest_point

def cut_linestring_at_points(linestring, point1, point2):
    fraction1 = linestring.project(point1)
    fraction2 = linestring.project(point2)

    if fraction1 > fraction2:
        fraction1, fraction2 = fraction2, fraction1

    # Get the coordinates between the two projected points
    coords = [coord for coord in linestring.coords if fraction1 <= linestring.project(Point(coord)) <= fraction2]

    # Include the projected points in the final segment
    segment = LineString([linestring.interpolate(fraction1), *coords, linestring.interpolate(fraction2)])

    return segment

### Execute Geoprocessing Steps

In [12]:
road_network = gpd.read_file('explode-dissolve/roads.shp')
road_network['road_id'] = road_network['road_id'].astype(str)

df = pd.read_csv('combined.csv')

df['StartPoint'] = df['Start GPS Co-ordinates'].apply(safe_convert)
df['EndPoint'] = df['End GPS Co-ordinates'].apply(safe_convert)

points_gdf = gpd.GeoDataFrame(df, geometry='StartPoint')W

points_gdf['road_id'] = points_gdf['road_id'].astype('int64')
road_network['road_id'] = road_network['road_id'].astype('int64')

road_network.rename(columns={'geometry': 'geometry_road'}, inplace=True)
merged_gdf = points_gdf.merge(road_network, how='left', on='road_id')

# Filter out rows where 'geometry_road' is a MultiLineString
merged_gdf = merged_gdf[merged_gdf['geometry_road'].apply(is_linestring)]

merged_gdf['Start_Closest_Point'] = merged_gdf.apply(lambda row: closest_point(row['StartPoint'], row['geometry_road']), axis=1)
merged_gdf['End_Closest_Point'] = merged_gdf.apply(lambda row: closest_point(row['EndPoint'], row['geometry_road']), axis=1)

merged_gdf['Road_Segment'] = merged_gdf.apply(lambda row: cut_linestring_at_points(row['geometry_road'], row['Start_Closest_Point'], row['End_Closest_Point']), axis=1)

merged_gdf.drop(columns=['StartPoint', 'Start_Closest_Point', 'geometry_road', 'End_Closest_Point', 'EndPoint'], inplace=True)

merged_gdf.set_geometry('Road_Segment', inplace=True)

merged_gdf.to_file("output/output.shp")

  merged_gdf.to_file("output/output.shp")
