In [1]:
!pip install pandas matplotlib requests networkx turfpy geopandas pygeos




[notice] A new release of pip available: 22.3.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip




This notebook is used for preparing shipping paths for the Welland Lock animation.

Required files
1. '1854 1875 1882 WCR.csv'-> Historical Shipping CSV derived from spreadsheet file
2. 'water_paths.json' -> Shipping paths converted to geojson with QGIS. Derived from this: https://www.arcgis.com/home/item.html?id=a4940deebec84fb9b6afa65afcbf891d#overview
3. 'locations.csv' -> File holding final QA edits for the locations used in the animation. Four columns: 
- Location: The name of the location
- Monk: Dr. Monk's chosen location
- Final: When Monk's location is fed into the mapbox geocoding api, what the location that results - we need to feed the location into the mapbox geocoding api to get a coordinate. 
- Coord: If there is a mismatch between the final and the monk columns, I've gone in and manually put in the coordinate we want, and in this case the mapbox geocoding api is not used.

Output
1. manifest.json -> Paths that the animation will use to illustrate ship voyages
2. debug.json -> For debugging purposes

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import json
import requests
import networkx as nx
import urllib.parse
from datetime import date, datetime, timedelta
from math import radians, sin, cos, atan2, sqrt, isnan, floor
from pprint import pprint
from pathlib import Path

Read in CSV data, clean it up, and prepare the first section of the json data that will be exported.
This first section is the csv converted to json format. 

In [3]:
df = pd.read_csv('1854 1875 1882 WCR.csv')

def not_empty(item):
    if type(item) == float and isnan(item):
        return False
    elif type(item) == type(None):
        return False 
    elif type(item) == str and item.strip() == '':
        return False
    else: 
        return True

cargo = []
for i,v in df[['Cargo 1', 'Cargo 2', 'Cargo 3', 'Cargo 4']].iterrows():
    cargo_this = map(str.strip, filter(not_empty, [v['Cargo 1'], v['Cargo 2'], v['Cargo 3'],v['Cargo 4']]))
    cargo.append(list(cargo_this))
df['Cargo'] = cargo
df = df.drop(columns=['Cargo 1', 'Cargo 2', 'Cargo 3', 'Cargo 4'])

strip = lambda x: str(x).strip()
for col in df:
    if col != 'Cargo':
        df[col] = df[col].apply(strip)
    
#Check values are clean
assert (df['Year'].apply(lambda x: x in ['1854','1875','1882']).all())
assert df['Day'].apply(lambda x: int(x)>=1 and int(x)<=31).all()

for col in ['Nationality', 'Vessel Type', 'Name of Vessel']:
    try: df[col] = df[col].apply(str.strip)
    except TypeError as e:
        print(col)

month_to_int = {"January":1, "Febuary":2, "March": 3, "April": 4, "May": 5, 
 "June":6, "July":7, "August":8, "September":9,"October":10,"November":11, "December":12}
df['Date'] = df[['Year','Month','Day']].apply(lambda x: str(date(int(x.Year), month_to_int[x.Month], int(x.Day))), axis=1)    
df.drop(columns=['Year','Month','Day'], inplace=True)

#Drop voyages for these locations. We don't know where they are.
unresolved_locations = [
    'Albro',
    'Black Bear',
    'Boynat Inlet',
    'Can Vill',
    'Cana',
    'Cape Ronence',
    'Cida Dal',
    'Clairmont',
    'Cork Ireland',
    'Creek',
    'Dinestown',
    'Elevator',
    'Forrester',
    'Government',
    'Mevey',
    'Monsoon',
    'Pass',
    'Point Plag',
    'Port Davis',
    'Port Flamber',
    'St. Duram',
    'Survey',
    'Venice',
    'White Stall'
]
df_unresolved = df['Where From'].map(lambda v: v in unresolved_locations)
df_unresolved = df_unresolved | df['Where Bound'].map(lambda v: v in unresolved_locations)
df = df[~df_unresolved]

Show some information about the dataset

In [4]:
print(set(df['Nationality']))
print(set(df['Vessel Type']))

#Total number of days
print(len(df['Date'].drop_duplicates()))

{'American', 'British'}
{'Scow', 'Steamer', 'Propeller', 'Brigantine', 'Boat', 'Tug', 'Barge', 'Sailboat', 'Yacht', 'Dredge', 'Barkentine', 'Raft', 'Steam Barge', 'Steam Yacht', 'Schooner'}
578


Read in the shipping path data that was converted to json using QGIS

In [5]:
with open(Path('data')/'water_paths.json') as wp_geojson:
    gj = json.loads(wp_geojson.read())
    

Define functions for transforming the path information into a graph and working with that graph

In [6]:
def feature_filter(gj, gid_list=None, exclude=True):
    '''filter out features from geojson. optionally exclude or include only features with gid in gid_list.'''
    filtered = []
    for f in gj['features']:
        if not gid_list:
            filtered.append(f)
            continue
            
        if exclude and f['properties']['gid'] not in gid_list: 
            filtered.append(f)
        elif not exclude and f['properties']['gid'] in gid_list: 
            filtered.append(f)
    
    return filtered

def distance(p1, p2):
    '''returns distance in KM between two points'''
    earth_radius = 6371
    lat1, lat2 = radians(p1[1]), radians(p2[1])
    lon1, lon2 = radians(p1[0]), radians(p2[0])
    
    #https://stackoverflow.com/questions/4913349/haversine-formula-in-python-bearing-and-distance-between-two-gps-points
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * atan2(sqrt(a), sqrt(1-a))
    base = earth_radius * c
    return base

def draw_graph(G):
    pos = nx.planar_layout(G)
    nx.draw(G, pos, with_labels=True, font_weight='bold')
    el=nx.get_edge_attributes(G,'weight')
    nx.draw_networkx_edge_labels(G,pos,edge_labels=el)

def path_distance(g, path):
    prev = path[0]
    distances = []
    for p in path[1:-1]:
        distances.append(g.edges[(prev,p)]['weight'])
        prev = p
    p = path[-1]
    distances.append(g.edges[(prev,p)]['weight'])
    return sum(distances)

def geojson_to_graph(features):
    G = nx.Graph()
    for feat in features:
        assert len(feat['geometry']['coordinates']) == 1
        feat_geo = feat['geometry']['coordinates'][0]
        anode = str(feat['properties']['anode'])
        bnode = str(feat['properties']['bnode'])
        #length = feat['properties']['length']
        id = feat['properties']['OBJECTID']
        id_postfix = 1
        G.add_node(anode, coordinates=tuple(feat_geo[0]))
        G.add_node(bnode, coordinates=tuple(feat_geo[-1]))

        prev = anode
        prev_coord = feat_geo[0]
        for point in feat_geo[1:-1]:
            name = f"{id}_{id_postfix}"
            id_postfix += 1
            G.add_node(name, coordinates=tuple(point))
            G.add_edge(prev,name, weight=distance(prev_coord, point))
            prev = name
            prev_coord = point
        G.add_edge(prev, bnode, weight=distance(prev_coord, feat_geo[-1]))
    return G

def coordinates_set(g):
    return {g.nodes[node]['coordinates'] for node in g.nodes}

def closest_coordinate(all_cords, candidate_points):
    point_distances = []
    for point in candidate_points:
        #find the closest node coordinate in the path graph to this point
        cc = sorted(all_cords, key=lambda coord: distance(coord, point))[0]
        #note down the point and the distance to its closest coordinate
        point_distances.append((cc, point, distance(cc, point)))
    
    #return the pair that has the smallest distance
    return sorted(point_distances, key=lambda record: record[2])[0]

def get_features_coords(features):
    return [feat['center'] for feat in features]

def location_name_to_coord(g, location_csv_pdf):
    '''Using the geocoding api, map locations to their most likely lat/lon'''
    #only return results within the specific bounding box
    bounding_box = '-95.13478080553124%2C39.119757307389875%2C-70.72830423014456%2C52.52603829818719'
    access_token = 'pk.eyJ1IjoiZmpvaG5zODg4IiwiYSI6ImNsaGh6eXo1dDAzMDMzbW1td3BqOXFoaDMifQ.RBC_0mpQ25-GRAZDA0E0oA'
    mapping = {}
    all_coords = coordinates_set(g)
    
    for row in location_csv_pdf.iterrows():
        row_data = row[1]
        
        lat_lon = row_data.Coord
        place_name = row_data.Location
        location = row_data.Monk
        resolved_location = row_data.Final
        
        if type(lat_lon) == str:
            lat,lon = lat_lon.split(',')
            lat = float(lat)
            lon = float(lon)
            cc = closest_coordinate(all_coords, [[lon,lat]]) #notice lon,lat order is reversed. this is what we want.
            mapping[place_name] = {'graphcoord':cc[0], 'geocoded_coord':cc[1], 'geocode_name': resolved_location}
        else:
            location = urllib.parse.quote(location)
            req_url = f'https://api.mapbox.com/geocoding/v5/mapbox.places/{location}.json?bbox={bounding_box}&access_token={access_token}'
            features = requests.get(req_url).json()
            if not len(features['features']):
                print(req_url)
                continue
            #assert len(features['features']), features

            #Use this instead to compare all returned search results against graph coordinates
            #use the pair that has the minimum distance from one another
            #candidate_points = get_features_coords(features['features'])
            #cc = closest_coordinate(all_coords, candidate_points)
            #mapping[loc] = {'graphcoord':cc[0], 'geocoded_coord':cc[1]}

            geocode = features['features'][0] #get the first and most relevant result
            cc = closest_coordinate(all_coords, [geocode['center']])
            mapping[place_name] = {'graphcoord':cc[0], 'geocoded_coord':cc[1], 'geocode_name': geocode['place_name']}
    
    return mapping

def coord_to_node(coordinate):
    for node in g.nodes:
        if g.nodes[node]['coordinates'] == coordinate:
            return node
    raise ValueError("Could not find node matching given coordinate")

Create the graph representation of the geojson data. Create mappings using the Mapbox geocoding api that translate place names to lon/lat.

In [7]:
#Exclude the lone multiline path in the geojson data
#this is okay since it its a series of disconnectd strings anyways
features = feature_filter(gj, [837], exclude=True)
g = geojson_to_graph(features)
location_csv = pd.read_csv('locations.csv')
location_csv.Location = location_csv.Location.map(lambda s: s.strip())
loc_cord_map = location_name_to_coord(g, location_csv)
loc_node_map = {loc:coord_to_node(loc_cord_map[loc]['graphcoord']) for loc in location_csv.Location}

Some functions that output feature collections (geojson) of the locations that were previously just strings. Basically just used to verify locations that were looked up using the geocoding api make sense.

In [8]:
def mapping_to_geojson(mapping):
    '''Dump location lon/dat data'''
    
    features = {
      "type": "FeatureCollection",
      "features": []
    }
    for loc in mapping:
        loc_gj = {"type": "Feature",
                  "id": loc,
                  "geometry": {
                      "type": "Point", 
                      "coordinates": mapping[loc]['graphcoord']},
                  "properties": {"name": loc}}
        features['features'].append(loc_gj)
    return json.dumps(features)

def mapping_to_geojson_debug(mapping):
    '''Dump queried/looked up lon/lat locations with their most likely graph locations.
    Provided as a LineString so that the discrepancy can be observed'''
    
    linefeatures = {
      "type": "FeatureCollection",
      "features": []
    }
    estfeatures = {
      "type": "FeatureCollection",
      "features": []
    }
    realfeatures = {
      "type": "FeatureCollection",
      "features": []
    }
    features = {}
    
    for loc in mapping:
        loc_gj = {"type": "Feature",
                  "id": loc,
                  "geometry": {
                      "type": "LineString", 
                      "coordinates": [mapping[loc]['graphcoord'], mapping[loc]['geocoded_coord']]},
                  "properties": {"name": loc}}
        est = {"type": "Feature",
               "id": loc,
               "geometry":{"type":"Point", "coordinates":mapping[loc]['graphcoord']},
               "properties":{"name":loc}}
        real = {"type": "Feature",
               "id": loc,
               "geometry":{"type":"Point", "coordinates":mapping[loc]['geocoded_coord']},
               "properties":{"name":mapping[loc]['geocode_name']}}
        linefeatures['features'].append(loc_gj) 
        estfeatures['features'].append(est)
        realfeatures['features'].append(real)
    
    features['lines'] = linefeatures
    features['estimation'] = estfeatures
    features['real'] = realfeatures
    return json.dumps(features)


In [9]:
debug = mapping_to_geojson_debug(loc_cord_map)
with open(Path('data')/'debug.json','w') as debug_file:
    debug_file.write(debug)

Code for navigating the created graph representation and deriving the shortest paths for the 'Where From' and 'Where Bound' columns of the CSV

In [10]:
def path_to_coordinates(g, path):
    return [g.nodes[node]['coordinates'] for node in path]

def coordinates_to_geojson(coordinates):
    geo_json = {"type": "Feature",
                "properties":{},
                "geometry":{
                    "type":"LineString",
                    "coordinates":[]
                }}
                
    
    for coordinate in coordinates:
        geo_json['geometry']['coordinates'].append(list(coordinate))
    
    return geo_json

def shortest_path_to_geojson(g, start, end):
    if isinstance(start, int): 
        start = str(start)
    if isinstance(end, int):
        end = str(end)
        
    sp = nx.shortest_path(g,start,end, weight='weight')
    return coordinates_to_geojson(path_to_coordinates(g, sp))

def shortest_paths_to_geojson(g, start_end_pairs):
    '''Given a list of star/end pairs, find the shortest path and return it as a series of linestrings in geojson'''
    geo_json = {"type":"FeatureCollection",
                "features":[]
               }
    geo_json['features'] = [shortest_path_to_geojson(g, start, end) for start, end in start_end_pairs]
    return geo_json

def gen_all_paths(g, df, loc_node_map):
    paths = []
    bad_paths = set()
    for _,locs in df[['Where From','Where Bound']].drop_duplicates().iterrows():
        start_name = locs['Where From']
        dst_name = locs['Where Bound']
        node_start = loc_node_map[start_name]
        node_end = loc_node_map[dst_name]
        path_name = f"{start_name}+{dst_name}"
        try:
            feature = shortest_path_to_geojson(g, node_start, node_end)
            feature['properties']['path'] = path_name
            paths.append(feature)
        except nx.NetworkXNoPath:
            bad_paths.add(path_name)
    
    feature_collection = {"type":"FeatureCollection", "features":paths}
    return json.dumps(feature_collection), bad_paths

good,bad = gen_all_paths(g, df, loc_node_map)
assert not bad, bad
    
# # Save cleaned dataframe as JSON    
# manifest = df.to_json(orient="records")

# with open(Path('data')/'manifest.json','w') as manifest_file:
#     manifest_file.write(f'{{"manifest":{manifest},\n "routes":{good}}}')
    

In [11]:
#Turns out I misinterpreted the date in the data; it's not when a vessel leaves, but rather when a vessel arrives at the 
#canal. Fix up the start times here by determining the distance from a vessel's origin to the canal and then determining 
#how long it would take it to travel this distance. That time is then substracted from the date in the data to get the 
#starting date

from geojson import  Polygon, Feature, LineString
from turfpy.measurement import length
from turfpy.transformation import intersect
from turfpy.misc import line_intersect

north = LineString([[-79.219,43.221],[-79.1601,43.215]])
south = LineString([[-79.2597,42.8823],[-79.2260,42.8762]])
entrance = None

def distance_to_canal(coordinates):
    distance=0
    start=coordinates[0]
    for point in coordinates[1:]:
        cur_line = LineString([start,point])
        features = line_intersect(cur_line,north)['features']
        entrance = "north"
        if not features:
            features = line_intersect(cur_line,south)['features']
            entrance = "south"
        if features:
            end = features[0]['geometry']['coordinates']
            distance += length(LineString([start,end]), units='km')
            return distance, entrance
        distance += length(cur_line, units='km')
        start=point
        
    return None,None


In [26]:
data = df.copy()
routes = json.loads(good)
route_map = {feat['properties']['path']:feat['geometry']['coordinates'] for feat in routes['features']}
cache = {}
updated_dates = {}
skipped = 0
entrance = None

#distance in km from south/north mouth of canal to lock3
south_lock3_dist = 31.0  
north_lock3_dist = 7.6
canal_speed = 19.0/24 #canal speed is 19KM per day
south_entrance_offset = south_lock3_dist/canal_speed #offset in hours
north_entrance_offset = north_lock3_dist/canal_speed

vessel_speeds = [
    ['Schooner',5],['Brigantine',4], ['Barkentine',4],
    ['Propeller',8],['Steamer',8],['Tug',10],
    ['Scow',4], ['Other',3]]

for vessel in vessel_speeds:
    speed = vessel[1]
    vessel[1] = speed * 1.609 #convert mph to kmh

vessel_speeds = {v[0]:v[1] for v in vessel_speeds}

for row in data.iterrows():
    i=row[0]
    row=row[1]
    vtype=row['Vessel Type']
    src=row['Where From']
    dst=row['Where Bound']
    route=f"{src}+{dst}"
    if route in cache:
        distance,entrance = cache[route]
    else: 
        distance,entrance = distance_to_canal(route_map[route])
        cache[route] = distance,entrance
    if not distance: 
        #This means that the path of this vessel is contained within the canal itself
        #and the date does not need to be changed
        skipped += 1
    else:
        try:
            vessel_speed = vessel_speeds[vtype]
        except KeyError:
            vessel_speed = vessel_speeds['Other']
        vdate = datetime.strptime(row['Date'],'%Y-%m-%d') 
        canal_offset = south_entrance_offset if entrance=="south" else north_entrance_offset
        offset = distance/vessel_speed + canal_offset #hrs back
        vdate = vdate - timedelta(hours=offset)
        updated_dates[i] = vdate

rounded_dates = {i:updated_dates[i].strftime('%Y-%m-%d %H:00:00') for i in updated_dates}

df2 = data.copy()
series_ud = pd.Series(data=rounded_dates.values(), index=rounded_dates.keys(), name='Date')
df2.update(series_ud)
manifest = df2.sort_values(by=['Date']).to_json(orient="records")

with open(Path('data')/'manifest.json','w') as manifest_file:
    manifest_file.write(f'{{"manifest":{manifest},\n "routes":{good}}}')

In [24]:
dates_precise = {i:updated_dates[i].strftime('%Y-%m-%d %H:%M:%S') for i in updated_dates}
df2 = data.copy()
df2.drop(['LAC Image #','Cargo','Direction'],axis=1,inplace=True)
series_ud = pd.Series(data=dates_precise.values(), index=dates_precise.keys(), name='Leaving')
df2.join(series_ud).sort_values(by=['Date']).to_csv('dataout.csv',index=False)

