# Data Preparation

This file contains the code used to prepare the data. 
Since the preprocessing (especially the blockage identification using the Haversine Algorithm) are computationally heavy, the outputs from this process are included in the Data/Input folder.
Along with it is the original data that was used as input here. 

The original input was:
- ports.csv (Containing a reference, name & position of ports)
- routes.csv (Containing the AIS data for routes with position, used to identify which chokepoint (if any) a route passes)
- distances.csv (Containing data on previous & next port as well as distance used for network construction)

In [3]:
import networkx as nx
import pandas as pd
import numpy as np
import geopandas
import math

data_path = 'Data/Input/' #set to wherever the data files are, will be used on every input
ports_df = pd.read_csv((data_path +'ports.csv'))
distances_df = pd.read_csv((data_path + 'distances.csv'))
routes_df = pd.read_csv((data_path + 'routes.csv'))

In [5]:
class Haversine:
    '''
    use the haversine class to calculate the distance between
    two lon/lat coordnate pairs.
    output distance available in kilometers, meters, miles, and feet.
    example usage: Haversine([lon1,lat1],[lon2,lat2]).feet
    
    '''
    def __init__(self,coord1,coord2):
        lon1,lat1=coord1
        lon2,lat2=coord2
        
        R=6371000                               # radius of Earth in meters
        phi_1=math.radians(lat1)
        phi_2=math.radians(lat2)

        delta_phi=math.radians(lat2-lat1)
        delta_lambda=math.radians(lon2-lon1)

        a=math.sin(delta_phi/2.0)**2+\
           math.cos(phi_1)*math.cos(phi_2)*\
           math.sin(delta_lambda/2.0)**2
        c=2*math.atan2(math.sqrt(a),math.sqrt(1-a))
        
        self.meters=R*c                         # output distance in meters
        self.km=self.meters/1000.0              # output distance in kilometers
        self.miles=self.meters*0.000621371      # output distance in miles
        self.feet=self.miles*5280               # output distance in feet

if __name__ == "__Haversine__":
    main()

In [None]:
def route_detect(routes, chokepoints):
    """
    Detect whether there is a point on the route that goes close to the Chokepoint. 
    Returns the original route dataframe with distance to the chokepoint and whether it is affected by that chokepoint closure.
    """

    for chokepoint in chokepoints.index:
        routes["dist_to_{}".format(chokepoint)] = routes.apply(lambda x: Haversine([x.lon,x.lat], [chokepoints.loc[chokepoint]["Longitude"], chokepoints.loc[chokepoint]["Latitude"]]).km, axis=1)
        routes["affected_by_{}".format(chokepoint)] = np.where(routes["dist_to_{}".format(chokepoint)] <= chokepoints.loc[chokepoint]["dist_threshold"], True, False)

    return routes

In [None]:
chokepoints = pd.DataFrame({'Chokepoint': ["suez", "said", "gibraltar", "malacca", "dover",  "balboa", "colon", "hormuz" ],
                            'Latitude': [29.957175, 31.25366, 35.982003, 3.142137, 51.027659, 8.932536, 9.332827, 26.669853], 
                            'Longitude':[32.583561, 32.336550,-5.452054, 100.548439, 1.483395, -79.556954, -79.929265, 56.509156],
                            'dist_threshold': [15, 15,15, 45, 40, 15, 15, 45,]}).set_index("Chokepoint") #create a dataframe with the chokepoint, lat/lon of a selected point in that chokepoint and a distance threshold (in km)

In [None]:
"""
Code is commented out since run time is ~12hrs. Output file is included in Data
"""
#  r = route_detect(routes_df, chokepoints)
# df = r.groupby(["prev_port", "next_port"], as_index=False).agg({'affected_by_suez':'max','affected_by_said':'max','affected_by_gibraltar':'max','affected_by_malacca':'max','affected_by_dover':'max', 'affected_by_balboa':'max','affected_by_colon':'max','affected_by_hormuz':'max'})
# df.to_csv((data_path + 'Route Blockages/'+ "route_blockage.csv"))

In [None]:
route_blockages = pd.read_csv((data_path + 'Route Blockages/' + 'route_blockage.csv'))


In [None]:
# # Since we are building a Singlegraph, not a Multigraph, we want to ensure we take the f
# clean_distances = distances.sort_values(by=['prev_port', 'next_port', 'distance'])
# clean_distances.drop_duplicates(subset=['prev_port', 'next_port'], keep='first', inplace=True, ignore_index=True)
# clean_distances.to_csv((data_path + 'clean_distances.csv'))


## Separation by Blockage
We separate by blockage and save the output files to use them to cut the Graph in our ABM 

In [None]:
clean_distances = pd.read_csv((data_path + 'clean_distances.csv'))

In [None]:
route_blockages_gib =  route_blockages[route_blockages['affected_by_gibraltar']==True]
route_blockages_mal =  route_blockages[route_blockages['affected_by_malacca']==True]
route_blockages_dov =  route_blockages[route_blockages['affected_by_dover']==True]
route_blockages_suez =  route_blockages[route_blockages['affected_by_suez']==True]
route_blockages_horm =  route_blockages[route_blockages['affected_by_hormuz']==True]
route_blockages_pan = route_blockages[route_blockages['affected_by_panama']==True]

route_blockages_horm = route_blockages_horm[["prev_port", "next_port"]]
route_blockages_gib =  route_blockages_gib[["prev_port", "next_port"]]
route_blockages_mal =  route_blockages_mal[["prev_port", "next_port"]]
route_blockages_dov =  route_blockages_dov[["prev_port", "next_port"]]
route_blockages_suez =  route_blockages_suez[["prev_port", "next_port"]]
route_blockages_pan = route_blockages_pan[["prev_port", "next_port"]]

In [None]:
route_blockages_total = pd.concat([route_blockages_horm, route_blockages_gib, route_blockages_mal, route_blockages_dov, route_blockages_suez, route_blockages_pan])
route_blockages_total = route_blockages_total.drop_duplicates()

In [None]:
"""
Commented out for your convenience, files are included
"""
# route_blockages_horm.to_csv((data_path + 'Route Blockages/' + 'route_blockages_horm.csv'))
# route_blockages_mal.to_csv((data_path + 'Route Blockages/' + 'route_blockages_mal.csv'))
# route_blockages_dov.to_csv((data_path + 'Route Blockages/' + 'route_blockages_dov.csv'))
# route_blockages_suez.to_csv((data_path + 'Route Blockages/' + 'route_blockages_suez.csv'))
# route_blockages_gib.to_csv((data_path + 'Route Blockages/' + 'route_blockages_gib.csv'))
# route_blockages_pan.to_csv((data_path + 'Route Blockages/' + 'route_blockages_pan.csv'))
# route_blockages_total.to_csv((data_path + 'Route Blockages/' + 'route_blockages_total.csv'))