In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

In [2]:
BASE_DIR = 'data_raw'

# Port Distances Dataset

In [3]:
df = pd.read_csv(f'{BASE_DIR}/Port Distances.csv')
df.head(2)

Unnamed: 0,PORT_NAME_FROM,PORT_NAME_TO,DISTANCE
0,AAHEIM,HALDIA,8652.67
1,AAHEIM,PARANAGUA,6304.48


In [4]:
print(f"shape: {df.shape}")
print(f"dtypes:\n{df.dtypes}")

shape: (15533, 3)
dtypes:
PORT_NAME_FROM        str
PORT_NAME_TO          str
DISTANCE          float64
dtype: object


In [5]:
# check for null values
print(f"null values:\n{df.isna().sum()}")

# summary statistics for `DISTANCE`
print(df["DISTANCE"].describe())
print(f"non-positive distances: {(df["DISTANCE"] <= 0).sum()}")

null values:
PORT_NAME_FROM    0
PORT_NAME_TO      0
DISTANCE          0
dtype: int64
count    15533.000000
mean      5067.643422
std       3287.136255
min          6.860000
25%       2370.900000
50%       4545.420000
75%       7293.000000
max      13558.480000
Name: DISTANCE, dtype: float64
non-positive distances: 0


In [6]:
from_ports = set(df["PORT_NAME_FROM"].unique())
to_ports = set(df["PORT_NAME_TO"].unique())
all_ports = from_ports | to_ports

print(f"unique FROM_PORTs: {len(from_ports)}")
print(f"unique TO_PORTs: {len(to_ports)}")
print(f"total unique ports: {len(all_ports)}")

unique FROM_PORTs: 1279
unique TO_PORTs: 1432
total unique ports: 1950


In [7]:
# if port appears only as TO or only as FROM, it indicates incomplete data
only_to_ports = to_ports - from_ports
only_from_ports = from_ports - to_ports


print(f"ports only appearing as TO (no outgoing): {len(sorted(list(only_to_ports)))}")
print(f"ports only appearing as FROM (no incoming): {len(sorted(list(only_from_ports)))}")

ports only appearing as TO (no outgoing): 671
ports only appearing as FROM (no incoming): 518


In [8]:
import plotly.express as px

coverage = pd.DataFrame({
    "category": ["ports_total", "only_as_to (no outgoing)", "only_as_from (no incoming)"],
    "count": [len(all_ports), len(only_to_ports), len(only_from_ports)]
})
px.bar(coverage, x="category", y="count", title="Port coverage gaps")


In [9]:
# Check for directedness of distances
pairs = set(zip(df["PORT_NAME_FROM"], df["PORT_NAME_TO"]))
rev_pairs = set((b,a) for (a,b) in pairs)

reverse_coverage = len(pairs & rev_pairs) / len(pairs)
print("Reverse-pair availability:", reverse_coverage)

# quantify differences in distances for reverse pairs
dist_map = {(a,b): d for a,b,d in df[["PORT_NAME_FROM","PORT_NAME_TO","DISTANCE"]].itertuples(index=False)}
sym = []
for (a,b), d in dist_map.items():
    if (b,a) in dist_map:
        sym.append((a,b,d,dist_map[(b,a)], abs(d-dist_map[(b,a)])))

sym_df = pd.DataFrame(sym, columns=["from","to","d_ab","d_ba","abs_diff"])
sym_df["pct_diff_vs_mean"] = sym_df["abs_diff"] / ((sym_df["d_ab"]+sym_df["d_ba"])/2)

print(f"Pairs with reverse present: {len(sym_df)}")
print(f"Share mismatched (abs_diff>0): {(sym_df["abs_diff"]>0).mean()}")
sym_df[["abs_diff","pct_diff_vs_mean"]].describe()

Reverse-pair availability: 0.019313719178523144
Pairs with reverse present: 300
Share mismatched (abs_diff>0): 0.9666666666666667


Unnamed: 0,abs_diff,pct_diff_vs_mean
count,300.0,300.0
mean,188.4748,0.038549
std,195.444543,0.025624
min,0.0,0.0
25%,32.19,0.019787
50%,116.93,0.034283
75%,311.88,0.058647
max,1017.1,0.146341


In [10]:
directed_summary = pd.DataFrame({
    "category": ["Pairs with reverse present", "Pairs without reverse"],
    "count": [len(pairs & rev_pairs), len(pairs) - len(pairs & rev_pairs)]
})
px.bar(directed_summary, x="category", y="count", title="Directedness check: reverse-pair availability")

px.histogram(sym_df, x="pct_diff_vs_mean", nbins=40,
             title="If reverse exists: abs % distance difference vs mean (directed routing effect)")


In [11]:
px.histogram(df, x="DISTANCE", nbins=60, title="DISTANCE distribution (NM)")

dist_log = df.copy()
dist_log["LOG10_DISTANCE"] = np.log10(dist_log["DISTANCE"])
px.histogram(dist_log, x="LOG10_DISTANCE", nbins=60, title="DISTANCE distribution (log10 NM)")

In [13]:
df.sort_values("DISTANCE", ascending=False).head(15)

Unnamed: 0,PORT_NAME_FROM,PORT_NAME_TO,DISTANCE
4360,HAY POINT,LULEA,13558.48
3967,GLADSTONE,GDANSK,13347.6
7520,MURMANSK,TIANJIN,13140.4
13231,ST PETERSBURG (RUSSIA),BAYUQUAN,13111.35
2727,DALRYMPLE BAY,GDYNIA,13107.15
7477,MURMANSK,HUANGHUA,12998.6
6529,LULEA,TIANJIN,12993.45
14501,UST-LUGA,GEELONG,12992.42
7465,MURMANSK,DALIAN,12968.8
4366,HAY POINT,OXELOSUND,12955.34


In [14]:
out_deg = df.groupby("PORT_NAME_FROM")["PORT_NAME_TO"].nunique().rename("out_degree").reset_index()
in_deg  = df.groupby("PORT_NAME_TO")["PORT_NAME_FROM"].nunique().rename("in_degree").reset_index()

ports_all = pd.Index(sorted(set(df["PORT_NAME_FROM"]) | set(df["PORT_NAME_TO"])))
deg = pd.DataFrame({"PORT": ports_all})
deg = deg.merge(out_deg, left_on="PORT", right_on="PORT_NAME_FROM", how="left").drop(columns=["PORT_NAME_FROM"])
deg = deg.merge(in_deg,  left_on="PORT", right_on="PORT_NAME_TO", how="left").drop(columns=["PORT_NAME_TO"])
deg = deg.fillna(0)
deg["total_degree"] = deg["out_degree"] + deg["in_degree"]

top_hubs = deg.sort_values("total_degree", ascending=False).head(20)
top_hubs


Unnamed: 0,PORT,out_degree,in_degree,total_degree
1120,NEW ORLEANS,204.0,121.0,325.0
1508,SAN LORENZO-SAN MARTIN,259.0,29.0,288.0
536,GHENT,126.0,149.0,275.0
1468,ROTTERDAM,84.0,171.0,255.0
1528,SANTOS,186.0,57.0,243.0
646,HOUSTON,130.0,84.0,214.0
1213,PARANAGUA,141.0,70.0,211.0
50,AMSTERDAM,93.0,103.0,196.0
1461,ROSARIO,186.0,6.0,192.0
1832,VANCOUVER (CANADA),180.0,10.0,190.0


In [15]:
px.histogram(deg, x="out_degree", nbins=80, title="Out-degree (# destinations per FROM port)")
px.histogram(deg, x="in_degree", nbins=80, title="In-degree (# origins per TO port)")

px.bar(top_hubs.sort_values("total_degree"),
       x="total_degree", y="PORT", orientation="h",
       title="Top 20 hub ports by total_degree (in + out)")

px.scatter(deg, x="out_degree", y="in_degree", size="total_degree", hover_name="PORT",
           title="Port connectivity: out_degree vs in_degree (bubble size = total_degree)")
