# Import necessary libraries

---

In [None]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import math
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut, GeocoderUnavailable
import warnings
warnings.filterwarnings('ignore')

---
Connect to Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


---
# Reading Data with Numeric Format Handling

In [None]:
print("\n📂Reading data from 4 CSV files...")

# File_path
file_path_outbound = '/content/drive/MyDrive/0027. CUỘC THI/01. LOGAGE 2025/02. VÒNG 2/PROJECT/01. DATASET/LOGage2025_Round 2_Data.xlsx_Outbound.csv'

file_path_branch = '/content/drive/MyDrive/0027. CUỘC THI/01. LOGAGE 2025/02. VÒNG 2/PROJECT/01. DATASET/LOGage2025_Round 2_Data.xlsx_BranchMaster.csv'

file_path_cost_curve = '/content/drive/MyDrive/0027. CUỘC THI/01. LOGAGE 2025/02. VÒNG 2/PROJECT/01. DATASET/LOGage2025_Round 2_Data.xlsx_CostCurve.csv'

file_path_current_scenario = '/content/drive/MyDrive/0027. CUỘC THI/01. LOGAGE 2025/02. VÒNG 2/PROJECT/01. DATASET/LOGage2025_Round 2_Data.xlsx_CurrentScenario.csv'

try:
    # Read data with decimal=',' to handle comma as decimal separator
    df_outbound = pd.read_csv(file_path_outbound, decimal=',')
    df_branch_master = pd.read_csv(file_path_branch, decimal=',')
    df_cost_curve = pd.read_csv(file_path_cost_curve, decimal=',')
    df_current_scenario = pd.read_csv(file_path_current_scenario, decimal=',')

    print("✅ Data loaded successfully!")
    print(f"   - Outbound: {len(df_outbound):,} rows")
    print(f"   - BranchMaster: {len(df_branch_master):,} rows")
    print(f"   - CostCurve: {len(df_cost_curve):,} rows")
    print(f"   - CurrentScenario: {len(df_current_scenario):,} rows")

except Exception as e:
    print(f"❌ Error reading data: {e}")
    print("💡 Make sure all CSV files are in the same directory as this script")


📂Reading data from 4 CSV files...
✅ Data loaded successfully!
   - Outbound: 145,490 rows
   - BranchMaster: 635 rows
   - CostCurve: 8 rows
   - CurrentScenario: 3 rows


---
## Viewing data

---
### df_outbound

In [None]:
display(df_outbound.head())

Unnamed: 0,DestinationRef,DestinationType,ShipToID,GroupID,Longitude,Latitude,BranchDeliveryVolumeInKG,BranchDeliveryVolumeInCBM,Section
0,3001BR,Branch,3001BR,1060,151.001259,-33.797556,4.08,0.00752,REFRIGERATION COMPONENTRY
1,3001BR,Branch,3001BR,1120,151.001259,-33.797556,1.7,0.00816,TAPWARE
2,3001BR,Branch,3001BR,1121,151.001259,-33.797556,21.16,0.11362,TAPWARE
3,3001BR,Branch,3001BR,1122,151.001259,-33.797556,291.15,0.51636,TAPWARE
4,3001BR,Branch,3001BR,1124,151.001259,-33.797556,48.45,0.12914,TAPWARE


---
Information

In [None]:
# Display basic information
print("BASIC INFORMATION ABOUT THE DATA")
print("-" * 50)
print(f"Number of rows: {df_outbound.shape[0]}")
print(f"Number of columns: {df_outbound.shape[1]}")
print("\nColumn information:")
print(df_outbound.info())

BASIC INFORMATION ABOUT THE DATA
--------------------------------------------------
Number of rows: 145490
Number of columns: 9

Column information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145490 entries, 0 to 145489
Data columns (total 9 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   DestinationRef             145490 non-null  object 
 1   DestinationType            145490 non-null  object 
 2   ShipToID                   145490 non-null  object 
 3   GroupID                    145490 non-null  int64  
 4   Longitude                  145490 non-null  float64
 5   Latitude                   145490 non-null  float64
 6   BranchDeliveryVolumeInKG   145490 non-null  float64
 7   BranchDeliveryVolumeInCBM  145490 non-null  float64
 8   Section                    145490 non-null  object 
dtypes: float64(4), int64(1), object(4)
memory usage: 10.0+ MB
None


---
Describe

In [None]:
# numerical
print("\nDescriptive statistics for numerical variables:")
display(df_outbound.describe().T)


Descriptive statistics for numerical variables:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
GroupID,145490.0,1621.40875,925.111159,45.0,619.0,1866.0,2598.0,2737.0
Longitude,145490.0,144.929833,10.075823,114.6274,144.7319,147.13313,151.197711,153.5789
Latitude,145490.0,-32.889357,5.474522,-42.973031,-37.656412,-33.890076,-30.5164,-12.401724
BranchDeliveryVolumeInKG,145490.0,73.797769,703.774225,0.001,1.02,4.313,22.05,115720.0
BranchDeliveryVolumeInCBM,145490.0,0.513734,5.026896,1e-05,0.003691,0.0156,0.091354,697.777754


In [None]:
# Object
print("\nDescriptive statistics for categorical (object) variables:")
display(df_outbound.describe(include='object').T)


Descriptive statistics for categorical (object) variables:


Unnamed: 0,count,unique,top,freq
DestinationRef,145490,635,7005BR,450
DestinationType,145490,1,Branch,145490
ShipToID,145490,486,3016BR,859
Section,145490,10,TAPWARE,52379


---
### df_branch_master

In [None]:
display(df_branch_master.head())

Unnamed: 0,DestinationRef,DestinationType,ShipToID,Longitude,Latitude,Suburb,City,SA4,State,Country
0,3001BR,Branch,3001BR,151.001259,-33.797556,North Parramatta,Parramatta,Sydney - Parramatta,New South Wales,Australia
1,3002BR,Branch,3002BR,149.2063,-35.347,Queanbeyan,Queanbeyan,Capital Region,New South Wales,Australia
2,3003BR,Branch,3003BR,147.343011,-35.122016,Wagga Wagga,Wagga Wagga,Riverina,New South Wales,Australia
3,3004BR,Branch,3004BR,150.85002,-33.760995,Glendenning,Blacktown,Sydney - Blacktown,New South Wales,Australia
4,3005BR,Branch,3005BR,153.129468,-30.287126,Coffs Harbour,Coffs Harbour,Coffs Harbour - Grafton,New South Wales,Australia


---
Information

In [None]:
# Display basic information
print("BASIC INFORMATION ABOUT THE DATA")
print("-" * 50)
print(f"Number of rows: {df_branch_master.shape[0]}")
print(f"Number of columns: {df_branch_master.shape[1]}")
print("\nColumn information:")
print(df_branch_master.info())

BASIC INFORMATION ABOUT THE DATA
--------------------------------------------------
Number of rows: 635
Number of columns: 10

Column information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 635 entries, 0 to 634
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   DestinationRef   635 non-null    object 
 1   DestinationType  635 non-null    object 
 2   ShipToID         635 non-null    object 
 3   Longitude        635 non-null    float64
 4   Latitude         635 non-null    float64
 5   Suburb           635 non-null    object 
 6   City             635 non-null    object 
 7   SA4              633 non-null    object 
 8   State            635 non-null    object 
 9   Country          635 non-null    object 
dtypes: float64(2), object(8)
memory usage: 49.7+ KB
None


---
Describe

In [None]:
# numerical
print("\nDescriptive statistics for numerical variables:")
display(df_branch_master.describe().T)


Descriptive statistics for numerical variables:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Longitude,635.0,144.817137,9.997109,114.6274,144.725994,146.777418,151.17164,153.5789
Latitude,635.0,-32.954832,5.590544,-42.973031,-37.68383,-33.918927,-31.451779,-12.401724


In [None]:
# Object
print("\nDescriptive statistics for categorical (object) variables:")
display(df_branch_master.describe(include='object').T)


Descriptive statistics for categorical (object) variables:


Unnamed: 0,count,unique,top,freq
DestinationRef,635,635,9014BR,1
DestinationType,635,1,Branch,635
ShipToID,635,486,5710BR,5
Suburb,635,422,Osborne Park,7
City,635,218,Brisbane,29
SA4,633,87,Melbourne - Inner,22
State,635,8,Victoria,188
Country,635,1,Australia,635


---
### df_cost_curve

In [None]:
display(df_cost_curve.head())

Unnamed: 0,State,Mode,Intercept (AUD),Slope (AUD/KM),Min Distance (KM),Max Distance (KM)
0,New South Wales,FTL,225.2,0.9,0,55
1,New South Wales,LTL,360.4,1.1,55,999999
2,Queensland,FTL,192.6,0.2,0,100
3,Queensland,LTL,226.7,0.3,100,999999
4,Victoria,FTL,198.4,1.0,0,70


---
Information

In [None]:
# Display basic information
print("BASIC INFORMATION ABOUT THE DATA")
print("-" * 50)
print(f"Number of rows: {df_cost_curve.shape[0]}")
print(f"Number of columns: {df_cost_curve.shape[1]}")
print("\nColumn information:")
print(df_cost_curve.info())

BASIC INFORMATION ABOUT THE DATA
--------------------------------------------------
Number of rows: 8
Number of columns: 6

Column information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   State              8 non-null      object 
 1   Mode               8 non-null      object 
 2   Intercept (AUD)    8 non-null      float64
 3   Slope (AUD/KM)     8 non-null      float64
 4   Min Distance (KM)  8 non-null      int64  
 5   Max Distance (KM)  8 non-null      int64  
dtypes: float64(2), int64(2), object(2)
memory usage: 516.0+ bytes
None


---
### df_current_scenario

In [None]:
display(df_current_scenario.head())

Unnamed: 0,Facility,CapacityInPallets,Longitude,Latitude,AverageInventoryInCBM,StorageCostPerCBM (AUD),HandlingCostPerCBM (AUD)
0,NSW-DC01,16000,151.1251,-29.791652,14898.0,30.0,10.0
1,,,,,,,
2,PalletSize,120 cm × 80 cm × 150 cm high,,,,,


---
# Aggregate Demand Data by ShipTo

In [None]:
print("\n📈 Aggregating demand data by ShipToID...")

# Calculate annual total CBM demand for each ShipToID
df_demand_summary = df_outbound.groupby('ShipToID').agg({
    'BranchDeliveryVolumeInCBM': 'sum',
    'BranchDeliveryVolumeInKG': 'sum',
    'DestinationRef': 'count'  # Count number of shipments
}).reset_index()

df_demand_summary.columns = ['ShipToID', 'Annual_CBM_Demand_ShipTo', 'Annual_KG_Demand_ShipTo', 'Total_Shipments']

print(f"✅ Aggregation completed for {len(df_demand_summary):,} unique ShipToIDs")
print(f"   - Total CBM demand: {df_demand_summary['Annual_CBM_Demand_ShipTo'].sum():,.2f}")
print(f"   - Total KG demand: {df_demand_summary['Annual_KG_Demand_ShipTo'].sum():,.2f}")


📈 Aggregating demand data by ShipToID...
✅ Aggregation completed for 486 unique ShipToIDs
   - Total CBM demand: 74,743.17
   - Total KG demand: 10,736,837.41


In [None]:
display(df_demand_summary.head())

Unnamed: 0,ShipToID,Annual_CBM_Demand_ShipTo,Annual_KG_Demand_ShipTo,Total_Shipments
0,3001BR,91.182518,11801.936,313
1,3002BR,100.414742,12320.369,314
2,3003BR,882.881842,99185.9602,694
3,3004BR,30.670602,5160.574,224
4,3005BR,149.023835,18935.35,418


In [None]:
df_demand_summary['Annual_KG_Demand_ShipTo'].sum()

10736837.409777973

---
# Create main delivery location dataframe

In [None]:
print("\n🗺️ Creating main delivery location dataframe...")

# Extract unique location info from BranchMaster
df_shipto_location = df_branch_master[['ShipToID', 'Longitude', 'Latitude', 'State', 'City', 'Suburb']].drop_duplicates(subset=['ShipToID'], keep='first')

# Merge with demand data
df_shipto_master = df_shipto_location.merge(df_demand_summary, on='ShipToID', how='inner')

print(f"✅ df_shipto_master created successfully:")
print(f"   - {len(df_shipto_master):,} unique delivery locations")
print(f"   - Columns: {list(df_shipto_master.columns)}")


🗺️ Creating main delivery location dataframe...
✅ df_shipto_master created successfully:
   - 486 unique delivery locations
   - Columns: ['ShipToID', 'Longitude', 'Latitude', 'State', 'City', 'Suburb', 'Annual_CBM_Demand_ShipTo', 'Annual_KG_Demand_ShipTo', 'Total_Shipments']


---
## Viewing data

---
### df_shipto_master

In [None]:
display(df_shipto_master.head())

Unnamed: 0,ShipToID,Longitude,Latitude,State,City,Suburb,Annual_CBM_Demand_ShipTo,Annual_KG_Demand_ShipTo,Total_Shipments
0,3001BR,151.001259,-33.797556,New South Wales,Parramatta,North Parramatta,91.182518,11801.936,313
1,3002BR,149.2063,-35.347,New South Wales,Queanbeyan,Queanbeyan,100.414742,12320.369,314
2,3003BR,147.343011,-35.122016,New South Wales,Wagga Wagga,Wagga Wagga,882.881842,99185.9602,694
3,3004BR,150.85002,-33.760995,New South Wales,Blacktown,Glendenning,30.670602,5160.574,224
4,3005BR,153.129468,-30.287126,New South Wales,Coffs Harbour,Coffs Harbour,149.023835,18935.35,418


In [None]:
df_shipto_master.to_csv('df_shipto_master.csv', index=False)

---
Information

In [None]:
# Display basic information
print("BASIC INFORMATION ABOUT THE DATA")
print("-" * 50)
print(f"Number of rows: {df_shipto_master.shape[0]}")
print(f"Number of columns: {df_shipto_master.shape[1]}")
print("\nColumn information:")
print(df_shipto_master.info())

BASIC INFORMATION ABOUT THE DATA
--------------------------------------------------
Number of rows: 486
Number of columns: 9

Column information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 486 entries, 0 to 485
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   ShipToID                  486 non-null    object 
 1   Longitude                 486 non-null    float64
 2   Latitude                  486 non-null    float64
 3   State                     486 non-null    object 
 4   City                      486 non-null    object 
 5   Suburb                    486 non-null    object 
 6   Annual_CBM_Demand_ShipTo  486 non-null    float64
 7   Annual_KG_Demand_ShipTo   486 non-null    float64
 8   Total_Shipments           486 non-null    int64  
dtypes: float64(4), int64(1), object(4)
memory usage: 34.3+ KB
None


---
Describe

In [None]:
# numerical
print("\nDescriptive statistics for numerical variables:")
display(df_shipto_master.describe().T)


Descriptive statistics for numerical variables:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Longitude,486.0,144.880209,10.0957,114.6274,144.678407,147.100411,151.190672,153.5789
Latitude,486.0,-32.794578,5.599513,-42.973031,-37.573892,-33.856511,-30.575645,-12.401724
Annual_CBM_Demand_ShipTo,486.0,153.792539,213.285387,0.05616,61.048371,94.128174,167.482787,2656.56157
Annual_KG_Demand_ShipTo,486.0,22092.258045,33371.585574,8.6,9029.497,13982.3725,25721.42725,581703.582738
Total_Shipments,486.0,299.36214,114.60293,1.0,248.0,299.5,352.0,859.0


---
## CHECK DATA QUALITY

In [None]:
print("\n🔍Checking data quality...")

# Check for missing data
print("📋 Checking for missing data:")
missing_data = df_shipto_master.isnull().sum()
for col, missing_count in missing_data.items():
    if missing_count > 0:
        print(f"   ⚠️ {col}: {missing_count} missing values ({missing_count / len(df_shipto_master) * 100:.2f}%)")
    else:
        print(f"   ✅ {col}: No missing data")

# Check if coordinates are within Australia’s range
print("\n📍 Checking coordinates:")
aus_lat_range = (-45, -10)
aus_lon_range = (110, 155)

invalid_coords = df_shipto_master[
    (df_shipto_master['Latitude'] < aus_lat_range[0]) |
    (df_shipto_master['Latitude'] > aus_lat_range[1]) |
    (df_shipto_master['Longitude'] < aus_lon_range[0]) |
    (df_shipto_master['Longitude'] > aus_lon_range[1])
]

if len(invalid_coords) > 0:
    print(f"   ⚠️ {len(invalid_coords)} locations have coordinates outside Australia")
else:
    print("   ✅ All coordinates are valid (within Australia)")

# Basic statistics summary
print(f"\n📊 SUMMARY STATISTICS:")
print(f"   • Total number of ShipToIDs: {len(df_shipto_master):,}")
print(f"   • Number of unique States: {df_shipto_master['State'].nunique()}")
print(f"   • Total annual KG demand: {df_shipto_master['Annual_KG_Demand_ShipTo'].sum():,.0f}")
print(f"   • Average annual KG demand per ShipTo: {df_shipto_master['Annual_KG_Demand_ShipTo'].mean():,.2f}")


🔍Checking data quality...
📋 Checking for missing data:
   ✅ ShipToID: No missing data
   ✅ Longitude: No missing data
   ✅ Latitude: No missing data
   ✅ State: No missing data
   ✅ City: No missing data
   ✅ Suburb: No missing data
   ✅ Annual_CBM_Demand_ShipTo: No missing data
   ✅ Annual_KG_Demand_ShipTo: No missing data
   ✅ Total_Shipments: No missing data

📍 Checking coordinates:
   ✅ All coordinates are valid (within Australia)

📊 SUMMARY STATISTICS:
   • Total number of ShipToIDs: 486
   • Number of unique States: 8
   • Total annual KG demand: 10,736,837
   • Average annual KG demand per ShipTo: 22,092.26


---
# Visualize exploratory data analysis with Plotly

---
## Overview Summary Table

In [None]:
summary_stats = {
    'Metric': [
        'Total ShipToID',
        'Number of States',
        'Total Annual KG Demand',
        'Average Annual KG Demand per ShipTo',
        'Median Annual KG Demand',
        'Latitude Range',
        'Longitude Range'
    ],
    'Value': [
        f"{len(df_shipto_master):,}",
        f"{df_shipto_master['State'].nunique()}",
        f"{df_shipto_master['Annual_KG_Demand_ShipTo'].sum():,.0f}",
        f"{df_shipto_master['Annual_KG_Demand_ShipTo'].mean():,.0f}",
        f"{df_shipto_master['Annual_KG_Demand_ShipTo'].median():,.0f}",
        f"({df_shipto_master['Latitude'].min():.2f}, {df_shipto_master['Latitude'].max():.2f})",
        f"({df_shipto_master['Longitude'].min():.2f}, {df_shipto_master['Longitude'].max():.2f})"
    ]
}

fig_summary = go.Figure(data=[go.Table(
    header=dict(
        values=['<b>Metric</b>', '<b>Value</b>'],
        fill_color='lightblue',
        align='left',
        font=dict(size=14, color='white'),
        height=40
    ),
    cells=dict(
        values=[summary_stats['Metric'], summary_stats['Value']],
        fill_color='lavender',
        align='left',
        font=dict(size=12),
        height=35
    )
)])

fig_summary.update_layout(
    title="<b>LOGage 2025 Data Summary - Phase 0</b>",
    title_x=0.5,
    height=400,
    margin=dict(l=50, r=50, t=80, b=50)
)
fig_summary.show()

---
## Analysis of KG Demand Distribution

In [None]:
fig_demand = make_subplots(
    rows=2, cols=2,
    subplot_titles=[
        'Histogram - Annual KG Demand (Original Scale)',
        'Histogram - Log(Annual KG Demand)',
        'Boxplot - KG Demand by State',
        'Top 15 ShipToID with Highest Demand'
    ],
    specs=[[{"type": "xy"}, {"type": "xy"}],
           [{"type": "xy"}, {"type": "xy"}]]
)

# Histogram of KG demand (original scale)
fig_demand.add_trace(
    go.Histogram(
        x=df_shipto_master['Annual_KG_Demand_ShipTo'],
        nbinsx=50,
        name='KG Demand',
        marker_color='skyblue',
        opacity=0.7
    ),
    row=1, col=1
)

# Histogram of log-transformed demand
log_demand = np.log1p(df_shipto_master['Annual_KG_Demand_ShipTo'])
fig_demand.add_trace(
    go.Histogram(
        x=log_demand,
        nbinsx=50,
        name='Log KG Demand',
        marker_color='orange',
        opacity=0.7
    ),
    row=1, col=2
)

# Boxplot by State
states = df_shipto_master['State'].unique()[:5]  # Top 5 states
colors = px.colors.qualitative.Set1
for i, state in enumerate(states):
    state_data = df_shipto_master[df_shipto_master['State'] == state]['Annual_KG_Demand_ShipTo']
    fig_demand.add_trace(
        go.Box(
            y=state_data,
            name=state[:15],  # Truncate state name if too long
            marker_color=colors[i % len(colors)],
            boxpoints='outliers'
        ),
        row=2, col=1
    )

# Top 15 ShipToID by demand
top_15 = df_shipto_master.nlargest(15, 'Annual_KG_Demand_ShipTo')
fig_demand.add_trace(
    go.Bar(
        x=top_15['ShipToID'],
        y=top_15['Annual_KG_Demand_ShipTo'],
        name='Top 15',
        marker_color='red',
        opacity=0.8
    ),
    row=2, col=2
)

fig_demand.update_layout(
    height=800,
    title_text="<b>Analysis of KG Demand Distribution - LOGage 2025</b>",
    title_x=0.5,
    showlegend=False
)

# Rotate x-axis labels for top 15 chart
fig_demand.update_xaxes(tickangle=45, row=2, col=2)
fig_demand.show()

---
## Geographical Distribution Map of ShipToID

In [None]:
fig_map = go.Figure()

# Calculate marker sizes based on KG demand
max_demand = df_shipto_master['Annual_KG_Demand_ShipTo'].max()
min_size, max_size = 5, 25
df_shipto_master['marker_size'] = (
    df_shipto_master['Annual_KG_Demand_ShipTo'] / max_demand * (max_size - min_size) + min_size
)

fig_map.add_trace(go.Scatter(
    x=df_shipto_master['Longitude'],
    y=df_shipto_master['Latitude'],
    mode='markers',
    marker=dict(
        size=df_shipto_master['marker_size'],
        color=df_shipto_master['Annual_KG_Demand_ShipTo'],
        colorscale='Viridis',
        colorbar=dict(title="Annual KG Demand", titleside="right"),
        opacity=0.7,
        line=dict(width=1, color='white')
    ),
    text=[f"<b>{row['ShipToID']}</b><br>" +
          f"State: {row['State']}<br>" +
          f"KG/year: {row['Annual_KG_Demand_ShipTo']:,.0f}<br>" +
          f"Lat: {row['Latitude']:.3f}<br>" +
          f"Lon: {row['Longitude']:.3f}"
          for _, row in df_shipto_master.iterrows()],
    hovertemplate='%{text}<extra></extra>',
    name='ShipTo Locations'
))

fig_map.update_layout(
    title="<b>Geographical Distribution Map of ShipToID<br><sub>Bubble size = KG Demand, Color = Demand intensity</sub></b>",
    title_x=0.5,
    xaxis_title="Longitude (°E)",
    yaxis_title="Latitude (°S)",
    height=600,
    hovermode='closest'
)

# Add annotation about Australian states
fig_map.add_annotation(
    x=0.02, y=0.98,
    xref="paper", yref="paper",
    text="<b>Australia Distribution Network</b><br>" +
         f"Total ShipToID: {len(df_shipto_master):,}<br>" +
         f"States: {df_shipto_master['State'].nunique()}",
    showarrow=False,
    font=dict(size=10),
    bgcolor="rgba(255,255,255,0.8)",
    bordercolor="black",
    borderwidth=1
)

fig_map.show()

---

In [None]:
# Create main map from df_shipto_master
fig1 = px.scatter_mapbox(
    df_shipto_master,
    lat='Latitude',
    lon='Longitude',
    size='Annual_CBM_Demand_ShipTo',
    color='State',
    hover_data=['ShipToID', 'City', 'Annual_CBM_Demand_ShipTo', 'Total_Shipments'],
    mapbox_style='open-street-map',
    title='Geographic Distribution of Delivery Points (ShipTo Locations)',
    height=600,
    size_max=15
)

# Add a large white background marker (creates outer border)
fig1.add_trace(
    go.Scattermapbox(
        lat=[-29.7916515243493],
        lon=[151.125099595745],
        mode='markers',
        marker=dict(
            size=30,
            color='white',
            symbol='circle',
            opacity=1.0
        ),
        showlegend=False,
        hoverinfo='skip'
    )
)

# Add main red marker
fig1.add_trace(
    go.Scattermapbox(
        lat=[-29.7916515243493],
        lon=[151.125099595745],
        mode='markers',
        marker=dict(
            size=35,
            color='red',
            symbol='circle',
            opacity=1.0
        ),
        showlegend=False,
        hoverinfo='skip'
    )
)

# Add smaller yellow marker inside (for contrast)
fig1.add_trace(
    go.Scattermapbox(
        lat=[-29.7916515243493],
        lon=[151.125099595745],
        mode='markers',
        marker=dict(
            size=20,
            color='yellow',
            symbol='circle',
            opacity=1.0
        ),
        showlegend=False,
        hoverinfo='skip'
    )
)

# Add central text label (e.g., "DC")
fig1.add_trace(
    go.Scattermapbox(
        lat=[-29.7916515243493],
        lon=[151.125099595745],
        mode='text',
        text=['DC'],
        textfont=dict(
            size=16,
            color='black',
            family='Arial Black'
        ),
        showlegend=False,
        hoverinfo='skip'
    )
)

# Add description text slightly offset to avoid overlap
fig1.add_trace(
    go.Scattermapbox(
        lat=[-29.6916515243493],  # Slightly shifted to avoid label overlap
        lon=[151.225099595745],
        mode='text',
        text=['NSW-DC01<br>Capacity: 16,000 pallets'],
        textfont=dict(
            size=14,
            color='darkred',
            family='Arial Black'
        ),
        name='Distribution Center',
        showlegend=True,
        hovertemplate='<b>NSW-DC01</b><br>Capacity: 16,000 pallets<extra></extra>'
    )
)

# Adjust map layout with closer zoom
fig1.update_layout(
    mapbox=dict(
        center=dict(lat=-29.7916515243493, lon=151.125099595745),
        zoom=8  # Increase zoom to focus on the area
    ),
    margin=dict(l=0, r=0, t=50, b=0),
    legend=dict(
        yanchor="top",
        y=0.99,
        xanchor="left",
        x=0.01
    )
)

fig1.show()

---
## Analysis by State

In [None]:
state_analysis = df_shipto_master.groupby('State').agg({
    'ShipToID': 'count',
    'Annual_KG_Demand_ShipTo': ['sum', 'mean', 'median', 'std']
}).round(0)

# Rename multi-level columns for clarity
state_analysis.columns = ['Count_ShipTo', 'Total_KG', 'Mean_KG', 'Median_KG', 'Std_KG']

# Reset index and sort states by total demand descending
state_analysis = state_analysis.reset_index().sort_values('Total_KG', ascending=False)

# Create a 2x2 subplot layout with different chart types
fig_state = make_subplots(
    rows=2, cols=2,
    subplot_titles=[
        'Number of ShipToID by State',
        'Total KG Demand by State',
        'Average KG Demand by State',
        'Demand Distribution by State'
    ],
    specs=[[{"type": "bar"}, {"type": "bar"}],
           [{"type": "bar"}, {"type": "pie"}]]
)

# Bar chart: Number of ShipToID per state
fig_state.add_trace(
    go.Bar(x=state_analysis['State'],
           y=state_analysis['Count_ShipTo'],
           marker_color='lightblue',
           name='Count',
           text=state_analysis['Count_ShipTo'],
           textposition='auto'),
    row=1, col=1
)

# Bar chart: Total KG demand per state
fig_state.add_trace(
    go.Bar(x=state_analysis['State'],
           y=state_analysis['Total_KG'],
           marker_color='lightgreen',
           name='Total KG',
           text=[f'{x:,.0f}' for x in state_analysis['Total_KG']],
           textposition='auto'),
    row=1, col=2
)

# Bar chart: Average KG demand per state
fig_state.add_trace(
    go.Bar(x=state_analysis['State'],
           y=state_analysis['Mean_KG'],
           marker_color='orange',
           name='Mean KG',
           text=[f'{x:,.0f}' for x in state_analysis['Mean_KG']],
           textposition='auto'),
    row=2, col=1
)

# Pie chart: Distribution of total demand by state (percentage share)
fig_state.add_trace(
    go.Pie(labels=state_analysis['State'],
           values=state_analysis['Total_KG'],
           textinfo='label+percent',
           textposition='auto',
           name="Distribution"),
    row=2, col=2
)

# Layout settings
fig_state.update_layout(
    height=800,
    title_text="<b>Demand Analysis by State - Australia</b>",
    title_x=0.5,
    showlegend=False
)

# Rotate x-axis labels for bar charts to improve readability
for i in range(1, 3):
    for j in range(1, 3):
        if not (i == 2 and j == 2):  # Skip rotation on pie chart
            fig_state.update_xaxes(tickangle=45, row=i, col=j)

fig_state.show()

---
## Detailed Descriptive Statistical Analysis

In [None]:
numeric_cols = ['Longitude', 'Latitude', 'Annual_KG_Demand_ShipTo']
desc_stats = df_shipto_master[numeric_cols].describe().round(2)

# Convert describe table into an easy-to-read format
desc_data = []
for stat in desc_stats.index:
    row = [stat]
    for col in desc_stats.columns:
        if col == 'Annual_KG_Demand_ShipTo':
            row.append(f"{desc_stats.loc[stat, col]:,.0f}")
        else:
            row.append(f"{desc_stats.loc[stat, col]:.3f}")
    desc_data.append(row)

fig_desc = go.Figure(data=[go.Table(
    header=dict(
        values=['<b>Statistic</b>'] + [f'<b>{col}</b>' for col in desc_stats.columns],
        fill_color='paleturquoise',
        align='left',
        font=dict(size=12),
        height=35
    ),
    cells=dict(
        values=list(zip(*desc_data)),
        fill_color='lightcyan',
        align='left',
        font=dict(size=11),
        height=30
    )
)])

fig_desc.update_layout(
    title="<b>Detailed Descriptive Statistics - Numeric Data</b>",
    title_x=0.5,
    height=350
)
fig_desc.show()