### Exploratory Data Analysis (EDA)- non-numeric features
Before modeling, I explored categorical features to understand fleet composition, operator dominance, and structural differences that numeric metrics alone can’t capture.
I summarized categorical variables using frequency tables, grouped averages, and cross-tabulations to understand fleet composition and structural differences across ship types.

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [3]:
df= pd.read_csv( "../data/cruise_ships_enriched_ml_ready.csv")

In [4]:
### Ship company Distribution
df["owner"].value_counts().head(20)

owner
Carnival Corporation & Plc                                          99
Viking Cruises                                                      61
Scylla Ag                                                           47
Rcg-Royal Caribbean Group (Via Rccl-Royal Caribbean Cruises Ltd)    46
Croisieurope                                                        43
American Cruise Lines Inc                                           33
Grimaldi Group                                                      28
Viking Holdings Ltd (Via Viking Cruises)                            27
Stena Ab (Via Stena Roro Ab)                                        26
Mediterranean Shipping Company (Msc Group)                          25
Amawaterways                                                        23
(Nclh) Norwegian Cruise Line Holdings Ltd                           21
Volga Shipping Company (Russia)                                     19
Luftner Cruises Gmbh                                                18


In [5]:
df.groupby("owner")["gross_tonnage"].sum().sort_values(ascending=False).head(10)

owner
Carnival Corporation & Plc                                          11030203.0
Rcg-Royal Caribbean Group (Via Rccl-Royal Caribbean Cruises Ltd)     6321107.0
Mediterranean Shipping Company (Msc Group)                           3587982.0
(Nclh) Norwegian Cruise Line Holdings Ltd                            2117750.0
Nclh/Norwegian Cruise Line Holdings Ltd (Via Ncl Bahamas Ltd)        1218997.0
Grimaldi Group                                                       1100953.0
Tui Cruises Gmbh (Tui Group + Royal Caribbean Group)                 1055030.0
Stena Ab (Via Stena Roro Ab)                                          965613.0
Viking Cruises Ltd                                                    756374.0
Tallink Grupp                                                         541196.0
Name: gross_tonnage, dtype: float64

In [6]:
### Ship Type Distribution
ship_type_counts = (
    df["ship_type"]
    .value_counts()
    .rename("count")
    .to_frame()
)
ship_type_counts

Unnamed: 0_level_0,count
ship_type,Unnamed: 1_level_1
Cruise,1190
Ferry,262
Icebreaker,63
River Cruise,51


In [7]:
### Average Passenger Capacity by Ship Type
avg_passengers_by_type = (
    df.groupby("ship_type")["passengers_clean"]
      .mean()
      .round(0)
      .sort_values(ascending=False)
      .rename("avg_passengers")
      .to_frame()
)
avg_passengers_by_type

Unnamed: 0_level_0,avg_passengers
ship_type,Unnamed: 1_level_1
Ferry,1478.0
Cruise,903.0
Icebreaker,76.0
River Cruise,50.0


In [8]:
### Top Shipbuilders by Fleet Size
top_builders = (
    df["builder"]
    .value_counts()
    .head(10)
    .rename("ship_count")
    .to_frame()
)
top_builders

Unnamed: 0_level_0,ship_count
builder,Unnamed: 1_level_1
"Neptun Werft Gmbh (Rostock, Germany)",76
"Chantiers De L’Atlantique (Saint-Nazaire, Stx France)",58
"Meyer Werft (Papenburg, Germany)",53
"Fincantieri (Monfalcone, Italy)",50
"Maasara Shipyard (Cairo, Egypt)",38
"Fincantieri (Marghera, Italy)",32
"Scheepswerf Den Breejen Shipyard (Hardinxveld-Giessendam, Holland)",30
"Fincantieri (Ancona, Italy)",28
"Westsea Shipyard (Viana Do Castelo, Portugal)",25
"Chesapeake Shipbuilding (Salisbury, Maryland Usa)",25


In [11]:
cluster_shiptype = pd.crosstab(
    df["cluster"],
    df["ship_type"]
)
cluster_shiptype

ship_type,Cruise,Ferry,Icebreaker,River Cruise
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.0,692,205,51,35
1.0,199,50,0,0
2.0,104,0,0,0


In [12]:
cluster_shiptype_pct = (
    pd.crosstab(df["cluster"], df["ship_type"], normalize="index")
      .mul(100)
      .round(1)
)
cluster_shiptype_pct

ship_type,Cruise,Ferry,Icebreaker,River Cruise
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.0,70.4,20.9,5.2,3.6
1.0,79.9,20.1,0.0,0.0
2.0,100.0,0.0,0.0,0.0
