# Data Cleaning – Real-Time Cruise Ship Data
This notebook cleans and standardizes real-time scraped cruise ship data to create an analysis-ready dataset for downstream EDA and dashboards.


In [11]:
import pandas as pd
import numpy as np
import re

In [16]:
# load raw data
df = pd.read_csv("../data/cruise_ships.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1566 entries, 0 to 1565
Data columns (total 27 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   name                    1566 non-null   object 
 1   year_built              1566 non-null   int64  
 2   age                     1458 non-null   float64
 3   last_refurbishment      821 non-null    float64
 4   gross_tonnage           1362 non-null   float64
 5   decks                   1538 non-null   object 
 6   decks_with_cabins       742 non-null    float64
 7   passengers              1555 non-null   object 
 8   crew                    1439 non-null   float64
 9   cabins                  1455 non-null   float64
 10  engines                 454 non-null    object 
 11  sister_ships            1071 non-null   object 
 12  christened_by           744 non-null    object 
 13  operator                1526 non-null   object 
 14  owner                   1547 non-null   

In [1]:
# drop unusable columns
cols_to_drop = [
    "itinerary_block",
    "propulsion_power_mw"
]
df = df.drop(columns=[c for c in cols_to_drop if c in df.columns])

def passengers_to_number(value):
    if pd.isna(value):
        return np.nan

    s = str(value).replace(",", "").strip()

    # range like 14-28 or 1000–1200 → midpoint
    m = re.match(r"(\d+)\s*[-–]\s*(\d+)", s)
    if m:
        return (int(m.group(1)) + int(m.group(2))) / 2

    if s.isdigit():
        return float(s)

    return np.nan

df["passengers_clean"] = df["passengers"].apply(passengers_to_number)
df = df.drop(columns=["passengers"])

# clean decks (keep numeric only)
df["decks_clean"] = (
    df["decks"]
    .astype(str)
    .str.extract(r"(\d+)")
    .astype(float)
)
df = df.drop(columns=["decks"])
categorical_cols = ["operator", "owner", "flag_state", "class", "builder"]
for col in categorical_cols:
    if col in df.columns:
        df[col] = df[col].astype(str).str.strip().str.title()
        
df["scrape_time"] = pd.to_datetime(df["scrape_time"], errors="coerce")


NameError: name 'df' is not defined

In [21]:
# split datasets
df_eda = df.copy()
df_ml = df.dropna(subset=["crew"])

# EDA dataset
eda_cols = [
    "name",
    "year_built",
    "age",
    "gross_tonnage",
    "passengers_clean",
    "crew",
    "cabins",
    "length_m",
    "beam_m",
    "speed_kn",
    "decks_clean",
    "decks_with_cabins",
    "operator",
    "owner",
    "flag_state",
    "class",
    "builder",
    "has_itinerary",
    "scrape_time",
    "building_cost_million",
    "building_cost_currency"
]
df_eda = df_eda[[c for c in eda_cols if c in df_eda.columns]]

# ML dataset
ml_cols = [
    "year_built",
    "age",
    "gross_tonnage",
    "passengers_clean",
    "crew",
    "cabins",
    "length_m",
    "beam_m",
    "speed_kn",
    "decks_clean",
    "operator",
    "owner",
    "flag_state",
    "class",
    "builder",
    "has_itinerary"
]
df_ml = df_ml[[c for c in ml_cols if c in df_ml.columns]]
# save outputs
df_eda.to_csv("../data/cruise_ships_eda.csv", index=False)
df_ml.to_csv("../data/cruise_ships_ml.csv", index=False)