# Data Cleaning – Real-Time Cruise Ship Data
This notebook cleans and standardizes real-time scraped cruise ship data to create an analysis-ready dataset for downstream EDA and dashboards.


In [4]:
import pandas as pd
import numpy as np
import re


In [13]:
# load raw data
df = pd.read_csv("../data/cruise_ships.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1566 entries, 0 to 1565
Data columns (total 27 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   name                    1566 non-null   object 
 1   year_built              1566 non-null   int64  
 2   age                     1458 non-null   float64
 3   last_refurbishment      821 non-null    float64
 4   gross_tonnage           1362 non-null   float64
 5   decks                   1538 non-null   object 
 6   decks_with_cabins       742 non-null    float64
 7   passengers              1555 non-null   object 
 8   crew                    1439 non-null   float64
 9   cabins                  1455 non-null   float64
 10  engines                 454 non-null    object 
 11  sister_ships            1071 non-null   object 
 12  christened_by           744 non-null    object 
 13  operator                1526 non-null   object 
 14  owner                   1547 non-null   

In [35]:
#drop unusable columns 
cols_to_drop = [
    "itinerary_block",
    "propulsion_power_mw"
]
df = df.drop(columns=[c for c in cols_to_drop if c in df.columns])

def passengers_to_number(value):
    if pd.isna(value):
        return np.nan

    s = str(value).replace(",", "").strip()

    # range like 14-28 or 1000–1200
    # take the midpoint
    m = re.match(r"(\d+)\s*[-–]\s*(\d+)", s)
    if m:
        return (int(m.group(1)) + int(m.group(2))) / 2

    # single number
    if s.isdigit():
        return float(s)

    return np.nan

df["passengers_clean"] = df["passengers"].apply(passengers_to_number)
df["decks_clean"] = (
    df["decks"]
    .astype(str)
    .str.extract(r"(\d+)")
    .astype(float)
)
df = df.drop(columns=["decks"])
decks_clean

KeyError: 'decks'