In [4]:
import pandas as pd
import numpy as np
import joblib
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [5]:
df = pd.read_csv("data/final_dataset.csv")
df.head()

Unnamed: 0,company_name,ship_type,gt,dwt,length,width,age,imo,port_of_registry,fuel_consumption,total_co2_emissions,annual_hrs_at_sea,tech_eff_index,tech_eff_value,median_time_in_port_hours,cii_rating
0,PRELUDE,Offshore Support Vessel,499167,394330,489,74,8,9365623,Port Victoria,2689.0,8474.77,2177.73,EIV,7.29,23.76,B
1,PRELUDE,Offshore Support Vessel,499167,394330,489,74,8,9810654,PANAMA,503.9,1583.55,593.4,EEDI,5.05,36.96,A
2,MSC LORETO,Container Ship,236184,240000,399,60,2,9934735,Monrovia,14308.65,44602.04,2668.0,EEXI,6.21,35.04,D
3,MSC FEBE,Container Ship,232618,228149,400,62,6,9839478,Panama,6075.67,18956.3,1199.23,EEXI,7.29,14.4,A
4,MSC ARINA,Container Ship,228741,228111,400,61,6,9839284,Panama,16857.12,52535.95,3088.9,EEXI,7.08,16.08,E


In [6]:
rng = np.random.default_rng(42)

base = 4                                        # minimum hours
size_term = (df["length"] / 400) * 8            # up to +8 h
beam_term = (df["width"] / 60) * 4              # up to +4 h
age_term  = (df["age"] / 35) * 1.5              # up to +1.5 h
type_term = np.where(df["ship_type"] == "container", 2, 6)  # diff by type
noise     = rng.normal(0, 1.5, len(df))         # ±1.5 h

df["handling_h_fake"] = (base + size_term + beam_term + age_term + type_term + noise).clip(4)

print(df.columns.values)

feat_cols = [
    "gt","dwt","length","width","age",
    "fuel_consumption",
    "ship_type","company_name",
    "median_time_in_port_hours",
    "tech_eff_index",
    "tech_eff_value",
]
cat_cols = ["ship_type", "company_name", "tech_eff_index"]

X = df[feat_cols]
y = df["handling_h_fake"]

X_tr, X_te, y_tr, y_te = train_test_split(
    X, y, test_size=0.2, random_state=1
)
train_pool = Pool(X_tr, y_tr, cat_features=cat_cols)
test_pool  = Pool(X_te, y_te, cat_features=cat_cols)

model = CatBoostRegressor(
    iterations=400,
    depth=8,
    learning_rate=0.08,
    loss_function="MAE",
    random_seed=1,
    verbose=False
).fit(train_pool, eval_set=test_pool)

mae = mean_absolute_error(y_te, model.predict(test_pool))
print(f"MAE vs synthetic label: {mae:.2f} h")

joblib.dump(model, "models/trained.pkl")

['company_name' 'ship_type' 'gt' 'dwt' 'length' 'width' 'age' 'imo'
 'port_of_registry' 'fuel_consumption' 'total_co2_emissions'
 'annual_hrs_at_sea' 'tech_eff_index' 'tech_eff_value'
 'median_time_in_port_hours' 'cii_rating' 'handling_h_fake']
MAE vs synthetic label: 0.98 h


['models/trained.pkl']