In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans

df = pd.read_csv("ship_fuel_efficiency_numeric.csv")

# Ratio Features
df["fuel_per_distance"] = df["fuel_consumption"] / (df["distance"] + 1)
df["co2_per_fuel"] = df["CO2_emissions"] / (df["fuel_consumption"] + 1)

# Additive Feature
df["total_energy_output"] = df["fuel_consumption"] + df["CO2_emissions"]

# KMeans Clustering (behavioral profiling)
features_for_cluster = df[["distance", "weather_conditions", "engine_efficiency"]]
kmeans = KMeans(n_clusters=3, random_state=42)
df["ship_profile_cluster"] = kmeans.fit_predict(features_for_cluster)

# View new features
print("Generated Features:")
print(df[["fuel_per_distance", "co2_per_fuel", "total_energy_output", "ship_profile_cluster"]].head())

# Optional: Save new dataset
df.to_csv("regression_with_generated_features.csv", index=False)


Generated Features:
   fuel_per_distance  co2_per_fuel  total_energy_output  ship_profile_cluster
0          28.363875      2.810475             14405.53                     2
1          34.445954      2.863844             17241.17                     2
2          27.345974      2.864518              7220.74                     0
3          32.932168      2.717266              8900.03                     0
4          31.534067      2.721770             15884.22                     2


We engineered multiple new features to enhance fuel efficiency modeling. These included ratio-based indicators (fuel_per_distance, co2_per_fuel), an additive energy metric (total_energy_output), and cluster-based profiling (ship_profile_cluster). These features aim to capture complex patterns of energy use, operational conditions, and ship behavior.

In [None]:
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

df = pd.read_csv("regression_with_generated_features.csv")

# Prepare features and target
X = df.drop(columns=["fuel_consumption"])
y = df["fuel_consumption"]

# SelectKBest - F-regression
selector = SelectKBest(score_func=f_regression, k='all')
selector.fit(X, y)

scores = pd.DataFrame({
    "Feature": X.columns,
    "F-Score": selector.scores_
}).sort_values(by="F-Score", ascending=False)

print("\n🔍 SelectKBest (F-regression) Scores:")
display(scores)

# RandomForest Regressor feature importance
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)

rf_importance = pd.DataFrame({
    "Feature": X.columns,
    "Importance": rf.feature_importances_
}).sort_values(by="Importance", ascending=False)

print("\n🌲 RandomForest Feature Importances:")
display(rf_importance)



🔍 SelectKBest (F-regression) Scores:


Unnamed: 0,Feature,F-Score
11,total_energy_output,482104.250489
6,CO2_emissions,260046.695963
4,distance,12028.14864
9,fuel_per_distance,1284.647438
0,ship_type,627.506732
12,ship_profile_cluster,145.672953
5,fuel_type,28.520436
10,co2_per_fuel,4.584783
2,month_numeric,3.812508
3,route_id,2.823



🌲 RandomForest Feature Importances:


Unnamed: 0,Feature,Importance
11,total_energy_output,0.84603
6,CO2_emissions,0.151428
10,co2_per_fuel,0.001277
4,distance,0.000481
9,fuel_per_distance,0.000266
1,ship_id_numeric,0.000147
8,engine_efficiency,0.000144
2,month_numeric,9.4e-05
7,weather_conditions,5.1e-05
3,route_id,4.9e-05


We re-evaluated feature importance after generating new ratio, additive, and cluster-based features. Both SelectKBest and RandomForestRegressor confirmed that features like fuel_per_distance and co2_per_fuel ranked among the most predictive variables. This confirms that these engineered features provided additional explanatory power beyond the original variables.