In [48]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [None]:
df = pd.read_csv("./../dataset/port_operations_log.csv")
df.head()

Unnamed: 0,Vessel_ID,Arrival_Date,Time_of_Day_Arrival,Day_of_Week_Arrival,Vessel_Type,Vessel_Size_GT,Total_TEU_Planned,Reefer_TEU_Planned,Hazmat_TEU_Planned,Berth_Used,Num_Cranes_Assigned,Gang_Size_per_Crane,Average_Wind_Speed,Precipitation,Waiting_Time_Before_Berth,Actual_Operation_Duration_Hours
0,V_0001,2025-03-15,16,Sat,Container Ship,39755.734094,3135,33,80,Berth_A,4,24.0,15.84496,Yes,0.251216,29.488075
1,V_0002,2025-03-16,5,Sun,Container Ship,125811.127254,8620,362,112,Berth_B,3,19.0,21.812952,Yes,16.692322,115.052352
2,V_0003,2025-06-30,19,Mon,Container Ship,72944.47483,5393,506,385,Berth_D,6,24.0,4.446655,No,29.204165,48.432351
3,V_0004,2025-11-05,6,Wed,Container Ship,116716.557873,5032,16,467,Berth_B,2,24.0,21.455915,No,5.345613,115.792496
4,V_0005,2025-05-26,14,Mon,Container Ship,22477.049049,11263,446,26,Berth_A,3,22.0,16.914155,No,23.147908,182.058103


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 16 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Vessel_ID                        5000 non-null   object 
 1   Arrival_Date                     5000 non-null   object 
 2   Time_of_Day_Arrival              5000 non-null   int64  
 3   Day_of_Week_Arrival              5000 non-null   object 
 4   Vessel_Type                      5000 non-null   object 
 5   Vessel_Size_GT                   5000 non-null   float64
 6   Total_TEU_Planned                5000 non-null   int64  
 7   Reefer_TEU_Planned               5000 non-null   int64  
 8   Hazmat_TEU_Planned               5000 non-null   int64  
 9   Berth_Used                       5000 non-null   object 
 10  Num_Cranes_Assigned              5000 non-null   int64  
 11  Gang_Size_per_Crane              4761 non-null   float64
 12  Average_Wind_Speed  

## Cleaning & Preprocessing
1. Handle missing data: impute since each row has a maximum of only 2 missing values out of 16 columns
2. Change to correct data types
3. Drop some columns and split into X & Y features
4. Handle preprocessing of categorical & numerical features separately
5. Combine Transformers using ColumnTransformer
6. Apply the Preprocessing

###### potential for improvement: more preprocessing & some feature engineering

In [10]:
# --- 1. Handle missing data: Imputation ---

median_gang_size = df['Gang_Size_per_Crane'].median()
df.fillna({'Gang_Size_per_Crane': round(median_gang_size)}, inplace=True)
mean_wind_speed = df['Average_Wind_Speed'].mean()
df.fillna({'Average_Wind_Speed': mean_wind_speed}, inplace=True)

# --- 2. Change to correct data types ---
df['Arrival_Date'] = pd.to_datetime(df['Arrival_Date'])
categorical_cols_to_convert = ['Day_of_Week_Arrival', 'Vessel_Type', 'Berth_Used', 'Precipitation']
for col in categorical_cols_to_convert:
    df[col] = df[col].astype('category')
df['Gang_Size_per_Crane'] = df['Gang_Size_per_Crane'].astype(int) #in case the median result in decimals

print("\n --------------------- DataFrame info before encoding and scaling: ---------------------")
df.info()

# --- 3. Drop some columns and split into X & Y features ---

# Exclude Vessel_ID as it's an identifier and Arrival_Date as we'd typically engineer features from it first
# If Arrival_Date was to be used, feature engineering (extracting month, day, etc.) would happen *before* this step.
# Let's assume for now Arrival_Date is not directly used in this encoding/scaling step but other features are.
X = df.drop(columns=['Actual_Operation_Duration_Hours', 'Vessel_ID', 'Arrival_Date']) #13 colums currently
y = df['Actual_Operation_Duration_Hours']

print("\n --------------------- Features (X) head: ---------------------")
print(X.head())

# --- 4. Handle preprocessing of categorical & numerical features separately ---

categorical_features = X.select_dtypes(include=['category', 'object']).columns.tolist()
# Ensure Vessel_Type is treated as categorical even if it only has one value in our generated data
if 'Vessel_Type' not in categorical_features and 'Vessel_Type' in X.columns:
     categorical_features.append('Vessel_Type')

numerical_features = X.select_dtypes(include=np.number).columns.tolist()

# If any numerical features were unintentionally captured as object/category,
# or vice-versa, they should be corrected here or earlier.
# For example, if 'Time_of_Day_Arrival' was object, convert to int.

print(f"\nCategorical features identified: {categorical_features}")
print(f"Numerical features identified: {numerical_features}")

# Preprocessing for numerical data: StandardScaler
numerical_transformer = StandardScaler()

# Preprocessing for categorical data: OneHotEncoder
# handle_unknown='ignore' will prevent errors if new categories appear in test data
# drop='first' can be used to avoid multicollinearity if desired, especially for linear models
categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop=None)


# --- 5. Combine Transformers using ColumnTransformer ---
# This applies the specified transformers to the correct columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough' # Use 'drop' if want to drop columns not specified
                           # 'passthrough' keeps them as is. For well-defined X, 'drop' is safer. ####################################### POTENTIAL FOR IMPROVEMENT: more meaningful preprocessing & feature engineering ##################################
                           # use 'drop' to ensure only processed features remain.
    # remainder='drop' # For this exercise, it's better to ensure only transformed features are used
)

# --- 6. Apply the Preprocessing ---

# Fit and transform the data
X_processed = preprocessor.fit_transform(X)

# The output of ColumnTransformer (with OneHotEncoder) can be a NumPy array.
# useful to get the feature names after one-hot encoding.
feature_names_out = preprocessor.get_feature_names_out()


# Convert the processed NumPy array back to a DataFrame (optional, for inspection)
X_processed_df = pd.DataFrame(X_processed, columns=feature_names_out, index=X.index)

print("\nShape of X before processing:", X.shape)
print("Shape of X after processing:", X_processed_df.shape)
print("\n --------------------- Processed features (X_processed_df) head: ---------------------")
print(X_processed_df.head())

print("\n --------------------- Info of processed DataFrame: ---------------------")
X_processed_df.info()


 --------------------- DataFrame info before encoding and scaling: ---------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 16 columns):
 #   Column                           Non-Null Count  Dtype         
---  ------                           --------------  -----         
 0   Vessel_ID                        5000 non-null   object        
 1   Arrival_Date                     5000 non-null   datetime64[ns]
 2   Time_of_Day_Arrival              5000 non-null   int64         
 3   Day_of_Week_Arrival              5000 non-null   category      
 4   Vessel_Type                      5000 non-null   category      
 5   Vessel_Size_GT                   5000 non-null   float64       
 6   Total_TEU_Planned                5000 non-null   int64         
 7   Reefer_TEU_Planned               5000 non-null   int64         
 8   Hazmat_TEU_Planned               5000 non-null   int64         
 9   Berth_Used                       5000 n