In [470]:
import pandas as pd

vessels_df = pd.read_csv('../data/processed_data/vessels.csv')

train_df = pd.read_csv("../data/processed_data/train.csv")

test_df = pd.read_csv("../data/processed_data/test.csv")

In [471]:
vessels_df = pd.merge(vessels_df, train_df[['vesselId']], on='vesselId', how='inner').drop_duplicates()

print(f"Rows of vessels_df that have a vesselId that is also present in train_df: {len(vessels_df)}")

Rows of vessels_df that have a vesselId that is also present in train_df: 688


In [472]:
vessels_df['enginePower'] = vessels_df['enginePower'].fillna(14121.0) # Median engine power value found in eda
vessels_df.loc[vessels_df['enginePower'] == 0, 'enginePower'] = 14121.0 # Median engine power value found in eda

vessels_df['breadth'] = vessels_df['breadth'].fillna(32) # Median breadth valud found in eda

vessels_df['vesselType'] = vessels_df['vesselType'].fillna(83) # Most typical vesselType valud found in eda

vessels_df['DWT'] = vessels_df['DWT'].fillna(18758) # Median DWT valud found in eda

In [473]:
train_set = vessels_df[['vesselId', 'enginePower', 'CEU', 'GT', 'breadth', 'length', 'vesselType', 'DWT', 'maxSpeed']].reset_index(drop=True)

train_set.head()

Unnamed: 0,vesselId,enginePower,CEU,GT,breadth,length,vesselType,DWT,maxSpeed
0,0.0,14121.0,6500,58684,32.0,199.0,83.0,21200.0,18.6
1,0.001389,14220.0,4902,46800,31.0,182.0,83.0,12325.0,
2,0.002778,14220.0,5000,46800,31.0,182.0,83.0,13059.0,
3,0.004167,11060.0,4200,39362,28.0,167.0,83.0,12588.0,
4,0.005556,13140.0,7450,75528,37.2,199.98,83.0,21052.0,


In [474]:
min_values = train_set.min()
max_values = train_set.max()

# Combine the min and max values into a single DataFrame for easier viewing
min_max_df = pd.DataFrame({'Min': min_values, 'Max': max_values})

# Display the result
print(min_max_df)

                Min            Max
vesselId        0.0       0.998611
enginePower  1500.0   36000.000000
CEU             0.0    8500.000000
GT           8659.0  100430.000000
breadth        18.0      42.000000
length         99.9     296.000000
vesselType     14.0      83.000000
DWT          3222.0  108650.000000
maxSpeed       16.7      23.300000


In [475]:
from sklearn.preprocessing import MinMaxScaler

vessel_ids = train_set['vesselId']
vessel_types = train_set['vesselType']
train_set = train_set.drop(columns=['vesselId', 'vesselType'])


# Initialize the MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))

# Fit and transform the data to normalize it
train_set = pd.DataFrame(scaler.fit_transform(train_set), columns=train_set.columns)

vessel_type_encoded = pd.get_dummies(vessel_types, prefix='vesselType')

train_set = pd.concat([vessel_ids.reset_index(drop=True), vessel_type_encoded.reset_index(drop=True), train_set.reset_index(drop=True)], axis=1)

# Display the normalized DataFrame
print(train_set.head())

   vesselId  vesselType_14.0  vesselType_21.0  vesselType_83.0  enginePower  \
0  0.000000            False            False             True     0.365826   
1  0.001389            False            False             True     0.368696   
2  0.002778            False            False             True     0.368696   
3  0.004167            False            False             True     0.277101   
4  0.005556            False            False             True     0.337391   

        CEU        GT   breadth    length       DWT  maxSpeed  
0  0.764706  0.545107  0.583333  0.505354  0.170524  0.287879  
1  0.576706  0.415611  0.541667  0.418664  0.086343       NaN  
2  0.588235  0.415611  0.541667  0.418664  0.093305       NaN  
3  0.494118  0.334561  0.416667  0.342172  0.088838       NaN  
4  0.876471  0.728651  0.800000  0.510352  0.169120       NaN  


In [476]:
nan_counts = train_set.isna().sum()

print(nan_counts)

print(f"Rows: {len(train_set)}")

vesselId             0
vesselType_14.0      0
vesselType_21.0      0
vesselType_83.0      0
enginePower          0
CEU                  0
GT                   0
breadth              0
length               0
DWT                  0
maxSpeed           479
dtype: int64
Rows: 688


In [477]:
print(f"Length of all data: {len(train_set)}")

x_to_pred = train_set[(train_set["maxSpeed"] == 0) | (train_set["maxSpeed"].isna())].drop(columns=["maxSpeed"])
train_set = train_set[(train_set["maxSpeed"] != 0) & (train_set["maxSpeed"].notna())]

print(f"Length of x to pred: {len(x_to_pred)}")
print(f"Length of train_set: {len(train_set)}")


Length of all data: 688
Length of x to pred: 480
Length of train_set: 208


In [478]:
from sklearn.model_selection import train_test_split

# Separate features (X) and the label (y)
X = train_set.drop(columns=["maxSpeed"])  # Features
y = train_set["maxSpeed"]  # Label

# Split the data: 80% training, 20% test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the resulting splits
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (166, 10)
X_test shape: (42, 10)
y_train shape: (166,)
y_test shape: (42,)


In [479]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error

rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
rf_regressor.fit(X_train, y_train)
y_pred = rf_regressor.predict(X_test)
rmse = root_mean_squared_error(y_test, y_pred)
print(f"RMSE: {rmse}")


RMSE: 0.0555495208581268


In [480]:
from sklearn.linear_model import LinearRegression
l_regressor = LinearRegression()
l_regressor.fit(X_train, y_train)
y_pred = l_regressor.predict(X_test)

rmse = root_mean_squared_error(y_test, y_pred)
print(f"RMSE: {rmse}")

RMSE: 0.08940415484734063


In [481]:
from sklearn.ensemble import GradientBoostingRegressor
gb_regressor = GradientBoostingRegressor(random_state=42)
gb_regressor.fit(X_train, y_train)
y_pred = gb_regressor.predict(X_test)

rmse = root_mean_squared_error(y_test, y_pred)
print(f"RMSE: {rmse}")

RMSE: 0.04807113866507933


In [482]:
import xgboost as xgb

xg_regressor = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=42)
xg_regressor.fit(X_train, y_train)
y_pred = xg_regressor.predict(X_test)

rmse = root_mean_squared_error(y_test, y_pred)
print(f"RMSE: {rmse}")

RMSE: 0.055787160817875235


In [483]:
y = gb_regressor.predict(x_to_pred)

x_to_pred["maxSpeed"] = y

# Step 3: Combine x_to_pred with the original train_set
complete_vessel_df = pd.concat([train_set, x_to_pred], axis=0)

# Optionally, reset the index for the full dataset
complete_vessel_df = complete_vessel_df.reset_index(drop=True)

# Display the final dataset
print(f"Length of full dataset: {len(complete_vessel_df)}")
complete_vessel_df.head()


Length of full dataset: 688


Unnamed: 0,vesselId,vesselType_14.0,vesselType_21.0,vesselType_83.0,enginePower,CEU,GT,breadth,length,DWT,maxSpeed
0,0.0,False,False,True,0.365826,0.764706,0.545107,0.583333,0.505354,0.170524,0.287879
1,0.006944,False,False,True,0.33913,0.729412,0.543483,0.594167,0.509944,0.17447,0.80303
2,0.008333,False,False,True,0.318841,0.592471,0.438788,0.591667,0.408414,0.114467,0.893939
3,0.009722,False,False,True,0.34087,0.733765,0.553922,0.594167,0.510301,0.148993,0.787879
4,0.011111,False,False,True,0.34087,0.731176,0.553421,0.594167,0.510301,0.146508,0.818182


In [484]:
train_df = pd.merge(train_df, complete_vessel_df, on='vesselId', how='left')

train_df.head()

Unnamed: 0,time,cog,sog,rot,heading,navstat,etaRaw,latitude,longitude,vesselId,...,vesselType_14.0,vesselType_21.0,vesselType_83.0,enginePower,CEU,GT,breadth,length,DWT,maxSpeed
0,0.087133,308.1,17.1,-6,316,0,01-08 06:00,7.50361,77.5834,0.0,...,False,False,True,0.365826,0.764706,0.545107,0.583333,0.505354,0.170524,0.287879
1,0.087255,307.6,17.3,5,313,0,01-14 23:30,7.57302,77.49505,0.0,...,False,False,True,0.365826,0.764706,0.545107,0.583333,0.505354,0.170524,0.287879
2,0.087392,306.8,16.9,5,312,0,01-14 23:30,7.65043,77.39404,0.0,...,False,False,True,0.365826,0.764706,0.545107,0.583333,0.505354,0.170524,0.287879
3,0.087504,307.9,16.9,6,313,0,01-14 23:30,7.71275,77.31394,0.0,...,False,False,True,0.365826,0.764706,0.545107,0.583333,0.505354,0.170524,0.287879
4,0.087614,307.0,16.3,7,313,0,01-14 23:30,7.77191,77.23585,0.0,...,False,False,True,0.365826,0.764706,0.545107,0.583333,0.505354,0.170524,0.287879


In [485]:
test_df = pd.merge(test_df, complete_vessel_df, on='vesselId', how='left')

test_df.head()

Unnamed: 0,ID,vesselId,time,scaling_factor,week_of_the_year,day_of_the_year,vesselType_14.0,vesselType_21.0,vesselType_83.0,enginePower,CEU,GT,breadth,length,DWT,maxSpeed
0,0,0.123611,0.962423,0.3,0.346154,0.350685,False,False,True,0.365826,0.933412,0.714779,0.583333,0.663437,0.264835,0.644914
1,1,0.909722,0.962439,0.3,0.346154,0.350685,True,False,False,0.365826,0.294118,0.014438,0.083333,0.122896,0.095003,0.763768
2,2,0.869444,0.962459,0.3,0.346154,0.350685,False,True,False,0.365826,0.164706,0.188905,0.375,0.439062,0.037258,0.762773
3,3,0.790278,0.962461,0.3,0.346154,0.350685,False,False,True,0.310435,0.589059,0.406446,0.508333,0.423763,0.101766,0.833333
4,4,0.001389,0.962471,0.3,0.346154,0.350685,False,False,True,0.368696,0.576706,0.415611,0.541667,0.418664,0.086343,0.612945


In [486]:
complete_vessel_df.to_csv('../data/processed_data/vessels.csv')
train_df.to_csv('../data/processed_data/train.csv', index=False)
test_df.to_csv("../data/processed_data/test.csv", index=False)

In [None]:
# Verify length of train and test??