In [96]:
import pandas as pd
import numpy as np

train_df = pd.read_csv("../data/processed_data/train.csv")

test_df = pd.read_csv("../data/processed_data/test.csv")

In [97]:
train_df.head()

Unnamed: 0,time,cog,sog,rot,heading,navstat,etaRaw,latitude,longitude,vesselId,...,vesselType_14.0,vesselType_21.0,vesselType_83.0,enginePower,CEU,GT,breadth,length,DWT,maxSpeed
0,0.031663,308.1,17.1,-6,316,0,01-08 06:00,7.50361,77.5834,61e9f38eb937134a3c4bfd8b,...,False,False,True,0.365826,0.764706,0.545107,0.583333,0.505354,0.170524,0.287879
1,0.031707,307.6,17.3,5,313,0,01-14 23:30,7.57302,77.49505,61e9f38eb937134a3c4bfd8b,...,False,False,True,0.365826,0.764706,0.545107,0.583333,0.505354,0.170524,0.287879
2,0.031757,306.8,16.9,5,312,0,01-14 23:30,7.65043,77.39404,61e9f38eb937134a3c4bfd8b,...,False,False,True,0.365826,0.764706,0.545107,0.583333,0.505354,0.170524,0.287879
3,0.031798,307.9,16.9,6,313,0,01-14 23:30,7.71275,77.31394,61e9f38eb937134a3c4bfd8b,...,False,False,True,0.365826,0.764706,0.545107,0.583333,0.505354,0.170524,0.287879
4,0.031838,307.0,16.3,7,313,0,01-14 23:30,7.77191,77.23585,61e9f38eb937134a3c4bfd8b,...,False,False,True,0.365826,0.764706,0.545107,0.583333,0.505354,0.170524,0.287879


In [98]:
print(f"Features available for training: {test_df.columns}")

Features available for training: Index(['ID', 'vesselId', 'time', 'scaling_factor', 'week_of_the_year',
       'day_of_the_year', 'avg_lat_change_1_step', 'avg_lat_change_2_steps',
       'avg_lon_change_1_step', 'avg_lon_change_2_steps', 'vesselType_14.0',
       'vesselType_21.0', 'vesselType_83.0', 'enginePower', 'CEU', 'GT',
       'breadth', 'length', 'DWT', 'maxSpeed'],
      dtype='object')


In [99]:
train_df[train_df["cog"] == 360]["cog"].count() # 360 is default value

5858

In [100]:
def replace_360_cog(df):
    for index, row in df[df['cog'] == 360].iterrows():
        if index > 0 and index < len(df) - 1:
            # Get the previous and next cog values
            prev_cog = df.loc[index - 1, 'cog']
            next_cog = df.loc[index + 1, 'cog']
            
            # Calculate the mean of the previous and next cog values if neither are 360
            if prev_cog != 360 and next_cog != 360:
                new_cog = np.mean([prev_cog, next_cog])
                df.at[index, 'cog'] = new_cog
            elif prev_cog != 360:
                df.at[index, 'cog'] = prev_cog
            elif next_cog != 360:
                df.at[index, 'cog'] = next_cog
            else:
                continue
    return df

train_df = replace_360_cog(train_df)

In [101]:
train_df[train_df["cog"] == 360]["cog"].count() # 360 is default value

0

In [102]:
train_df.columns

Index(['time', 'cog', 'sog', 'rot', 'heading', 'navstat', 'etaRaw', 'latitude',
       'longitude', 'vesselId', 'portId', 'latitude_1_step_ago',
       'longitude_1_step_ago', 'time_position_1_step_ago',
       'latitude_2_steps_ago', 'longitude_2_steps_ago',
       'time_position_2_steps_ago', 'week_of_the_year', 'day_of_the_year',
       'lat_change_2_to_1_steps', 'lon_change_2_to_1_steps',
       'avg_lat_change_1_step', 'avg_lat_change_2_steps',
       'avg_lon_change_1_step', 'avg_lon_change_2_steps', 'vesselType_14.0',
       'vesselType_21.0', 'vesselType_83.0', 'enginePower', 'CEU', 'GT',
       'breadth', 'length', 'DWT', 'maxSpeed'],
      dtype='object')

In [103]:
nan_counts = train_df.isna().sum()

print(nan_counts)

time                            0
cog                             0
sog                             0
rot                             0
heading                         0
navstat                         0
etaRaw                          0
latitude                        0
longitude                       0
vesselId                        0
portId                       1615
latitude_1_step_ago           688
longitude_1_step_ago          688
time_position_1_step_ago      688
latitude_2_steps_ago         1375
longitude_2_steps_ago        1375
time_position_2_steps_ago    1375
week_of_the_year                0
day_of_the_year                 0
lat_change_2_to_1_steps      1375
lon_change_2_to_1_steps      1375
avg_lat_change_1_step           1
avg_lat_change_2_steps          1
avg_lon_change_1_step           1
avg_lon_change_2_steps          1
vesselType_14.0                 0
vesselType_21.0                 0
vesselType_83.0                 0
enginePower                     0
CEU           

In [104]:
from sklearn.model_selection import train_test_split

sample_df = sample_df = train_df.sample(n=50_000)

# Separate features (X) and the label (y)
unused_features_df = sample_df[['cog', 'sog', 'rot', 'heading', 'navstat', 'etaRaw', 'latitude',
       'longitude', 'vesselId', 'portId', 'latitude_1_step_ago',
       'longitude_1_step_ago', 'time_position_1_step_ago',
       'latitude_2_steps_ago', 'longitude_2_steps_ago',
       'time_position_2_steps_ago',
       'lat_change_2_to_1_steps', 'lon_change_2_to_1_steps']]

X = sample_df[['time','week_of_the_year',
       'day_of_the_year', 'avg_lat_change_1_step', 'avg_lat_change_2_steps',
       'avg_lon_change_1_step', 'avg_lon_change_2_steps', 'vesselType_14.0',
       'vesselType_21.0', 'vesselType_83.0', 'enginePower', 'CEU', 'GT',
       'breadth', 'length', 'DWT', 'maxSpeed']]  # Features
y = sample_df["cog"]  # Label

# Split the data: 80% training, 20% test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the resulting splits
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X.head()

X_train shape: (40000, 17)
X_test shape: (10000, 17)
y_train shape: (40000,)
y_test shape: (10000,)


Unnamed: 0,time,week_of_the_year,day_of_the_year,avg_lat_change_1_step,avg_lat_change_2_steps,avg_lon_change_1_step,avg_lon_change_2_steps,vesselType_14.0,vesselType_21.0,vesselType_83.0,enginePower,CEU,GT,breadth,length,DWT,maxSpeed
1042319,0.206313,0.192308,0.205479,0.019364,0.038719,0.009715,0.019376,False,True,False,0.370435,0.594118,0.550915,0.583333,0.505354,0.141025,0.664602
443542,0.344032,0.326923,0.342466,0.147981,0.296452,-0.014097,-0.027848,False,True,False,0.365826,0.899412,0.683484,0.583333,0.653238,0.214326,0.647872
1480561,0.133599,0.115385,0.131507,-2.7e-05,-5.5e-05,0.000876,0.001752,False,False,True,0.365826,0.421176,0.294298,0.416667,0.352881,0.083735,0.816951
123061,0.133225,0.115385,0.131507,-0.003022,-0.006048,-0.021505,-0.043036,False,False,True,0.365826,1.0,0.738371,0.916667,0.505354,0.177334,0.35631
800838,0.299207,0.288462,0.29863,0.018327,0.03667,-0.023658,-0.047337,False,False,True,0.291594,0.412353,0.383029,0.5,0.408465,0.111934,0.742424


In [105]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error

rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
rf_regressor.fit(X_train, y_train)
y_pred = rf_regressor.predict(X_test)
rmse = root_mean_squared_error(y_test, y_pred)
print(f"RMSE: {rmse}")


RMSE: 101.40391690132093


In [106]:
test_df.head()


Unnamed: 0,ID,vesselId,time,scaling_factor,week_of_the_year,day_of_the_year,avg_lat_change_1_step,avg_lat_change_2_steps,avg_lon_change_1_step,avg_lon_change_2_steps,vesselType_14.0,vesselType_21.0,vesselType_83.0,enginePower,CEU,GT,breadth,length,DWT,maxSpeed
0,4,61e9f38eb937134a3c4bfd8d,0.34975,0.3,0.346154,0.350685,-0.000462,-0.000924,-0.001555,-0.003109,False,False,True,0.368696,0.576706,0.415611,0.541667,0.418664,0.086343,0.837412
1,201,61e9f38eb937134a3c4bfd8d,0.349802,0.3,0.346154,0.350685,-0.000462,-0.000924,-0.001555,-0.003109,False,False,True,0.368696,0.576706,0.415611,0.541667,0.418664,0.086343,0.837412
2,583,61e9f38eb937134a3c4bfd8d,0.349904,0.3,0.346154,0.350685,-0.000462,-0.000924,-0.001555,-0.003109,False,False,True,0.368696,0.576706,0.415611,0.541667,0.418664,0.086343,0.837412
3,701,61e9f38eb937134a3c4bfd8d,0.349938,0.3,0.346154,0.350685,-0.000462,-0.000924,-0.001555,-0.003109,False,False,True,0.368696,0.576706,0.415611,0.541667,0.418664,0.086343,0.837412
4,829,61e9f38eb937134a3c4bfd8d,0.349961,0.3,0.346154,0.350685,-0.000462,-0.000924,-0.001555,-0.003109,False,False,True,0.368696,0.576706,0.415611,0.541667,0.418664,0.086343,0.837412


In [107]:

unused_features_test_df = test_df[['vesselId', 'scaling_factor', "ID"]]

X_test_df = test_df[['time','week_of_the_year',
       'day_of_the_year', 'avg_lat_change_1_step', 'avg_lat_change_2_steps',
       'avg_lon_change_1_step', 'avg_lon_change_2_steps', 'vesselType_14.0',
       'vesselType_21.0', 'vesselType_83.0', 'enginePower', 'CEU', 'GT',
       'breadth', 'length', 'DWT', 'maxSpeed']]  # Features

In [108]:
cog_pred = rf_regressor.predict(X_test_df)
cog_pred_series = pd.Series(cog_pred, index=X_test_df.index, name='cog')

test_df = pd.concat([X_test_df, cog_pred_series, unused_features_test_df])


test_df.columns

Index(['time', 'week_of_the_year', 'day_of_the_year', 'avg_lat_change_1_step',
       'avg_lat_change_2_steps', 'avg_lon_change_1_step',
       'avg_lon_change_2_steps', 'vesselType_14.0', 'vesselType_21.0',
       'vesselType_83.0', 'enginePower', 'CEU', 'GT', 'breadth', 'length',
       'DWT', 'maxSpeed', 'cog', 'vesselId', 'scaling_factor', 'ID'],
      dtype='object')

In [109]:
test_df.to_csv('../data/processed_data/test.csv', index=False)