In [43]:
import pandas as pd
import numpy as np


train_df = pd.read_csv("../data/processed_data/train.csv")
test_df = pd.read_csv("../data/processed_data/test.csv")
schedules_df = pd.read_csv("../data/processed_data/schedules.csv")

print(len(schedules_df))

schedules_df.head()


136250


Unnamed: 0,vesselId,shippingLineId,shippingLineName,arrivalDate,sailingDate,portName,portId,portLatitude,portLongitude
0,0.138889,61a8e672f9cba188601e84ac,Wallenius Wilhelmsen Ocean,2023-10-02 00:00:00+00:00,2023-10-03 00:00:00+00:00,Port of Brunswick,0.411145,31.140556,-81.496667
1,0.138889,61a8e672f9cba188601e84ac,Wallenius Wilhelmsen Ocean,2023-10-27 00:00:00+00:00,2023-10-27 00:00:00+00:00,Port of Southampton,0.385542,50.9025,-1.428889
2,0.138889,61a8e672f9cba188601e84ac,Wallenius Wilhelmsen Ocean,2023-10-19 00:00:00+00:00,2023-10-20 00:00:00+00:00,Port of Bremerhaven,0.125,53.563611,8.554722
3,0.138889,61a8e672f9cba188601e84ac,Wallenius Wilhelmsen Ocean,2023-10-09 00:00:00+00:00,2023-10-10 00:00:00+00:00,Port of New York,0.401355,40.688333,-74.028611
4,0.138889,61a8e672f9cba188601e84ac,Wallenius Wilhelmsen Ocean,2023-09-25 00:00:00+00:00,2023-09-26 00:00:00+00:00,Manzanillo International Terminal,0.304217,9.37237,-79.87979


In [44]:

# Reference times
reference_start_time = pd.to_datetime("2024-01-01 00:00:00")
reference_end_time = pd.to_datetime("2024-05-12 23:59:58")  # Might need to adjust the end time
total_time_span = (reference_end_time - reference_start_time).total_seconds()

In [45]:

schedules_df['sailing_time_converted'] = pd.to_datetime(schedules_df['sailingDate'], errors='coerce')
schedules_df['arrival_time_converted'] = pd.to_datetime(schedules_df['arrivalDate'], errors='coerce')
schedules_df = schedules_df.sort_values(by=['sailing_time_converted']).reset_index(drop=True)

# Might have to make this timezone aware!!
schedules_df['sailing_time_converted'] = schedules_df['sailing_time_converted'].dt.tz_localize(None)
schedules_df['arrival_time_converted'] = schedules_df['arrival_time_converted'].dt.tz_localize(None)

#schedules_df = schedules_df[schedules_df["sailing_time_converted"] <= reference_end_time]
#schedules_df = schedules_df[schedules_df["sailing_time_converted"] >= reference_start_time]
#schedules_df = schedules_df[schedules_df["sailing_time_converted"] < schedules_df["arrival_time_converted"]]


schedules_df.drop(['sailing_time_converted'], axis=1, inplace=True)
schedules_df.drop(['arrival_time_converted'], axis=1, inplace=True)


schedules_df.tail()

Unnamed: 0,vesselId,shippingLineId,shippingLineName,arrivalDate,sailingDate,portName,portId,portLatitude,portLongitude
136245,0.020833,61a8e673f9cba188601e84ae,K-Line,2023-10-16 00:00:00+00:00,,,,,
136246,0.026389,61a8e673f9cba188601e84ae,K-Line,2023-09-10 00:00:00+00:00,,,,,
136247,0.013889,61a8e673f9cba188601e84ae,K-Line,2023-08-17 00:00:00+00:00,,,,,
136248,0.604167,61a8e673f9cba188601e84ae,K-Line,2023-10-13 00:00:00+00:00,,,,,
136249,0.013889,61a8e673f9cba188601e84ae,K-Line,2023-08-16 00:00:00+00:00,,,,,


In [46]:
# Training data - time feature conversion and normalization
train_df['original_time_converted'] = pd.to_datetime(train_df['time'], errors='coerce')
train_df['time_of_last_known_position_converted'] = pd.to_datetime(train_df['time_of_last_known_position'], errors='coerce')
train_df['time'] = (train_df['original_time_converted'] - reference_start_time).dt.total_seconds()
train_df['time_of_last_known_position'] = (train_df['time_of_last_known_position_converted'] - reference_start_time).dt.total_seconds()

# Normalize time (between 0 and 1)
train_df['time'] = train_df['time'] / total_time_span
train_df['time_of_last_known_position'] = train_df['time_of_last_known_position'] / total_time_span

# Add new features
train_df['week_of_the_year'] = train_df['original_time_converted'].dt.isocalendar().week  # Week number of the year
train_df['day_of_the_year'] = train_df['original_time_converted'].dt.dayofyear

# Normalize other features
# Min-max normalization to scale features between 0 and 1
train_df['week_of_the_year'] = (train_df['week_of_the_year'] - 1) / 52  # Normalize week_of_the_year (1-53)
train_df['day_of_the_year'] = (train_df['day_of_the_year'] - 1) / 365  # Normalize day_of_the_year (1-365)

# Drop intermediate columns
train_df.drop(['original_time_converted', 'time_of_last_known_position_converted'], axis=1, inplace=True)

train_df.head()


Unnamed: 0,time,cog,sog,rot,heading,navstat,etaRaw,latitude,longitude,vesselId,portId,last_known_latitude,last_known_longitude,time_of_last_known_position,week_of_the_year,day_of_the_year
0,0.087133,308.1,17.1,-6,316,0,01-08 06:00,7.50361,77.5834,0.0,0.143072,,,,0.019231,0.030137
1,0.087255,307.6,17.3,5,313,0,01-14 23:30,7.57302,77.49505,0.0,0.152108,7.50361,77.5834,0.087133,0.019231,0.030137
2,0.087392,306.8,16.9,5,312,0,01-14 23:30,7.65043,77.39404,0.0,0.152108,7.57302,77.49505,0.087255,0.019231,0.030137
3,0.087504,307.9,16.9,6,313,0,01-14 23:30,7.71275,77.31394,0.0,0.152108,7.65043,77.39404,0.087392,0.019231,0.030137
4,0.087614,307.0,16.3,7,313,0,01-14 23:30,7.77191,77.23585,0.0,0.152108,7.71275,77.31394,0.087504,0.019231,0.030137


In [47]:
# Test data - time feature conversion and normalization
test_df['time_converted'] = pd.to_datetime(test_df['time'], errors='coerce')
test_df['time'] = (test_df['time_converted'] - reference_start_time).dt.total_seconds()
test_df['time'] = test_df['time'] / total_time_span

# Add new features
test_df['week_of_the_year'] = test_df['time_converted'].dt.isocalendar().week  # Week number of the year
test_df['day_of_the_year'] = test_df['time_converted'].dt.dayofyear

# Normalize other features in test data
test_df['week_of_the_year'] = (test_df['week_of_the_year'] - 1) / 52  # Normalize week_of_the_year (1-53)
test_df['day_of_the_year'] = (test_df['day_of_the_year'] - 1) / 365  # Normalize day_of_the_year (1-365)

# Drop intermediate columns
test_df.drop(['time_converted'], axis=1, inplace=True)

test_df.head()

Unnamed: 0,ID,vesselId,time,scaling_factor,week_of_the_year,day_of_the_year
0,0,0.123611,0.962423,0.3,0.346154,0.350685
1,1,0.909722,0.962439,0.3,0.346154,0.350685
2,2,0.869444,0.962459,0.3,0.346154,0.350685
3,3,0.790278,0.962461,0.3,0.346154,0.350685
4,4,0.001389,0.962471,0.3,0.346154,0.350685


In [48]:
# Schedule data - time feature conversion and normalization
schedules_df['sailing_time_converted'] = pd.to_datetime(schedules_df['sailingDate'], errors='coerce').dt.tz_localize(None) # Might have to make this timezone aware!!
schedules_df['arrival_time_converted'] = pd.to_datetime(schedules_df['arrivalDate'], errors='coerce').dt.tz_localize(None) # Might have to make this timezone aware!!
schedules_df['sailingDate'] = (schedules_df['sailing_time_converted'] - reference_start_time).dt.total_seconds()
schedules_df['arrivalDate'] = (schedules_df['arrival_time_converted'] - reference_start_time).dt.total_seconds()

schedules_df['sailingDate'] = schedules_df['sailingDate'] / total_time_span
schedules_df['arrivalDate'] = schedules_df['arrivalDate'] / total_time_span


# Add new features
schedules_df['sailing_week_of_the_year'] = schedules_df['sailing_time_converted'].dt.isocalendar().week  # Week number of the year
schedules_df['sailing_day_of_the_year'] = schedules_df['sailing_time_converted'].dt.dayofyear
schedules_df['arrival_week_of_the_year'] = schedules_df['arrival_time_converted'].dt.isocalendar().week  # Week number of the year
schedules_df['arrival_day_of_the_year'] = schedules_df['arrival_time_converted'].dt.dayofyear

# Normalize other features in test data
schedules_df['sailing_week_of_the_year'] = (schedules_df['sailing_week_of_the_year'] - 1) / 52  # Normalize week_of_the_year (1-53)
schedules_df['sailing_day_of_the_year'] = (schedules_df['sailing_day_of_the_year'] - 1) / 365  # Normalize day_of_the_year (1-365)
schedules_df['arrival_week_of_the_year'] = (schedules_df['arrival_week_of_the_year'] - 1) / 52  # Normalize week_of_the_year (1-53)
schedules_df['arrival_day_of_the_year'] = (schedules_df['arrival_day_of_the_year'] - 1) / 365  # Normalize day_of_the_year (1-365)

# Drop intermediate columns
schedules_df.drop(['sailing_time_converted'], axis=1, inplace=True)
schedules_df.drop(['arrival_time_converted'], axis=1, inplace=True)

schedules_df.head()


Unnamed: 0,vesselId,shippingLineId,shippingLineName,arrivalDate,sailingDate,portName,portId,portLatitude,portLongitude,sailing_week_of_the_year,sailing_day_of_the_year,arrival_week_of_the_year,arrival_day_of_the_year
0,0.959722,61a8e673f9cba188601e84b3,UECC,-28.53196,-28.5282,,,,,0.596154,0.608219,0.596154,0.608219
1,0.959722,61a8e673f9cba188601e84b3,UECC,-28.53196,-28.5282,,,,,0.596154,0.608219,0.596154,0.608219
2,0.959722,61a8e673f9cba188601e84b3,UECC,-28.53196,-28.5282,,,,,0.596154,0.608219,0.596154,0.608219
3,0.538889,61be24564ea00ae59d0fe37f,ACL,-8.372495,-8.369519,Port of New York,0.401355,40.688333,-74.028611,0.942308,0.950685,0.942308,0.950685
4,0.538889,61be24564ea00ae59d0fe37f,ACL,-8.355233,-8.347464,Port of Baltimore,0.394578,39.2325,-76.558889,0.961538,0.958904,0.961538,0.956164


In [49]:
train_df.to_csv('../data/processed_data/train.csv', index=False)
test_df.to_csv("../data/processed_data/test.csv", index=False)
schedules_df.to_csv("../data/processed_data/schedules.csv", index=False)