In [1]:
import pandas as pd

train_df = pd.read_csv('../data/processed_data/train.csv')
test_df = pd.read_csv('../data/processed_data/test.csv')

train_df.head()

Unnamed: 0,time,cog,sog,rot,heading,navstat,etaRaw,latitude,longitude,vesselId,...,time_position_1_step_ago,latitude_2_steps_ago,longitude_2_steps_ago,time_position_2_steps_ago,cog_1_step_ago,time_cog_1_step_ago,cog_2_steps_ago,time_cog_2_steps_ago,week_of_the_year,day_of_the_year
0,0.031663,0.858217,17.1,-6,316,0,01-08 06:00,7.50361,77.5834,61e9f38eb937134a3c4bfd8b,...,,,,,,,,,0.019231,0.030137
1,0.031707,0.856825,17.3,5,313,0,01-14 23:30,7.57302,77.49505,61e9f38eb937134a3c4bfd8b,...,0.031663,,,,0.858217,0.031663,,,0.019231,0.030137
2,0.031757,0.854596,16.9,5,312,0,01-14 23:30,7.65043,77.39404,61e9f38eb937134a3c4bfd8b,...,0.031707,7.50361,77.5834,0.031663,0.856825,0.031707,0.858217,0.031663,0.019231,0.030137
3,0.031798,0.85766,16.9,6,313,0,01-14 23:30,7.71275,77.31394,61e9f38eb937134a3c4bfd8b,...,0.031757,7.57302,77.49505,0.031707,0.854596,0.031757,0.856825,0.031707,0.019231,0.030137
4,0.031838,0.855153,16.3,7,313,0,01-14 23:30,7.77191,77.23585,61e9f38eb937134a3c4bfd8b,...,0.031798,7.65043,77.39404,0.031757,0.85766,0.031798,0.854596,0.031757,0.019231,0.030137


In [2]:
import pandas as pd
import numpy as np

# Sort by vesselId and time to calculate distances between consecutive positions
train_df = train_df.sort_values(by=['vesselId', 'time']).reset_index(drop=True)

# Step 1: Create columns for previous positions and times
train_df['latitude_1_step_ago'] = train_df.groupby('vesselId')['latitude'].shift(1)
train_df['latitude_2_steps_ago'] = train_df.groupby('vesselId')['latitude'].shift(2)

train_df['longitude_1_step_ago'] = train_df.groupby('vesselId')['longitude'].shift(1)
train_df['longitude_2_steps_ago'] = train_df.groupby('vesselId')['longitude'].shift(2)

# Step 2: Calculate the changes between positions
train_df['lat_change_1_step'] = train_df['latitude'] - train_df['latitude_1_step_ago']
train_df['lat_change_2_steps'] = train_df['latitude'] - train_df['latitude_2_steps_ago']

train_df['lon_change_1_step'] = train_df['longitude'] - train_df['longitude_1_step_ago']
train_df['lon_change_2_steps'] = train_df['longitude'] - train_df['longitude_2_steps_ago']

train_df['lat_change_2_to_1_steps'] = train_df['latitude_1_step_ago'] - train_df['latitude_2_steps_ago']
train_df['lon_change_2_to_1_steps'] = train_df['longitude_1_step_ago'] - train_df['longitude_2_steps_ago']

# Step 3: Calculate the average latitude change for each vesselId based on 1-step and 2-step changes
avg_lat_change_1_step = train_df.groupby('vesselId')['lat_change_1_step'].mean().reset_index()
avg_lat_change_1_step.columns = ['vesselId', 'avg_lat_change_1_step']

avg_lat_change_2_steps = train_df.groupby('vesselId')['lat_change_2_steps'].mean().reset_index()
avg_lat_change_2_steps.columns = ['vesselId', 'avg_lat_change_2_steps']

# Step 4: Calculate the average longitude change for each vesselId based on 1-step and 2-step changes
avg_lon_change_1_step = train_df.groupby('vesselId')['lon_change_1_step'].mean().reset_index()
avg_lon_change_1_step.columns = ['vesselId', 'avg_lon_change_1_step']

avg_lon_change_2_steps = train_df.groupby('vesselId')['lon_change_2_steps'].mean().reset_index()
avg_lon_change_2_steps.columns = ['vesselId', 'avg_lon_change_2_steps']



# Step 5: Merge the average latitude and longitude changes back into the main dataframe
train_df = pd.merge(train_df, avg_lat_change_1_step, on='vesselId', how='left')
train_df = pd.merge(train_df, avg_lat_change_2_steps, on='vesselId', how='left')
train_df = pd.merge(train_df, avg_lon_change_1_step, on='vesselId', how='left')
train_df = pd.merge(train_df, avg_lon_change_2_steps, on='vesselId', how='left')

# Display the final dataframe with the new features
train_df.head()

Unnamed: 0,time,cog,sog,rot,heading,navstat,etaRaw,latitude,longitude,vesselId,...,lat_change_1_step,lat_change_2_steps,lon_change_1_step,lon_change_2_steps,lat_change_2_to_1_steps,lon_change_2_to_1_steps,avg_lat_change_1_step,avg_lat_change_2_steps,avg_lon_change_1_step,avg_lon_change_2_steps
0,0.031663,0.858217,17.1,-6,316,0,01-08 06:00,7.50361,77.5834,61e9f38eb937134a3c4bfd8b,...,,,,,,,0.073175,0.146541,-0.178895,-0.358466
1,0.031707,0.856825,17.3,5,313,0,01-14 23:30,7.57302,77.49505,61e9f38eb937134a3c4bfd8b,...,0.06941,,-0.08835,,,,0.073175,0.146541,-0.178895,-0.358466
2,0.031757,0.854596,16.9,5,312,0,01-14 23:30,7.65043,77.39404,61e9f38eb937134a3c4bfd8b,...,0.07741,0.14682,-0.10101,-0.18936,0.06941,-0.08835,0.073175,0.146541,-0.178895,-0.358466
3,0.031798,0.85766,16.9,6,313,0,01-14 23:30,7.71275,77.31394,61e9f38eb937134a3c4bfd8b,...,0.06232,0.13973,-0.0801,-0.18111,0.07741,-0.10101,0.073175,0.146541,-0.178895,-0.358466
4,0.031838,0.855153,16.3,7,313,0,01-14 23:30,7.77191,77.23585,61e9f38eb937134a3c4bfd8b,...,0.05916,0.12148,-0.07809,-0.15819,0.06232,-0.0801,0.073175,0.146541,-0.178895,-0.358466


In [3]:
train_df = train_df.drop(columns=['lat_change_1_step', 'lon_change_1_step', 'lat_change_2_steps', 'lon_change_2_steps'])

print(train_df.columns)

train_df.head()

Index(['time', 'cog', 'sog', 'rot', 'heading', 'navstat', 'etaRaw', 'latitude',
       'longitude', 'vesselId', 'portId', 'latitude_1_step_ago',
       'longitude_1_step_ago', 'time_position_1_step_ago',
       'latitude_2_steps_ago', 'longitude_2_steps_ago',
       'time_position_2_steps_ago', 'cog_1_step_ago', 'time_cog_1_step_ago',
       'cog_2_steps_ago', 'time_cog_2_steps_ago', 'week_of_the_year',
       'day_of_the_year', 'lat_change_2_to_1_steps', 'lon_change_2_to_1_steps',
       'avg_lat_change_1_step', 'avg_lat_change_2_steps',
       'avg_lon_change_1_step', 'avg_lon_change_2_steps'],
      dtype='object')


Unnamed: 0,time,cog,sog,rot,heading,navstat,etaRaw,latitude,longitude,vesselId,...,cog_2_steps_ago,time_cog_2_steps_ago,week_of_the_year,day_of_the_year,lat_change_2_to_1_steps,lon_change_2_to_1_steps,avg_lat_change_1_step,avg_lat_change_2_steps,avg_lon_change_1_step,avg_lon_change_2_steps
0,0.031663,0.858217,17.1,-6,316,0,01-08 06:00,7.50361,77.5834,61e9f38eb937134a3c4bfd8b,...,,,0.019231,0.030137,,,0.073175,0.146541,-0.178895,-0.358466
1,0.031707,0.856825,17.3,5,313,0,01-14 23:30,7.57302,77.49505,61e9f38eb937134a3c4bfd8b,...,,,0.019231,0.030137,,,0.073175,0.146541,-0.178895,-0.358466
2,0.031757,0.854596,16.9,5,312,0,01-14 23:30,7.65043,77.39404,61e9f38eb937134a3c4bfd8b,...,0.858217,0.031663,0.019231,0.030137,0.06941,-0.08835,0.073175,0.146541,-0.178895,-0.358466
3,0.031798,0.85766,16.9,6,313,0,01-14 23:30,7.71275,77.31394,61e9f38eb937134a3c4bfd8b,...,0.856825,0.031707,0.019231,0.030137,0.07741,-0.10101,0.073175,0.146541,-0.178895,-0.358466
4,0.031838,0.855153,16.3,7,313,0,01-14 23:30,7.77191,77.23585,61e9f38eb937134a3c4bfd8b,...,0.854596,0.031757,0.019231,0.030137,0.06232,-0.0801,0.073175,0.146541,-0.178895,-0.358466


In [4]:
# Sort by vesselId and time to calculate distances between consecutive positions
test_df = test_df.sort_values(by=['vesselId', 'time']).reset_index(drop=True)

# Step 1: Extract the average latitude and longitude changes from train_df
avg_changes = train_df[['vesselId',  'avg_lat_change_1_step', 'avg_lat_change_2_steps', 'avg_lon_change_1_step', 'avg_lon_change_2_steps']].drop_duplicates()

# Step 2: Merge the average latitude and longitude changes into test_df based on vesselId
test_df = pd.merge(test_df, avg_changes, on='vesselId', how='left')

test_df.head()

Unnamed: 0,ID,vesselId,time,scaling_factor,week_of_the_year,day_of_the_year,avg_lat_change_1_step,avg_lat_change_2_steps,avg_lon_change_1_step,avg_lon_change_2_steps
0,4,61e9f38eb937134a3c4bfd8d,0.34975,0.3,0.346154,0.350685,-0.000462,-0.000924,-0.001555,-0.003109
1,201,61e9f38eb937134a3c4bfd8d,0.349802,0.3,0.346154,0.350685,-0.000462,-0.000924,-0.001555,-0.003109
2,583,61e9f38eb937134a3c4bfd8d,0.349904,0.3,0.346154,0.350685,-0.000462,-0.000924,-0.001555,-0.003109
3,701,61e9f38eb937134a3c4bfd8d,0.349938,0.3,0.346154,0.350685,-0.000462,-0.000924,-0.001555,-0.003109
4,829,61e9f38eb937134a3c4bfd8d,0.349961,0.3,0.346154,0.350685,-0.000462,-0.000924,-0.001555,-0.003109


In [5]:
train_df.to_csv('../data/processed_data/train.csv', index=False)
test_df.to_csv("../data/processed_data/test.csv", index=False)