# Prediction - 2: Linear regression

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [2]:
train = pd.read_csv("data/ais_train.csv",sep="|")
df_train = train.copy()

In [3]:
test = pd.read_csv("data/ais_test.csv",sep=",")
df_test = test.copy()

In [4]:
# Time to datetime
df_train['time'] = pd.to_datetime(df_train['time'])
df_test['time'] = pd.to_datetime(df_test['time'])

In [5]:
# Additional dataframes
df_port = pd.read_csv("data/ports.csv",sep="|")

In [6]:
# Renaming columns of portId
df_port.rename(columns={'longitude':'portLong'}, inplace=True)
df_port.rename(columns={'latitude':'portLat'}, inplace=True)

In [7]:
# Filling NaN values in PortId and adding portLat and portLong

df_train['portId'] = df_train.groupby('vesselId')['portId'].ffill().bfill()

df_train = df_train.merge(df_port[['portId', 'portLat', 'portLong']], on='portId', how='left')

In [8]:
# Adding moored as a feature

df_train['moored'] = ((df_train.groupby('vesselId')['latitude'].shift(0) == df_train.groupby('vesselId')['latitude'].shift(-1))
                   & (df_train.groupby('vesselId')['longitude'].shift(0) == df_train.groupby('vesselId')['longitude'].shift(-1))).astype(bool)


In [9]:
# Remove 'sog' and 'navstat' features
df_train.drop(['sog', 'navstat'], axis=1, inplace=True)

In [10]:
n_arr = [2**n for n in range(1, 11)]

X_list = []
y_list = []

for n in n_arr:
    df_copy = df_train.copy()
    df_copy_grouped = df_copy.groupby('vesselId')

    # Reset index to ensure vesselId is included in the resulting DataFrame
    df_copy = df_copy.reset_index()

    # Apply iloc to each group individually with include_groups=False
    df_copy_x = df_copy_grouped.apply(lambda x: x.iloc[:-n]).reset_index(drop=True)
    df_copy_y = df_copy_grouped.apply(lambda x: x.shift(-n).iloc[:-n]).reset_index(drop=True)

    # Calculate the time difference
    df_copy_x['dt'] = (pd.to_datetime(df_copy_y['time']) - pd.to_datetime(df_copy_x['time'])).dt.total_seconds() / 60

    X_list.append(df_copy_x)
    y_list.append(df_copy_y)

X_tr = pd.concat(X_list, ignore_index=True)
y_tr = pd.concat(y_list, ignore_index=True)



In [11]:
# Renaming longitude and latitude features
X_tr.rename(columns={'longitude': 'last_longitude', 'latitude':'last_latitude'}, inplace=True)

In [12]:
# Splitting the data into training and validation sets
split_index = 10

X_val = X_tr.groupby('vesselId').tail(split_index)
X_tr = X_tr.groupby('vesselId').apply(lambda x: x.iloc[:-split_index]).reset_index(drop=True)

y_val = y_tr.groupby('vesselId').tail(split_index)[['latitude', 'longitude']]
y_tr = y_tr.groupby('vesselId').apply(lambda x: x.iloc[:-split_index]).reset_index(drop=True)[['latitude', 'longitude']]

In [13]:
# Drop non-numeric variables
X_tr.drop(['time'], axis=1, inplace=True)
X_val.drop(['time'], axis=1, inplace=True)

X_tr.drop(['portId'], axis=1, inplace=True)
X_val.drop(['portId'], axis=1, inplace=True)

X_tr.drop(['vesselId'], axis=1, inplace=True)
X_val.drop(['vesselId'], axis=1, inplace=True)

X_tr.drop(['etaRaw'], axis=1, inplace=True)
X_val.drop(['etaRaw'], axis=1, inplace=True)


In [14]:
# Fit the XGB model
# Create instance of XGBRegressor
clf = xgb.XGBRegressor(n_estimators=500, max_depth=7, learning_rate=0.05, min_child_weight=7 ,verbosity=2)
# Fit the model
clf.fit(X_tr, y_tr)

In [15]:
# Fetching last value per feature per vessel in the training set
X_te = df_train.groupby('vesselId').apply(lambda x: x.iloc[-1]).reset_index(drop=True)
# Merge with the test set
X_te = df_test.merge(X_te, on='vesselId', how='left', suffixes=('_new',''))
# Add the time difference as a new feature
X_te['dt'] = (X_te['time_new'] - X_te['time']).dt.total_seconds()/60
# Rename latitude and longitude features
X_te.rename(columns={'longitude': 'last_longitude', 'latitude':'last_latitude'}, inplace=True)


In [16]:
# Predict on the test set
y_pred = clf.predict(X_te.drop(['time', 'time_new', 'vesselId', 'scaling_factor', 'etaRaw', 'ID', 'portId'], axis=1))

In [17]:
# Export results to CSV
results_df = pd.DataFrame(y_pred, columns=['latitude', 'longitude'])
results_df['ID'] = X_te['ID']
results_df.rename(columns={'latitude': 'latitude_predicted', 'longitude': 'longitude_predicted'}, inplace=True)
results_df.to_csv("results_linear_regression.csv", index=False)