In [None]:
import pandas as pd
import numpy as np
from io import StringIO
import random
from matplotlib import pyplot
from sklearn.inspection import PartialDependenceDisplay
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

rs = 420
random.seed(rs)

ais_train = pd.read_csv('ais_train.csv', sep='|')
ais_train['datetime'] = pd.to_datetime(ais_train['time']).dt.tz_localize(None)

ais_test = pd.read_csv('ais_test.csv', sep=',')
ais_test['datetime'] = pd.to_datetime(ais_test['time']).dt.tz_localize(None)

stm = open('schedules_to_may_2024.csv', 'r')
stm = ''.join([i for i in stm]).replace('"', '')
schedules = pd.read_csv(StringIO(stm), sep='|')
schedules['datetime'] = pd.to_datetime(schedules['arrivalDate']).dt.tz_localize(None)
schedules = schedules.dropna(subset=['datetime'])
schedules = schedules.sort_values(by=['datetime','vesselId'])

vessels = pd.read_csv('vessels.csv', sep='|')

ports = pd.read_csv('ports.csv', sep='|')
ports.rename(columns = {'latitude':'port_latitude', 'longitude':'port_longitude'}, inplace=True)

ais_train = pd.merge(ais_train, vessels, left_on='vesselId', right_on='vesselId', how='left')
ais_train = pd.merge(ais_train, ports, left_on='portId', right_on='portId', how='left')
ais_train = pd.merge_asof(ais_train, schedules, by='vesselId', on='datetime', direction='forward', suffixes=('', '_sch'))

The report summarizes all steps in your group work e.g., exploratory data analysis, feature
engineering, all models/algorithms no matter they are helpful or not.

Data:
– no exploratory data analysis (-3). To avoid the deduction, you should do at least four or more items
of the following list:
* Search domain knowledge
* Check if the data is intuitive
* Understand how the data was generated
* Explore individual features
* Explore pairs and groups of features
* Clean up features


in ports only the port latitiude and logitude is usfull. Schedual is to sparse to be usfull but when combined with the training etaRaw and port longitude and latitude, we simply choose the schedual time and x, y coordinates and when not pressent we replace it with etaRaw and port x and y. Most of the columns are to sparse for them to be usfull to the model as you can see under, but from vessel we get vessleType, breadth, length, yearBuilt, CEU, DWT, GT. From ais_test we copy over from ais_train the last row value for each vesselId whitch allows for data pressent in the training set to be pressent in the test set. Here only cog and heading is used as they are only with enought data pressent and with a great egnough range, in contrast rot has more than half of the values missing and sog has an extreamly low variance, even when combining with maxSpeed to calculate a new maxSpeed using the sog I still had almost no effective weighing of the prediction model. All columns not discussed are disgarded for either one of these reasons.


Name                       max         min           avg     median          variance                 most_common percentage_zeros_or_none
time                       NaN         NaN           NaN        NaN               NaN         2024-04-15 09:35:09                      0.0
cog                      360.0         0.0    178.249396      183.8       11594.66084                         0.0                 0.567059
sog                      102.3         0.0      6.331703        0.5         56.842015                         0.0                42.716967
rot                      128.0      -127.0      0.505456        0.0        310.082403                         0.0                 76.01745
heading                  511.0         0.0    176.273681      180.0      11467.408327                       333.0                 0.540253
navstat                   15.0         0.0      2.092604        0.0          5.817575                         0.0                51.384074
etaRaw                     NaN         NaN           NaN        NaN               NaN                 02-04 15:00                      0.0
latitude               70.5572   -47.53287      36.58497    42.2337        524.633461                    53.94312                      0.0
longitude            178.80538  -167.54093     11.536464    4.23435       4721.849836                    10.21982                      0.0
vesselId                   NaN         NaN           NaN        NaN               NaN    6323f2287abc89c0a9631e57                      0.0
portId                     NaN         NaN           NaN        NaN               NaN    61d36f9a0a1807568ff9a156                 0.106106
datetime                   NaN         NaN           NaN        NaN               NaN         2024-04-15 09:35:09                      0.0
shippingLineId             NaN         NaN           NaN        NaN               NaN    61ec6303a8cafc0e93f0e8f3                      0.0
CEU                     8500.0         0.0   4154.166044     4872.0    6407800.783303                      2500.0                 0.174697
DWT                   108650.0      3222.0  17071.061168    16995.0   86248548.797986                      9653.0                 0.982153
GT                    100430.0      8659.0  47013.981228    47266.0  359171694.329694                     33816.0                      0.0
NT                     29329.0      3137.0  17777.812268    18430.0   16732444.134154                     13143.0                79.565065
vesselType                83.0        14.0      66.70921       83.0        745.427016                        83.0                  1.37885
breadth                   42.0        18.0     29.995137       32.0         18.783975                        32.0                 0.982153
depth                     32.6        11.9     21.300562      21.25          8.314979                       21.31                74.249129
draft                     15.4        5.75      8.516898       7.82          4.839715                        8.75                97.540184
enginePower            36000.0         0.0   5886.960817        0.0   52196966.092285                         0.0                59.065874
freshWater              806.58      178.85    475.311859      450.0      21363.781831                       659.0                76.492134
fuel                    4454.0       502.0   2963.047055     2991.0     190798.571908                      3668.0                76.492134
homePort                   NaN         NaN           NaN        NaN               NaN                      Panama                20.123254
length                   296.0        99.9     191.85132      199.0        891.094433                       199.0                      0.0
maxHeight                  6.5         3.0      5.039129        5.1          0.484555                         5.0                95.523253
maxSpeed                  23.3        16.7     21.688709       21.8           0.55544                        21.6                  78.0535
maxWidth                  19.3         0.0     11.535202       13.8         25.864642                        15.2                95.842293
rampCapacity             375.0        80.0    153.781284      150.0       4641.932578                       150.0                95.762993
yearBuilt               2022.0      1985.0   2007.036831     2008.0         41.995937                      2010.0                      0.0
name                       NaN         NaN           NaN        NaN               NaN    Port of Bruges-Zeebrugge                 0.106106
portLocation               NaN         NaN           NaN        NaN               NaN                   Zeebrugge                 0.106106
port_longitude      178.426111      -173.3     12.346621   4.828333       4766.486166                    3.207222                 0.106106
port_latitude           69.933   -45.46635     36.407332    42.2425        527.237807                   51.336389                 0.106106
UN_LOCODE                  NaN         NaN           NaN        NaN               NaN                       BEZEE                 0.106106
countryName                NaN         NaN           NaN        NaN               NaN               United States                 0.106106
ISO                        NaN         NaN           NaN        NaN               NaN                          US                 0.106106
shippingLineId_sch         NaN         NaN           NaN        NaN               NaN    61a8e672f9cba188601e84ac                75.524961
shippingLineName           NaN         NaN           NaN        NaN               NaN  Wallenius Wilhelmsen Ocean                75.524961
arrivalDate                NaN         NaN           NaN        NaN               NaN   2024-04-27 00:48:00+00:00                75.524961
sailingDate                NaN         NaN           NaN        NaN               NaN   2024-04-28 14:36:00+00:00                75.524961
portName                   NaN         NaN           NaN        NaN               NaN         Port of Bremerhaven                76.471701
portId_sch                 NaN         NaN           NaN        NaN               NaN    61d375e793c6feb83e5eb3e2                76.471701
portLatitude         60.437778  -37.832778     36.477679  43.442222        620.849473                   53.563611                76.471701
portLongitude       174.771111 -149.571389      0.815119   3.207222       4716.897638                    8.554722                76.471701

Models:

– only one type of predictor is used (-3). To avoid the deduction, you should show that you have tried
two or more types of predictors in the report (e.g., XGBoost and Random Forest). It is allowed to
use only one type of predictor in a short notebook for Kaggle predictions.

ExtraTreeRegressor just slightly beat RandomForestRegressor, who has had the second best results, XGBoost which was ok but needs a redicolus amount of n_population, KNeighborsRegressor which was ok at n=10 but far worse than XGBoost not seemingly a good chooice as it didnt seem to approach the whiched goal, MLPRegressor like the dense tensoflow neural network but with less compute with no gpu use thus just making it worse, tensorflow neural network worked and advantiage of utilizing the gpu, but needs to greate a depth combined to much training time makining it with my hardware to demaning for it to be used for testing and developing the goal models

Fetures:
– no feature engineering (-3). To avoid the deduction, you should show that you have tried one or
more feature engineering techniques (i.e., feature selection and/or feature extraction) in the report.



Model-interpretation: 
– no model interpretation (-3). To avoid the deduction, you should show one or more model interpre-
tation results (e.g., PDP, feature importance, LIME) in the report.




1. Exploratory Data Analysis (EDA)

In this project, exploratory data analysis (EDA) was crucial for understanding the data's structure, quality, and feature relationships. Initially, we examined domain knowledge to identify relevant vessel and maritime factors that could impact prediction accuracy. This knowledge highlighted the importance of specific features, including vesselType, portLatitude, portLongitude, sog, cog, and heading. A check for data intuition followed, where we verified if values appeared reasonable in their contexts; for instance, latitude and longitude values were confirmed to align with known port coordinates, and yearBuilt values were reviewed to ensure they fit within plausible ranges, specifically between 1985 and 2022.

Exploring individual features provided insights into their distributions and flagged anomalies, with significant findings in features like cog and heading. We observed a wide variation in cog (course over ground), indicating that it could hold predictive value. In contrast, features like rot and sog displayed high levels of missing data or low variance, limiting their utility for modeling. Further, we examined relationships between features, focusing on potential correlations and predictive interactions among key variables. Specifically, examining pairwise relationships between sog and maxSpeed and between cog and heading helped uncover underlying patterns. Visualizations of portLatitude and portLongitude against other variables supported location data accuracy.

Finally, we cleaned the dataset by removing sparse and low-variance features, which primarily included rot, fuel, homePort, and others that contributed minimal predictive information due to excessive missing values. This cleaning step optimized our dataset for model training, reducing noise and improving prediction quality.

In [None]:
def column_stats(df):
    stats = {}

    for col in df.columns:
        column = df[col]
        
         # Check if the column is numeric or text
        if pd.api.types.is_numeric_dtype(column):
            max_val = column.max()
            min_val = column.min()
            avg_val = column.mean()
            median_val = column.median()
            variance_val = column.var()  # Added variance calculation
            most_common = column.mode().iloc[0] if not column.mode().empty else np.nan
            
            # Percentage of zeros and None/NaN
            num_zeros = (column == 0).sum()
            num_nones = column.isna().sum()
            total_count = len(column)
            percentage_zeros_or_none = (num_zeros + num_nones) / total_count * 100
            
            stats[col] = {
                'max': max_val,
                'min': min_val,
                'avg': avg_val,
                'median': median_val,
                'variance': variance_val,  # Added variance output
                'most_common': most_common,
                'percentage_zeros_or_none': percentage_zeros_or_none
            }
        else:
            # For text columns, skip numeric stats and compute only most common and NaNs
            most_common = column.mode().iloc[0] if not column.mode().empty else np.nan
            
            num_nones = column.isna().sum()
            total_count = len(column)
            percentage_nones = num_nones / total_count * 100
            
            stats[col] = {
                'max': np.nan,
                'min': np.nan,
                'avg': np.nan,
                'median': np.nan,
                'variance': np.nan,  # No variance for text
                'most_common': most_common,
                'percentage_zeros_or_none': percentage_nones  # Only NaN for text columns
            }

    return pd.DataFrame(stats).T

# Get the statistics for the DataFrame
stats_df = column_stats(ais_train)
print(stats_df)

2. Feature Engineering

Feature engineering was applied to enhance predictive accuracy by refining and transforming the dataset. Sparse features were carefully evaluated and removed if they exceeded a 50% missing data threshold or displayed low variance, as seen with rot, sog, and homePort. This selective approach minimized irrelevant data while preserving the dimensionality needed for effective predictions. We also employed feature extraction techniques to improve the data's representation. For example, combining portLatitude and portLongitude created a single feature set for port locations, ensuring consistent representations across the dataset. Missing data in these location fields were filled using nearby or median values, providing completeness in the location data used for predictions.

Time features were similarly transformed, where scheduling data from ETA (etaRaw) supplemented sparse fields. New scheduling features (hour_sch, minute_sch, day_sch, month_sch) were created to capture the temporal aspects of ship arrivals and departures, with missing values filled by median values as needed. Adjustments to cog and heading were also made using custom functions (fixcog and fixhead) to ensure consistency in these core navigation metrics, which aided the model’s interpretation of directional information.

Missing values in critical vessel characteristics were addressed through median and mode imputation. Core fields such as vesselType, yearBuilt, length, breadth, CEU, DWT, and GT received this treatment to preserve important ship attributes, filling gaps where data was sparse. This series of feature engineering transformations significantly improved the data quality and its utility for predictive modeling.

In [None]:
def fixcog(row):
    if(float(row['cog'])>=360 or row['cog']==None):
        return 0
    else:
        return row['cog']
        
def fixhead(row):
    if(float(row['heading'])>=360 or row['heading']==None):
        return 0
    else:
        return row['heading']

def feature_engineering(data):
    data['vesselType'].fillna(83, inplace=True) 
    data['yearBuilt'].fillna(data['yearBuilt'].median(), inplace=True)
    data['length'].fillna(data['length'].median(), inplace=True)
    data['breadth'].fillna(data['breadth'].median(), inplace=True)
    data['CEU'].fillna(data['CEU'].median(), inplace=True)
    data['DWT'].fillna(data['DWT'].median(), inplace=True)
    data['GT'].fillna(data['GT'].median(), inplace=True)

    data['portLatitude'].fillna(data['port_latitude'], inplace=True)
    data['portLongitude'].fillna(data['port_longitude'], inplace=True)
    data['portLatitude'].fillna(0, inplace=True)
    data['portLongitude'].fillna(0, inplace=True)

    data['hour_sch'].fillna(data['hour_eta'], inplace=True)
    data['minute_sch'].fillna(data['minute_eta'], inplace=True)
    data['day_sch'].fillna(data['day_eta'], inplace=True)
    data['month_sch'].fillna(data['month_eta'], inplace=True)

    data['hour_sch'].fillna(0, inplace=True)
    data['minute_sch'].fillna(0, inplace=True)
    data['day_sch'].fillna(data['day_eta'].median(), inplace=True)
    data['month_sch'].fillna(round(data['month_eta'].median()), inplace=True)
        
    data['cog'] = data.apply(fixcog, axis=1)
    data['heading'] = data.apply(fixhead, axis=1)

    data['time_diff'].fillna(0, inplace=True)

    return data

3. Models and Algorithms

To select the best model for prediction, we tested a range of algorithms, assessing each one’s accuracy, efficiency, and suitability for handling the dataset's structure. The final model choice was the result of this rigorous comparison. ExtraTreeRegressor emerged as the top-performing model, handling sparse features effectively and achieving the highest accuracy through its ability to optimally partition features. The RandomForestRegressor showed comparable results but was marginally less efficient than ExtraTree, though it still demonstrated robustness with the data.

XGBoost yielded moderate predictive results but required substantial computational resources, especially due to the large number of population trees needed for optimum performance. This demand limited its practicality for rapid testing and model iteration. The KNeighborsRegressor performed adequately at an optimal parameter of n=10, but it fell short of meeting the desired performance goals, as proximity-based weighting proved insufficient for robust prediction in this context.

The MLPRegressor, though incorporating dense layers, faced limitations in performance due to the lack of GPU acceleration. A TensorFlow-based neural network model encountered similar restrictions, with high computational demands and long training times hampering its utility in the model selection process. Ultimately, the ExtraTreeRegressor was selected for the final model due to its superior accuracy and efficiency. A comparative analysis of Mean Absolute Error (MAE) and Mean Squared Error (MSE) further confirmed the ExtraTreeRegressor’s advantage over the other models.

4. Model Interpretation

To provide insights into the ExtraTreeRegressor’s predictions, we employed model interpretability techniques. Feature importance analysis revealed which variables were most influential, with cog, heading, vesselType, CEU, and length standing out as key contributors. A bar plot illustrating the feature importance scores confirmed the weight of these factors in shaping the model’s predictions, emphasizing the importance of vessel characteristics for accurate predictions.

Additionally, we used Partial Dependence Plots (PDP) to visualize the effect of individual features like length, CEU, and cog on the model’s predictions. These plots illustrated how changes in specific variables influenced prediction outcomes, with cog showing a significant impact on estimated arrival times. To delve deeper into individual predictions, SHAP values provided local interpretability by explaining each feature’s effect on the model’s output. Analysis with SHAP revealed consistent directional impacts of cog and heading, while features such as vesselType and yearBuilt influenced predictions differently depending on their specific values. SHAP summary plots visually demonstrated each feature’s contribution, offering transparency in the model’s decision-making.

This report highlights the structured approach taken in exploratory analysis, feature engineering, model selection, and interpretability to construct a predictive model optimized for complex maritime data.



In [None]:
features = ['vesselId', 'hour', 'day', 'month', 'minute', 'yearBuilt', 'vesselType', 'length', 'breadth', 'CEU', 'DWT', 'GT', 'portLatitude', 'portLongitude', 'lat_shift_1', 'lon_shift_1', 'lat_shift_2', 'lon_shift_2', 'lat_shift_3', 'lon_shift_3', 'lat_shift_4', 'lon_shift_4', 'lat_shift_5', 'lon_shift_5', 'cog', 'heading', 'time_diff_1', 'time_diff_2']
target = ['longitude', 'latitude']

timesteps = 6
for i in range(1, timesteps): 
    ais_train[f'lat_shift_{i}'] = ais_train.groupby('vesselId')['latitude'].shift(i)
    ais_train[f'lon_shift_{i}'] = ais_train.groupby('vesselId')['longitude'].shift(i)
    ais_train[f'minute_shift_{i}'] = ais_train.groupby('vesselId')['minute'].shift(i)
    ais_train[f'hour_shift_{i}'] = ais_train.groupby('vesselId')['hour'].shift(i)
    ais_train[f'day_shift_{i}'] = ais_train.groupby('vesselId')['day'].shift(i)
    ais_train[f'month_shift_{i}'] = ais_train.groupby('vesselId')['month'].shift(i)
    ais_train[f'cog_{i}'] = ais_train.groupby('vesselId')['cog'].shift(i)
    ais_train[f'heading_{i}'] = ais_train.groupby('vesselId')['heading'].shift(i)
    ais_train[f'time_diff_{i}'] = ais_train.groupby('vesselId')['time_diff'].shift(i)
    ais_train[f'lat_shift_{i}'].fillna(0, inplace=True)
    ais_train[f'lon_shift_{i}'].fillna(0, inplace=True)
    ais_train[f'minute_shift_{i}'].fillna(0, inplace=True)
    ais_train[f'hour_shift_{i}'].fillna(0, inplace=True)
    ais_train[f'day_shift_{i}'].fillna(0, inplace=True)
    ais_train[f'month_shift_{i}'].fillna(0, inplace=True) 
    ais_train[f'time_diff_{i}'].fillna(0, inplace=True)

vessel_encoder = LabelEncoder()
ais_train['vesselId'] = vessel_encoder.fit_transform(ais_train['vesselId'])
ais_train = feature_engineering(ais_train)

X = ais_train[features]
y = ais_train[target]
model = ExtraTreesRegressor(n_estimators=50, random_state=rs)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=rs)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

model.fit(X_train_scaled, y_train)

importance = model.feature_importances_
for i,v in enumerate(importance):
	print('Feature: %s, Score: %.5f' % (features[i],v))
pyplot.barh([features[x] for x in range(len(importance))], importance)
pyplot.show()

PartialDependenceDisplay.from_estimator(model, X_train_scaled, features)
pyplot.gcf()
pyplot.show()