In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score
import datetime
import random
rs = 420
random.seed(rs)

print(datetime.datetime.today().strftime("%HH:%MM %dd"))
# Load datasets


vessel_encoder = LabelEncoder()

# Select features and target
features = ['vesselId', 'hour', 'day', 'month', 'year', 'minute', 'yearBuilt', 'length', 'vesselType', 'breadth', 'CEU', 'DWT', 'GT', 'portLatitude', 'portLongitude', 'sch_hour', 'sch_minute']#['cog', 'sog', 'rot', 'heading', 'navstat', 'latitude', 'longitude', 'vesselId', 'portId', 'hour', 'day', 'month', 'year', 'minute']
target = ['latitude', 'longitude']  # Predicting next position (latitude, longitude)

def feature_engineering(data):

    data['vesselType'].fillna(83, inplace=True) #data['vesselType'].mode() -> dette er 83 men funker ikke??
    data['yearBuilt'].fillna(data['yearBuilt'].median(), inplace=True)
    data['length'].fillna(data['length'].median(), inplace=True)
    data['breadth'].fillna(data['breadth'].median(), inplace=True)
    data['CEU'].fillna(data['CEU'].median(), inplace=True)
    data['DWT'].fillna(data['DWT'].median(), inplace=True)
    data['GT'].fillna(data['GT'].median(), inplace=True)
    data['portLatitude'].fillna(0, inplace=True)
    data['portLongitude'].fillna(0, inplace=True)
    data['sch_hour'].fillna(0, inplace=True)
    data['sch_minute'].fillna(0, inplace=True)
    return data

# Handle missing values (if any)
#ais_train = ais_train.dropna(subset=features + target) #ais_train.replace(to_replace='None', value=np.nan).dropna()


# Train-test split
X_0 = pd.read_csv('x_data_0.csv', sep=',')
print("p1:", X_0.tail())
X_1 = pd.read_csv('x_data_1.csv', sep=',')
print("p2:", X_1.tail())
X_2 = pd.read_csv('x_data_2.csv', sep=',')
print("p3:", X_2.tail())
X_3 = pd.read_csv('x_data_3.csv', sep=',')
X_4 = pd.read_csv('x_data_4.csv', sep=',')
X_5 = pd.read_csv('x_data_5.csv', sep=',')
X_6 = pd.read_csv('x_data_6.csv', sep=',')
X_7 = pd.read_csv('x_data_7.csv', sep=',')
X_8 = pd.read_csv('x_data_8.csv', sep=',')
X_9 = pd.read_csv('x_data_9.csv', sep=',')
X_10 = pd.read_csv('x_data_10.csv', sep=',')
X_11 = pd.read_csv('x_data_11.csv', sep=',')
X_12 = pd.read_csv('x_data_12.csv', sep=',')
X_13 = pd.read_csv('x_data_13.csv', sep=',')
X_14 = pd.read_csv('x_data_14.csv', sep=',')
X_15 = pd.read_csv('x_data_15.csv', sep=',')
X_16 = pd.read_csv('x_data_16.csv', sep=',')
X = X_0.join([X_1, X_2, X_3, X_4, X_5, X_6, X_7, X_8, X_9, X_10, X_11, X_12, X_13, X_14, X_15, X_16])
y_1 = pd.read_csv('y_data_1.csv', sep=',')
print("p4:", y_1.tail())
y_2 = pd.read_csv('y_data_2.csv', sep=',')
print("p5:", y_2.tail())
y = y_1.join(y_2)

print("p6:", X.tail())
print("p7:", y.tail())
X['vesselId'] = vessel_encoder.fit_transform(X['vesselId'])
X = feature_engineering(X)
model = RandomForestRegressor(n_estimators=1, random_state=rs)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=rs)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Train a RandomForest model
model.fit(X_train_scaled, y_train)

# Predict on validation set
y_pred = model.predict(X_val_scaled)

# Evaluate the model
mse = mean_squared_error(y_val, y_pred)
print(f'Mean Absolute Error: {mse}')
print(f'Root Mean Squared Error: {np.sqrt(mse)}')
print(f'R2-score: {r2_score(y_val, y_pred)}')
print(f'Variance Score: {explained_variance_score(y_val, y_pred)}')

# Use the same features as training
X_0 = pd.read_csv('x_test.csv', sep=',')
X_1 = pd.read_csv('x_test_1.csv', sep=',')
X_2 = pd.read_csv('x_test_2.csv', sep=',')
X_3 = pd.read_csv('x_test_3.csv', sep=',')
X_test = X_0.join([X_1, X_2, X_3])
X_test['vesselId'] = vessel_encoder.transform(X_test['vesselId'])
# Scale the test set
X_test_scaled = scaler.transform(X_test)

# Predict on the test set
test_predictions = model.predict(X_test_scaled)

# Save the predictions
predictions_df = pd.DataFrame(test_predictions, columns=['longitude_predicted', 'latitude_predicted'])
predictions_df.to_csv('ais_test_predictions.csv', index=True)
print(datetime.datetime.today().strftime("%HH:%MM %dd"))

14H:17M 28d
p1:          minute
1666782      59
1666783      59
1666784      59
1666785      59
1666786      59
p2:          portLongitude
1666782     -81.496667
1666783     -76.558889
1666784     -79.927500
1666785       0.000000
1666786       0.000000
p3:          vesselId
1666782       459
1666783       459
1666784       459
1666785       596
1666786       637
p4:          latitude
1666782  49.71372
1666783  49.71372
1666784  49.71372
1666785  38.27895
1666786  38.98635
p5:          longitude
1666782   -5.22042
1666783   -5.22042
1666784   -5.22042
1666785   10.78280
1666786  -75.13275
p6:          minute  portLongitude  vesselId  breadth     GT  length  month  \
1666782      59     -81.496667       459     38.0  72700  199.96      5   
1666783      59     -76.558889       459     38.0  72700  199.96      5   
1666784      59     -79.927500       459     38.0  72700  199.96      5   
1666785      59       0.000000       596     27.0  25995  186.00      5   
1666786      59       0.0

MemoryError: could not allocate 134217728 bytes