In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor
import datetime
from io import StringIO
import random
#!{sys.executable} -m pip install xgboost
rs = 420
random.seed(rs)

print(datetime.datetime.today().strftime("%HH:%MM %dd"))
# Load datasets

featuers = ['vesselId', 'hour', 'day', 'month', 'minute', 'vesselType', 'yearBuilt', 'length', 'breadth', 'CEU', 'DWT', 'GT', 'portLatitude', 'portLongitude', 'hour_sch', 'day_sch', 'month_sch', 'minute_sch', 'lat_shift_1', 'lon_shift_1', 'minute_shift_1', 'hour_shift_1', 'day_shift_1', 'month_shift_1', 'cog_1', 'heading_1', 'lat_shift_2', 'lon_shift_2', 'minute_shift_2', 'hour_shift_2', 'day_shift_2', 'month_shift_2', 'cog_2', 'heading_2', 'lat_shift_3', 'lon_shift_3', 'minute_shift_3', 'hour_shift_3', 'day_shift_3', 'month_shift_3', 'cog_3', 'heading_3', 'cog', 'heading']
vessel_encoder = LabelEncoder()
# Train-test split
liste = []
for i in range(len(featuers)):
    liste.append(pd.read_csv('x_data_'+str(i)+'.csv', sep=','))

X = liste[0].join(liste[1:])
y_1 = pd.read_csv('y_data_1.csv', sep=',')
y_2 = pd.read_csv('y_data_2.csv', sep=',')
y = y_2.join(y_1)

print("p6:", X.tail())
print("p7:", y.tail())
X['vesselId'] = vessel_encoder.fit_transform(X['vesselId'])

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=rs)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)


model = RandomForestRegressor(n_estimators=180, random_state=rs)
# Train a RandomForest model
model.fit(X_train_scaled, y_train)

# Predict on validation set
y_pred = model.predict(X_val_scaled)

# Evaluate the model
mse = mean_squared_error(y_val, y_pred)
print(f'Mean Absolute Error: {mse}')
print(f'Root Mean Squared Error: {np.sqrt(mse)}')
print(f'R2-score: {r2_score(y_val, y_pred)}')
print(f'Variance Score: {explained_variance_score(y_val, y_pred)}')

# Use the same features as training
X_0 = pd.read_csv('x_test.csv', sep=',')
X_1 = pd.read_csv('x_test_1.csv', sep=',')
X_2 = pd.read_csv('x_test_2.csv', sep=',')
X_3 = pd.read_csv('x_test_3.csv', sep=',')
X_4 = pd.read_csv('x_test_4.csv', sep=',')
X_test = X_0.join([X_1, X_2, X_3, X_4])
X_test = X_test[featuers]
X_test['vesselId'] = vessel_encoder.transform(X_test['vesselId'])
# Scale the test set
X_test_scaled = scaler.transform(X_test)

# Predict on the test set
test_predictions = model.predict(X_test_scaled)

# Save the predictions
predictions_df = pd.DataFrame(test_predictions, columns=['longitude_predicted', 'latitude_predicted'])
predictions_df.to_csv('ais_test_predictions_2.csv', index=True)
print(datetime.datetime.today().strftime("%HH:%MM %dd"))