In [None]:
import pandas as pd
import xgboost as xgb
import numpy as np

# Read training data
train_df = pd.read_csv('ais_train.csv', sep='|')

# Read test data
test_df = pd.read_csv('ais_test.csv', sep=',')

# Define target variables
y_lon = train_df['longitude']
y_lat = train_df['latitude']

# Drop target variables and 'ID' from features
X_train = train_df.drop(['longitude', 'latitude', 'ID'], axis=1, errors='ignore')

# Prepare test features
test_ids = test_df['ID']  # Save IDs for submission
X_test = test_df.drop(['ID'], axis=1, errors='ignore')

# Combine train and test data to ensure consistent encoding
combined = pd.concat([X_train, X_test], axis=0, ignore_index=True)

# Handle categorical variables, if any
# For simplicity, let's use get_dummies
combined = pd.get_dummies(combined)

# Split back into train and test features
X_train = combined.iloc[:len(X_train), :]
X_test = combined.iloc[len(X_train):, :]

# Now, train the XGBoost models

# For latitude
model_lat = xgb.XGBRegressor()
model_lat.fit(X_train, y_lat)

# For longitude
model_lon = xgb.XGBRegressor()
model_lon.fit(X_train, y_lon)

# Predict on test data
predicted_lat = model_lat.predict(X_test)
predicted_lon = model_lon.predict(X_test)

# Create submission DataFrame
submission_rows = []
for idx, test_id in enumerate(test_ids):
    submission_rows.append({
        'ID': test_id,
        'longitude_predicted': predicted_lon[idx],
        'latitude_predicted': predicted_lat[idx]
    })

submission_df = pd.DataFrame(submission_rows)

# Save submission_df to csv
submission_df.to_csv('submission.csv', index=False)


In [None]:
submission_df.describe()