In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# Load ais_train.csv with separator '|'
train_df = pd.read_csv('ais_train.csv', sep='|')
train_df['time'] = pd.to_datetime(train_df['time'])

# Filter out unrealistic speeds
train_df = train_df[train_df['sog'] < 25]

# Map 'navstat' values
train_df['navstat'] = train_df['navstat'].replace(8, 0)
train_df = train_df[~((train_df['navstat'].isin([1, 5])) & (train_df['sog'] > 0))]
train_df = train_df[~((train_df['navstat'] == 2) & (train_df['sog'] > 5))]

train_df = train_df.drop_duplicates()

# Load ais_test.csv with separator ','
test_df = pd.read_csv('ais_test.csv', sep=',')
test_df['time'] = pd.to_datetime(test_df['time'])

# Select only common vessel IDs in both train and test datasets
common_vessel_ids = set(train_df['vesselId']).intersection(set(test_df['vesselId']))
train_df = train_df[train_df['vesselId'].isin(common_vessel_ids)]

# Initialize a list to store submission data
submission_rows = []

# Define the number of lags to use
lags = [1]  # Number of past hours to include

# Process each vessel separately
for vessel_id in common_vessel_ids:
    # Prepare training data with lags for the specific vessel
    vessel_data = train_df[train_df['vesselId'] == vessel_id].sort_values('time').set_index('time')
    
    # Define features and target variables
    features = vessel_data[['latitude', 'longitude', 'cog', 'sog', 'heading']]
    target_lat = features['latitude']
    target_lon = features['longitude']
    
    # Create lagged features
    for lag in lags:
        for column in ['latitude', 'longitude']:
            features.loc[:, f'{column}_lag{lag}'] = features[column].shift(lag)
    
    # Drop rows with NaN values introduced by lags
    features = features.dropna()
    target_lat, target_lon = target_lat[features.index], target_lon[features.index]

    # Split data for model training
    X_train_lat, X_test_lat, y_train_lat, y_test_lat = train_test_split(features, target_lat, test_size=0.2, random_state=42)
    X_train_lon, X_test_lon, y_train_lon, y_test_lon = train_test_split(features, target_lon, test_size=0.2, random_state=42)

    # Initialize RandomForestRegressor for latitude and longitude
    rf_model_lat = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_model_lon = RandomForestRegressor(n_estimators=100, random_state=42)

    # Train the models
    rf_model_lat.fit(X_train_lat, y_train_lat)
    rf_model_lon.fit(X_train_lon, y_train_lon)
    
    # Make predictions on the test dataset
    vessel_test = test_df[test_df['vesselId'] == vessel_id].sort_values('time')
    for idx, row in vessel_test.iterrows():
        # Create lagged feature row for prediction
        feature_row = features.iloc[-1].copy()  # Start with the latest row from training features
        feature_row['cog'] = row['cog'] if 'cog' in row else feature_row['cog']  # Update if available
        feature_row['sog'] = row['sog'] if 'sog' in row else feature_row['sog']
        feature_row['heading'] = row['heading'] if 'heading' in row else feature_row['heading']
        
        # Ensure feature_row is a DataFrame for prediction with named columns
        feature_row_df = pd.DataFrame([feature_row], columns=features.columns)
        
        # Predict latitude and longitude
        predicted_lat = rf_model_lat.predict(feature_row_df)[0]
        predicted_lon = rf_model_lon.predict(feature_row_df)[0]

        # Append to submission list
        submission_rows.append({
            'ID': row['ID'],
            'longitude_predicted': predicted_lon,
            'latitude_predicted': predicted_lat
        })

# Create a submission DataFrame
submission_df = pd.DataFrame(submission_rows)
submission_df.to_csv('submission.csv', index=False)

print(submission_df)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features.loc[:, f'{column}_lag{lag}'] = features[column].shift(lag)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features.loc[:, f'{column}_lag{lag}'] = features[column].shift(lag)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features.loc[:, f'{column}_lag{lag}'] = features[column].shift(lag)
A