In [None]:
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, make_scorer, root_mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
# from catboost import CatBoostRegressor
import numpy as np

# Training Data Preperation

In [None]:
challenge_data = pd.read_csv("../data/challenge_set.csv")
submission_data = pd.read_csv("../data/final_submission_set.csv")
challenge_data_len = challenge_data.shape[0]
challenge_data = pd.concat([challenge_data, submission_data], axis=0)
challenge_data.reset_index(drop=True, inplace=True)
print(f"{challenge_data_len=}, {challenge_data.shape[0]=}")

In [None]:
# New features
challenge_data['speed'] = challenge_data['flown_distance'] / challenge_data['flight_duration']
challenge_data['month'] = challenge_data['date'].apply(lambda x: int(str(x)[5:7]))
challenge_data['month1'] = challenge_data['date'].apply(lambda x: str(x)[5:7])
challenge_data['tod'] = challenge_data['actual_offblock_time'].apply(lambda x: int(str(x)[11:13]))
challenge_data['actual_offblock_time_dt'] = pd.to_datetime(challenge_data['actual_offblock_time'])
challenge_data['day_of_week'] = challenge_data['actual_offblock_time_dt'].dt.day_name()

challenge_data[['date', 'adep', 'ades', 'actual_offblock_time', 'arrival_time', 'aircraft_type', 'wtc', 'airline', 'flight_duration', 'taxiout_time', 'flown_distance', 'tow',
                'speed', 'month', 'tod', 'day_of_week']].head()

# Load additional features

## Add vertical_rate feature

In [None]:
vertical_rate = pd.read_csv("../data/vertical_rate.csv")
challenge_data = challenge_data.merge(vertical_rate, on='flight_id', how='left')
challenge_data.dtypes

## Add density_altitude feature

In [None]:
density_altitude = pd.read_csv("../data/density_altitude.csv")
challenge_data = challenge_data.merge(density_altitude, on='flight_id', how='left')
challenge_data.dtypes

## Add additional features

In [None]:
additional_features = pd.read_csv("../data/additional_features.csv")
challenge_data = challenge_data.merge(additional_features, on='flight_id', how='left')
challenge_data.dtypes

## Airspeed and normalized vertical rate

In [None]:
ias_norm_virt_rate = pd.read_csv("../data/ias_norm_virt_rate.csv")
challenge_data = challenge_data.merge(ias_norm_virt_rate, on='flight_id', how='left')
challenge_data.columns

In [None]:
challenge_data.to_csv("../data/combined_training_data.csv", index=False)
challenge_data.describe()

# Train

In [None]:
# Define preprocessing for categorical and numerical features
categorical_features = ['aircraft_type', 'adep', 'ades', 'wtc', 'month1', 'day_of_week', 'airline']
numerical_features = [
    'flight_duration', 'taxiout_time', 'flown_distance', 'speed', 'month', 'tod',
    'density_altitude_min', 'density_altitude_max', 'density_altitude_mean',
    'vertical_rate_min_x', 'vertical_rate_max_x', 'vertical_rate_mean_x',
    'altitude_min', 'altitude_max', 'altitude_mean',
    'groundspeed_min', 'groundspeed_max', 'groundspeed_mean',
    'track_min', 'track_max', 'track_mean',
    'vertical_rate_min_y', 'vertical_rate_max_y', 'vertical_rate_mean_y',
    # 'track_unwrapped_min', 'track_unwrapped_max', 'track_unwrapped_mean',
    'u_component_of_wind_min', 'u_component_of_wind_max', 'u_component_of_wind_mean',
    'v_component_of_wind_min', 'v_component_of_wind_max', 'v_component_of_wind_mean',
    'temperature_min', 'temperature_max', 'temperature_mean',
    'specific_humidity_min', 'specific_humidity_max', 'specific_humidity_mean',
    'ias_min', 'ias_max', 'ias_mean',
    'norm_vertical_rate_min', 'norm_vertical_rate_max', 'norm_vertical_rate_mean',
]

categories = [
    challenge_data['aircraft_type'].unique(),
    challenge_data['adep'].unique(),
    challenge_data['ades'].unique(),
    challenge_data['wtc'].unique(),
    challenge_data['month1'].unique(),
    challenge_data['day_of_week'].unique(),
    challenge_data['airline'].unique()
]

# One-hot encode the categorical features
preprocessor = ColumnTransformer(
    transformers=[
        # ('num', PolynomialFeatures(degree=2, include_bias=False), numerical_features),
        ('num', 'passthrough', numerical_features),
        ('cat', OneHotEncoder(categories=categories), categorical_features)
    ]
)

In [None]:
# Select the features and target variable for training and validation
features = [*numerical_features, *categorical_features]
target = 'tow'

# Separate features and target from the challenge set
X = challenge_data[features][:challenge_data_len]
y = challenge_data[target][:challenge_data_len]
X_sub = challenge_data_features[challenge_data_len:]

print(f"{X.shape=}")

In [None]:
# Evaluate the best models on the full training set and predict on the submission set
def train(i, model, model_name):
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
    # pipeline = Pipeline(steps=[('model', model)])
    
    # Split the data (using only 10% for hyperparameter tuning)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42 + i)
    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=50 + i)
    
    print(f"{i}. Training {model_name}...")

    pipeline.fit(X_train, y_train)

    y_pred = pipeline.predict(X_train)
    rmse_train = root_mean_squared_error(y_train, y_pred)

    y_pred = pipeline.predict(X_test)
    rmse_test = root_mean_squared_error(y_test, y_pred)

    print(f"{i}. Trained {model_name} RMSE (train): {rmse_train} RMSE (test): {rmse_test} RMSE diff {rmse_test - rmse_train}")

    return pipeline, rmse_test
    
results = []
splits = 1
selected_params_xgboost = {
    'objective':'reg:squarederror',
    'n_estimators': 2000,
    'learning_rate': 0.1,
    'max_depth': 6,
    'min_child_weight': 1,
    'gamma': 0.1,
    'subsample': 0.8,
    'colsample_bytree': 1.0,
    'reg_alpha': 0.01,
    'reg_lambda': 1.5,
    'scale_pos_weight': 1.5,
    'n_jobs': -1,
    'eval_metric': 'rmse'
}
catboost_params = {
    'depth': 10,
    'od_wait': 200,
    'l2_leaf_reg': 3,
    'iterations': 50000, # 200000,
    'model_size_reg': 0.7,
    'learning_rate': 0.05,
    'random_seed': 42,
    #"silent": True,
    "verbose": False
}

for i in range(splits):
    #model = RandomForestRegressor(n_estimators = 1000,n_jobs = -1,random_state =1)
    #pipeline, rmse = train(i, model, 'RandomForest')
    #results.append(('RandomForest', i, rmse, pipeline))

    #model = xgb.XGBRegressor(**selected_params)
    #pipeline, rmse = train(i, model, 'XGBoost')
    #results.append(('XGBoost', i, rmse, pipeline))

    model = CatBoostRegressor(**catboost_params)
    pipeline, rmse = train(i, model, 'CatBoost')
    results.append(('CatBoost', i, rmse, pipeline))

best_models = {}
#for model_name in ['RandomForest', 'XGBoost', 'CatBoost']:
for model_name in ['CatBoost']:
    model_results = [result for result in results if result[0] == model_name]
    best_result = min(model_results, key=lambda x: x[2])
    best_models[model_name] = best_result

# Find the best model overall
best_model_name = min(best_models, key=lambda name: best_models[name][2])
best_rmse = best_models[best_model_name][2]
best_split = best_models[best_model_name][1]
best_model = best_models[best_model_name][3]

print(f"{best_model_name=} {best_rmse=} {best_split=}")

In [None]:
def predict(model):
    predictions = model.predict(X_sub)
    print(f"Predictions for {model_name}:", predictions)
    return predictions

submission_data['tow'] = predict(best_model)
submission_data[['flight_id','tow']].to_csv("../submissions/team_exuberant_scooter_v7_5c26054c-2acb-4e7b-8bfa-61a0fe40cb6a.csv", index=None)