In [None]:
!pip install scikit-learn xgboost  pandas catboost


In [None]:
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from catboost import CatBoostRegressor
import numpy as np
import joblib

In [None]:
# Load the challenge set (training data) and submission set
challenge_data = pd.read_csv("challenge_set.csv")
submission_data = pd.read_csv("submission_set.csv")

In [None]:
# Define RMSE scorer
rmse_scorer = make_scorer(lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred)), greater_is_better=False)

# Select the features and target variable for training and validation
features = ['flight_duration', 'taxiout_time', 'flown_distance', 'aircraft_type']
target = 'tow'

# Separate features and target from the challenge set
X = challenge_data[features]
y = challenge_data[target]

# For the submission set, only features are used
X_submission = submission_data[features]

# Split the data (using only 10% for hyperparameter tuning)
X_train_full, X_train_tune, y_train_full, y_train_tune = train_test_split(X, y, test_size=0.1, random_state=42)

In [None]:
# Define preprocessing for categorical and numerical features
categorical_features = ['aircraft_type']
numerical_features = ['flight_duration', 'taxiout_time', 'flown_distance']

# One-hot encode the categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ]
)


In [None]:
# Create model pipelines
rf_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', RandomForestRegressor())])

xgb_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('model', xgb.XGBRegressor(objective='reg:squarederror'))])

catboost_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                    ('model', CatBoostRegressor(silent=True))])

In [None]:
# Evaluate the best models on the full training set and predict on the submission set
def evaluate_and_predict(model, X_train, y_train, X_submission, model_name):
    model.fit(X_train, y_train)
    predictions = model.predict(X_submission)
    print(f"Predictions for {model_name}:", predictions)
    return predictions

# Evaluate and predict for each model
rf_predictions = evaluate_and_predict(rf_pipeline, X_train_full, y_train_full, X_submission, 'RandomForest')
xgb_predictions = evaluate_and_predict(xgb_pipeline, X_train_full, y_train_full, X_submission, 'XGBoost')
catboost_predictions = evaluate_and_predict(catboost_pipeline, X_train_full, y_train_full, X_submission, 'CatBoost')

In [None]:
# Save predictions to CSV files


submission_data["tow"] = catboost_predictions
submission_data[["flight_id", "tow"]].to_csv("submission_catboost.csv", index=None)
submission_data["tow"] = xgb_predictions
submission_data[["flight_id", "tow"]].to_csv("submission_xgb.csv", index=None)
submission_data["tow"] = rf_predictions
submission_data[["flight_id", "tow"]].to_csv("submission_rf.csv", index=None)

In [None]:
joblib.dump(best_rf, "model_rf.pkl")
joblib.dump(best_xgb, "model_xgb.pkl")
joblib.dump(best_catboost, "model_catboost.pkl")