In [1]:
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, make_scorer, root_mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from catboost import CatBoostRegressor
import numpy as np
import joblib
from sklearn.metrics import f1_score
from sklearn.preprocessing import  PolynomialFeatures
import shap

# Training Data Preperation

In [24]:
challenge_data = pd.read_csv("../data/challenge_set.csv")
submission_data = pd.read_csv("../data/submission_set.csv")
challenge_data_len = challenge_data.shape[0]
challenge_data = pd.concat([challenge_data, submission_data], axis=0)
challenge_data.reset_index(drop=True, inplace=True)
print(f"{challenge_data_len=}, {challenge_data.shape[0]=}")

challenge_data_len=369013, challenge_data.shape[0]=474972


In [25]:
# New features
challenge_data['speed'] = challenge_data['flown_distance'] / challenge_data['flight_duration']
challenge_data['month'] = challenge_data['date'].apply(lambda x: int(str(x)[5:7]))
challenge_data['month1'] = challenge_data['date'].apply(lambda x: str(x)[5:7])
challenge_data['tod'] = challenge_data['actual_offblock_time'].apply(lambda x: int(str(x)[11:13]))
challenge_data['actual_offblock_time_dt'] = pd.to_datetime(challenge_data['actual_offblock_time'])
challenge_data['day_of_week'] = challenge_data['actual_offblock_time_dt'].dt.day_name()

challenge_data[['date', 'adep', 'ades', 'actual_offblock_time', 'arrival_time', 'aircraft_type', 'wtc', 'airline', 'flight_duration', 'taxiout_time', 'flown_distance', 'tow',
                'speed', 'month', 'tod', 'day_of_week']].head()

Unnamed: 0,date,adep,ades,actual_offblock_time,arrival_time,aircraft_type,wtc,airline,flight_duration,taxiout_time,flown_distance,tow,speed,month,tod,day_of_week
0,2022-01-01,EGLL,EICK,2022-01-01T13:46:00Z,2022-01-01T15:04:56Z,A320,M,a73f82288988b79be490c6322f4c32ed,61,18,321,54748.0,5.262295,1,13,Saturday
1,2022-01-01,LEBL,KMIA,2022-01-01T09:55:00Z,2022-01-01T19:37:56Z,B772,H,5543e4dc327359ffaf5b9c0e6faaf0e1,570,13,4193,185441.0,7.35614,1,9,Saturday
2,2022-01-01,ESSA,KORD,2022-01-01T09:39:00Z,2022-01-01T19:08:13Z,A333,H,8be5c854fd664bcb97fb543339f74770,554,15,3770,230396.0,6.805054,1,9,Saturday
3,2022-01-01,LSZH,KPHL,2022-01-01T11:04:00Z,2022-01-01T19:32:13Z,B788,H,5543e4dc327359ffaf5b9c0e6faaf0e1,497,11,3607,157615.0,7.257545,1,11,Saturday
4,2022-01-01,EIDW,EGLL,2022-01-01T12:36:00Z,2022-01-01T13:44:32Z,A21N,M,a73f82288988b79be490c6322f4c32ed,55,14,305,70318.447226,5.545455,1,12,Saturday


# Load additional features

## Add vertical_rate feature

In [26]:
vertical_rate = pd.read_csv("../data/vertical_rate.csv")
challenge_data = challenge_data.merge(vertical_rate, on='flight_id', how='left')
challenge_data.dtypes

flight_id                                int64
date                                    object
callsign                                object
adep                                    object
name_adep                               object
country_code_adep                       object
ades                                    object
name_ades                               object
country_code_ades                       object
actual_offblock_time                    object
arrival_time                            object
aircraft_type                           object
wtc                                     object
airline                                 object
flight_duration                          int64
taxiout_time                             int64
flown_distance                           int64
tow                                    float64
speed                                  float64
month                                    int64
month1                                  object
tod          

## Add density_altitude feature

In [27]:
density_altitude = pd.read_csv("../data/density_altitude.csv")
challenge_data = challenge_data.merge(density_altitude, on='flight_id', how='left')
challenge_data.dtypes

flight_id                                int64
date                                    object
callsign                                object
adep                                    object
name_adep                               object
country_code_adep                       object
ades                                    object
name_ades                               object
country_code_ades                       object
actual_offblock_time                    object
arrival_time                            object
aircraft_type                           object
wtc                                     object
airline                                 object
flight_duration                          int64
taxiout_time                             int64
flown_distance                           int64
tow                                    float64
speed                                  float64
month                                    int64
month1                                  object
tod          

## Add additional features

In [28]:
additional_features = pd.read_csv("../data/additional_features.csv")
challenge_data = challenge_data.merge(additional_features, on='flight_id', how='left')
challenge_data.dtypes

flight_id                                 int64
date                                     object
callsign                                 object
adep                                     object
name_adep                                object
country_code_adep                        object
ades                                     object
name_ades                                object
country_code_ades                        object
actual_offblock_time                     object
arrival_time                             object
aircraft_type                            object
wtc                                      object
airline                                  object
flight_duration                           int64
taxiout_time                              int64
flown_distance                            int64
tow                                     float64
speed                                   float64
month                                     int64
month1                                  

In [29]:
challenge_data.columns

Index(['flight_id', 'date', 'callsign', 'adep', 'name_adep',
       'country_code_adep', 'ades', 'name_ades', 'country_code_ades',
       'actual_offblock_time', 'arrival_time', 'aircraft_type', 'wtc',
       'airline', 'flight_duration', 'taxiout_time', 'flown_distance', 'tow',
       'speed', 'month', 'month1', 'tod', 'actual_offblock_time_dt',
       'day_of_week', 'vertical_rate_min_x', 'vertical_rate_max_x',
       'vertical_rate_mean_x', 'density_altitude_min', 'density_altitude_max',
       'density_altitude_mean', 'altitude_min', 'altitude_max',
       'altitude_mean', 'groundspeed_min', 'groundspeed_max',
       'groundspeed_mean', 'track_min', 'track_max', 'track_mean',
       'vertical_rate_min_y', 'vertical_rate_max_y', 'vertical_rate_mean_y',
       'track_unwrapped_min', 'track_unwrapped_max', 'track_unwrapped_mean',
       'u_component_of_wind_min', 'u_component_of_wind_max',
       'u_component_of_wind_mean', 'v_component_of_wind_min',
       'v_component_of_wind_max', 

# Train

In [30]:
# Define preprocessing for categorical and numerical features
categorical_features = ['aircraft_type', 'adep', 'ades', 'wtc', 'month1', 'day_of_week', 'airline']
numerical_features = [
    'flight_duration', 'taxiout_time', 'flown_distance', 'speed', 'month', 'tod',
    'density_altitude_min', 'density_altitude_max', 'density_altitude_mean',
    'vertical_rate_min_x', 'vertical_rate_max_x', 'vertical_rate_mean_x',
    'altitude_min', 'altitude_max', 'altitude_mean',
    'groundspeed_min', 'groundspeed_max', 'groundspeed_mean',
    'track_min', 'track_max', 'track_mean',
    'vertical_rate_min_y', 'vertical_rate_max_y', 'vertical_rate_mean_y',
    'track_unwrapped_min', 'track_unwrapped_max', 'track_unwrapped_mean',
    'u_component_of_wind_min', 'u_component_of_wind_max', 'u_component_of_wind_mean',
    'v_component_of_wind_min', 'v_component_of_wind_max', 'v_component_of_wind_mean',
    'temperature_min', 'temperature_max', 'temperature_mean',
    'specific_humidity_min', 'specific_humidity_max', 'specific_humidity_mean'
]
numerical_features = [
    'density_altitude_max',
 'altitude_mean',
 'vertical_rate_max_y',
 'vertical_rate_max_x',
 'vertical_rate_mean_x',
 'flight_duration',
 'altitude_max',
 'flown_distance'
]
categories = [
    challenge_data['aircraft_type'].unique(),
    challenge_data['adep'].unique(),
    challenge_data['ades'].unique(),
    challenge_data['wtc'].unique(),
    challenge_data['month1'].unique(),
    challenge_data['day_of_week'].unique(),
    challenge_data['airline'].unique()
]

# One-hot encode the categorical features
preprocessor = ColumnTransformer(
    transformers=[
        # ('num', PolynomialFeatures(degree=2, include_bias=False), numerical_features),
        ('num', 'passthrough', numerical_features),
        ('cat', OneHotEncoder(categories=categories), categorical_features)
    ]
)

In [31]:
# Select the features and target variable for training and validation
features = [*numerical_features, *categorical_features]
target = 'tow'

# Separate features and target from the challenge set
X = challenge_data[features][:challenge_data_len]
y = challenge_data[target][:challenge_data_len]

X_sub = challenge_data[features][challenge_data_len:]

In [32]:
# Evaluate the best models on the full training set and predict on the submission set
def train(i, model, model_name):
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
    
    # Split the data (using only 10% for hyperparameter tuning)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42 + i)
    
    print(f"{i}. Training {model_name}...")

    pipeline.fit(X_train, y_train)

    y_pred = pipeline.predict(X_test)
    rmse = root_mean_squared_error(y_test, y_pred)

    print(f"{i}. Trained {model_name} RMSE: {rmse}")

    return pipeline, rmse
    
results = []
splits = 1
for i in range(splits):
    # model = RandomForestRegressor(n_jobs=-1)
    # pipeline, rmse = train(i, model, 'RandomForest')
    # results.append(('RandomForest', i, rmse, pipeline))

    model = xgb.XGBRegressor(objective='reg:squarederror', n_jobs=-1, n_estimators=1000, max_depth=10)
    pipeline, rmse = train(i, model, 'XGBoost')
    results.append(('XGBoost', i, rmse, pipeline))

    # model = CatBoostRegressor(silent=True, thread_count=-1)
    # pipeline, rmse = train(i, model, 'CatBoost')
    # results.append(('CatBoost', i, rmse, pipeline))

best_models = {}
# for model_name in ['RandomForest', 'XGBoost', 'CatBoost']:
for model_name in ['XGBoost']:
    model_results = [result for result in results if result[0] == model_name]
    best_result = min(model_results, key=lambda x: x[2])
    best_models[model_name] = best_result

# Find the best model overall
best_model_name = min(best_models, key=lambda name: best_models[name][2])
best_rmse = best_models[best_model_name][2]
best_split = best_models[best_model_name][1]
best_model = best_models[best_model_name][3]

print(f"{best_model_name=} {best_rmse=} {best_split=}")

0. Training XGBoost...
0. Trained XGBoost RMSE: 3066.388260565277
best_model_name='XGBoost' best_rmse=3066.388260565277 best_split=0


In [22]:
df = pd.DataFrame([dict(zip(numerical_features, best_model.steps[1][1].feature_importances_[:len(numerical_features)]))])
df = df.T
df = df.sort_values(0)
list(df[df[0] > 0.00001].reset_index()['index'])
# list(sorted(best_model.steps[1][1].feature_importances_))
# explainer = shap.Explainer(best_model.steps[1][1])
# shap_values = explainer(best_model.steps[0][1].transform(X))
# shap.plots.waterfall(shap_values[0])

['density_altitude_max',
 'altitude_mean',
 'vertical_rate_max_y',
 'vertical_rate_max_x',
 'vertical_rate_mean_x',
 'flight_duration',
 'altitude_max',
 'flown_distance']

In [35]:
def predict(model):
    predictions = model.predict(X_sub)
    print(f"Predictions for {model_name}:", predictions)
    return predictions

submission_data['tow'] = predict(best_model)
submission_data[['flight_id','tow']].to_csv("../submissions/team_exuberant_scooter_v1_5c26054c-2acb-4e7b-8bfa-61a0fe40cb6a.csv", index=None)

Predictions for XGBoost: [ 64684.434 212743.11  220756.75  ...  77851.     61176.47   63738.08 ]


In [34]:
submission_data

Unnamed: 0,flight_id,date,callsign,adep,name_adep,country_code_adep,ades,name_ades,country_code_ades,actual_offblock_time,arrival_time,aircraft_type,wtc,airline,flight_duration,taxiout_time,flown_distance,tow
0,248753821,2022-01-01,3b3de0f3ad0ee192513995c02f7bf7cf,LTFJ,Istanbul Sabiha Gokcen,TR,LFLL,Lyon,FR,2022-01-01T09:44:00Z,2022-01-01T12:48:33Z,B738,M,6351ec1b849adacc0cbb3b1313d8d39b,170,15,1122,
1,248753822,2022-01-01,e06dd03d4a879ca37d9e18c1bd7cad16,EBBR,Brussels,BE,KJFK,New York JFK,US,2022-01-01T09:45:00Z,2022-01-01T17:49:51Z,A333,H,bdeeef3a675587d530de70a25d7118d2,470,15,3205,
2,248754498,2022-01-01,2d3b1c962c78c4ebeef11bcd51b9e94c,KMIA,Miami,US,EGLL,London Heathrow,GB,2022-01-01T01:52:00Z,2022-01-01T09:55:16Z,B77W,H,5543e4dc327359ffaf5b9c0e6faaf0e1,473,10,3965,
3,248757623,2022-01-01,81564432d3ee97c4bdf4cd8f006753dc,EGCN,Doncaster Sheffield,GB,LEAL,Alicante,ES,2022-01-01T08:20:00Z,2022-01-01T11:06:08Z,B38M,M,3922524069809ac4326134429751e26f,156,10,986,
4,248763603,2022-01-01,84be079d7e660db105d91f600b4b3d59,EIDW,Dublin,IE,LFLL,Lyon,FR,2022-01-01T11:01:00Z,2022-01-01T13:00:43Z,A320,M,a73f82288988b79be490c6322f4c32ed,105,15,686,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105954,258066302,2022-12-31,2d3b4446c4d05a25196a9d52cab936fb,LTFJ,Istanbul Sabiha Gokcen,TR,EKCH,Copenhagen,DK,2022-12-31T09:36:00Z,2022-12-31T13:12:17Z,B38M,M,6351ec1b849adacc0cbb3b1313d8d39b,201,15,1199,
105955,258068609,2022-12-31,253fd692ed441fac523081471c067772,LOWW,Vienna,AT,KIAD,Washington Dulles,US,2022-12-31T09:49:00Z,2022-12-31T19:38:26Z,B763,H,5d407cb11cc29578cc3e292e743f5393,575,14,3937,
105956,258068876,2022-12-31,c9fca302ca2e28acab0eb0bb1b46f11b,LTFM,iGA Istanbul,TR,LSZH,Zurich,CH,2022-12-31T09:25:00Z,2022-12-31T12:24:24Z,A321,M,6351ec1b849adacc0cbb3b1313d8d39b,154,25,988,
105957,258064675,2022-12-31,00f96ad0e382476649574ba044c764fc,EHAM,Amsterdam,NL,EDDF,Frankfurt,DE,2022-12-31T10:04:21Z,2022-12-31T10:55:35Z,A320,M,f502877cab405652cf0dd70c2213e730,42,9,240,


# Save predictions to CSV files

submission_data["tow"] = catboost_predictions
submission_data[["flight_id", "tow"]].to_csv("submission_catboost.csv", index=None)
submission_data["tow"] = xgb_predictions
submission_data[["flight_id", "tow"]].to_csv("submission_xgb.csv", index=None)
submission_data["tow"] = rf_predictions
submission_data[["flight_id", "tow"]].to_csv("submission_rf.csv", index=None)

In [None]:
joblib.dump(best_model, "best_model.pkl")