In [37]:
from pathlib import Path
import joblib
import warnings
from IPython.display import display


import pandas as pd
import numpy as np

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.preprocessing import LabelEncoder

from catboost import Pool, CatBoostRegressor

In [38]:
pd.set_option('display.max_columns',None)

In [39]:
TRAIN_FLIGHTS_DIR = '../prc-2025-datasets/flights_train'
RANK_FLIGHTS_DIR = '../prc-2025-datasets/flights_rank'
FINAL_FLIGHTS_DIR = '../prc-2025-datasets/flights_final'
FLIGHTLIST_TRAIN = '../prc-2025-datasets/flightlist_train.parquet'
FLIGHTLIST_RANK = '../prc-2025-datasets/flightlist_rank.parquet'
FLIGHTLIST_FINAL = '../prc-2025-datasets/flightlist_final.parquet'
FUEL_RANK = '../prc-2025-datasets/fuel_rank_submission.parquet'
FUEL_TRAIN = '../prc-2025-datasets/fuel_train.parquet'
FUEL_FINAL = '../prc-2025-datasets/fuel_final_submission.parquet'
AIRPORTS = '../prc-2025-datasets/apt.parquet'
PREPARED_TRAIN_DATA = './prep_train_acropole_test.csv'

In [40]:
TRAIN_FLIGHTS_DIR = Path(TRAIN_FLIGHTS_DIR)
RANK_FLIGHTS_DIR = Path(RANK_FLIGHTS_DIR)
FINAL_FLIGHTS_DIR = Path(FINAL_FLIGHTS_DIR)

In [41]:
train_flightlist = pd.read_parquet(FLIGHTLIST_TRAIN)
rank_flightlist = pd.read_parquet(FLIGHTLIST_RANK)
final_flightlist = pd.read_parquet(FLIGHTLIST_FINAL)
airports = pd.read_parquet(AIRPORTS)
train_fuel = pd.read_parquet(FUEL_TRAIN)
rank_fuel = pd.read_parquet(FUEL_RANK)
final_fuel = pd.read_parquet(FUEL_FINAL)

In [42]:
TRAIN_FLIGHTS_DIR = Path(TRAIN_FLIGHTS_DIR)

In [43]:
warnings.filterwarnings('ignore', category=RuntimeWarning)
warnings.filterwarnings('ignore', category=UserWarning)

In [44]:
prep_train = pd.read_csv(PREPARED_TRAIN_DATA)

In [45]:
def haversine(lat_lon1, lat_lon2):
    # Radius of the Earth in kilometers
    R = 6371.0
    
    # Convert latitude and longitude from degrees to radians
    lat1, lon1 = map(np.radians, lat_lon1)
    lat2, lon2 = map(np.radians, lat_lon2)
    
    # Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))

    return R * c * 1000

In [46]:
def extra_features(df,datetime_cols=["timestamp"]):

    df['full_flight_distance'] = df.apply(lambda row: haversine((row['orig_lat'],
                                                                 row['orig_long']),
                                                        (row['dest_lat'],
                                                         row['dest_long'])),
                                 axis=1)
    
    # Ensure the datetime column is in datetime format
    for dt_col in datetime_cols:
        df[dt_col] = pd.to_datetime(df[dt_col], format='ISO8601',utc=True)
   

    
    df['seg_duration'] = (df['seg_end'] - df['seg_start']).dt.total_seconds() / 60

    # dataframe (excluding the datetime column)
    result_df = df.drop(columns=datetime_cols)
    
    return result_df

In [47]:
dt_feat = ['takeoff','landed','seg_start','seg_end']
cols_to_drop = ['flight_id','origin_name','destination_name','flight_date','MODEL',
               'aircraft','eng_name','eng_uid','flight_start','flight_end',
               'takeoff_date','land_date']

cols_to_encode = ['aircraft_type','origin_icao','destination_icao',
                 'WAKE','ENG_NAME','ENG_MAN','ENG_MODEL','Manufacturer',
                 'Physical_Class_Engine','AAC','AAC_minimum','AAC_maximum','ADG',
                  'TDG','Main_Gear_Config','ICAO_WTC','Class','FAA_Weight','CWT',
                  'One_Half_Wake_Category','Two_Wake_Category_Appx_A',
                  'Two_Wake_Category_Appx_B','SRS','flaps_type','engine_type',
                  'engine_mount','engine_default','eng_type','eng_manufacturer',
                  'fuel_engine', 'fuel_aircraft'
                 ]

In [48]:
train_interact = extra_features(prep_train,dt_feat)

In [49]:
encoder_dict = {}

for col in cols_to_encode:
    sp_enc = LabelEncoder()
    sp_enc.fit(train_interact[col].astype('str'))
    encoder_dict[col] = sp_enc

In [50]:
for col in cols_to_encode:
    train_interact[col] = encoder_dict[col].transform(train_interact[col].astype('str'))

In [51]:
# splitting into target and features
X = train_interact.drop(columns=['fuel_kg']+cols_to_drop)
y = train_interact['fuel_kg']
cat_cols = cols_to_encode

In [52]:
stratifier = train_interact['missing_segment'].astype('str') + '_' + train_interact['aircraft_type'].astype('str')

In [53]:
sel_ind = ['rawseg_diff_mean', 'flight_DE_ct', 'seg_ENR_dur', 'all_seg_phase_dur',
           'unscaled_approx_seg_fuel', 'start_longitude', 'min_longitude',
           'end_altitude', 'mean_altitude', 'sum_altitude', 'total_climb_height',
           'sum_groundspeed', 'nancount_mach', 'sum_CAS', 'sum_fuel_flow', 'sum_fuel',
           'sum_drag', 'start_thrust', 'sum_thrust', 'end_auth_score', 'sum_cl_fuel',
           'sum_enr_fuel', 'sum_dist_from_ades', 'start_acp_fuel', 'max_acp_fuel',
           'sum_acp_fuel', 'sum_acp_fuelflow', 'dest_long', 'Tail_Height_at_OEW_ft',
           'MALW_lb', 'mtow', 'wing_mac', 'drag_gears', 'eng_bpr', 'eng_ei_co_co',
           'eng_ei_co_app', 'eng_ei_nox_co', 'full_flight_distance', 'seg_duration']

In [54]:
train_interact[sel_ind]

Unnamed: 0,rawseg_diff_mean,flight_DE_ct,seg_ENR_dur,all_seg_phase_dur,unscaled_approx_seg_fuel,start_longitude,min_longitude,end_altitude,mean_altitude,sum_altitude,total_climb_height,sum_groundspeed,nancount_mach,sum_CAS,sum_fuel_flow,sum_fuel,sum_drag,start_thrust,sum_thrust,end_auth_score,sum_cl_fuel,sum_enr_fuel,sum_dist_from_ades,start_acp_fuel,max_acp_fuel,sum_acp_fuel,sum_acp_fuelflow,dest_long,Tail_Height_at_OEW_ft,MALW_lb,mtow,wing_mac,drag_gears,eng_bpr,eng_ei_co_co,eng_ei_co_app,eng_ei_nox_co,full_flight_distance,seg_duration
0,0.584998,0.033829,1740.0,1740.0,3665.351077,24.199546,19.517282,35999.9280,35991.59680,1.079748e+06,174.9552,14173.0,0,,51.881158,3112.869496,3.586575e+06,143269.755016,4.292722e+06,1.0,0.0,3112.869496,22819.780130,104.282328,106.251690,3144.960076,52.416001,4.763890,56.1,425000.0,254000,6.27,0.014,9.04,0.49,0.62,43.07,1.023664e+07,30.000667
1,0.502928,0.033829,840.0,840.0,1832.103349,19.349037,16.959855,35974.9344,35989.93056,5.398490e+05,74.9808,7243.0,0,,26.212932,1572.775918,1.822873e+06,143081.394720,2.141274e+06,1.0,0.0,1572.775918,8752.121661,104.930070,105.658100,1574.520940,26.242016,4.763890,56.1,425000.0,254000,6.27,0.014,9.04,0.49,0.62,43.07,1.023664e+07,14.995650
2,0.517378,0.033829,1740.0,1740.0,3665.186139,16.793096,11.648848,35999.9280,35985.76496,1.079573e+06,199.9488,15351.0,0,,55.763557,3345.813417,3.878233e+06,142487.225149,4.255467e+06,1.0,0.0,3345.813417,11910.921322,102.785837,105.484288,3105.585350,51.759756,4.763890,56.1,425000.0,254000,6.27,0.014,9.04,0.49,0.62,43.07,1.023664e+07,29.999317
3,0.514650,0.033829,840.0,840.0,1832.885273,11.445160,8.687736,35999.9280,35991.59680,5.398740e+05,124.9680,7595.0,0,,27.454558,1647.273505,1.902048e+06,141681.493689,2.129820e+06,1.0,0.0,1647.273505,3116.474432,102.498843,106.036129,1548.239712,25.803995,4.763890,56.1,425000.0,254000,6.27,0.014,9.04,0.49,0.62,43.07,1.023664e+07,15.002050
4,0.593094,0.033829,300.0,780.0,1832.070769,8.484637,5.778495,15250.0584,28001.63056,4.200245e+05,24.9936,6975.0,0,,19.187005,1151.220303,2.070286e+06,142422.666297,8.479277e+05,1.0,0.0,553.762924,1309.621252,102.385920,104.480462,747.427733,12.457129,4.763890,56.1,425000.0,254000,6.27,0.014,9.04,0.49,0.62,43.07,1.023664e+07,14.995383
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
521,0.710383,0.153846,240.0,240.0,138.511772,-113.753320,-114.170245,37950.0384,37990.02816,1.899501e+05,24.9936,2071.0,0,,2.420718,145.243068,1.903815e+05,47171.985425,2.360698e+05,1.0,0.0,145.243068,1326.889257,36.225957,36.225957,172.318129,2.871969,-118.407997,39.6,148591.0,79000,4.29,0.017,12.28,0.30,5.36,15.30,1.963404e+06,5.008200
522,0.535752,0.153846,840.0,840.0,414.884455,-114.275189,-115.815573,38000.0256,38006.69056,5.701004e+05,99.9744,6218.0,0,,7.186845,431.210680,5.685556e+05,47171.985425,7.077502e+05,1.0,0.0,431.210680,2950.187482,34.147702,35.956240,513.914798,8.565247,-118.407997,39.6,148591.0,79000,4.29,0.017,12.28,0.30,5.36,15.30,1.963404e+06,15.001067
523,0.495078,0.153846,0.0,240.0,137.380140,-116.534168,-117.044640,19675.1448,25420.01520,1.271001e+05,0.0000,2000.0,0,,0.482322,28.939339,1.919253e+05,3235.086246,2.168263e+04,1.0,0.0,7.144680,469.546581,20.719477,20.719477,50.659505,0.844325,-118.407997,39.6,148591.0,79000,4.29,0.017,12.28,0.30,5.36,15.30,1.963404e+06,4.967283
524,0.506581,0.153846,0.0,300.0,290.717906,-117.160894,-117.968781,8225.0280,11327.46528,1.132747e+05,0.0000,3210.0,0,,2.764001,165.840069,3.900771e+05,6422.061429,7.917594e+04,1.0,0.0,0.000000,536.251412,11.811168,41.005801,251.443033,4.190717,-118.407997,39.6,148591.0,79000,4.29,0.017,12.28,0.30,5.36,15.30,1.963404e+06,10.511550


In [None]:
len(sel_ind)

39

In [65]:
# rmse 213.89598165883922 and parameters: 
hyp = {'iterations': 972, 'learning_rate': 0.05060963202084587, 'max_depth': 6,
       'l2_leaf_reg': 1.822777238628044, 'min_data_in_leaf': 78,
       'colsample_bylevel': 0.6035766248956298, 'bootstrap_type': 'MVS',
       'subsample': 0.46633222868063806}

In [66]:
scv = StratifiedGroupKFold(n_splits=5)
fit_models = []
ll_scores = []
train_scores = []

train_features = {}
train_targets = {}
valid_features = {}
valid_targets = {}
valid_preds = {}
ind = 0

for train_index, test_index in scv.split(X,stratifier,
                                        groups=train_dt['flight_id']):

    X_train, X_test = X.iloc[train_index].copy(), X.iloc[test_index].copy()
    
    y_train, y_test = y.iloc[train_index], y.iloc[test_index] 

    train_features[ind] = X_train
    train_targets[ind] = y_train
    valid_features[ind] = X_test
    valid_targets[ind] = y_test
    
    preppedtrain_sel = X_train[sel_ind]
    preppedtest_sel = X_test[sel_ind]

    cat_features = [i for i, col in enumerate(sel_ind) if col in cat_cols]
            
    train_pool = Pool(preppedtrain_sel, y_train,cat_features=cat_features)
    test_pool = Pool(preppedtest_sel, y_test, cat_features=cat_features)
    
    model = CatBoostRegressor(objective='RMSE',task_type='CPU',
                              random_state=42, silent=True,**hyp
                              )
    

    model.fit(train_pool,eval_set=test_pool)
    
    
    fit_models.append(model)


    y_pred = model.predict(preppedtest_sel)
    y_pred = np.where(y_pred<0,train_fuel['fuel_kg'].min(),y_pred)

    valid_preds[ind] = y_pred
    ind +=1

    train_pred = model.predict(preppedtrain_sel)
    train_pred = np.where(train_pred<0,train_fuel['fuel_kg'].min(),train_pred)
    
    ind_cv = mean_squared_error(y_test,y_pred,squared=False)
    tr_cv = mean_squared_error(y_train,train_pred,squared=False)

    ll_scores.append(ind_cv)
    train_scores.append(tr_cv)


print('mean 5fold rmse train: ', np.mean(train_scores))
print('mean 5fold rmse test: ', np.mean(ll_scores)) 

mean 5fold rmse train:  135.50833061642936
mean 5fold rmse test:  213.89598165883922


In [67]:
prep_rank = pd.read_csv('/kaggle/input/prc2025-accessories/prep_rank_acropole/prep_rank_acropole.csv')

In [None]:
rank_interact = extra_features(prep_rank,dt_feat)

for col in cols_to_encode:
    if col in sel_ind:
        print(col)
        rank_interact[col] = encoder_dict[col].transform(rank_interact[col].astype('str'))


In [71]:
preds = [np.clip(model.predict(rank_interact[sel_ind]),
                a_min=train_fuel['fuel_kg'].min(),a_max=None) for model in fit_models]
pred = sum(preds) / len(preds)

In [73]:
rank_fuel['fuel_kg'] = pred

In [74]:
rank_fuel['fuel_kg'].describe()

count    24289.000000
mean       416.292326
std        745.771351
min          0.453592
25%         92.157918
50%        151.403112
75%        394.877050
max      12200.323908
Name: fuel_kg, dtype: float64

In [80]:
rank_fuel.to_parquet('wise-watermelon_v26.parquet')

## FINAL

In [107]:
prep_final = pd.read_csv('/kaggle/input/prc2025-accessories/prep_final_acropole/prep_final_acropole.csv')


  prep_final = pd.read_csv('/kaggle/input/prc2025-accessories/prep_final_acropole/prep_final_acropole.csv')


In [None]:
final_interact = extra_features(prep_final,dt_feat)

In [None]:
for col in cols_to_encode:
    if col in sel_ind:
        print(col)
        final_interact[col] = encoder_dict[col].transform(final_interact[col].astype('str'))


In [110]:
preds_final = [np.clip(model.predict(final_interact[sel_ind]),
                a_min=train_fuel['fuel_kg'].min(),a_max=None) for model in fit_models]
preds_final = sum(preds_final) / len(preds_final)

In [111]:
final_fuel['fuel_kg'] = preds_final

In [112]:
final_fuel['fuel_kg'].describe()

count    61745.000000
mean       512.410579
std        830.672808
min          0.453592
25%        107.267244
50%        360.601572
75%        471.461127
max      13004.959405
Name: fuel_kg, dtype: float64

In [113]:
final_fuel.to_parquet('wise-watermelon_final.parquet')