In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [5]:
import joblib
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
import category_encoders as ce
from lightgbm import LGBMRegressor, early_stopping, log_evaluation
from xgboost import XGBRegressor, callback
from sklearn.metrics import mean_squared_error

In [3]:
train_df = pd.read_csv('/kaggle/input/burnout-datathon-ieeecsmuj/train.csv')
val_df = pd.read_csv('/kaggle/input/burnout-datathon-ieeecsmuj/val.csv')
test_df = pd.read_csv('/kaggle/input/burnout-datathon-ieeecsmuj/test.csv')

In [4]:
for df in [train_df, val_df, test_df]:
    df['Penalty_Seconds'] = df['Penalty'].map({
        '+3s': 3,
        '+5s': 5,
        'Ride Through': 20,
        'DNS': 0,
        'DNF': 0
    }).fillna(0)

for df in [train_df, val_df, test_df]:
    df['Tire_Combo'] = df['Tire_Compound_Front'] + '_' + df['Tire_Compound_Rear']
    df['Temperature_Diff'] = df['Track_Temperature_Celsius'] - df['Ambient_Temperature_Celsius']

target = 'Lap_Time_Seconds'
features = [
    'Circuit_Length_km', 'Avg_Speed_kmh', 'Corners_per_Lap',
    'Tire_Degradation_Factor_per_Lap', 'Pit_Stop_Duration_Seconds',
    'Ambient_Temperature_Celsius', 'Track_Temperature_Celsius', 'Penalty_Seconds',
    'Temperature_Diff', 'Track_Condition', 'Tire_Compound_Front',
    'Tire_Compound_Rear', 'Tire_Combo', 'Session', 'circuit_name'
]

numerical_features = [
    'Circuit_Length_km', 'Avg_Speed_kmh', 'Corners_per_Lap',
    'Tire_Degradation_Factor_per_Lap', 'Pit_Stop_Duration_Seconds',
    'Ambient_Temperature_Celsius', 'Track_Temperature_Celsius', 'Penalty_Seconds',
    'Temperature_Diff'
]
low_cardinality_categorical = [
    'Track_Condition', 'Tire_Compound_Front', 'Tire_Compound_Rear',
    'Tire_Combo', 'Session'
]
high_cardinality_categorical = ['circuit_name']

for df in [train_df, val_df, test_df]:
    for col in numerical_features:
        df[col] = df[col].astype('float32')
    if target in df.columns:
        df[target] = df[target].astype('float32')

X_train = train_df[features]
y_train = train_df[target]
X_val = val_df[features]
y_val = val_df[target]
X_test = test_df[features]

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

low_cardinality_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=True))
])

high_cardinality_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
    ('target', ce.TargetEncoder())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('low_cat', low_cardinality_transformer, low_cardinality_categorical),
        ('high_cat', high_cardinality_transformer, high_cardinality_categorical)
    ])

X_train_transformed = preprocessor.fit_transform(X_train, y_train)
X_val_transformed = preprocessor.transform(X_val)
X_test_transformed = preprocessor.transform(X_test)

In [8]:
xgb_model = XGBRegressor(
    n_estimators=20000,
    learning_rate=1,
    objective='reg:squarederror',
    tree_method='hist',              # use 'gpu_hist' if you have a GPU
    random_state=42,
    verbosity=0                      # use 0 to suppress log duplication
)

# Fit the model with early stopping and log evaluation every 20 rounds
xgb_model.fit(
    X_train_transformed, y_train,
    eval_set=[(X_val_transformed, y_val)],
    eval_metric='rmse',
    callbacks=[
        callback.EarlyStopping(rounds=100, save_best=True, metric_name='rmse'),
    ]
)

[0]	validation_0-rmse:11.48117
[1]	validation_0-rmse:11.45681
[2]	validation_0-rmse:11.42583
[3]	validation_0-rmse:11.40045
[4]	validation_0-rmse:11.37191
[5]	validation_0-rmse:11.34152
[6]	validation_0-rmse:11.31519
[7]	validation_0-rmse:11.29326
[8]	validation_0-rmse:11.27366
[9]	validation_0-rmse:11.24619
[10]	validation_0-rmse:11.21858
[11]	validation_0-rmse:11.19157
[12]	validation_0-rmse:11.16565
[13]	validation_0-rmse:11.13942
[14]	validation_0-rmse:11.11158
[15]	validation_0-rmse:11.09271
[16]	validation_0-rmse:11.06715
[17]	validation_0-rmse:11.04100
[18]	validation_0-rmse:11.01853
[19]	validation_0-rmse:10.99732
[20]	validation_0-rmse:10.97866
[21]	validation_0-rmse:10.96047
[22]	validation_0-rmse:10.93592
[23]	validation_0-rmse:10.92019
[24]	validation_0-rmse:10.89311
[25]	validation_0-rmse:10.86912
[26]	validation_0-rmse:10.84587
[27]	validation_0-rmse:10.82104
[28]	validation_0-rmse:10.79490
[29]	validation_0-rmse:10.77617
[30]	validation_0-rmse:10.76038
[31]	validation_0-

In [9]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', xgb_model)
])

In [10]:
y_val_pred = model.predict(X_val)
mean_squared_error(y_val, y_val_pred)
joblib.dump(xgb_model, 'lgbm_model_0.6.pkl')
y_test_val = model.predict(X_test)
results_df = test_df[['Unique ID']].copy()
results_df['Lap_Time_Seconds'] = y_test_val 
results_df.to_csv('submission.csv', index=False)


In [12]:
y_val_pred.shape

(273437,)

In [13]:
results_df.shape

(546874, 2)