In [3]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
import warnings

warnings.filterwarnings('ignore')

# --- 1. Load Data ---
# Updated file paths based on your new code structure.
try:
    df_train = pd.read_csv('training_data/train/train.csv')
    df_transactions = pd.read_csv('training_data/train/transactions.csv')
    df_test = pd.read_csv('testing data/test_8gqdJqH.csv')
    print("All data files loaded successfully.")
except FileNotFoundError as e:
    print(f"Error loading data files: {e}")
    print("Please ensure your folder structure and file names are correct.")
    exit()

# --- 2. Initial Data Cleaning & Type Conversion ---
print("\n--- Starting Data Cleaning and Preprocessing ---")

# Convert date columns to datetime objects
for df in [df_train, df_transactions, df_test]:
    df['doj'] = pd.to_datetime(df['doj'])
if 'doi' in df_transactions.columns:
    df_transactions['doi'] = pd.to_datetime(df_transactions['doi'])

print("Date columns converted to datetime objects.")

# --- 3. Feature Engineering Function ---
# We create a function to apply the same transformations to both train and test data
def create_features(df):
    """
    Creates time-series and categorical features from the dataframe.
    """
    df['month'] = df['doj'].dt.month
    df['year'] = df['doj'].dt.year
    df['day_of_week'] = df['doj'].dt.dayofweek  # Monday=0, Sunday=6
    df['day_of_year'] = df['doj'].dt.dayofyear
    df['week_of_year'] = df['doj'].dt.isocalendar().week.astype(int)
    df['is_weekend'] = (df['doj'].dt.dayofweek >= 5).astype(int) # Saturday or Sunday
    
    # Create a unique route identifier
    df['route'] = df['srcid'].astype(str) + '_' + df['destid'].astype(str)
    
    return df

print("\n--- Engineering Features ---")
# Apply full feature engineering to the transactions data
df_transactions = create_features(df_transactions)

# For the test set, we only need to create the 'route' column for the merge key.
# The other features (month, year, etc.) will come from the transaction data after the merge.
df_test['route'] = df_test['srcid'].astype(str) + '_' + df_test['destid'].astype(str)
print("Feature engineering complete.")

# --- 4. Prepare Training and Test Data ---
print("\n--- Preparing Model Training and Test Sets ---")

# Filter transactions for data available exactly 15 days before departure
dbd_filter = 15
df_transactions_filtered = df_transactions[df_transactions['dbd'] == dbd_filter].copy()

print(f"Filtered transactions for dbd = {dbd_filter}. Shape: {df_transactions_filtered.shape}")

# Create the training set by merging filtered transactions with train labels
df_model_train = pd.merge(
    df_train,
    df_transactions_filtered,
    on=['doj', 'srcid', 'destid'],
    how='inner'
)

print(f"Final training data shape after merge: {df_model_train.shape}")
if df_model_train.shape[0] != df_train.shape[0]:
    print("Warning: Some routes in train.csv did not have a transaction record at dbd=15.")

# Create the test set by merging filtered transactions with test routes
df_model_test = pd.merge(
    df_test,
    df_transactions_filtered,
    on=['doj', 'srcid', 'destid', 'route'],
    how='left'
)

print(f"Final test data shape after merge: {df_model_test.shape}")
if df_model_test.isnull().sum().sum() > 0:
    print("Warning: Some test routes have missing features. Filling with 0.")
    df_model_test.fillna(0, inplace=True)

# --- 5. Model Training (XGBoost) ---
print("\n--- Training XGBoost Model ---")

# Define features (X) and target (y)
# We will one-hot encode the categorical features for XGBoost
features = [
    'srcid', 'destid', 'cumsum_seatcount', 'cumsum_searchcount',
    'month', 'year', 'day_of_week', 'day_of_year', 'week_of_year', 'is_weekend',
    'srcid_region', 'destid_region', 'srcid_tier', 'destid_tier', 'route'
]
categorical_features = ['srcid_region', 'destid_region', 'srcid_tier', 'destid_tier', 'route', 'srcid', 'destid']

X = df_model_train[features]
y_train = df_model_train['final_seatcount']
X_test_prep = df_model_test[features]

# One-Hot Encode categorical features
X_train = pd.get_dummies(X, columns=categorical_features, dummy_na=False)
X_test = pd.get_dummies(X_test_prep, columns=categorical_features, dummy_na=False)

# Align columns - crucial for ensuring test set has same features as train set
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

# XGBoost Model Parameters
xgb_params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'n_estimators': 2000,
    'learning_rate': 0.01,
    'max_depth': 8,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'seed': 42,
    'n_jobs': -1,
    'tree_method': 'hist', # Use 'hist' for faster training
    'early_stopping_rounds': 100 # Moved parameter here
}

model = xgb.XGBRegressor(**xgb_params)

# Train the model with early stopping
# The early_stopping_rounds parameter from the constructor will be used here.
model.fit(X_train, y_train,
          eval_set=[(X_train, y_train)],
          verbose=False)

print("Model training complete.")

train_predictions = model.predict(X_train)
train_rmse = np.sqrt(mean_squared_error(y_train, train_predictions))
print(f"\nFinal Training RMSE: {train_rmse:.4f}")

# --- 6. Prediction and Submission File Creation ---
print("\n--- Generating Predictions and Submission File ---")

# Predict on the test data
# The model automatically uses the best iteration thanks to early stopping
predictions = model.predict(X_test)

# Post-processing: Ensure predictions are non-negative integers
predictions[predictions < 0] = 0
predictions = np.round(predictions).astype(int)

# Create the submission file
submission_df = pd.DataFrame({'route_key': df_model_test['route_key'], 'final_seatcount': predictions})

# Save the submission file
submission_df.to_csv('submission_file.csv', index=False)

print("Submission file 'submission_file.csv' created successfully.")
print("\nTop 5 rows of the submission file:")
print(submission_df.head())

# --- 7. (Optional) Feature Importance ---
print("\n--- Top 15 Feature Importances ---")
feature_imp = pd.DataFrame({'Value': model.feature_importances_, 'Feature': X_train.columns})
print(feature_imp.sort_values(by="Value", ascending=False).head(15))


All data files loaded successfully.

--- Starting Data Cleaning and Preprocessing ---
Date columns converted to datetime objects.

--- Engineering Features ---
Feature engineering complete.

--- Preparing Model Training and Test Sets ---
Filtered transactions for dbd = 15. Shape: (73100, 18)
Final training data shape after merge: (67200, 19)
Final test data shape after merge: (5900, 19)

--- Training XGBoost Model ---
Model training complete.

Final Training RMSE: 304.6268

--- Generating Predictions and Submission File ---
Submission file 'submission_file.csv' created successfully.

Top 5 rows of the submission file:
          route_key  final_seatcount
0  2025-02-11_46_45             3732
1  2025-01-20_17_23             1629
2  2025-01-08_02_14             1171
3  2025-01-08_08_47              962
4  2025-01-08_09_46             3305

--- Top 15 Feature Importances ---
        Value      Feature
114  0.035489   route_46_9
135  0.030618   route_9_46
144  0.029741      srcid_9
188  0.0