In [10]:
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
import warnings
import gc

warnings.filterwarnings('ignore')

# --- 1. Load Data ---
# Updated file paths based on your new code structure.
try:
    df_train = pd.read_csv('training_data/train/train.csv')
    df_transactions = pd.read_csv('training_data/train/transactions.csv')
    df_test = pd.read_csv('testing data/test_8gqdJqH.csv')
    print("All data files loaded successfully.")
except FileNotFoundError as e:
    print(f"Error loading data files: {e}")
    print("Please ensure your folder structure and file names are correct.")
    exit()

# --- 2. Initial Data Cleaning & Type Conversion ---
print("\n--- Starting Data Cleaning and Preprocessing ---")

# Convert date columns to datetime objects
for df in [df_train, df_transactions, df_test]:
    df['doj'] = pd.to_datetime(df['doj'])
if 'doi' in df_transactions.columns:
    # Corrected line: apply to_datetime directly to df_transactions['doi']
    df_transactions['doi'] = pd.to_datetime(df_transactions['doi']) 

print("Date columns converted to datetime objects.")

# --- 3. Feature Engineering Function ---
def create_common_features(df):
    """
    Creates common time-series features from the dataframe's 'doj' column.
    """
    df['month'] = df['doj'].dt.month
    df['year'] = df['doj'].dt.year
    df['day_of_week'] = df['doj'].dt.dayofweek  # Monday=0, Sunday=6
    df['day_of_year'] = df['doj'].dt.dayofyear
    df['week_of_year'] = df['doj'].dt.isocalendar().week.astype(int)
    df['is_weekend'] = (df['doj'].dt.dayofweek >= 5).astype(int) # Saturday or Sunday
    df['day_of_month'] = df['doj'].dt.day
    df['is_month_start'] = df['doj'].dt.is_month_start.astype(int)
    df['is_month_end'] = df['doj'].dt.is_month_end.astype(int)
    
    # Create a unique route identifier
    df['route'] = df['srcid'].astype(str) + '_' + df['destid'].astype(str)
    
    return df

def prepare_transaction_features(df_transactions_raw):
    """
    Prepares features from transactions data, including features from various dbd values.
    """
    print("Preparing transaction-based features (dbd values).")
    
    # Define a set of dbd values to extract features from
    dbd_values_to_include = [5, 10, 15, 20, 25] # Including 15 as per original logic, and surrounding values
    
    # Initialize a list to hold processed transaction segments
    transaction_features_list = []

    # Ensure transaction data is sorted for consistent pivoting/merging later
    df_transactions_raw = df_transactions_raw.sort_values(by=['doj', 'srcid', 'destid', 'dbd']).copy()

    for dbd_val in dbd_values_to_include:
        df_temp = df_transactions_raw[df_transactions_raw['dbd'] == dbd_val].copy()
        
        # Select relevant columns and rename them to include dbd_val suffix
        df_temp = df_temp[['doj', 'srcid', 'destid', 'cumsum_seatcount', 'cumsum_searchcount', 
                           'srcid_region', 'destid_region', 'srcid_tier', 'destid_tier']]
        
        df_temp.rename(columns={
            'cumsum_seatcount': f'cumsum_seatcount_dbd_{dbd_val}',
            'cumsum_searchcount': f'cumsum_searchcount_dbd_{dbd_val}',
            'srcid_region': f'srcid_region_dbd_{dbd_val}', # These will be largely redundant, but for safety
            'destid_region': f'destid_region_dbd_{dbd_val}',
            'srcid_tier': f'srcid_tier_dbd_{dbd_val}',
            'destid_tier': f'destid_tier_dbd_{dbd_val}'
        }, inplace=True)
        transaction_features_list.append(df_temp)

    # Merge all dbd-specific features into a single DataFrame
    # Start with the first dbd value, then iteratively merge others
    if not transaction_features_list:
        return pd.DataFrame(columns=['doj', 'srcid', 'destid']) # Return empty if no dbd values
        
    df_transaction_features = transaction_features_list[0]
    for i in range(1, len(transaction_features_list)):
        df_transaction_features = pd.merge(
            df_transaction_features,
            transaction_features_list[i],
            on=['doj', 'srcid', 'destid'],
            how='outer' # Use outer to keep all unique combinations, then fillna later
        )
    
    # Drop duplicate categorical columns (they should be identical across dbd values if present)
    # Keeping only one set for region/tier features, preferably from dbd=15
    for col in ['srcid_region', 'destid_region', 'srcid_tier', 'destid_tier']:
        if f'{col}_dbd_15' in df_transaction_features.columns:
            df_transaction_features[col] = df_transaction_features[f'{col}_dbd_15']
            for dbd_val in dbd_values_to_include:
                if f'{col}_dbd_{dbd_val}' in df_transaction_features.columns and dbd_val != 15:
                    df_transaction_features.drop(columns=f'{col}_dbd_{dbd_val}', inplace=True)
        else: # If dbd=15 is not present for some reason, pick the first available
             for dbd_val in dbd_values_to_include:
                if f'{col}_dbd_{dbd_val}' in df_transaction_features.columns:
                    df_transaction_features[col] = df_transaction_features[f'{col}_dbd_{dbd_val}']
                    for dv in dbd_values_to_include:
                        if f'{col}_dbd_{dv}' in df_transaction_features.columns and dv != dbd_val:
                            df_transaction_features.drop(columns=f'{col}_dbd_{dv}', inplace=True)
                    break # Break after finding and setting one
    
    return df_transaction_features


print("\n--- Engineering Common and Transaction Features ---")
# Apply common feature engineering to train and test sets
df_train = create_common_features(df_train)
df_test = create_common_features(df_test)

# Prepare transaction features separately
df_transaction_features = prepare_transaction_features(df_transactions)

print("Feature engineering complete.")

# --- 4. Prepare Training and Test Data ---
print("\n--- Preparing Model Training and Test Sets ---")

# Merge transaction features with train and test labels
df_model_train = pd.merge(
    df_train,
    df_transaction_features,
    on=['doj', 'srcid', 'destid'],
    how='inner' # Keep only records present in both
)

print(f"Final training data shape after merge: {df_model_train.shape}")
if df_model_train.shape[0] != df_train.shape[0]:
    print(f"Warning: {df_train.shape[0] - df_model_train.shape[0]} records in train.csv did not have corresponding transaction data for any dbd value. These rows are dropped.")

df_model_test = pd.merge(
    df_test,
    df_transaction_features,
    on=['doj', 'srcid', 'destid'],
    how='left' # Keep all test records, fill missing transaction features with 0
)

print(f"Final test data shape after merge: {df_model_test.shape}")

# Identify columns that are not 'doj', 'srcid', 'destid', or the target
numerical_cols = [col for col in df_model_train.columns if 'cumsum' in col or 'count' in col or col in ['month', 'year', 'day_of_week', 'day_of_year', 'week_of_year', 'is_weekend', 'day_of_month', 'is_month_start', 'is_month_end']]

# Fill missing numerical features (especially in test set after left merge)
# It's crucial to fill missing dbd features after the merge.
for col in numerical_cols:
    if col in df_model_train.columns:
        df_model_train[col].fillna(0, inplace=True) # Fill with 0 for cumulative counts if not present
    if col in df_model_test.columns:
        df_model_test[col].fillna(0, inplace=True)

# Fill missing categorical features (e.g., if a region/tier was missing for a dbd value)
categorical_cols = ['srcid_region', 'destid_region', 'srcid_tier', 'destid_tier']
for col in categorical_cols:
    if col in df_model_train.columns:
        df_model_train[col].fillna('unknown', inplace=True)
    if col in df_model_test.columns:
        df_model_test[col].fillna('unknown', inplace=True)


# --- 5. Model Training and Evaluation ---
print("\n--- Training and Evaluating Multiple ML Models ---")

# Define features (X) and target (y)
# Exclude 'doj' and 'route_key' from features. 'route' is handled by encoding.
excluded_cols = ['doj', 'route_key', 'final_seatcount']
features = [col for col in df_model_train.columns if col not in excluded_cols]

X = df_model_train[features]
y_train = df_model_train['final_seatcount']
X_test_prep = df_model_test[features]

# Identify categorical features for encoding
categorical_features = [col for col in features if X[col].dtype == 'object' or X[col].nunique() < 50] # Heuristic for categorical


# One-Hot Encode categorical features for tree-based models (XGBoost, LightGBM, Random Forest, Gradient Boosting)
# CatBoost handles categoricals internally
X_train_encoded = pd.get_dummies(X, columns=categorical_features, dummy_na=False)
X_test_encoded = pd.get_dummies(X_test_prep, columns=categorical_features, dummy_na=False)

# Align columns - crucial for ensuring test set has same features as train set
X_train_encoded, X_test_encoded = X_train_encoded.align(X_test_encoded, join='left', axis=1, fill_value=0)

# Clean column names for LightGBM/XGBoost compatibility (avoiding problematic characters)
X_train_encoded.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in X_train_encoded.columns]
X_test_encoded.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in X_test_encoded.columns]


# Label encode categorical features for CatBoost (it needs integers)
# Apply Label Encoding to the original categorical features for CatBoost
cat_features_for_catboost = [col for col in categorical_features if col in X.columns] # Ensure they exist in X
for col in cat_features_for_catboost:
    le = LabelEncoder()
    # Fit on combined train and test categories to handle unseen categories in test
    combined_categories = pd.concat([X[col], X_test_prep[col]], axis=0).astype(str).unique()
    le.fit(combined_categories)
    X[col] = le.transform(X[col].astype(str))
    X_test_prep[col] = le.transform(X_test_prep[col].astype(str))

# Define models to evaluate
models = {
    'XGBoost': xgb.XGBRegressor(objective='reg:squarederror', eval_metric='rmse', n_estimators=1000, 
                                learning_rate=0.05, max_depth=7, subsample=0.7, colsample_bytree=0.7, 
                                random_state=42, n_jobs=-1, tree_method='hist', # Added more estimators
                                early_stopping_rounds=50),
    'LightGBM': lgb.LGBMRegressor(objective='regression', metric='rmse', n_estimators=1000, 
                                  learning_rate=0.05, num_leaves=31, max_depth=-1, 
                                  subsample=0.7, colsample_bytree=0.7, random_state=42, n_jobs=-1,
                                  callbacks=[lgb.early_stopping(50, verbose=False)]),
    'CatBoost': cb.CatBoostRegressor(objective='RMSE', iterations=1000, learning_rate=0.05, 
                                    depth=7, l2_leaf_reg=3, random_seed=42, verbose=0,
                                    early_stopping_rounds=50,
                                    cat_features=cat_features_for_catboost),
    'GradientBoosting': GradientBoostingRegressor(n_estimators=500, learning_rate=0.05, 
                                                max_depth=5, subsample=0.7, random_state=42),
    'RandomForest': RandomForestRegressor(n_estimators=300, max_depth=10, 
                                          min_samples_leaf=5, random_state=42, n_jobs=-1)
}

# TimeSeriesSplit for cross-validation
# Ensure splits are large enough, adjusting for data size
tscv = TimeSeriesSplit(n_splits=5) 

# Store RMSE results for each model
model_rmse_scores = {}

# Iterate through models and perform cross-validation
for model_name, model in models.items():
    print(f"\n--- Evaluating {model_name} ---")
    fold_rmses = []

    # Use appropriate X_train and X_test for each model type
    current_X_train = X_train_encoded if model_name not in ['CatBoost'] else X
    current_X_test = X_test_encoded if model_name not in ['CatBoost'] else X_test_prep

    # Prepare data for CatBoost if it's the current model
    if model_name == 'CatBoost':
        # Need to re-create X and X_test_prep with label encoded categoricals each time
        # for CatBoost specifically, as other models use one-hot encoded.
        # This is handled by passing X and X_test_prep (which have been label encoded earlier)
        # and letting CatBoost's `cat_features` parameter do the rest.
        pass # X and X_test_prep already have label encoded categoricals from earlier

    # For models with early stopping, eval_set is used. For others, fit directly.
    requires_eval_set = model_name in ['XGBoost', 'LightGBM', 'CatBoost']

    for fold, (train_index, val_index) in enumerate(tscv.split(current_X_train)):
        X_train_fold, X_val_fold = current_X_train.iloc[train_index], current_X_train.iloc[val_index]
        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

        if requires_eval_set:
            if model_name == 'CatBoost':
                model.fit(X_train_fold, y_train_fold, 
                          eval_set=(X_val_fold, y_val_fold),
                          verbose=False) # Keep verbose for CatBoost as it's directly supported
            else: # XGBoost, LightGBM
                # Removed verbose=False from here to avoid the TypeError
                model.fit(X_train_fold, y_train_fold,
                          eval_set=[(X_val_fold, y_val_fold)]) 
        else: # GradientBoosting, RandomForest
            model.fit(X_train_fold, y_train_fold)

        val_predictions = model.predict(X_val_fold)
        val_rmse = np.sqrt(mean_squared_error(y_val_fold, val_predictions))
        fold_rmses.append(val_rmse)
        print(f"  Fold {fold+1} RMSE: {val_rmse:.4f}")
    
    avg_rmse = np.mean(fold_rmses)
    model_rmse_scores[model_name] = avg_rmse
    print(f"Average {model_name} RMSE: {avg_rmse:.4f}\n")

    # Clear memory for the current model if possible
    del model
    gc.collect() # Garbage collection

# --- 6. Select Best Model and Retrain ---
best_model_name = min(model_rmse_scores, key=model_rmse_scores.get)
print(f"\n--- Best Model Selected: {best_model_name} with average RMSE: {model_rmse_scores[best_model_name]:.4f} ---")

best_model = models[best_model_name] # Retrieve the original model instance

# Retrain the best model on the entire training dataset
print(f"Retraining {best_model_name} on the full training data...")

# Use appropriate X_train for the best model
if best_model_name in ['CatBoost']:
    final_X_train = X
    final_X_test = X_test_prep
    # For CatBoost, the cat_features need to be specified during fit if not in constructor
    # Re-instantiate CatBoost to ensure it uses the full dataset properly
    best_model = cb.CatBoostRegressor(objective='RMSE', iterations=1000, learning_rate=0.05, 
                                    depth=7, l2_leaf_reg=3, random_seed=42, verbose=0,
                                    cat_features=cat_features_for_catboost) # No early stopping on full data
else:
    final_X_train = X_train_encoded
    final_X_test = X_test_encoded
    # For XGBoost/LightGBM, remove early_stopping_rounds for final fit on full data
    if best_model_name == 'XGBoost':
        # Removed early_stopping_rounds for final fit on full data, and verbose=False
        best_model = xgb.XGBRegressor(objective='reg:squareerrot', n_estimators=1000, 
                                    learning_rate=0.05, max_depth=7, subsample=0.7, colsample_bytree=0.7, 
                                    random_state=42, n_jobs=-1, tree_method='hist')
    elif best_model_name == 'LightGBM':
        # Removed callbacks for final fit on full data, and verbose=False
        best_model = lgb.LGBMRegressor(objective='regression', n_estimators=1000, 
                                    learning_rate=0.05, num_leaves=31, max_depth=-1, 
                                    subsample=0.7, colsample_bytree=0.7, random_state=42, n_jobs=-1)
    # For other models, no change needed

best_model.fit(final_X_train, y_train)

train_predictions = best_model.predict(final_X_train)
train_rmse = np.sqrt(mean_squared_error(y_train, train_predictions))
print(f"Final Training RMSE with best model ({best_model_name}): {train_rmse:.4f}")

# --- 7. Prediction and Submission File Creation ---
print("\n--- Generating Predictions and Submission File ---")

predictions = best_model.predict(final_X_test)

# Post-processing: Ensure predictions are non-negative integers
predictions[predictions < 0] = 0
predictions = np.round(predictions).astype(int)

# Create the submission file
submission_df = pd.DataFrame({'route_key': df_model_test['route_key'], 'final_seatcount': predictions})

# Save the submission file
submission_df.to_csv('submission_file.csv', index=False)

print("Submission file 'submission_file.csv' created successfully.")
print("\nTop 5 rows of the submission file:")
print(submission_df.head())

# --- 8. (Optional) Feature Importance for the Best Model ---
print(f"\n--- Top 15 Feature Importances for {best_model_name} ---")
# Feature importance might not be available or directly interpretable for all models
try:
    if hasattr(best_model, 'feature_importances_'):
        feature_imp = pd.DataFrame({'Value': best_model.feature_importances_, 'Feature': final_X_train.columns})
        print(feature_imp.sort_values(by="Value", ascending=False).head(15))
    elif hasattr(best_model, 'coef_'): # For linear models, though not used here
        print("Coefficients are not suitable for feature importance for this model type.")
    else:
        print("Feature importances not directly available for this model type.")
except Exception as e:
    print(f"Could not retrieve feature importances: {e}")


All data files loaded successfully.

--- Starting Data Cleaning and Preprocessing ---
Date columns converted to datetime objects.

--- Engineering Common and Transaction Features ---
Preparing transaction-based features (dbd values).
Feature engineering complete.

--- Preparing Model Training and Test Sets ---
Final training data shape after merge: (67200, 32)
Final test data shape after merge: (5900, 32)

--- Training and Evaluating Multiple ML Models ---

--- Evaluating XGBoost ---
[0]	validation_0-rmse:1055.16459
[1]	validation_0-rmse:1020.90100
[2]	validation_0-rmse:990.24773
[3]	validation_0-rmse:959.76933
[4]	validation_0-rmse:928.58210
[5]	validation_0-rmse:903.98006
[6]	validation_0-rmse:881.36910
[7]	validation_0-rmse:859.92458
[8]	validation_0-rmse:836.69565
[9]	validation_0-rmse:816.47558
[10]	validation_0-rmse:798.26299
[11]	validation_0-rmse:781.77929
[12]	validation_0-rmse:766.08795
[13]	validation_0-rmse:751.23733
[14]	validation_0-rmse:738.34749
[15]	validation_0-rmse:7