In [None]:
# === CELL 1: IMPORTS (V46 - RF/XGB Focus) ===
print("--- Cell 1: Imports ---")
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, FunctionTransformer
from sklearn.metrics import mean_squared_error
# New models
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.2f}'.format)
print("✅ Libraries imported (incl. RF & XGB).")

In [None]:
# === CELL 2: DATA LOADING & INITIAL CLEANING ===
print("\n--- Cell 2: Data Loading & Initial Cleaning ---")
try:
    df = pd.read_csv('../data/train.csv')
    df.columns = df.columns.str.strip()
    print(f"Loaded train.csv: {df.shape}")
except FileNotFoundError:
    print("Error: ../data/train.csv not found.")
    df = None
except Exception as e: 
    print(f"Error loading train.csv: {e}"); df = None

if df is not None:
    print("Converting date columns (temporarily)...")
    df['Order_Placed_Date'] = pd.to_datetime(df['Order_Placed_Date'], errors='coerce')
    df['Delivery_Date'] = pd.to_datetime(df['Delivery_Date'], errors='coerce')
    print("✅ Basic loading complete.")

In [None]:
# === CELL 3: V45/V46 FEATURE ENGINEERING & SELECTION (Focus 2) ===
if df is not None:
    print("\n--- Cell 3: V46 Feature Engineering & Selection ---")

    print("[Step 1/4] Repairing negative values using abs()...")
    df['Delivery_Days_temp'] = (df['Delivery_Date'] - df['Order_Placed_Date']).dt.days
    df['Delivery_Days_temp'] = df['Delivery_Days_temp'].abs() # Repair
    df['Transport_Cost'] = df['Transport_Cost'].abs() # Use abs()
    
    try:
        prediction_cap_value_995 = df['Transport_Cost'].dropna().quantile(0.995)
        print(f"   ✓ Calculated Prediction Cap (99.5th percentile): {prediction_cap_value_995:,.2f}")
    except Exception as e:
        print(f"   ! Warning: Could not calculate prediction cap: {e}. Using fallback.")
        prediction_cap_value_995 = 1500000
    print("   ✓ Negatives repaired.")

    print("\n[Step 2/4] Defining target variable (y)...")
    y = np.log1p(df['Transport_Cost']) # Log-transform of abs(cost)
    print(f"   ✓ Target (y) created: {y.shape}")

    print("\n[Step 3/4] Selecting features (X) - Focus 2...")
    features_to_keep = ['Equipment_Value', 'Equipment_Weight']
    features_present = [col for col in features_to_keep if col in df.columns]
    X = df[features_present].copy() # Create X with ONLY these columns
    print(f"   ✓ Feature set 'X' created (Focus 2): {X.shape}")
    print(f"   ✓ Features: {X.columns.tolist()}")

    print("\n[Step 4/4] Splitting data into Train/Validation (80/20)...")
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    print(f"   ✓ Data split complete.")

    print("✅ V46 Feature Engineering/Selection complete.")

else:
    print("   ✗ Feature engineering skipped due to data loading error.")

In [None]:
# === CELL 4: V46 PREPROCESSING PIPELINE DEFINITION & FITTING ===
if 'X_train' in locals() and not X_train.empty:
    print("\n--- Cell 4: V46 Preprocessing Pipeline (Impute -> Clip -> Log -> Scale) ---")

    # --- Define Transformer Pipeline for the 2 features ---
    # Apply Impute -> Clip(0) -> Log1p -> Scale
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('clipper', FunctionTransformer(lambda x: np.maximum(x, 0), validate=False)), # Clip >= 0
        ('log', FunctionTransformer(np.log1p, validate=False)), # Apply log1p
        ('scaler', StandardScaler()) # Then scale
    ])
    print("   ✓ Transformer defined (Impute -> Clip(0) -> Log1p -> StandardScaler).")

    # --- Assemble Preprocessor ---
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, X_train.columns.tolist()) # Apply to all columns in X_train
        ],
        remainder='drop'
    )
    print("   ✓ Preprocessor assembled.")

    # --- Fit Preprocessor ---
    print("Fitting preprocessor on X_train...")
    preprocessor.fit(X_train) # Fit only on the training part
    print("✅ V46 Preprocessor fitted.")
else:
      print("   ✗ Preprocessor fitting skipped (X_train not defined or empty).")

In [None]:
# === CELL 5: HYPERPARAMETER TUNING (RANDOM FOREST) ===
if 'X_train' in locals() and 'preprocessor' in locals() and preprocessor is not None:
    print("\n--- Cell 5: Tuning Random Forest (V46 Data) ---")

    # --- RF pipeline ---
    rf_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor), # Use V46 preprocessor
        ('rf', RandomForestRegressor(random_state=42, n_jobs=-1))
    ])

    # --- Hyperparameter grid (for RandomizedSearch) ---
    param_dist_rf = {
        'rf__n_estimators': [100, 200, 300, 400, 500],
        'rf__max_depth': [None, 10, 20, 30, 40],
        'rf__min_samples_leaf': [1, 2, 4, 6],
        'rf__min_samples_split': [2, 5, 10],
        'rf__max_features': [1.0, 'sqrt', 'log2'] # 1.0 is all features (equiv. to 'auto')
    }
    print(f"RandomizedSearch Parameter Grid (RF):\\n{param_dist_rf}")

    # --- RandomizedSearchCV setup ---
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    rf_search = RandomizedSearchCV(
        estimator=rf_pipeline,
        param_distributions=param_dist_rf,
        n_iter=50, # Number of combinations to try
        scoring='neg_root_mean_squared_error',
        cv=kf,
        n_jobs=-1, # Use all cores
        random_state=42,
        verbose=2
    )

    print("Starting RandomizedSearchCV for Random Forest...")
    rf_search.fit(X_train, y_train) # Fit on the 80% training split

    print("\n✅ Random Forest Tuning complete!")
    rf_best_params = rf_search.best_params_
    rf_best_cv_rmse = -rf_search.best_score_
    print(f"   Best hyperparameters (RF): {rf_best_params}")
    print(f"   Best CV RMSE (log-space, RF): {rf_best_cv_rmse:.4f}")

    # --- Evaluate on Validation Set ---
    rf_best_model = rf_search.best_estimator_
    y_val_pred_log = rf_best_model.predict(X_val)
    rf_val_rmse_log = np.sqrt(mean_squared_error(y_val, y_val_pred_log))
    y_val_orig = np.expm1(y_val)
    y_val_pred_orig = np.maximum(np.expm1(y_val_pred_log), 0) # Clip preds >= 0
    rf_val_rmse_orig = np.sqrt(mean_squared_error(y_val_orig, y_val_pred_orig))
    print(f"--- RF Validation Set Performance ---")
    print(f"   Validation RMSE (log-space): {rf_val_rmse_log:.4f}")
    print(f"   Validation RMSE (original scale): {rf_val_rmse_orig:,.2f}")

else:
    print("   ✗ RF Tuning skipped - check previous cells.")

In [None]:
# === CELL 6: HYPERPARAMETER TUNING (XGBOOST) ===
if 'X_train' in locals() and 'preprocessor' in locals() and preprocessor is not None:
    print("\n--- Cell 6: Tuning XGBoost (V46 Data) ---")

    # --- XGB pipeline ---
    xgb_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor), # Use V46 preprocessor
        ('xgb', XGBRegressor(random_state=42, objective='reg:squarederror', eval_metric='rmse', n_jobs=-1))
    ])

    # --- Hyperparameter grid (for RandomizedSearch) ---
    param_dist_xgb = {
        'xgb__n_estimators': [100, 200, 300, 400, 500, 600],
        'xgb__max_depth': [3, 5, 7, 9, 11],
        'xgb__learning_rate': [0.01, 0.02, 0.05, 0.1, 0.15],
        'xgb__subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
        'xgb__colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
        'xgb__gamma': [0, 0.1, 0.2, 0.3] # Regularization
    }
    print(f"RandomizedSearch Parameter Grid (XGB):\\n{param_dist_xgb}")

    # --- RandomizedSearchCV setup ---
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    xgb_search = RandomizedSearchCV(
        estimator=xgb_pipeline,
        param_distributions=param_dist_xgb,
        n_iter=50, # Number of combinations to try
        scoring='neg_root_mean_squared_error',
        cv=kf,
        n_jobs=-1, # Use all cores
        random_state=42,
        verbose=2
    )

    print("Starting RandomizedSearchCV for XGBoost...")
    xgb_search.fit(X_train, y_train) # Fit on the 80% training split

    print("\n✅ XGBoost Tuning complete!")
    xgb_best_params = xgb_search.best_params_
    xgb_best_cv_rmse = -xgb_search.best_score_
    print(f"   Best hyperparameters (XGB): {xgb_best_params}")
    print(f"   Best CV RMSE (log-space, XGB): {xgb_best_cv_rmse:.4f}")

    # --- Evaluate on Validation Set ---
    xgb_best_model = xgb_search.best_estimator_
    y_val_pred_log = xgb_best_model.predict(X_val)
    xgb_val_rmse_log = np.sqrt(mean_squared_error(y_val, y_val_pred_log))
    y_val_orig = np.expm1(y_val)
    y_val_pred_orig = np.maximum(np.expm1(y_val_pred_log), 0) # Clip preds >= 0
    xgb_val_rmse_orig = np.sqrt(mean_squared_error(y_val_orig, y_val_pred_orig))
    print(f"--- XGB Validation Set Performance ---")
    print(f"   Validation RMSE (log-space): {xgb_val_rmse_log:.4f}")
    print(f"   Validation RMSE (original scale): {xgb_val_rmse_orig:,.2f}")

else:
    print("   ✗ XGB Tuning skipped - check previous cells.")

In [None]:
# === CELL 7: TRAIN FINAL MODEL (RF or XGB) ===

# Check if both tuning cells have run and produced scores
if ('rf_val_rmse_log' in locals() and 'xgb_val_rmse_log' in locals() and 
    'X' in locals() and 'y' in locals()):
    print("\n--- Cell 7: Training Final Model on ALL V46 Data ---")

    # --- Select the Best Model based on Validation RMSE ---
    if rf_val_rmse_log < xgb_val_rmse_log:
        print(f"   Selecting Random Forest (Val RMSE: {rf_val_rmse_log:.4f}) over XGBoost (Val RMSE: {xgb_val_rmse_log:.4f})")
        final_model_name = 'rf'
        final_params = rf_best_params
        # Re-create the model object with best params
        final_model_obj = RandomForestRegressor(
            n_estimators=final_params['rf__n_estimators'],
            max_depth=final_params['rf__max_depth'],
            min_samples_leaf=final_params['rf__min_samples_leaf'],
            min_samples_split=final_params['rf__min_samples_split'],
            max_features=final_params['rf__max_features'],
            random_state=42,
            n_jobs=-1
        )
    else:
        print(f"   Selecting XGBoost (Val RMSE: {xgb_val_rmse_log:.4f}) over Random Forest (Val RMSE: {rf_val_rmse_log:.4f})")
        final_model_name = 'xgb'
        final_params = xgb_best_params
        # Re-create the model object with best params
        final_model_obj = XGBRegressor(
            n_estimators=final_params['xgb__n_estimators'],
            max_depth=final_params['xgb__max_depth'],
            learning_rate=final_params['xgb__learning_rate'],
            subsample=final_params['xgb__subsample'],
            colsample_bytree=final_params['xgb__colsample_bytree'],
            gamma=final_params['xgb__gamma'],
            random_state=42, 
            objective='reg:squarederror', 
            eval_metric='rmse',
            n_jobs=-1
        )
    
    print(f"   ✓ Using best params for {final_model_name}: {final_params}")

    # --- Re-fit preprocessor on FULL X data ---
    print("   Re-fitting V46 preprocessor on full dataset X...")
    # Rebuild the final preprocessor object using definitions from Cell 4
    log_transformer_pipeline = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('clipper', FunctionTransformer(lambda x: np.maximum(x, 0), validate=False)),
        ('log', FunctionTransformer(np.log1p, validate=False)),
        ('scaler', StandardScaler())
    ])
    final_preprocessor = ColumnTransformer(transformers=[
        ('num', log_transformer_pipeline, X.columns.tolist())
        ], remainder='drop')
    
    final_preprocessor.fit(X) # Fit on ALL X data
    print("   ✓ Preprocessor re-fitted on full X dataset.")

    # Create the final pipeline
    final_pipeline = Pipeline(steps=[
        ('preprocessor', final_preprocessor), # Use the re-fitted preprocessor
        (final_model_name, final_model_obj)  # Add the best model
    ])

    # === Fit final model on full V46 dataset ===
    print(f"Fitting final {final_model_name} model on the full V46 dataset (X, y)...")
    print("X shape:", X.shape)
    print("y shape:", y.shape)

    final_pipeline.fit(X, y) # Train on full X and y

    print(f"\n✅ Final (V46 {final_model_name.upper()}) model trained.")
    print("The 'final_pipeline' object is ready for prediction.")

else:
    print("   ✗ Final model training skipped. Required variables not found.")
    if 'rf_val_rmse_log' not in locals(): print("     - Random Forest tuning (Cell 5) not run.")
    if 'xgb_val_rmse_log' not in locals(): print("     - XGBoost tuning (Cell 6) not run.")
    if 'X' not in locals() or 'y' not in locals(): print("     - Full dataset 'X' or 'y' not defined (Cell 3).")

In [None]:
# === CELL 8: V46 TEST DATA PREPARATION FUNCTION ===
# (This is identical to V45 as it only selects the 2 features)

def prepare_features_V46(df_raw, train_cols_expected):
    """Applies V46 cleaning and selects the 2 key features"""
    print("Preparing test features (V46 - Focus 2)...")
    df_test = df_raw.copy()
    df_test.columns = df_test.columns.str.strip()

    # --- Select ONLY the 2 key features ---
    features_present_in_test = [col for col in train_cols_expected if col in df_test.columns]

    missing_in_test = set(train_cols_expected) - set(features_present_in_test)
    if missing_in_test:
        print(f"   ! Warning: Expected features missing in test data: {missing_in_test}. Adding as NaN.")
        for col in missing_in_test:
            df_test[col] = np.nan

    try:
        df_test_final = df_test.reindex(columns=train_cols_expected)
        print(f"Test data prepared. Shape: {df_test_final.shape}")
        return df_test_final
    except Exception as e:
        print(f"   ✗ Error ensuring column consistency: {e}")
        return None


print("\n✅ V46 Test preparation function defined.")

In [None]:
# === CELL 9: GENERATE SUBMISSION (V46 - Final Model) ===

# Check if the final model, prepare function, cap value, and X exist
if ('final_pipeline' in locals() and
    'prepare_features_V46' in globals() and
    'X' in locals() and 
    'prediction_cap_value_995' in locals()):
    print("\n--- Cell 9: Generating Submission File (V46 Pipeline) ---")
    try:
        # Load the raw test data
        df_test_raw = pd.read_csv('../data/test.csv')
        submission_ids = df_test_raw['Hospital_Id']
        print(f"Loaded test.csv: {df_test_raw.shape}")

        # Prepare test features using V46 function
        X_test_final_raw = prepare_features_V46(df_test_raw, X.columns) 

        if X_test_final_raw is not None:
            print("Getting predictions from the final V46 model...")
            log_predictions = final_pipeline.predict(X_test_final_raw)

            print("Converting predictions from log scale...")
            final_predictions = np.expm1(log_predictions)

            # --- Apply Aggressive Safety Clip (99.5th Percentile) ---
            print(f"Applying safety clip to predictions at {prediction_cap_value_995:,.2f} (99.5th percentile)")
            final_predictions = np.clip(final_predictions, 0, prediction_cap_value_995)

            print("Creating submission DataFrame...")
            submission_df = pd.DataFrame({
                'Hospital_Id': submission_ids,
                'Transport_Cost': final_predictions
            })

            output_filename = 'model.csv'
            submission_df.to_csv(output_filename, index=False)
            print(f"\n✅ Submission file '{output_filename}' saved successfully.")
            print("Final Predictions Head:")
            print(submission_df.head())

        else:
            print("   ✗ Submission generation failed: Error preparing test features.")

    except FileNotFoundError:
        print("   ✗ Error: ../data/test.csv not found.")
    except Exception as e:
        print(f"   ✗ An unexpected error occurred during submission generation: {e}")

else:
    print("\n--- Submission Generation Skipped ---")
    if 'final_pipeline' not in locals(): print("   Reason: Final V46 model not trained (Run Cell 7).")
    if 'prepare_features_V46' not in globals(): print("   Reason: 'prepare_features_V46' function not defined (Run Cell 8).")
    if 'X' not in locals(): print("   Reason: Training features 'X' not defined (Run Cell 3).")
    if 'prediction_cap_value_995' not in locals(): print("   Reason: 'prediction_cap_value_995' not defined (Run Cell 3).")