In [None]:
# === CELL 1: IMPORTS (V55 - Random Forest, 2 Features) ===
print("--- Cell 1: Imports ---")
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.2f}'.format)
print("✅ Libraries imported (incl. RandomForest, RandomizedSearchCV).")

In [None]:
# === CELL 2: DATA LOADING ===
print("\n--- Cell 2: Data Loading ---")
try:
    df = pd.read_csv('../data/train.csv')
    df.columns = df.columns.str.strip()
    print(f"Loaded train.csv: {df.shape}")
except FileNotFoundError:
    print("Error: ../data/train.csv not found.")
    df = None
except Exception as e: 
    print(f"Error loading train.csv: {e}"); df = None

if df is not None:
    # Minimal cleaning
    df['Order_Placed_Date'] = pd.to_datetime(df['Order_Placed_Date'], errors='coerce')
    df['Delivery_Date'] = pd.to_datetime(df['Delivery_Date'], errors='coerce')
    print("✅ Basic loading complete.")

In [None]:
# === CELL 3: V55 FEATURE ENGINEERING (Focus 2, abs()) ===
if df is not None:
    print("\n--- Cell 3: V55 Feature Engineering & Selection ---")

    print("[Step 1/4] Repairing Transport_Cost using abs()...")
    # We return to the V45 method of abs() for the target
    df['Transport_Cost'] = df['Transport_Cost'].abs() 
    
    try:
        prediction_cap_value_995 = df['Transport_Cost'].dropna().quantile(0.995)
        print(f"   ✓ Calculated Prediction Cap (99.5th percentile): {prediction_cap_value_995:,.2f}")
    except Exception as e:
        print(f"   ! Warning: Could not calculate prediction cap: {e}. Using fallback.")
        prediction_cap_value_995 = 1500000
    print("   ✓ Negatives repaired.")

    print("\n[Step 2/4] Defining target variable (y)...")
    y = np.log1p(df['Transport_Cost']) # Log-transform of abs(cost)
    print(f"   ✓ Target (y) created: {y.shape}")

    print("\n[Step 3/4] Selecting features (X) - Focus 2...")
    features_to_keep = ['Equipment_Value', 'Equipment_Weight']
    
    # --- Apply abs() to features ---
    df['Equipment_Value'] = df['Equipment_Value'].abs()
    df['Equipment_Weight'] = df['Equipment_Weight'].abs()
    
    X = df[features_to_keep].copy() # Create X with ONLY these columns
    print(f"   ✓ Feature set 'X' created (Focus 2): {X.shape}")
    print(f"   ✓ Features: {X.columns.tolist()}")

    print("\n[Step 4/4] Splitting data into Train/Validation (80/20)...")
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    print(f"   ✓ Data split complete.")

    print("✅ V55 Feature Engineering/Selection complete.")

else:
    print("   ✗ Feature engineering skipped due to data loading error.")

In [None]:
# === CELL 4: V55 PREPROCESSING PIPELINE (TREE-FRIENDLY) ===
if 'X_train' in locals() and not X_train.empty:
    print("\n--- Cell 4: V55 Preprocessing Pipeline (Impute Only) ---")

    # --- KEY CHANGE: This pipeline is for TREE models ---
    # We ONLY impute. We DO NOT log or scale the features.
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median'))
    ])
    print("   ✓ Transformer defined (Impute -> Median).")

    # --- Assemble Preprocessor ---
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, X_train.columns.tolist()) 
        ],
        remainder='drop'
    )
    print("   ✓ Preprocessor assembled.")

    # --- Fit Preprocessor ---
    print("Fitting preprocessor on X_train...")
    preprocessor.fit(X_train) 
    print("✅ V55 Preprocessor fitted.")
else:
      print("   ✗ Preprocessor fitting skipped (X_train not defined or empty).")

In [None]:
# === CELL 5: HYPERPARAMETER TUNING (RANDOM FOREST) ===
if 'X_train' in locals() and 'preprocessor' in locals() and preprocessor is not None:
    print("\n--- Cell 5: Tuning Random Forest (V55 Data) ---")

    # --- RF pipeline ---
    rf_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor), # Use V55 preprocessor
        ('rf', RandomForestRegressor(random_state=42, n_jobs=-1))
    ])

    # --- Hyperparameter grid (Wide search for RandomizedSearch) ---
    param_dist_rf = {
        'rf__n_estimators': [100, 200, 300, 400, 500],
        'rf__max_depth': [10, 20, 30, 40, None], # None = no limit
        'rf__min_samples_leaf': [1, 2, 4, 6],
        'rf__min_samples_split': [2, 5, 10],
        'rf__max_features': [1.0, 0.75, 0.5, 'sqrt'] # 1.0 = all features
    }
    print(f"RandomizedSearch Parameter Grid (RF):\\n{param_dist_rf}")

    # --- RandomizedSearchCV setup ---
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    rf_search = RandomizedSearchCV(
        estimator=rf_pipeline,
        param_distributions=param_dist_rf,
        n_iter=50, # Try 50 different combinations
        scoring='neg_root_mean_squared_error',
        cv=kf,
        n_jobs=-1, # Use all cores
        random_state=42,
        verbose=2
    )

    print("Starting RandomizedSearchCV for Random Forest...")
    rf_search.fit(X_train, y_train) # Fit on the 80% training split

    print("\n✅ Random Forest Tuning complete!")
    best_params = rf_search.best_params_
    best_cv_rmse = -rf_search.best_score_
    print(f"   Best hyperparameters (RF): {best_params}")
    print(f"   Best CV RMSE (log-space, RF): {best_cv_rmse:.4f}")

    # --- Evaluate on Validation Set ---
    best_model_tuned = rf_search.best_estimator_
    y_val_pred_log = best_model_tuned.predict(X_val)
    val_rmse_log = np.sqrt(mean_squared_error(y_val, y_val_pred_log))
    y_val_orig = np.expm1(y_val)
    y_val_pred_orig = np.maximum(np.expm1(y_val_pred_log), 0) # Clip preds >= 0
    val_rmse_orig = np.sqrt(mean_squared_error(y_val_orig, y_val_pred_orig))
    print(f"--- RF Validation Set Performance ---")
    print(f"   Validation RMSE (log-space): {val_rmse_log:.4f}")
    print(f"   Validation RMSE (original scale): {val_rmse_orig:,.2f}")

else:
    print("   ✗ RF Tuning skipped - check previous cells.")

In [None]:
# === CELL 6: TRAIN FINAL MODEL (V55 - Random Forest) ===

if 'best_params' in locals() and 'X' in locals() and 'y' in locals():
    print("\n--- Cell 6: Training Final Random Forest Model on ALL V55 Data --")

    print(f"   ✓ Using best params from V55 RandomizedSearch: {best_params}")

    # --- Re-fit preprocessor on FULL X data ---
    print("   Re-fitting V55 (Impute) preprocessor on full dataset X...")
    # Rebuild the final preprocessor object
    final_numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median'))
    ])
    final_preprocessor = ColumnTransformer(transformers=[
        ('num', final_numeric_transformer, X.columns.tolist())
        ], remainder='drop')

    final_preprocessor.fit(X) # Fit on ALL X data
    print("   ✓ Preprocessor re-fitted on full X dataset.")

    # --- Create the final model object ---
    # We must strip the 'rf__' prefix from the param names
    best_params_cleaned = {key.replace('rf__', ''): value for key, value in best_params.items()}
    final_rf_obj = RandomForestRegressor(
        **best_params_cleaned,
        random_state=42, 
        n_jobs=-1
    )

    # Create the final pipeline
    final_pipeline = Pipeline(steps=[
        ('preprocessor', final_preprocessor), # Use the re-fitted preprocessor
        ('rf', final_rf_obj)  # Add the best model
    ])

    # === Fit final model on full V55 dataset ===
    print("Fitting final RF model on the full V55 dataset (X, y)...")
    print("X shape:", X.shape)
    print("y shape:", y.shape)

    final_pipeline.fit(X, y) # Train on full X and y

    print(f"\n✅ Final (V55 RANDOM FOREST) model trained.")
    print("The 'final_pipeline' object is ready for prediction.")

else:
    print("   ✗ Final model training skipped. Required variables not found.")
    if 'best_params' not in locals(): print("     - Best parameters not found (Run RandomizedSearch - Cell 5).")
    if 'X' not in locals() or 'y' not in locals(): print("     - Full dataset 'X' or 'y' not defined (Cell 3).")

In [None]:
# === CELL 7: V55 TEST DATA PREPARATION FUNCTION ===

def prepare_features_V55(df_raw, train_cols_expected):
    """Applies V55 cleaning and selects the 2 key features"""
    print("Preparing test features (V55 - Focus 2)...")
    df_test = df_raw.copy()
    df_test.columns = df_test.columns.str.strip()

    features_present_in_test = [col for col in train_cols_expected if col in df_test.columns]

    missing_in_test = set(train_cols_expected) - set(features_present_in_test)
    if missing_in_test:
        print(f"   ! Warning: Expected features missing in test data: {missing_in_test}. Adding as NaN.")
        for col in missing_in_test:
            df_test[col] = np.nan
            
    # --- Apply abs() to features to match training ---
    if 'Equipment_Value' in df_test.columns:
        df_test['Equipment_Value'] = df_test['Equipment_Value'].abs()
    if 'Equipment_Weight' in df_test.columns:
        df_test['Equipment_Weight'] = df_test['Equipment_Weight'].abs()

    try:
        df_test_final = df_test.reindex(columns=train_cols_expected)
        print(f"Test data prepared. Shape: {df_test_final.shape}")
        return df_test_final
    except Exception as e:
        print(f"   ✗ Error ensuring column consistency: {e}")
        return None


print("\n✅ V55 Test preparation function defined.")

In [None]:
# === CELL 8: GENERATE SUBMISSION (V55 - Random Forest) ===

if ('final_pipeline' in locals() and
    'prepare_features_V55' in globals() and
    'X' in locals() and 
    'prediction_cap_value_995' in locals()):
    print("\n--- Cell 8: Generating Submission File (V55 Pipeline) ---")
    try:
        # Load the raw test data
        df_test_raw = pd.read_csv('../data/test.csv')
        submission_ids = df_test_raw['Hospital_Id']
        print(f"Loaded test.csv: {df_test_raw.shape}")

        # Prepare test features using V55 function
        X_test_final_raw = prepare_features_V55(df_test_raw, X.columns) 

        if X_test_final_raw is not None:
            print("Getting predictions from the final V55 model...")
            log_predictions = final_pipeline.predict(X_test_final_raw)

            print("Converting predictions from log scale...")
            final_predictions = np.expm1(log_predictions)

            # --- Apply Aggressive Safety Clip (from full data) ---
            print(f"Applying safety clip to predictions at {prediction_cap_value_995:,.2f} (99.5th percentile)")
            final_predictions = np.clip(final_predictions, 0, prediction_cap_value_995)

            print("Creating submission DataFrame...")
            submission_df = pd.DataFrame({
                'Hospital_Id': submission_ids,
                'Transport_Cost': final_predictions
            })

            output_filename = 'model.csv'
            submission_df.to_csv(output_filename, index=False)
            print(f"\n✅ Submission file '{output_filename}' saved successfully.")
            print("Final Predictions Head:")
            print(submission_df.head())

        else:
            print("   ✗ Submission generation failed: Error preparing test features.")

    except FileNotFoundError:
        print("   ✗ Error: ../data/test.csv not found.")
    except Exception as e:
        print(f"   ✗ An unexpected error occurred during submission generation: {e}")

else:
    print("\n--- Submission Generation Skipped ---")
    if 'final_pipeline' not in locals(): print("   Reason: Final V55 model not trained (Run Cell 6).")
    if 'prepare_features_V55' not in globals(): print("   Reason: 'prepare_features_V55' function not defined (Run Cell 7).")
    if 'X' not in locals(): print("   Reason: Training features 'X' not defined (Run Cell 3).")
    if 'prediction_cap_value_995' not in locals(): print("   Reason: 'prediction_cap_value_995' not defined (Run Cell 3).")