### 📋 Preprocessing Summary

* **Filtered bad data** by removing rows with a negative `Transport_Cost`.
* **Engineered `Equipment_Volume`** by multiplying `Height` and `Width`.
* **Cleaned `Delivery_Days`** by keeping all values (distinguishing them from same-day `0`).
* **Normalized features** by log-transforming skewed inputs (`Value`, `Volume`, `Weight`, `Base_Transport_Fee`).
* **Normalized the target** by applying a log-transform (`np.log1p`) to `Transport_Cost`.
* **Removed clutter** by dropping all ID, redundant, and original date columns.
* **Split the data** into an 80% training set and a 20% test set.
* **Established a baseline** by calculating the dollar-scale RMSE of just guessing the mean cost.
* **Built a numeric pipeline** to impute missing values with the `median` and scale with `RobustScaler` (handles outliers).
* **Built a categorical pipeline** to impute missing values with the `most_frequent` and apply `OneHotEncoder`.
* **Prevented data leakage** by `fitting` the preprocessor *only* on the training data and then `transforming` both train and test sets.

In [91]:
# core
import os
import re
import warnings
warnings.filterwarnings('ignore')

# data + plotting
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from scipy.stats import zscore
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
np.set_printoptions(threshold=np.inf)

# sklearn (preprocessing / pipeline / model selection / metrics)
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures, PowerTransformer,RobustScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error

# classical models (if you use them elsewhere)
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor

# gradient boosting / lightgbm / xgboost
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

# utilities
import joblib   # optional: save/load pipeline

from lightgbm import LGBMRegressor
from sklearn.model_selection import RandomizedSearchCV


In [92]:
# Set plot style
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)


# 1️⃣ Load Data
print("Loading data...")
df = pd.read_csv('../data/train.csv')
df.columns = df.columns.str.strip()
display(df.head())
print(f"Initial data shape: {df.shape}")

# 2️⃣ Clean all string/object columns: strip spaces, replace blanks with NaN
print("Cleaning string columns...")
for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].astype(str).str.strip()
    df[col] = df[col].replace({'': np.nan, 'nan': np.nan, 'NaN': np.nan})

# 3️⃣ Normalize Yes/No columns to consistent "Yes"/"No"
print("Normalizing Yes/No columns...")
yes_no_cols = ['CrossBorder_Shipping', 'Urgent_Shipping', 'Installation_Service',
               'Fragile_Equipment', 'Rural_Hospital']
for col in yes_no_cols:
    if col in df.columns:
        df[col] = df[col].replace({
            'YES': 'Yes', 'yes': 'Yes', 'Y': 'Yes', 'y': 'Yes',
            'NO': 'No', 'no': 'No', 'N': 'No', 'n': 'No'
        })

# 4️⃣ Convert date columns to datetime
print("Converting date columns...")
df['Order_Placed_Date'] = pd.to_datetime(df['Order_Placed_Date'], errors='coerce')
df['Delivery_Date'] = pd.to_datetime(df['Delivery_Date'], errors='coerce')

# 5️⃣ Create new feature: Delivery_Days (difference in days)
print("Engineering Delivery_Days feature...")
df['Delivery_Days'] = (df['Delivery_Date'] - df['Order_Placed_Date']).dt.days
df['Delivery_Days'] = pd.to_numeric(df['Delivery_Days'], errors='coerce')

# === ADDED: Date Feature Engineering ===
print("Engineering more date features...")
df['Order_Month'] = df['Order_Placed_Date'].dt.month
df['Order_Day_of_Week'] = df['Order_Placed_Date'].dt.dayofweek  # Monday=0, Sunday=6
df['Order_Is_Weekend'] = df['Order_Day_of_Week'].isin([5, 6])
# === END ADDED ===

# 6️⃣ (Original) delete initial date rows
# df = df.dropna(subset=['Order_Placed_Date', 'Delivery_Date'])

# 7️⃣ Drop exact duplicate rows
print("Dropping duplicates...")
before = len(df)
df = df.drop_duplicates()
after = len(df)
print(f"Dropped {before - after} duplicate rows.")

# 8️⃣ Quick check after cleaning
print("\n" + "="*30)
print(" CLEANING & FEATURE ENGINEERING COMPLETE ")
print("="*30)
print(f"After basic cleaning shape: {df.shape}")

print("\nMissing values (raw count):")
print(df.isna().sum())

# === ADDED: Missing Value Percentage View ===
print("\nMissing values (percentage):")
missing_pct = (df.isna().sum() / len(df) * 100).sort_values(ascending=False)
print(missing_pct[missing_pct > 0])
# === END ADDED ===

print("\nDataFrame head:")
display(df.head())
# print(df['Delivery_Days'])

Loading data...


Unnamed: 0,Hospital_Id,Supplier_Name,Supplier_Reliability,Equipment_Height,Equipment_Width,Equipment_Weight,Equipment_Type,Equipment_Value,Base_Transport_Fee,CrossBorder_Shipping,Urgent_Shipping,Installation_Service,Transport_Method,Fragile_Equipment,Hospital_Info,Rural_Hospital,Order_Placed_Date,Delivery_Date,Hospital_Location,Transport_Cost
0,fffe3200360030003700,Jo Valencia,0.44,21.0,6.0,,,3.62,17.13,No,No,No,Roadways,No,Working Class,No,10/20/17,10/20/17,APO AA 33776,179.5
1,fffe3400380037003400,Wanda Warren,0.58,29.0,20.0,1210684.0,Marble,9703.37,35.42,No,Yes,Yes,Roadways,No,Working Class,No,02/22/16,02/24/16,"South Kevin, VT 84493",627732.45
2,fffe3200350036003700,Robert Ackies,0.97,39.0,15.0,3305.0,Aluminium,40.21,18.54,No,No,No,Roadways,No,Working Class,No,01/11/18,01/10/18,"Kevinshire, NE 31279",1565.92
3,fffe3800320034003400,Charlotte Membreno,0.7,8.0,5.0,606.0,Brass,4.55,17.48,No,No,No,Roadways,No,Working Class,No,08/06/16,08/06/16,DPO AP 61572,257.71
4,fffe3600340033003000,Nena Silva,0.66,27.0,13.0,,Marble,2726.8,30.23,Yes,No,No,Roadways,No,Working Class,,12/15/16,12/17/16,"Joshuamouth, AK 01550",8553.52


Initial data shape: (5000, 20)
Cleaning string columns...
Normalizing Yes/No columns...
Converting date columns...
Engineering Delivery_Days feature...
Engineering more date features...
Dropping duplicates...
Dropped 0 duplicate rows.

 CLEANING & FEATURE ENGINEERING COMPLETE 
After basic cleaning shape: (5000, 24)

Missing values (raw count):
Hospital_Id                0
Supplier_Name              0
Supplier_Reliability     587
Equipment_Height         283
Equipment_Width          443
Equipment_Weight         460
Equipment_Type           599
Equipment_Value            0
Base_Transport_Fee         0
CrossBorder_Shipping       0
Urgent_Shipping            0
Installation_Service       0
Transport_Method        1071
Fragile_Equipment          0
Hospital_Info              0
Rural_Hospital           586
Order_Placed_Date          0
Delivery_Date              0
Hospital_Location          0
Transport_Cost             0
Delivery_Days              0
Order_Month                0
Order_Day_of_Wee

Unnamed: 0,Hospital_Id,Supplier_Name,Supplier_Reliability,Equipment_Height,Equipment_Width,Equipment_Weight,Equipment_Type,Equipment_Value,Base_Transport_Fee,CrossBorder_Shipping,...,Hospital_Info,Rural_Hospital,Order_Placed_Date,Delivery_Date,Hospital_Location,Transport_Cost,Delivery_Days,Order_Month,Order_Day_of_Week,Order_Is_Weekend
0,fffe3200360030003700,Jo Valencia,0.44,21.0,6.0,,,3.62,17.13,No,...,Working Class,No,2017-10-20,2017-10-20,APO AA 33776,179.5,0,10,4,False
1,fffe3400380037003400,Wanda Warren,0.58,29.0,20.0,1210684.0,Marble,9703.37,35.42,No,...,Working Class,No,2016-02-22,2016-02-24,"South Kevin, VT 84493",627732.45,2,2,0,False
2,fffe3200350036003700,Robert Ackies,0.97,39.0,15.0,3305.0,Aluminium,40.21,18.54,No,...,Working Class,No,2018-01-11,2018-01-10,"Kevinshire, NE 31279",1565.92,-1,1,3,False
3,fffe3800320034003400,Charlotte Membreno,0.7,8.0,5.0,606.0,Brass,4.55,17.48,No,...,Working Class,No,2016-08-06,2016-08-06,DPO AP 61572,257.71,0,8,5,True
4,fffe3600340033003000,Nena Silva,0.66,27.0,13.0,,Marble,2726.8,30.23,Yes,...,Working Class,,2016-12-15,2016-12-17,"Joshuamouth, AK 01550",8553.52,2,12,3,False


In [93]:

print("Preprocessing script started...")

# ==============================================================================
# PART 1: PRE-SPLIT (Final Cleaning & Feature Engineering)
# ==============================================================================

# 1. Filter Bad Data
initial_rows = len(df)
print(initial_rows)
df = df[df['Transport_Cost'] >= 0]
print(f"Filtered {initial_rows - len(df)} rows with negative cost.")

# 2. Equipment Feature Engineering
df['Equipment_Volume'] = df['Equipment_Height'] * df['Equipment_Width']

# 3. Delivery_Days Feature Engineering (THE FIX)
# (!!! THIS IS THE FIX !!!)
# Create the flag column *before* modifying Delivery_Days
df['Is_Negative_Delivery'] = (df['Delivery_Days'] < 0).astype(int) 
print("Created 'Is_Negative_Delivery' flag.")

# Now, modify Delivery_Days (your suggestion)
df['Delivery_Days'] = np.where(df['Delivery_Days'] < 0, -1, df['Delivery_Days'])
# print("Set negative Delivery_Days to -1.")
# no change to Delivery_Days as per latest discussion

# 4. Log-Transform Skewed *Features*
df['Equipment_Value'] = np.log1p(df['Equipment_Value'])
df['Equipment_Volume'] = np.log1p(df['Equipment_Volume'])
df['Equipment_Weight'] = np.log1p(df['Equipment_Weight'])
df['Base_Transport_Fee'] = np.log1p(df['Base_Transport_Fee'])
print("Log-transformed skewed numeric features.")

# 5. Define Target (y) and Features (X)
y_original = df['Transport_Cost'] # Keep original for baseline
y = np.log1p(y_original)         # This is the target we will train on
print("Applied log1p to target variable 'Transport_Cost'.")

# Define X by dropping the target and all redundant/ID/replaced columns.
X = df.drop(columns=[
    'Transport_Cost',       # Target
    'Equipment_Height',     # Replaced by Volume
    'Equipment_Width',      # Replaced by Volume
    'Hospital_Id',          # ID
    'Supplier_Name',        # ID
    'Hospital_Location',    # ID
    'Order_Placed_Date',    # Replaced by date features
    'Delivery_Date'         # Replaced by date features
])

print(f"Total features for modeling: {len(X.columns)}")

# 6. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 7. Calculate Baseline RMSE (in DOLLARS)
print("\nCalculating baseline...")
y_train_original_mean = y_original.loc[y_train.index].mean()
y_test_original = y_original.loc[y_test.index]

y_test_pred_baseline = np.full_like(y_test_original, y_train_original_mean)
baseline_rmse = np.sqrt(mean_squared_error(y_test_original, y_test_pred_baseline))
print(f"Baseline RMSE (predicting mean cost): ${baseline_rmse:,.2f}")

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")


# ==============================================================================
# PART 2: POST-SPLIT (Pipelines & ColumnTransformer)
# ==============================================================================

# 1. Define Feature Lists
numeric_features = [
    'Supplier_Reliability',
    'Equipment_Value',      # Log-transformed
    'Base_Transport_Fee',   # Log-transformed
    'Delivery_Days',        # Clipped at -1
    'Equipment_Volume',     # Log-transformed
    'Equipment_Weight'      # Log-transformed
]

categorical_features = [
    'Equipment_Type',
    'CrossBorder_Shipping',
    'Urgent_Shipping',
    'Installation_Service',
    'Transport_Method',
    'Fragile_Equipment',
    'Hospital_Info',
    'Rural_Hospital',
    'Order_Month',
    'Order_Day_of_Week',
    'Order_Is_Weekend',
    'Is_Negative_Delivery'  # <-- This column now exists
]

# 2. Create the Numeric Pipeline
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
])

# 3. Create the Categorical Pipeline
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# 4. Create the Full Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop' 
)

# 5. Apply the Preprocessor
print("\nFitting preprocessor on X_train...")
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test) 

print("Preprocessing complete.")
print(f"Processed X_train shape: {X_train_processed.shape}")
print(f"Processed X_test shape: {X_test_processed.shape}")

Preprocessing script started...
5000
Filtered 493 rows with negative cost.
Created 'Is_Negative_Delivery' flag.
Log-transformed skewed numeric features.
Applied log1p to target variable 'Transport_Cost'.
Total features for modeling: 18

Calculating baseline...
Baseline RMSE (predicting mean cost): $143,852.70
Training set shape: (3605, 18)
Test set shape: (902, 18)

Fitting preprocessor on X_train...
Preprocessing complete.
Processed X_train shape: (3605, 51)
Processed X_test shape: (902, 51)


In [95]:
# ==============================
# 1️⃣ Create full pipeline
# ==============================
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),        # handle numeric + categorical
    ('poly', PolynomialFeatures()),        # now safe
    ('regressor', Lasso(max_iter=10000, random_state=42))
])

# ==============================
# 2️⃣ Define hyperparameter grid
# ==============================
param_grid = {
    'poly__degree': [1,2],             # polynomial degree
    'regressor__alpha': [0.01, 0.1, 1, 10]  # Lasso regularization
}

# ==============================
# 3️⃣ Setup GridSearchCV with K-Fold
# ==============================
kf = KFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=kf,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=2
)

# ==============================
# 4️⃣ Fit GridSearch on training data
# ==============================
grid_search.fit(X_train, y_train)

# ==============================
# 5️⃣ Best parameters and CV RMSE
# ==============================
best_params = grid_search.best_params_
best_rmse = np.sqrt(-grid_search.best_score_)

print("Best parameters:", best_params)
print(f"Best CV RMSE: {best_rmse:.4f}")

# ==============================
# 6️⃣ Evaluate on test set
# ==============================
y_test_pred = grid_search.predict(X_test)
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
print(f"Test RMSE: {test_rmse:.4f}")

# ==============================
# 7️⃣ Feature importance: Top 30 by absolute coefficient
# ==============================
best_model = grid_search.best_estimator_.named_steps['regressor']
preprocessor = grid_search.best_estimator_.named_steps['preprocessor']
poly = grid_search.best_estimator_.named_steps['poly']

# numeric + categorical feature names
num_features = numeric_features
cat_features = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_features)
all_features = np.concatenate([num_features, cat_features])

# polynomial feature names
poly_feature_names = poly.get_feature_names_out(all_features)

# match coefficients to feature names
coef = best_model.coef_
feature_coef_df = pd.DataFrame({
    'feature': poly_feature_names,
    'coefficient': coef
})

# Top 30 features by absolute value
top_30 = feature_coef_df.reindex(feature_coef_df['coefficient'].abs().sort_values(ascending=False).index).head(30)
display(top_30)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] END ..............poly__degree=1, regressor__alpha=0.01; total time=   0.0s
[CV] END ..............poly__degree=1, regressor__alpha=0.01; total time=   0.0s
[CV] END ..............poly__degree=1, regressor__alpha=0.01; total time=   0.0s
[CV] END ..............poly__degree=1, regressor__alpha=0.01; total time=   0.0s
[CV] END ...............poly__degree=1, regressor__alpha=0.1; total time=   0.0s
[CV] END ..............poly__degree=1, regressor__alpha=0.01; total time=   0.0s
[CV] END ...............poly__degree=1, regressor__alpha=0.1; total time=   0.0s
[CV] END ...............poly__degree=1, regressor__alpha=0.1; total time=   0.0s
[CV] END ...............poly__degree=1, regressor__alpha=0.1; total time=   0.0s
[CV] END ...............poly__degree=1, regressor__alpha=0.1; total time=   0.0s
[CV] END .................poly__degree=1, regressor__alpha=1; total time=   0.0s
[CV] END .................poly__degree=1, regress

Unnamed: 0,feature,coefficient
1,Supplier_Reliability,0.658071
3,Base_Transport_Fee,0.490455
103,Equipment_Value^2,0.415364
118,Equipment_Value Urgent_Shipping_Yes,0.284314
2,Equipment_Value,0.282179
6,Equipment_Weight,0.271746
56,Supplier_Reliability Equipment_Volume,0.128342
166,Base_Transport_Fee Urgent_Shipping_No,0.119615
5,Equipment_Volume,0.111701
156,Base_Transport_Fee Equipment_Weight,-0.067655


In [85]:
# === CELL 4 (V6): GRIDSEARCHCV FOR EXTREME ROBUSTNESS ===

print("🚀 Starting V6 GridSearchCV for XGBoost (Focusing on EXTREME ROBUSTNESS)...")

# --- 1. CV splitter ---
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# --- 2. XGB pipeline ---
# 'preprocessor' is your V4 preprocessor (with RobustScaler)
# This MUST be the V4 preprocessor object, already in your memory
xgb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor), 
    ('xgb', XGBRegressor(
        objective='reg:squarederror',
        random_state=42,
        n_jobs=-1,
        learning_rate=0.05 # Slower learning rate for more stability
    ))
])

# --- 3. V6 Hyperparameter grid (Shallow, Regularized) ---
param_grid = {
    'xgb__n_estimators': [1000, 1500],
    'xgb__max_depth': [3],               # Force shallow trees
    'xgb__reg_alpha': [10, 50, 100],     # Aggressive L1
    'xgb__reg_lambda': [10, 50, 100]     # Aggressive L2
}

# --- 4. GridSearchCV ---
grid_search = GridSearchCV(
    estimator=xgb_pipeline,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',
    cv=kf,
    n_jobs=-1,
    verbose=2
)

# --- 5. Run grid search on the V4 TRAINING DATA ---
# (X_train and y_train are from your V4 preprocessing cell)
grid_search.fit(X_train, y_train)

# --- 6. Best params & CV score ---
print("\n✅ V6 GridSearch complete!")
best_params = grid_search.best_params_
best_cv_rmse = -grid_search.best_score_
print(f"   Best hyperparameters: {best_params}")
print(f"   Best CV RMSE (log-space): {best_cv_rmse:.4f}")

# --- 7. Evaluate on V4 test set ---
final_xgb_bulletproof_v6 = grid_search.best_estimator_
y_test_pred_log = final_xgb_bulletproof_v6.predict(X_test)
y_test_pred_orig = np.expm1(y_test_pred_log)

rmse_test_log = np.sqrt(mean_squared_error(y_test, y_test_pred_log))
rmse_test_orig = np.sqrt(mean_squared_error(np.expm1(y_test), y_test_pred_orig))

print(f"\n   Test RMSE (log-space)      : {rmse_test_log:.4f}")
print(f"   Test RMSE (original scale) : {rmse_test_orig:.2f}")

🚀 Starting V6 GridSearchCV for XGBoost (Focusing on EXTREME ROBUSTNESS)...
Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV] END xgb__max_depth=3, xgb__n_estimators=1000, xgb__reg_alpha=10, xgb__reg_lambda=10; total time=   0.1s
[CV] END xgb__max_depth=3, xgb__n_estimators=1000, xgb__reg_alpha=10, xgb__reg_lambda=10; total time=   0.2s
[CV] END xgb__max_depth=3, xgb__n_estimators=1000, xgb__reg_alpha=10, xgb__reg_lambda=10; total time=   0.2s
[CV] END xgb__max_depth=3, xgb__n_estimators=1000, xgb__reg_alpha=10, xgb__reg_lambda=10; total time=   0.2s
[CV] END xgb__max_depth=3, xgb__n_estimators=1000, xgb__reg_alpha=10, xgb__reg_lambda=10; total time=   0.2s
[CV] END xgb__max_depth=3, xgb__n_estimators=1000, xgb__reg_alpha=10, xgb__reg_lambda=50; total time=   0.2s
[CV] END xgb__max_depth=3, xgb__n_estimators=1000, xgb__reg_alpha=10, xgb__reg_lambda=50; total time=   0.2s
[CV] END xgb__max_depth=3, xgb__n_estimators=1000, xgb__reg_alpha=10, xgb__reg_lambda=50; total time=

In [None]:
print("🚀 Starting GridSearchCV for Polynomial Ridge...")

# --- 1. CV splitter ---
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# --- 2. Polynomial Ridge pipeline ---
# We are adding a new step: 'poly'
# The pipeline is now: Preprocessor -> PolynomialFeatures -> Ridge
poly_ridge_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor), 
    ('poly', PolynomialFeatures(include_bias=False)), # <-- NEW STEP
    ('ridge', Ridge(random_state=42))
])

# --- 3. Hyperparameter grid ---
# We now tune both the polynomial degree and the ridge alpha
param_grid = {
    'poly__degree': [1], # Degree 1 = linear (your old model), Degree 2 = quadratic
    'ridge__alpha': [1, 10, 20,50, 100] # Test a wide range of L2 regularization
}

# --- 4. GridSearchCV ---
# Using a new variable name 'grid_search_poly'
grid_search_poly = GridSearchCV(
    estimator=poly_ridge_pipeline,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',
    cv=kf,
    n_jobs=-1,
    verbose=2
)

# --- 5. Run grid search ---
# We fit on the raw X_train and y_train
grid_search_poly.fit(X_train, y_train)

# --- 6. Best params & CV score ---
print("\n✅ Polynomial Ridge GridSearch complete!")
best_params_poly = grid_search_poly.best_params_
best_cv_rmse_poly = -grid_search_poly.best_score_
print(f"   Best hyperparameters: {best_params_poly}")
print(f"   Best CV RMSE (log-space): {best_cv_rmse_poly:.4f}")

# --- 7. Evaluate on V4 test set ---
final_poly_ridge = grid_search_poly.best_estimator_
y_test_pred_log_poly = final_poly_ridge.predict(X_test)
y_test_pred_orig_poly = np.expm1(y_test_pred_log_poly)

# (Safety check: set any potential negative predictions to 0)
y_test_pred_orig_poly[y_test_pred_orig_poly < 0] = 0

# We use the original y_test_original from your baseline cell
rmse_test_log_poly = np.sqrt(mean_squared_error(y_test, y_test_pred_log_poly))
rmse_test_orig_poly = np.sqrt(mean_squared_error(y_test_original, y_test_pred_orig_poly))

print(f"\n   Test RMSE (log-space)      : {rmse_test_log_poly:.4f}")
print(f"   Test RMSE (original scale) : ${rmse_test_orig_poly:,.2f}")

In [None]:
print("🚀 Retraining final Polynomial Ridge model on ALL available data...")

# === 1. Get Best Hyperparameters (DYNAMICALLY) ===
try:
    # Get the best params from the 'grid_search_poly' object
    best_params_poly = grid_search_poly.best_params_
    best_degree = best_params_poly['poly__degree']
    best_alpha = best_params_poly['ridge__alpha']
    print(f"   Using best hyperparameters: {best_params_poly}")

except NameError as e:
    print("\nERROR: Could not run. 'grid_search_poly' not found.")
    print("Please make sure you have run the 'Polynomial Ridge GridSearchCV' cell successfully first.")
    print(f"Details: {e}")
    # We'll stop the script if the params aren't found
    raise

# === 2. Re-define the Preprocessor (for clarity and safety) ===
# This is the exact same preprocessor from your "V4" script
numeric_features = [
    'Supplier_Reliability', 'Equipment_Value', 'Base_Transport_Fee',
    'Delivery_Days', 'Equipment_Volume', 'Equipment_Weight'
]
categorical_features = [
    'Equipment_Type', 'CrossBorder_Shipping', 'Urgent_Shipping',
    'Installation_Service', 'Transport_Method', 'Fragile_Equipment',
    'Hospital_Info', 'Rural_Hospital', 'Order_Month',
    'Order_Day_of_Week', 'Order_Is_Weekend', 'Is_Negative_Delivery'
]

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

final_preprocessor_poly = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop' 
)

# === 3. Build the Final Model and Pipeline ===
# Create the final pipeline with the 'poly' step and best params
final_model_pipeline_poly = Pipeline(steps=[
    ('preprocessor', final_preprocessor_poly),
    ('poly', PolynomialFeatures(degree=best_degree, include_bias=False)), # <-- Uses best_degree
    ('model', Ridge(alpha=best_alpha, random_state=42))                   # <-- Uses best_alpha
])

# === 4. Fit on the ENTIRE dataset ===
# We use 'X' and 'y' (the full, pre-split data)
print(f"\nFitting pipeline on {X.shape[0]} rows of data...")
final_model_pipeline_poly.fit(X, y)

print("\n✅ Final Polynomial Ridge model trained on entire dataset!")
print("You can now use this 'final_model_pipeline_poly' for all predictions.")

In [86]:
print("🚀 Retraining final model on ALL available data...")

# === 1. Get Best Hyperparameters ===
# Get the best params found by GridSearchCV (V6)
# The keys will be like 'xgb__n_estimators', we need to clean them
best_params_raw = grid_search.best_params_
best_params = {key.replace('xgb__', ''): value for key, value in best_params_raw.items()}

print(f"   Using best hyperparameters: {best_params}")

# === 2. Re-define the Preprocessor (for clarity and safety) ===
# This is the exact same preprocessor from your "V4" script
numeric_features = [
    'Supplier_Reliability', 'Equipment_Value', 'Base_Transport_Fee',
    'Delivery_Days', 'Equipment_Volume', 'Equipment_Weight'
]
categorical_features = [
    'Equipment_Type', 'CrossBorder_Shipping', 'Urgent_Shipping',
    'Installation_Service', 'Transport_Method', 'Fragile_Equipment',
    'Hospital_Info', 'Rural_Hospital', 'Order_Month',
    'Order_Day_of_Week', 'Order_Is_Weekend', 'Is_Negative_Delivery'
]

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

final_preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop' 
)

# === 3. Build the Final Model and Pipeline ===
# Create a new XGBoost model instance with the best params
final_xgb_model = XGBRegressor(
    objective='reg:squarederror',
    random_state=42,
    n_jobs=-1,
    learning_rate=0.05, # This was fixed in your grid search
    **best_params       # This unpacks all the best params (n_estimators, max_depth, etc.)
)

# Create the final pipeline
final_model_pipeline = Pipeline(steps=[
    ('preprocessor', final_preprocessor),
    ('model', final_xgb_model)
])

# === 4. Fit on the ENTIRE dataset ===
# We use 'X' and 'y' (the full, pre-split data)
print(f"\nFitting pipeline on {X.shape[0]} rows of data...")
final_model_pipeline.fit(X, y)

print("\n✅ Final XGBoost model trained on entire dataset!")
print("You can now use this 'final_model_pipeline' for all predictions.")

🚀 Retraining final model on ALL available data...
   Using best hyperparameters: {'max_depth': 3, 'n_estimators': 1000, 'reg_alpha': 10, 'reg_lambda': 50}

Fitting pipeline on 4507 rows of data...

✅ Final XGBoost model trained on entire dataset!
You can now use this 'final_model_pipeline' for all predictions.


In [87]:
def prepare_features(df_raw):
    """
    Applies all manual cleaning and feature engineering
    to match the data used for model training.
    
    Takes a raw DataFrame (like test.csv) and returns
    a DataFrame ready for the preprocessor pipeline.
    """
    
    print(f"Preparing {len(df_raw)} new rows...")
    
    # Make a copy to avoid changing the original data
    df = df_raw.copy()
    
    # 1. Clean column names (from your training script)
    df.columns = df.columns.str.strip()

    # 2. Clean all string/object columns (from your training script)
    for col in df.select_dtypes(include='object').columns:
        df[col] = df[col].astype(str).str.strip()
        df[col] = df[col].replace({'': np.nan, 'nan': np.nan, 'NaN': np.nan})

    # 3. Normalize Yes/No columns (from your training script)
    yes_no_cols = ['CrossBorder_Shipping', 'Urgent_Shipping', 'Installation_Service',
                   'Fragile_Equipment', 'Rural_Hospital']
    for col in yes_no_cols:
        if col in df.columns:
            df[col] = df[col].replace({
                'YES': 'Yes', 'yes': 'Yes', 'Y': 'Yes', 'y': 'Yes',
                'NO': 'No', 'no': 'No', 'N': 'No', 'n': 'No'
            })

    # 4. Convert date columns (from your training script)
    df['Order_Placed_Date'] = pd.to_datetime(df['Order_Placed_Date'], errors='coerce')
    df['Delivery_Date'] = pd.to_datetime(df['Delivery_Date'], errors='coerce')

    # 5. Engineer Date Features (from your training script)
    df['Delivery_Days'] = (df['Delivery_Date'] - df['Order_Placed_Date']).dt.days
    df['Delivery_Days'] = pd.to_numeric(df['Delivery_Days'], errors='coerce')
    
    df['Order_Month'] = df['Order_Placed_Date'].dt.month.astype(str)
    df['Order_Day_of_Week'] = df['Order_Placed_Date'].dt.dayofweek.astype(str)
    df['Order_Is_Weekend'] = (df['Order_Placed_Date'].dt.dayofweek >= 5).astype(int)
    
    # 6. Engineer Equipment_Volume (from your training script)
    df['Equipment_Volume'] = df['Equipment_Height'] * df['Equipment_Width']
    
    # 7. Engineer Delivery_Days features (from your training script)
    # This *must* be in this order
    df['Is_Negative_Delivery'] = (df['Delivery_Days'] < 0).astype(int)
    df['Delivery_Days'] = np.where(df['Delivery_Days'] < 0, -1, df['Delivery_Days'])

    # 8. Log-Transform Skewed Features (from your training script)
    df['Equipment_Value'] = np.log1p(df['Equipment_Value'])
    df['Equipment_Volume'] = np.log1p(df['Equipment_Volume'])
    df['Equipment_Weight'] = np.log1p(df['Equipment_Weight'])
    df['Base_Transport_Fee'] = np.log1p(df['Base_Transport_Fee'])
    
    print("Feature preparation complete.")
    
    # 9. Return the feature-engineered DataFrame
    # The preprocessor pipeline will select the columns it needs from this.
    return df

In [88]:
# Assume 'final_model_pipeline' and 'prepare_features' are in memory
# If not, you would load your model:
# import joblib
# final_model_pipeline = joblib.load('final_xgb_model.pkl') 

# 1. Load your new, raw test data
print("Loading new test data...")
df_new_test = pd.read_csv('../data/test.csv') 

# 2. Save IDs for the final submission
# We need to map our predictions back to the original IDs
submission_ids = df_new_test['Hospital_Id']

# 3. Apply the *exact same* feature engineering
print("Applying feature engineering to new data...")
X_new_prepared = prepare_features(df_new_test)

# 4. Get predictions
print("Getting predictions from the final model...")
log_predictions = final_model_pipeline.predict(X_new_prepared)

# 5. Convert predictions back from log-scale! (THE FIX)
# Remember, you trained on log(Transport_Cost + 1)
final_predictions = np.expm1(log_predictions)

# (Safety check: ensure no negative predictions)
# final_predictions[final_predictions < 0] = 0

# 6. Create the final submission file
submission_df = pd.DataFrame({
    'Hospital_Id': submission_ids,
    'Transport_Cost': final_predictions
})

# Display the first few predictions
print("\nFinal Predictions (in $):")
display(submission_df.head())

# Save to CSV
submission_df.to_csv('xgb.csv', index=False)
print("Submission file 'xgb.csv' created successfully.")

Loading new test data...
Applying feature engineering to new data...
Preparing 500 new rows...
Feature preparation complete.
Getting predictions from the final model...

Final Predictions (in $):


Unnamed: 0,Hospital_Id,Transport_Cost
0,fffe33003400,450.034637
1,fffe3700330036003600,284.725525
2,fffe3300390038003400,2074.483643
3,fffe310030003900,211.848755
4,fffe3700330031003200,1021.790283


Submission file 'xgb.csv' created successfully.
