In [50]:
# core
import os
import re
import warnings
warnings.filterwarnings('ignore')

# data + plotting
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from scipy.stats import zscore
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
np.set_printoptions(threshold=np.inf)

# sklearn (preprocessing / pipeline / model selection / metrics)
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures, PowerTransformer,RobustScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error

# classical models (if you use them elsewhere)
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor

# gradient boosting / lightgbm / xgboost
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

# utilities
import joblib   # optional: save/load pipeline

from lightgbm import LGBMRegressor
from sklearn.model_selection import RandomizedSearchCV


In [51]:
# Set plot style
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)


# 1️⃣ Load Data
print("Loading data...")
df = pd.read_csv('../data/train.csv')
df.columns = df.columns.str.strip()
display(df.head())
print(f"Initial data shape: {df.shape}")

# 2️⃣ Clean all string/object columns: strip spaces, replace blanks with NaN
print("Cleaning string columns...")
for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].astype(str).str.strip()
    df[col] = df[col].replace({'': np.nan, 'nan': np.nan, 'NaN': np.nan})

# 3️⃣ Normalize Yes/No columns to consistent "Yes"/"No"
print("Normalizing Yes/No columns...")
yes_no_cols = ['CrossBorder_Shipping', 'Urgent_Shipping', 'Installation_Service',
               'Fragile_Equipment', 'Rural_Hospital']
for col in yes_no_cols:
    if col in df.columns:
        df[col] = df[col].replace({
            'YES': 'Yes', 'yes': 'Yes', 'Y': 'Yes', 'y': 'Yes',
            'NO': 'No', 'no': 'No', 'N': 'No', 'n': 'No'
        })

# 4️⃣ Convert date columns to datetime
print("Converting date columns...")
df['Order_Placed_Date'] = pd.to_datetime(df['Order_Placed_Date'], errors='coerce')
df['Delivery_Date'] = pd.to_datetime(df['Delivery_Date'], errors='coerce')

# 5️⃣ Create new feature: Delivery_Days (difference in days)
print("Engineering Delivery_Days feature...")
df['Delivery_Days'] = (df['Delivery_Date'] - df['Order_Placed_Date']).dt.days
df['Delivery_Days'] = pd.to_numeric(df['Delivery_Days'], errors='coerce')

# === ADDED: Date Feature Engineering ===
print("Engineering more date features...")
df['Order_Month'] = df['Order_Placed_Date'].dt.month
df['Order_Day_of_Week'] = df['Order_Placed_Date'].dt.dayofweek  # Monday=0, Sunday=6
df['Order_Is_Weekend'] = df['Order_Day_of_Week'].isin([5, 6])
# === END ADDED ===

# 6️⃣ (Original) delete initial date rows
# df = df.dropna(subset=['Order_Placed_Date', 'Delivery_Date'])

# 7️⃣ Drop exact duplicate rows
print("Dropping duplicates...")
before = len(df)
df = df.drop_duplicates()
after = len(df)
print(f"Dropped {before - after} duplicate rows.")

# 8️⃣ Quick check after cleaning
print("\n" + "="*30)
print(" CLEANING & FEATURE ENGINEERING COMPLETE ")
print("="*30)
print(f"After basic cleaning shape: {df.shape}")

print("\nMissing values (raw count):")
print(df.isna().sum())

# === ADDED: Missing Value Percentage View ===
print("\nMissing values (percentage):")
missing_pct = (df.isna().sum() / len(df) * 100).sort_values(ascending=False)
print(missing_pct[missing_pct > 0])
# === END ADDED ===

print("\nDataFrame head:")
display(df.head())
# print(df['Delivery_Days'])

Loading data...


Unnamed: 0,Hospital_Id,Supplier_Name,Supplier_Reliability,Equipment_Height,Equipment_Width,Equipment_Weight,Equipment_Type,Equipment_Value,Base_Transport_Fee,CrossBorder_Shipping,Urgent_Shipping,Installation_Service,Transport_Method,Fragile_Equipment,Hospital_Info,Rural_Hospital,Order_Placed_Date,Delivery_Date,Hospital_Location,Transport_Cost
0,fffe3200360030003700,Jo Valencia,0.44,21.0,6.0,,,3.62,17.13,No,No,No,Roadways,No,Working Class,No,10/20/17,10/20/17,APO AA 33776,179.5
1,fffe3400380037003400,Wanda Warren,0.58,29.0,20.0,1210684.0,Marble,9703.37,35.42,No,Yes,Yes,Roadways,No,Working Class,No,02/22/16,02/24/16,"South Kevin, VT 84493",627732.45
2,fffe3200350036003700,Robert Ackies,0.97,39.0,15.0,3305.0,Aluminium,40.21,18.54,No,No,No,Roadways,No,Working Class,No,01/11/18,01/10/18,"Kevinshire, NE 31279",1565.92
3,fffe3800320034003400,Charlotte Membreno,0.7,8.0,5.0,606.0,Brass,4.55,17.48,No,No,No,Roadways,No,Working Class,No,08/06/16,08/06/16,DPO AP 61572,257.71
4,fffe3600340033003000,Nena Silva,0.66,27.0,13.0,,Marble,2726.8,30.23,Yes,No,No,Roadways,No,Working Class,,12/15/16,12/17/16,"Joshuamouth, AK 01550",8553.52


Initial data shape: (5000, 20)
Cleaning string columns...
Normalizing Yes/No columns...
Converting date columns...
Engineering Delivery_Days feature...
Engineering more date features...
Dropping duplicates...
Dropped 0 duplicate rows.

 CLEANING & FEATURE ENGINEERING COMPLETE 
After basic cleaning shape: (5000, 24)

Missing values (raw count):
Hospital_Id                0
Supplier_Name              0
Supplier_Reliability     587
Equipment_Height         283
Equipment_Width          443
Equipment_Weight         460
Equipment_Type           599
Equipment_Value            0
Base_Transport_Fee         0
CrossBorder_Shipping       0
Urgent_Shipping            0
Installation_Service       0
Transport_Method        1071
Fragile_Equipment          0
Hospital_Info              0
Rural_Hospital           586
Order_Placed_Date          0
Delivery_Date              0
Hospital_Location          0
Transport_Cost             0
Delivery_Days              0
Order_Month                0
Order_Day_of_Wee

Unnamed: 0,Hospital_Id,Supplier_Name,Supplier_Reliability,Equipment_Height,Equipment_Width,Equipment_Weight,Equipment_Type,Equipment_Value,Base_Transport_Fee,CrossBorder_Shipping,...,Hospital_Info,Rural_Hospital,Order_Placed_Date,Delivery_Date,Hospital_Location,Transport_Cost,Delivery_Days,Order_Month,Order_Day_of_Week,Order_Is_Weekend
0,fffe3200360030003700,Jo Valencia,0.44,21.0,6.0,,,3.62,17.13,No,...,Working Class,No,2017-10-20,2017-10-20,APO AA 33776,179.5,0,10,4,False
1,fffe3400380037003400,Wanda Warren,0.58,29.0,20.0,1210684.0,Marble,9703.37,35.42,No,...,Working Class,No,2016-02-22,2016-02-24,"South Kevin, VT 84493",627732.45,2,2,0,False
2,fffe3200350036003700,Robert Ackies,0.97,39.0,15.0,3305.0,Aluminium,40.21,18.54,No,...,Working Class,No,2018-01-11,2018-01-10,"Kevinshire, NE 31279",1565.92,-1,1,3,False
3,fffe3800320034003400,Charlotte Membreno,0.7,8.0,5.0,606.0,Brass,4.55,17.48,No,...,Working Class,No,2016-08-06,2016-08-06,DPO AP 61572,257.71,0,8,5,True
4,fffe3600340033003000,Nena Silva,0.66,27.0,13.0,,Marble,2726.8,30.23,Yes,...,Working Class,,2016-12-15,2016-12-17,"Joshuamouth, AK 01550",8553.52,2,12,3,False


In [52]:

print("Preprocessing script started...")

# ==============================================================================
# PART 1: PRE-SPLIT (Final Cleaning & Feature Engineering)
# ==============================================================================

# 1. Filter Bad Data
initial_rows = len(df)
print(initial_rows)
# df = df[df['Transport_Cost'] >= 0] instead of this we will make negative as value to 1
df['Transport_Cost'] = np.where(df['Transport_Cost'] < 0, 1, df['Transport_Cost'])
print(f"Filtered {initial_rows - len(df)} rows with negative cost.")

# 2. Equipment Feature Engineering
df['Equipment_Volume'] = df['Equipment_Height'] * df['Equipment_Width']

# 3. Delivery_Days Feature Engineering (THE FIX)
# (!!! THIS IS THE FIX !!!)
# Create the flag column *before* modifying Delivery_Days
print("Created 'Is_Negative_Delivery' flag.")

# Now, modify Delivery_Days (your suggestion)
df['Delivery_Days'] = df['Delivery_Days'].abs()
# print("Set negative Delivery_Days to -1.")
# no change to Delivery_Days as per latest discussion

# 4. Log-Transform Skewed *Features*
df['Equipment_Value'] = np.log1p(df['Equipment_Value'])
df['Equipment_Volume'] = np.log1p(df['Equipment_Volume'])
df['Equipment_Weight'] = np.log1p(df['Equipment_Weight'])
df['Base_Transport_Fee'] = np.log1p(df['Base_Transport_Fee'])
print("Log-transformed skewed numeric features.")

# 5. Define Target (y) and Features (X)
y_original = df['Transport_Cost'] # Keep original for baseline
y = np.log1p(y_original)         # This is the target we will train on
print("Applied log1p to target variable 'Transport_Cost'.")

# Define X by dropping the target and all redundant/ID/replaced columns.
X = df.drop(columns=[
    'Transport_Cost',       # Target
    'Equipment_Height',     # Replaced by Volume
    'Equipment_Width',      # Replaced by Volume
    'Hospital_Id',          # ID
    'Supplier_Name',        # ID
    'Hospital_Location',    # ID
    'Order_Placed_Date',    # Replaced by date features
    'Delivery_Date'         # Replaced by date features
])

print(f"Total features for modeling: {len(X.columns)}")

# 6. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 7. Calculate Baseline RMSE (in DOLLARS)
print("\nCalculating baseline...")
y_train_original_mean = y_original.loc[y_train.index].mean()
y_test_original = y_original.loc[y_test.index]

y_test_pred_baseline = np.full_like(y_test_original, y_train_original_mean)
baseline_rmse = np.sqrt(mean_squared_error(y_test_original, y_test_pred_baseline))
print(f"Baseline RMSE (predicting mean cost): ${baseline_rmse:,.2f}")

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")


# ==============================================================================
# PART 2: POST-SPLIT (Pipelines & ColumnTransformer)
# ==============================================================================

# 1. Define Feature Lists
numeric_features = [
    'Supplier_Reliability',
    'Equipment_Value',      # Log-transformed
    'Base_Transport_Fee',   # Log-transformed
    'Delivery_Days',        # Clipped at -1
    'Equipment_Volume',     # Log-transformed
    'Equipment_Weight'      # Log-transformed
]

categorical_features = [
    'Equipment_Type',
    'CrossBorder_Shipping',
    'Urgent_Shipping',
    'Installation_Service',
    'Transport_Method',
    'Fragile_Equipment',
    'Hospital_Info',
    'Rural_Hospital',
    'Order_Month',
    'Order_Day_of_Week',
    'Order_Is_Weekend'
]

# 2. Create the Numeric Pipeline
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
])

# 3. Create the Categorical Pipeline
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# 4. Create the Full Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop' 
)

# 5. Apply the Preprocessor
print("\nFitting preprocessor on X_train...")
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test) 

print("Preprocessing complete.")
print(f"Processed X_train shape: {X_train_processed.shape}")
print(f"Processed X_test shape: {X_test_processed.shape}")

Preprocessing script started...
5000
Filtered 0 rows with negative cost.
Created 'Is_Negative_Delivery' flag.
Log-transformed skewed numeric features.
Applied log1p to target variable 'Transport_Cost'.
Total features for modeling: 17

Calculating baseline...
Baseline RMSE (predicting mean cost): $48,764.21
Training set shape: (4000, 17)
Test set shape: (1000, 17)

Fitting preprocessor on X_train...
Preprocessing complete.
Processed X_train shape: (4000, 49)
Processed X_test shape: (1000, 49)


In [None]:
# ==============================
# 1️⃣ Create full pipeline
# ==============================
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),        # handle numeric + categorical
    ('poly', PolynomialFeatures()),        # now safe
    ('regressor', Lasso(max_iter=10000, random_state=42))
])

# ==============================
# 2️⃣ Define hyperparameter grid
# ==============================
param_grid = {
    'poly__degree': [1,2],             # polynomial degree
    'regressor__alpha': [0.01, 0.1, 1, 10]  # Lasso regularization
}

# ==============================
# 3️⃣ Setup GridSearchCV with K-Fold
# ==============================
kf = KFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=kf,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=2
)

# ==============================
# 4️⃣ Fit GridSearch on training data
# ==============================
grid_search.fit(X_train, y_train)

# ==============================
# 5️⃣ Best parameters and CV RMSE
# ==============================
best_params = grid_search.best_params_
best_rmse = np.sqrt(-grid_search.best_score_)

print("Best parameters:", best_params)
print(f"Best CV RMSE: {best_rmse:.4f}")

# ==============================
# 6️⃣ Evaluate on test set
# ==============================
y_test_pred = grid_search.predict(X_test)
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
print(f"Test RMSE: {test_rmse:.4f}")

# ==============================
# 7️⃣ Feature importance: Top 30 by absolute coefficient
# ==============================
best_model = grid_search.best_estimator_.named_steps['regressor']
preprocessor = grid_search.best_estimator_.named_steps['preprocessor']
poly = grid_search.best_estimator_.named_steps['poly']

# numeric + categorical feature names
num_features = numeric_features
cat_features = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_features)
all_features = np.concatenate([num_features, cat_features])

# polynomial feature names
poly_feature_names = poly.get_feature_names_out(all_features)

# match coefficients to feature names
coef = best_model.coef_
feature_coef_df = pd.DataFrame({
    'feature': poly_feature_names,
    'coefficient': coef
})

# Top 30 features by absolute value
top_30 = feature_coef_df.reindex(feature_coef_df['coefficient'].abs().sort_values(ascending=False).index).head(30)
display(top_30)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso, Ridge
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import PolynomialFeatures
import numpy as np
import pandas as pd

# ==============================
# 1️⃣ Get the BEST Lasso Alpha (for the selector)
# ==============================
try:
    # We get the best alpha from your *first* grid search
    best_lasso_alpha = grid_search.best_params_['regressor__alpha']
    print(f"--- Using fixed Lasso Alpha for selector: {best_lasso_alpha} ---")
except NameError:
    print("ERROR: 'grid_search' object (from Lasso) not found.")
    raise

# ==============================
# 2️⃣ Build the FULL Hybrid Pipeline
# ==============================
# This pipeline will be tuned by GridSearchCV

hybrid_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor), # Your original, fitted preprocessor
    
    ('poly', PolynomialFeatures(include_bias=False)), # This is now a TUNABLE step
    
    ('selector', SelectFromModel(
        Lasso(alpha=best_lasso_alpha, max_iter=10000, random_state=42),
        # threshold=1e-5, <-- Removed! This will now be set by the grid search.
        prefit=False  # This re-fits Lasso for each 'degree'
    )),
    
    ('regressor', Ridge(random_state=42)) # This is also a TUNABLE step
])

# ==============================
# 3️⃣ Define the NEW Hyperparameter Grid
# ==============================
# We will tune poly__degree, regressor__alpha, AND the selector__threshold

hybrid_param_grid = {
    'poly__degree': [1, 2, 3],  # Test different degrees
    'selector__threshold': ['mean', 'median', '1.25*mean'], # <-- DYNAMIC THRESHOLD
    'regressor__alpha': [0.1, 1, 10, 100] # Test different Ridge alphas
}

# ==============================
# 4️⃣ Setup and Run the new GridSearchCV
# ==============================
# We re-use 'kf' from your previous cell
# kf = KFold(n_splits=5, shuffle=True, random_state=42) 

hybrid_grid_search = GridSearchCV(
    hybrid_pipeline,
    hybrid_param_grid,
    cv=kf,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=2
)

print("\n--- Fitting Hybrid Model (Tuning Degree, Threshold, AND Ridge Alpha) ---")
hybrid_grid_search.fit(X_train, y_train)

# ==============================
# 5️⃣ Best parameters and CV RMSE
# ==============================
hybrid_best_params = hybrid_grid_search.best_params_
hybrid_best_rmse = np.sqrt(-hybrid_grid_search.best_score_)

print("\n--- New Hybrid Model Results ---")
print(f"Best parameters: {hybrid_best_params}")
print(f"Best CV RMSE: {hybrid_best_rmse:.4f}")

# ==============================
# 6️⃣ Evaluate on test set
# ==============================
y_test_pred_hybrid = hybrid_grid_search.predict(X_test)
test_rmse_hybrid = np.sqrt(mean_squared_error(y_test, y_test_pred_hybrid))

print(f"\n--- Final Model Comparison (Test Set) ---")
try:
    print(f"Original Lasso RMSE:      {test_rmse:.4f}")
except NameError:
    print("Original Lasso RMSE:      (variable 'test_rmse' not found)")
    
try:
    print(f"Ridge (w/ Lasso FS) RMSE: {test_rmse_ridge_fs:.4f} (Old)")
except NameError:
    print("Ridge (w/ Lasso FS) RMSE: (variable 'test_rmse_ridge_fs' not found)")

print(f"New Hybrid RMSE:          {test_rmse_hybrid:.4f} (New)")

In [None]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures, RobustScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Lasso
from sklearn.exceptions import NotFittedError

# ===================================================================
# 🚀 Retraining final Polynomial Lasso model on ALL available data...
# ===================================================================

# === 1. Get Best Hyperparameters (DYNAMICALLY) ===
try:
    # Get the best params from the 'grid_search' object (from your previous cell)
    best_params_lasso = grid_search.best_params_
    best_degree_lasso = best_params_lasso['poly__degree']
    best_alpha_lasso = best_params_lasso['regressor__alpha']
    print(f"   Using best hyperparameters: {best_params_lasso}")

except (NameError, AttributeError, NotFittedError) as e:
    print("\nERROR: Could not run. 'grid_search' object not found or not fitted.")
    print("Please make sure you have run the 'Polynomial Lasso GridSearchCV' cell successfully first.")
    print(f"Details: {e}")
    # Stop the script if the params aren't found
    raise

# === 2. Re-define the Preprocessor (for clarity and safety) ===
# Using the same feature lists from your reference "V4" script
numeric_features = [
    'Supplier_Reliability', 'Equipment_Value', 'Base_Transport_Fee',
    'Delivery_Days', 'Equipment_Volume', 'Equipment_Weight'
]
categorical_features = [
    'Equipment_Type', 'CrossBorder_Shipping', 'Urgent_Shipping',
    'Installation_Service', 'Transport_Method', 'Fragile_Equipment',
    'Hospital_Info', 'Rural_Hospital', 'Order_Month',
    'Order_Day_of_Week', 'Order_Is_Weekend'
]

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# This is the final preprocessor that will be used in the pipeline
final_preprocessor_lasso = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop' 
)

# === 3. Build the Final Model and Pipeline ===
# Create the final pipeline with the 'poly' step and best params
final_model_pipeline_lasso = Pipeline(steps=[
    ('preprocessor', final_preprocessor_lasso),
    ('poly', PolynomialFeatures(degree=best_degree_lasso, include_bias=False)), # <-- Uses best_degree
    ('model', Lasso(alpha=best_alpha_lasso, max_iter=10000, random_state=42))    # <-- Uses best_alpha
])

# === 4. Fit on the ENTIRE dataset ===
# We use 'X' and 'y' (the full, pre-split data) to train the final model
print(f"\nFitting pipeline on {X.shape[0]} rows of data...")
final_model_pipeline_lasso.fit(X, y)

print("\n✅ Final Polynomial Lasso model trained on entire dataset!")
print("You can now use this 'final_model_pipeline_lasso' for all predictions.")

In [None]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures, RobustScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Lasso, Ridge
from sklearn.feature_selection import SelectFromModel
from sklearn.exceptions import NotFittedError

# =====================================================================
# 🚀 Retraining final Hybrid (Lasso-Select + Ridge) model on ALL data...
# =====================================================================

# === 1. Get Best Hyperparameters (DYNAMICALLY) ===

# --- 1a. Get params from the NEW Hybrid grid search ---
try:
    best_params_hybrid = hybrid_grid_search.best_params_
    best_degree_hybrid = best_params_hybrid['poly__degree']
    best_alpha_ridge = best_params_hybrid['regressor__alpha']
    best_threshold_hybrid = best_params_hybrid['selector__threshold']
    
    print("   Using best HYBRID hyperparameters:")
    print(f"   - {best_params_hybrid}")

except (NameError, AttributeError, NotFittedError) as e:
    print("\nERROR: Could not run. 'hybrid_grid_search' object not found or not fitted.")
    print("Please make sure you have run the 'Hybrid GridSearchCV' cell successfully first.")
    print(f"Details: {e}")
    raise

# --- 1b. Get the SELECTOR'S alpha from the ORIGINAL Lasso grid search ---
try:
    # This is the fixed alpha used *inside* the SelectFromModel step
    best_lasso_alpha = grid_search.best_params_['regressor__alpha']
    print(f"\n   Using best SELECTOR alpha from original Lasso grid search:")
    print(f"   - Lasso Alpha for Selector: {best_lasso_alpha}")
except (NameError, AttributeError, NotFittedError) as e:
    print("\nERROR: Could not run. 'grid_search' (original Lasso) object not found or not fitted.")
    print("This is needed for the selector's 'alpha' parameter.")
    print(f"Details: {e}")
    raise

# === 2. Re-define the Preprocessor (for clarity and safety) ===
# Using the same feature lists from your reference "V4" script
numeric_features = [
    'Supplier_Reliability', 'Equipment_Value', 'Base_Transport_Fee',
    'Delivery_Days', 'Equipment_Volume', 'Equipment_Weight'
]
categorical_features = [
    'Equipment_Type', 'CrossBorder_Shipping', 'Urgent_Shipping',
    'Installation_Service', 'Transport_Method', 'Fragile_Equipment',
    'Hospital_Info', 'Rural_Hospital', 'Order_Month',
    'Order_Day_of_Week', 'Order_Is_Weekend'
]

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# This is the final preprocessor that will be used in the pipeline
final_preprocessor_hybrid = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop' 
)

# === 3. Build the Final Model and Pipeline ===
# Create the final pipeline with ALL the best parameters
final_model_pipeline_hybrid = Pipeline(steps=[
    ('preprocessor', final_preprocessor_hybrid),
    
    ('poly', PolynomialFeatures(degree=best_degree_hybrid, include_bias=False)), # <-- Uses best_degree
    
    ('selector', SelectFromModel(
        Lasso(alpha=best_lasso_alpha, max_iter=10000, random_state=42),
        threshold=best_threshold_hybrid, # <-- Uses best_threshold
        prefit=False
    )),
    
    ('regressor', Ridge(alpha=best_alpha_ridge, random_state=42)) # <-- Uses best_alpha (for Ridge)
])

# === 4. Fit on the ENTIRE dataset ===
# We use 'X' and 'y' (the full, pre-split data) to train the final model
print(f"\nFitting hybrid pipeline on {X.shape[0]} rows of data...")
final_model_pipeline_hybrid.fit(X, y)

print("\n✅ Final Hybrid (Lasso-Select + Ridge) model trained on entire dataset!")
print("You can now use this 'final_model_pipeline_hybrid' for all predictions.")

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
import numpy as np
import pandas as pd

# ==============================
# 1️⃣ Create the Random Forest Pipeline
# ==============================
# We use the *exact same* 'preprocessor' you already have.
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor), # Using your existing object
    ('regressor', RandomForestRegressor(random_state=42, n_jobs=-1))
])

# ==============================
# 2️⃣ Define a WIDE Hyperparameter Grid
# ==============================
# This grid tests many combinations to fight overfitting
rf_param_grid = {
    'regressor__n_estimators': [100, 300, 500],
    'regressor__max_depth': [10, 20, None],  # Test limited-depth vs. fully-grown trees
    'regressor__max_features': ['sqrt', 1.0],  # 'sqrt' is classic RF, '1.0' is all features (like Bagging)
    'regressor__min_samples_split': [2, 5]   # Default (2) vs. slight regularization (5)
}
# Total fits = 3 (n_est) * 3 (depth) * 2 (max_feat) * 2 (min_split) = 36 combinations
# 36 combinations * 5 folds (cv) = 180 fits.
# This will take some time to run.

# ==============================
# 3️⃣ Setup and Run GridSearchCV
# ==============================
# Assumes 'kf' is still in memory
rf_grid_search = GridSearchCV(
    rf_pipeline,
    rf_param_grid,
    cv=kf,
    scoring='neg_mean_squared_error',
    n_jobs=-1, # Use all your CPU cores
    verbose=2
)

print("\n--- 🌳 Fitting Random Forest Model (Wide Grid Search) 🌳 ---")
rf_grid_search.fit(X_train, y_train)

# ==============================
# 4️⃣ Best parameters and CV RMSE
# ==============================
rf_best_params = rf_grid_search.best_params_
rf_best_rmse = np.sqrt(-rf_grid_search.best_score_)

print("\n--- Random Forest Results ---")
print(f"Best parameters: {rf_best_params}")
print(f"Best CV RMSE (RF): {rf_best_rmse:.4f}")

# ==============================
# 5️⃣ Evaluate on Test Set
# ==============================
y_test_pred_rf = rf_grid_search.predict(X_test)
test_rmse_rf = np.sqrt(mean_squared_error(y_test, y_test_pred_rf))

print(f"\n--- 📊 Final Model Comparison (Test Set) ---")
try:
    print(f"Lasso RMSE:             {test_rmse:.4f}")
    print(f"Hybrid Ridge RMSE:      {test_rmse_ridge_fs:.4f}")
    print(f"XGBoost RMSE:           {test_rmse_xgb:.4f}")
except NameError:
    print("Could not find previous model scores for comparison.")

print(f"Random Forest RMSE:     {test_rmse_rf:.4f} (New)")

# ==============================
# 6️⃣ Feature Importance from Random Forest
# ==============================
try:
    # Get names after OHE
    best_preprocessor = rf_grid_search.best_estimator_.named_steps['preprocessor']
    ohe = best_preprocessor.named_transformers_['cat'].named_steps['onehot']
    cat_fnames = ohe.get_feature_names_out(categorical_features)
    all_feature_names = np.concatenate([numeric_features, cat_fnames])

    # Get importances from the best model
    best_rf_model = rf_grid_search.best_estimator_.named_steps['regressor']
    importances = best_rf_model.feature_importances_

    rf_feature_df = pd.DataFrame({
        'Feature': all_feature_names,
        'Importance': importances
    }).sort_values(by='Importance', ascending=False)

    print(f"\n--- Random Forest Feature Importances (All {len(rf_feature_df)} Features) ---")
    
    # Display all features
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):
        display(rf_feature_df)

except Exception as e:
    print(f"\nCould not generate feature importances: {e}")

In [None]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.exceptions import NotFittedError

# ===================================================================
# 🌳 Retraining final Random Forest model on ALL available data...
# ===================================================================

# === 1. Get Best Hyperparameters (DYNAMICALLY) ===
try:
    # Get the best params from the 'rf_grid_search' object (from your previous cell)
    best_params_rf = rf_grid_search.best_params_
    
    # Extract each best parameter
    best_n_estimators = best_params_rf['regressor__n_estimators']
    best_max_depth = best_params_rf['regressor__max_depth']
    best_max_features = best_params_rf['regressor__max_features']
    best_min_samples_split = best_params_rf['regressor__min_samples_split']
    
    print(f"   Using best hyperparameters: {best_params_rf}")

except (NameError, AttributeError, NotFittedError) as e:
    print("\nERROR: Could not run. 'rf_grid_search' object not found or not fitted.")
    print("Please make sure you have run the 'Random Forest GridSearchCV' cell successfully first.")
    print(f"Details: {e}")
    # Stop the script if the params aren't found
    raise

# === 2. Re-define the Preprocessor (for clarity and safety) ===
# Using the same feature lists from your Lasso retraining template
numeric_features = [
    'Supplier_Reliability', 'Equipment_Value', 'Base_Transport_Fee',
    'Delivery_Days', 'Equipment_Volume', 'Equipment_Weight'
]
categorical_features = [
    'Equipment_Type', 'CrossBorder_Shipping', 'Urgent_Shipping',
    'Installation_Service', 'Transport_Method', 'Fragile_Equipment',
    'Hospital_Info', 'Rural_Hospital', 'Order_Month',
    'Order_Day_of_Week', 'Order_Is_Weekend'
]

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# This is the final preprocessor that will be used in the pipeline
final_preprocessor_rf = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop' 
)

# === 3. Build the Final Model and Pipeline ===
# Create the final pipeline with the best params
# Note: There is no 'poly' step for the Random Forest
final_model_pipeline_rf = Pipeline(steps=[
    ('preprocessor', final_preprocessor_rf),
    ('model', RandomForestRegressor(
        n_estimators=best_n_estimators,      # <-- Uses best_n_estimators
        max_depth=best_max_depth,          # <-- Uses best_max_depth
        max_features=best_max_features,      # <-- Uses best_max_features
        min_samples_split=best_min_samples_split, # <-- Uses best_min_samples_split
        random_state=42,
        n_jobs=-1
    ))
])

# === 4. Fit on the ENTIRE dataset ===
# We use 'X' and 'y' (the full, pre-split data) to train the final model
print(f"\nFitting pipeline on {X.shape[0]} rows of data...")
final_model_pipeline_rf.fit(X, y)

print("\n✅ Final Random Forest model trained on entire dataset!")
print("You can now use this 'final_model_pipeline_rf' for all predictions.")

In [53]:
import pandas as pd
import numpy as np
import joblib
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# ==============================
# 1️⃣ Create the XGBoost Pipeline
# ==============================
# We will use your *existing* 'preprocessor' variable,
# which has the RobustScaler included. This is perfectly fine.
# Assumes 'preprocessor' is in memory from your linear model setup.
try:
    xgb_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor), # <-- Using your existing object
        ('regressor', XGBRegressor(random_state=42, n_jobs=-1,
                                 objective='reg:squarederror'))
    ])
except NameError:
    print("ERROR: 'preprocessor' object not found.")
    print("Please run the preprocessing cell (section 5) from your linear model script first.")
    raise

# ==============================
# 2️⃣ Define Hyperparameter Grid for XGBoost
# ==============================
xgb_param_grid = {
    'regressor__n_estimators': [100, 300, 500],
    'regressor__learning_rate': [0.01, 0.1],
    'regressor__max_depth': [3, 5, 7]
}

# ==============================
# 3️⃣ Setup and Run GridSearchCV
# ==============================
# Assumes 'kf' (your KFold object) is still in memory
try:
    xgb_grid_search = GridSearchCV(
        xgb_pipeline,
        xgb_param_grid,
        cv=kf,
        scoring='neg_mean_squared_error',
        n_jobs=-1,
        verbose=2
    )
except NameError:
    print("ERROR: 'kf' object (KFold) not found.")
    print("Please run the cell that defines 'kf' first.")
    raise

print("\n--- 🌳 Fitting XGBoost Model (with RobustScaler) 🌳 ---")
# Assumes 'X_train' and 'y_train' are still in memory
try:
    xgb_grid_search.fit(X_train, y_train)
except NameError:
    print("ERROR: 'X_train' or 'y_train' not found.")
    print("Please run your Train-Test Split cell first.")
    raise

# ==============================
# 4️⃣ Best parameters and CV RMSE
# ==============================
xgb_best_params = xgb_grid_search.best_params_
xgb_best_rmse = np.sqrt(-xgb_grid_search.best_score_)

print("\n--- XGBoost Results ---")
print(f"Best parameters: {xgb_best_params}")
print(f"Best CV RMSE (XGBoost): {xgb_best_rmse:.4f}")

# ==============================
# 5️⃣ Evaluate on Test Set
# ==============================
# Assumes 'X_test' and 'y_test' are still in memory
y_test_pred_xgb = xgb_grid_search.predict(X_test)
test_rmse_xgb = np.sqrt(mean_squared_error(y_test, y_test_pred_xgb))

print(f"\n--- 📊 Final Model Comparison (Test Set) ---")
# Assumes 'test_rmse' (Lasso) and 'test_rmse_ridge_fs' are in memory
try:
    print(f"Lasso RMSE:             {test_rmse:.4f} (Original)")
    print(f"Hybrid Ridge RMSE:      {test_rmse_ridge_fs:.4f} (Your Best Linear)")
except NameError:
    print("Could not find previous model scores ('test_rmse' or 'test_rmse_ridge_fs') for comparison.")

print(f"XGBoost RMSE:           {test_rmse_xgb:.4f} (New)")

# ==============================
# 6️⃣ Feature Importance from XGBoost (All Features)
# ==============================
try:
    # Get names after OHE
    best_preprocessor = xgb_grid_search.best_estimator_.named_steps['preprocessor']
    ohe = best_preprocessor.named_transformers_['cat'].named_steps['onehot']
    cat_fnames = ohe.get_feature_names_out(categorical_features)
    # Assumes 'numeric_features' list is in memory
    all_feature_names = np.concatenate([numeric_features, cat_fnames])

    # Get importances from the best model
    best_xgb_model = xgb_grid_search.best_estimator_.named_steps['regressor']
    importances = best_xgb_model.feature_importances_

    xgb_feature_df = pd.DataFrame({
        'Feature': all_feature_names,
        'Importance': importances
    }).sort_values(by='Importance', ascending=False)

    print(f"\n--- XGBoost Feature Importances (All {len(xgb_feature_df)} Features) ---")
    
    # Temporarily set pandas to display all rows
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):
        display(xgb_feature_df)

except Exception as e:
    print(f"\nCould not generate feature importances: {e}")


--- 🌳 Fitting XGBoost Model (with RobustScaler) 🌳 ---
Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV] END regressor__learning_rate=0.01, regressor__max_depth=3, regressor__n_estimators=100; total time=   0.0s
[CV] END regressor__learning_rate=0.01, regressor__max_depth=3, regressor__n_estimators=100; total time=   0.0s
[CV] END regressor__learning_rate=0.01, regressor__max_depth=3, regressor__n_estimators=100; total time=   0.1s
[CV] END regressor__learning_rate=0.01, regressor__max_depth=3, regressor__n_estimators=100; total time=   0.0s
[CV] END regressor__learning_rate=0.01, regressor__max_depth=3, regressor__n_estimators=100; total time=   0.1s
[CV] END regressor__learning_rate=0.01, regressor__max_depth=3, regressor__n_estimators=300; total time=   0.1s
[CV] END regressor__learning_rate=0.01, regressor__max_depth=3, regressor__n_estimators=300; total time=   0.1s
[CV] END regressor__learning_rate=0.01, regressor__max_depth=3, regressor__n_estimators=300; total t

Unnamed: 0,Feature,Importance
1,Equipment_Value,0.298215
2,Base_Transport_Fee,0.074989
43,Order_Day_of_Week_3,0.048988
26,Rural_Hospital_No,0.047939
0,Supplier_Reliability,0.043991
21,Transport_Method_Waterways,0.036947
6,Equipment_Type_Aluminium,0.034886
3,Delivery_Days,0.028688
4,Equipment_Volume,0.027506
41,Order_Day_of_Week_1,0.026078


In [54]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from xgboost import XGBRegressor
from sklearn.exceptions import NotFittedError

# ===================================================================
# 🌳 Retraining final XGBoost model on ALL available data...
# ===================================================================

# === 1. Get Best Hyperparameters (DYNAMICALLY) ===
try:
    # Get the best params from the 'xgb_grid_search' object (from your previous cell)
    best_params_xgb = xgb_grid_search.best_params_
    
    # Extract each best parameter
    best_n_estimators = best_params_xgb['regressor__n_estimators']
    best_learning_rate = best_params_xgb['regressor__learning_rate']
    best_max_depth = best_params_xgb['regressor__max_depth']
    
    print(f"   Using best hyperparameters: {best_params_xgb}")

except (NameError, AttributeError, NotFittedError) as e:
    print("\nERROR: Could not run. 'xgb_grid_search' object not found or not fitted.")
    print("Please make sure you have run the 'XGBoost GridSearchCV' cell successfully first.")
    print(f"Details: {e}")
    # Stop the script if the params aren't found
    raise

# === 2. Re-define the Preprocessor (for clarity and safety) ===
# Using the same feature lists from your other retraining cells
numeric_features = [
    'Supplier_Reliability', 'Equipment_Value', 'Base_Transport_Fee',
    'Delivery_Days', 'Equipment_Volume', 'Equipment_Weight'
]
categorical_features = [
    'Equipment_Type', 'CrossBorder_Shipping', 'Urgent_Shipping',
    'Installation_Service', 'Transport_Method', 'Fragile_Equipment',
    'Hospital_Info', 'Rural_Hospital', 'Order_Month',
    'Order_Day_of_Week', 'Order_Is_Weekend'
]

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# This is the final preprocessor that will be used in the pipeline
final_preprocessor_xgb = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop' 
)

# === 3. Build the Final Model and Pipeline ===
# Create the final pipeline with the best params
final_model_pipeline_xgb = Pipeline(steps=[
    ('preprocessor', final_preprocessor_xgb),
    ('model', XGBRegressor(
        n_estimators=best_n_estimators,      # <-- Uses best_n_estimators
        learning_rate=best_learning_rate,  # <-- Uses best_learning_rate
        max_depth=best_max_depth,          # <-- Uses best_max_depth
        random_state=42,
        n_jobs=-1,
        objective='reg:squarederror'
    ))
])

# === 4. Fit on the ENTIRE dataset ===
# We use 'X' and 'y' (the full, pre-split data) to train the final model
print(f"\nFitting pipeline on {X.shape[0]} rows of data...")
final_model_pipeline_xgb.fit(X, y)

print("\n✅ Final XGBoost model trained on entire dataset!")
print("You can now use this 'final_model_pipeline_xgb' for all predictions.")

   Using best hyperparameters: {'regressor__learning_rate': 0.01, 'regressor__max_depth': 3, 'regressor__n_estimators': 500}

Fitting pipeline on 5000 rows of data...

✅ Final XGBoost model trained on entire dataset!
You can now use this 'final_model_pipeline_xgb' for all predictions.


In [55]:
def prepare_features(df_raw):
    """
    Applies all manual cleaning and feature engineering
    to match the data used for model training.
    
    Takes a raw DataFrame (like test.csv) and returns
    a DataFrame ready for the preprocessor pipeline.
    """
    
    print(f"Preparing {len(df_raw)} new rows...")
    
    # Make a copy to avoid changing the original data
    df = df_raw.copy()
    
    # 1. Clean column names (from your training script)
    df.columns = df.columns.str.strip()

    # 2. Clean all string/object columns (from your training script)
    for col in df.select_dtypes(include='object').columns:
        df[col] = df[col].astype(str).str.strip()
        df[col] = df[col].replace({'': np.nan, 'nan': np.nan, 'NaN': np.nan})

    # 3. Normalize Yes/No columns (from your training script)
    yes_no_cols = ['CrossBorder_Shipping', 'Urgent_Shipping', 'Installation_Service',
                   'Fragile_Equipment', 'Rural_Hospital']
    for col in yes_no_cols:
        if col in df.columns:
            df[col] = df[col].replace({
                'YES': 'Yes', 'yes': 'Yes', 'Y': 'Yes', 'y': 'Yes',
                'NO': 'No', 'no': 'No', 'N': 'No', 'n': 'No'
            })

    # 4. Convert date columns (from your training script)
    df['Order_Placed_Date'] = pd.to_datetime(df['Order_Placed_Date'], errors='coerce')
    df['Delivery_Date'] = pd.to_datetime(df['Delivery_Date'], errors='coerce')

    # 5. Engineer Date Features (from your training script)
    df['Delivery_Days'] = (df['Delivery_Date'] - df['Order_Placed_Date']).dt.days
    df['Delivery_Days'] = pd.to_numeric(df['Delivery_Days'], errors='coerce')
    
    df['Order_Month'] = df['Order_Placed_Date'].dt.month.astype(str)
    df['Order_Day_of_Week'] = df['Order_Placed_Date'].dt.dayofweek.astype(str)
    df['Order_Is_Weekend'] = (df['Order_Placed_Date'].dt.dayofweek >= 5).astype(int)
    
    # 6. Engineer Equipment_Volume (from your training script)
    df['Equipment_Volume'] = df['Equipment_Height'] * df['Equipment_Width']
    
    # 7. Engineer Delivery_Days features (from your training script)
    # This *must* be in this order
    df['Delivery_Days'] = df['Delivery_Days'].abs()
    # 8. Log-Transform Skewed Features (from your training script)
    df['Equipment_Value'] = np.log1p(df['Equipment_Value'])
    df['Equipment_Volume'] = np.log1p(df['Equipment_Volume'])
    df['Equipment_Weight'] = np.log1p(df['Equipment_Weight'])
    df['Base_Transport_Fee'] = np.log1p(df['Base_Transport_Fee'])
    
    print("Feature preparation complete.")
    
    # 9. Return the feature-engineered DataFrame
    # The preprocessor pipeline will select the columns it needs from this.
    return df

In [56]:
# Assume 'final_model_pipeline' and 'prepare_features' are in memory
# If not, you would load your model:
# import joblib
# final_model_pipeline = joblib.load('final_xgb_model.pkl') 

# 1. Load your new, raw test data
print("Loading new test data...")
df_new_test = pd.read_csv('../data/test.csv') 

# 2. Save IDs for the final submission
# We need to map our predictions back to the original IDs
submission_ids = df_new_test['Hospital_Id']

# 3. Apply the *exact same* feature engineering
print("Applying feature engineering to new data...")
X_new_prepared = prepare_features(df_new_test)

# 4. Get predictions
print("Getting predictions from the final model...")
log_predictions = final_model_pipeline_xgb.predict(X_new_prepared)

# 5. Convert predictions back from log-scale! (THE FIX)
# Remember, you trained on log(Transport_Cost + 1)
final_predictions = np.expm1(log_predictions)

# (Safety check: ensure no negative predictions)
# final_predictions[final_predictions < 0] = 0

# 6. Create the final submission file
submission_df = pd.DataFrame({
    'Hospital_Id': submission_ids,
    'Transport_Cost': final_predictions
})

# Display the first few predictions
print("\nFinal Predictions (in $):")
display(submission_df.head())

# Save to CSV
submission_df.to_csv('mixed_hybrid_xxgb.csv', index=False)
print("Submission file 'mixed_hybrid_ridge.csv' created successfully.")

Loading new test data...
Applying feature engineering to new data...
Preparing 500 new rows...
Feature preparation complete.
Getting predictions from the final model...

Final Predictions (in $):


Unnamed: 0,Hospital_Id,Transport_Cost
0,fffe33003400,278.333984
1,fffe3700330036003600,192.789917
2,fffe3300390038003400,1107.906128
3,fffe310030003900,131.184555
4,fffe3700330031003200,644.596252


Submission file 'mixed_hybrid_ridge.csv' created successfully.
