In [60]:
# core
import os
import re
import warnings
warnings.filterwarnings('ignore')

# data + plotting
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from scipy.stats import zscore
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
np.set_printoptions(threshold=np.inf)

# sklearn (preprocessing / pipeline / model selection / metrics)
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures, PowerTransformer,RobustScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error

# classical models (if you use them elsewhere)
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor

# gradient boosting / lightgbm / xgboost
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

# utilities
import joblib   # optional: save/load pipeline

from lightgbm import LGBMRegressor
from sklearn.model_selection import RandomizedSearchCV


In [61]:

# Set plot style (optional)
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

# Load data
df = pd.read_csv('../data/train.csv')
df.columns = df.columns.str.strip()
print(f"Initial data shape: {df.shape}")

# Clean string/object columns
for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].astype(str).str.strip()
    df[col] = df[col].replace({'': np.nan, 'nan': np.nan, 'NaN': np.nan})

# Normalize Yes/No columns
yes_no_cols = [
    'CrossBorder_Shipping', 'Urgent_Shipping', 'Installation_Service',
    'Fragile_Equipment', 'Rural_Hospital'
]
for col in yes_no_cols:
    if col in df.columns:
        df[col] = df[col].replace({
            'YES': 'Yes', 'yes': 'Yes', 'Y': 'Yes', 'y': 'Yes',
            'NO': 'No', 'no': 'No', 'N': 'No', 'n': 'No'
        })

# Convert dates
date_order_col = 'Order_Placed_Date'
date_delivery_col = 'Delivery_Date'
df[date_order_col] = pd.to_datetime(df[date_order_col], errors='coerce')
df[date_delivery_col] = pd.to_datetime(df[date_delivery_col], errors='coerce')

# Compute temporary Delivery_Days
df['Delivery_Days'] = (df[date_delivery_col] - df[date_order_col]).dt.days

# Detect and fix negative delivery days
neg_mask = df['Delivery_Days'].notna() & (df['Delivery_Days'] < 0)
print(f"Rows with negative Delivery_Days before swap: {neg_mask.sum()}")

if neg_mask.any():
    tmp = df.loc[neg_mask, date_order_col].copy()
    df.loc[neg_mask, date_order_col] = df.loc[neg_mask, date_delivery_col]
    df.loc[neg_mask, date_delivery_col] = tmp

# Recompute Delivery_Days and check again
df['Delivery_Days'] = (df[date_delivery_col] - df[date_order_col]).dt.days
neg_after = (df['Delivery_Days'].notna() & (df['Delivery_Days'] < 0)).sum()
print(f"Rows with negative Delivery_Days after swap: {neg_after}")

# Drop Delivery_Days since not used as input
df = df.drop(columns=['Delivery_Days'])

# Handle Transport_Cost
cost_col = 'Transport_Cost'
if cost_col in df.columns:
    df[cost_col] = pd.to_numeric(df[cost_col], errors='coerce')

    neg_cost_count = (df[cost_col] < 0).sum()
    print(f"Negative Transport_Cost entries before fix: {neg_cost_count}")

    # Replace negative with NaN
    df.loc[df[cost_col] < 0, cost_col] = np.nan

    # Compute mean of positive costs
    positive_mean = df.loc[df[cost_col] > 0, cost_col].mean()
    print(f"Mean of positive Transport_Cost values: {positive_mean:.4f}")

    # Fill NaNs with positive mean
    df[cost_col] = df[cost_col].fillna(positive_mean)

    # ✅ Final check for negative costs
    neg_cost_after = (df[cost_col] < 0).sum()
    print(f"Negative Transport_Cost entries after fix: {neg_cost_after}")
else:
    print("Warning: 'Transport_Cost' column not found!")

# Drop duplicates
before = len(df)
df = df.drop_duplicates()
after = len(df)
print(f"Dropped {before - after} duplicate rows.")

# ✅ Final sanity checks
print("\n=== FINAL DATA VALIDATION ===")
neg_delivery_check = ((pd.to_datetime(df['Delivery_Date']) - pd.to_datetime(df['Order_Placed_Date'])).dt.days < 0).sum()
print(f"Check → Negative Delivery_Days remaining: {neg_delivery_check}")

if cost_col in df.columns:
    neg_cost_check = (df[cost_col] < 0).sum()
    print(f"Check → Negative Transport_Cost remaining: {neg_cost_check}")

print("=====================================")
print("Cleaning complete. Final shape:", df.shape)

# Optional: preview
display(df.head())

Initial data shape: (5000, 20)
Rows with negative Delivery_Days before swap: 1964
Rows with negative Delivery_Days after swap: 0
Negative Transport_Cost entries before fix: 493
Mean of positive Transport_Cost values: 20528.6990
Negative Transport_Cost entries after fix: 0
Dropped 0 duplicate rows.

=== FINAL DATA VALIDATION ===
Check → Negative Delivery_Days remaining: 0
Check → Negative Transport_Cost remaining: 0
Cleaning complete. Final shape: (5000, 20)


Unnamed: 0,Hospital_Id,Supplier_Name,Supplier_Reliability,Equipment_Height,Equipment_Width,Equipment_Weight,Equipment_Type,Equipment_Value,Base_Transport_Fee,CrossBorder_Shipping,Urgent_Shipping,Installation_Service,Transport_Method,Fragile_Equipment,Hospital_Info,Rural_Hospital,Order_Placed_Date,Delivery_Date,Hospital_Location,Transport_Cost
0,fffe3200360030003700,Jo Valencia,0.44,21.0,6.0,,,3.62,17.13,No,No,No,Roadways,No,Working Class,No,2017-10-20,2017-10-20,APO AA 33776,179.5
1,fffe3400380037003400,Wanda Warren,0.58,29.0,20.0,1210684.0,Marble,9703.37,35.42,No,Yes,Yes,Roadways,No,Working Class,No,2016-02-22,2016-02-24,"South Kevin, VT 84493",627732.45
2,fffe3200350036003700,Robert Ackies,0.97,39.0,15.0,3305.0,Aluminium,40.21,18.54,No,No,No,Roadways,No,Working Class,No,2018-01-10,2018-01-11,"Kevinshire, NE 31279",1565.92
3,fffe3800320034003400,Charlotte Membreno,0.7,8.0,5.0,606.0,Brass,4.55,17.48,No,No,No,Roadways,No,Working Class,No,2016-08-06,2016-08-06,DPO AP 61572,257.71
4,fffe3600340033003000,Nena Silva,0.66,27.0,13.0,,Marble,2726.8,30.23,Yes,No,No,Roadways,No,Working Class,,2016-12-15,2016-12-17,"Joshuamouth, AK 01550",8553.52


In [62]:


# ==============================
# 1️⃣ Define target and features
# ==============================
target_col = 'Transport_Cost'
y = df[target_col]
X = df.drop(columns=[target_col])

# ==============================
# 2️⃣ Convert dates to numeric timestamps
# ==============================
date_cols = ['Order_Placed_Date', 'Delivery_Date']
for col in date_cols:
    X[col] = pd.to_datetime(X[col], errors='coerce').astype(int)/10**9  # seconds since epoch

# ==============================
# 3️⃣ Explicitly define numeric and categorical features
# ==============================
numeric_features = [
    'Supplier_Reliability',
    'Equipment_Height',
    'Equipment_Width',
    'Equipment_Weight',
    'Equipment_Value',
    'Base_Transport_Fee',
    'Order_Placed_Date',
    'Delivery_Date'
]

categorical_features = [
    'Equipment_Type',
    'CrossBorder_Shipping',
    'Urgent_Shipping',
    'Installation_Service',
    'Transport_Method',
    'Fragile_Equipment',
    'Hospital_Info',
    'Rural_Hospital'
]

# ==============================
# 4️⃣ Train-Test Split
# ==============================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ==============================
# 5️⃣ Create Pipelines
# ==============================
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop'
)

# ==============================
# 6️⃣ Fit and Transform
# ==============================
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# ==============================
# 7️⃣ Baseline RMSE (mean predictor)
# ==============================
train_mean = y_train.mean()
y_test_pred_baseline = np.full(shape=len(y_test), fill_value=train_mean)
baseline_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred_baseline))
print(f"Baseline RMSE (test, original-scale): {baseline_rmse:.4f}")
print(f"Processed X_train shape: {X_train_processed.shape}")
print(f"Processed X_test shape: {X_test_processed.shape}")

Baseline RMSE (test, original-scale): 48940.9269
Processed X_train shape: (4000, 30)
Processed X_test shape: (1000, 30)


In [63]:
# ==============================
# 1️⃣ Create full pipeline
# ==============================
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),        # handle numeric + categorical
    ('poly', PolynomialFeatures()),        # now safe
    ('regressor', Lasso(max_iter=10000, random_state=42))
])

# ==============================
# 2️⃣ Define hyperparameter grid
# ==============================
param_grid = {
    'poly__degree': [1],             # polynomial degree
    'regressor__alpha': [0.01, 0.1, 1, 10]  # Lasso regularization
}

# ==============================
# 3️⃣ Setup GridSearchCV with K-Fold
# ==============================
kf = KFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=kf,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=2
)

# ==============================
# 4️⃣ Fit GridSearch on training data
# ==============================
grid_search.fit(X_train, y_train)

# ==============================
# 5️⃣ Best parameters and CV RMSE
# ==============================
best_params = grid_search.best_params_
best_rmse = np.sqrt(-grid_search.best_score_)

print("Best parameters:", best_params)
print(f"Best CV RMSE: {best_rmse:.4f}")

# ==============================
# 6️⃣ Evaluate on test set
# ==============================
y_test_pred = grid_search.predict(X_test)
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
print(f"Test RMSE: {test_rmse:.4f}")

# ==============================
# 7️⃣ Feature importance: Top 30 by absolute coefficient
# ==============================
best_model = grid_search.best_estimator_.named_steps['regressor']
preprocessor = grid_search.best_estimator_.named_steps['preprocessor']
poly = grid_search.best_estimator_.named_steps['poly']

# numeric + categorical feature names
num_features = numeric_features
cat_features = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_features)
all_features = np.concatenate([num_features, cat_features])

# polynomial feature names
poly_feature_names = poly.get_feature_names_out(all_features)

# match coefficients to feature names
coef = best_model.coef_
feature_coef_df = pd.DataFrame({
    'feature': poly_feature_names,
    'coefficient': coef
})

# Top 30 features by absolute value
top_30 = feature_coef_df.reindex(feature_coef_df['coefficient'].abs().sort_values(ascending=False).index).head(30)
display(top_30)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] END .................poly__degree=1, regressor__alpha=1; total time=   0.0s
[CV] END .................poly__degree=1, regressor__alpha=1; total time=   0.0s
[CV] END ..............poly__degree=1, regressor__alpha=0.01; total time=   0.4s
[CV] END ..............poly__degree=1, regressor__alpha=0.01; total time=   0.4s
[CV] END ...............poly__degree=1, regressor__alpha=0.1; total time=   0.4s
[CV] END ...............poly__degree=1, regressor__alpha=0.1; total time=   0.5s
[CV] END .................poly__degree=1, regressor__alpha=1; total time=   0.4s
[CV] END ...............poly__degree=1, regressor__alpha=0.1; total time=   0.5s
[CV] END ................poly__degree=1, regressor__alpha=10; total time=   0.0s
[CV] END ................poly__degree=1, regressor__alpha=10; total time=   0.0s
[CV] END ..............poly__degree=1, regressor__alpha=0.01; total time=   0.5s
[CV] END ................poly__degree=1, regresso

Unnamed: 0,feature,coefficient
18,Urgent_Shipping_No,-25247.04
1,Supplier_Reliability,22689.51
12,Equipment_Type_Clay,-20661.74
13,Equipment_Type_Marble,13860.67
25,Fragile_Equipment_No,-13701.06
6,Base_Transport_Fee,-13093.13
7,Order_Placed_Date,-10150.75
3,Equipment_Width,10074.71
15,Equipment_Type_Wood,-9910.517
16,CrossBorder_Shipping_No,-8567.859


In [64]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso, Ridge
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import PolynomialFeatures
import numpy as np
import pandas as pd

# ==============================
# 1️⃣ Get the BEST Lasso Alpha (for the selector)
# ==============================
try:
    # We get the best alpha from your *first* grid search
    best_lasso_alpha = grid_search.best_params_['regressor__alpha']
    print(f"--- Using fixed Lasso Alpha for selector: {best_lasso_alpha} ---")
except NameError:
    print("ERROR: 'grid_search' object (from Lasso) not found.")
    raise

# ==============================
# 2️⃣ Build the FULL Hybrid Pipeline
# ==============================
# This pipeline will be tuned by GridSearchCV

hybrid_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor), # Your original, fitted preprocessor
    
    ('poly', PolynomialFeatures(include_bias=False)), # This is now a TUNABLE step
    
    ('selector', SelectFromModel(
        Lasso(alpha=best_lasso_alpha, max_iter=10000, random_state=42),
        threshold=1e-5,
        prefit=False  # <-- THE FIX! This re-fits Lasso for each 'degree'
    )),
    
    ('regressor', Ridge(random_state=42)) # This is also a TUNABLE step
])

# ==============================
# 3️⃣ Define the NEW Hyperparameter Grid
# ==============================
# We will tune BOTH poly__degree and regressor__alpha

hybrid_param_grid = {
    'poly__degree': [1, 2, 3],  # Test different degrees
    'regressor__alpha': [0.1, 1, 10, 100] # Test different Ridge alphas
}

# ==============================
# 4️⃣ Setup and Run the new GridSearchCV
# ==============================
# kf = KFold(n_splits=5, shuffle=True, random_state=42) # Re-use your kf

hybrid_grid_search = GridSearchCV(
    hybrid_pipeline,
    hybrid_param_grid,
    cv=kf,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=2
)

print("\n--- Fitting Hybrid Model (Tuning Degree AND Ridge Alpha) ---")
hybrid_grid_search.fit(X_train, y_train)

# ==============================
# 5️⃣ Best parameters and CV RMSE
# ==============================
hybrid_best_params = hybrid_grid_search.best_params_
hybrid_best_rmse = np.sqrt(-hybrid_grid_search.best_score_)

print("\n--- New Hybrid Model Results ---")
print(f"Best parameters: {hybrid_best_params}")
print(f"Best CV RMSE: {hybrid_best_rmse:.4f}")

# ==============================
# 6️⃣ Evaluate on test set
# ==============================
y_test_pred_hybrid = hybrid_grid_search.predict(X_test)
test_rmse_hybrid = np.sqrt(mean_squared_error(y_test, y_test_pred_hybrid))

print(f"\n--- Final Model Comparison (Test Set) ---")
print(f"Original Lasso RMSE:      {test_rmse:.4f}")
print(f"Ridge (w/ Lasso FS) RMSE: {test_rmse_ridge_fs:.4f} (Old)")
print(f"New Hybrid RMSE:          {test_rmse_hybrid:.4f} (New)")

--- Using fixed Lasso Alpha for selector: 10 ---

--- Fitting Hybrid Model (Tuning Degree AND Ridge Alpha) ---
Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END ...............poly__degree=1, regressor__alpha=0.1; total time=   0.0s
[CV] END ...............poly__degree=1, regressor__alpha=0.1; total time=   0.0s
[CV] END ...............poly__degree=1, regressor__alpha=0.1; total time=   0.0s
[CV] END ...............poly__degree=1, regressor__alpha=0.1; total time=   0.0s
[CV] END .................poly__degree=1, regressor__alpha=1; total time=   0.0s
[CV] END .................poly__degree=1, regressor__alpha=1; total time=   0.0s
[CV] END .................poly__degree=1, regressor__alpha=1; total time=   0.0s
[CV] END .................poly__degree=1, regressor__alpha=1; total time=   0.0s
[CV] END ................poly__degree=1, regressor__alpha=10; total time=   0.0s
[CV] END ...............poly__degree=1, regressor__alpha=0.1; total time=   0.0s
[CV] END .........

In [66]:
import pandas as pd
import numpy as np

# ==============================
# 7️⃣ Inspect features from the best hybrid model
# ==============================
print("\n--- Inspecting Final Hybrid Model Features & Coefficients ---")

# 1. Get the best, fully-fitted pipeline
try:
    best_pipeline = hybrid_grid_search.best_estimator_
except NameError:
    print("ERROR: 'hybrid_grid_search' object not found.")
    raise

# 2. Extract all the fitted steps
preproc = best_pipeline.named_steps['preprocessor']
poly = best_pipeline.named_steps['poly']
selector = best_pipeline.named_steps['selector']
final_ridge_model = best_pipeline.named_steps['regressor']

print(f"Best parameters found: {hybrid_grid_search.best_params_}")
print(f"Best polynomial degree used: {poly.degree}")

# 3. Get the feature names *after* preprocessing
try:
    num_fnames = numeric_features
    ohe = preproc.named_transformers_['cat'].named_steps['onehot']
    cat_fnames = ohe.get_feature_names_out(categorical_features)
    input_feature_names = np.concatenate([num_fnames, cat_fnames])
except NameError as e:
    print(f"ERROR: Could not find 'numeric_features' or 'categorical_features'. {e}")
    raise

# 4. Get all the polynomial feature names
poly_feature_names = poly.get_feature_names_out(input_feature_names)

# 5. Get the boolean mask of features *kept* by the Lasso selector
support_mask = selector.get_support()
selected_feature_names = poly_feature_names[support_mask]

# 6. Get the coefficients from the final Ridge model
final_ridge_coefs = final_ridge_model.coef_

# --- 7. Combine and Display ---
print("-------------------------------------------")
print(f"Total features created by polynomial step: {len(poly_feature_names)}")
print(f"Features kept by Lasso selector: {len(selected_feature_names)}")
print(f"Features passed to Ridge model: {len(final_ridge_coefs)}")
print("-------------------------------------------")

# Sanity check
if len(selected_feature_names) != len(final_ridge_coefs):
    print("!! WARNING: Mismatch in feature count and coefficient count!")
else:
    # Create the final DataFrame
    final_features_df = pd.DataFrame({
        'Feature': selected_feature_names,
        'Ridge_Coefficient': final_ridge_coefs
    })
    
    # Sort by the absolute value of the coefficient
    final_features_df = final_features_df.reindex(
        final_features_df['Ridge_Coefficient'].abs().sort_values(ascending=False).index
    )

    print("\n--- Top 30 Features Used by Final Hybrid Model ---")
    with pd.option_context('display.max_rows', 30):
        display(final_features_df.head(30))


--- Inspecting Final Hybrid Model Features & Coefficients ---
Best parameters found: {'poly__degree': 1, 'regressor__alpha': 100}
Best polynomial degree used: 1
-------------------------------------------
Total features created by polynomial step: 30
Features kept by Lasso selector: 21
Features passed to Ridge model: 21
-------------------------------------------

--- Top 30 Features Used by Final Hybrid Model ---


Unnamed: 0,Feature,Ridge_Coefficient
14,Urgent_Shipping_No,-21574.978146
0,Supplier_Reliability,21447.595582
10,Equipment_Type_Marble,11723.447805
5,Base_Transport_Fee,-10746.719109
2,Equipment_Width,9493.488675
6,Order_Placed_Date,-9318.165673
9,Equipment_Type_Clay,-8982.839896
13,CrossBorder_Shipping_No,-7404.267429
12,Equipment_Type_Wood,-7242.207814
19,Hospital_Info_Wealthy,-5884.503335


In [65]:
import pandas as pd
import numpy as np
import joblib
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (
    PolynomialFeatures, RobustScaler, OneHotEncoder
)
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Lasso, Ridge
from sklearn.feature_selection import SelectFromModel

# ==============================
# 1️⃣ Get All Best Hyperparameters
# ==============================
try:
    # From the FIRST (Lasso) grid search
    best_lasso_params = grid_search.best_params_
    best_poly_degree = best_lasso_params['poly__degree']
    best_lasso_alpha = best_lasso_params['regressor__alpha']
    
    # From the SECOND (Ridge) grid search
    best_ridge_params = ridge_fs_grid_search.best_params_
    best_ridge_alpha = best_ridge_params['alpha']
    
    print("--- Hyperparameters Found ---")
    print(f"   Best Polynomial Degree: {best_poly_degree}")
    print(f"   Best Lasso Alpha (for selection): {best_lasso_alpha}")
    print(f"   Best Ridge Alpha (for regression): {best_ridge_alpha}")
    
except NameError as e:
    print("\nERROR: Could not find 'grid_search' or 'ridge_fs_grid_search'.")
    print("Please make sure you have run the previous GridSearchCV cells.")
    print(f"Details: {e}")
    raise

# ==============================
# 2️⃣ Re-define the Preprocessor (for a clean pipeline)
# ==============================
# (This is identical to your original preprocessor definition)
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

final_preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop'
)

# ==============================
# 3️⃣ Build the Final Hybrid Pipeline
# ==============================
# This pipeline does all 4 steps in order:
# 1. Preprocess
# 2. Create Polynomial Features
# 3. Select features using a *new* Lasso model (configured with best_lasso_alpha)
# 4. Regress using a *new* Ridge model (configured with best_ridge_alpha)

final_hybrid_pipeline = Pipeline(steps=[
    ('preprocessor', final_preprocessor),
    
    ('poly', PolynomialFeatures(degree=best_poly_degree, include_bias=False)),
    
    ('selector', SelectFromModel(
        Lasso(alpha=best_lasso_alpha, max_iter=10000, random_state=42),
        threshold=1e-5 # Drop features with coefs near zero
    )),
    
    ('regressor', Ridge(alpha=best_ridge_alpha, random_state=42))
])

print("\n--- Final Pipeline Architecture ---")
display(final_hybrid_pipeline)

# ==============================
# 4️⃣ Fit on the ENTIRE dataset (X and y)
# ==============================
print(f"\nFitting final hybrid model on all {X.shape[0]} rows...")
final_hybrid_pipeline.fit(X, y)

print("✅ Final hybrid model is trained and ready!")

# ==============================
# 5️⃣ (Optional) Save Your Model
# ==============================


--- Hyperparameters Found ---
   Best Polynomial Degree: 1
   Best Lasso Alpha (for selection): 10
   Best Ridge Alpha (for regression): 100

--- Final Pipeline Architecture ---


0,1,2
,steps,"[('preprocessor', ...), ('poly', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,with_centering,True
,with_scaling,True
,quantile_range,"(25.0, ...)"
,copy,True
,unit_variance,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,degree,1
,interaction_only,False
,include_bias,False
,order,'C'

0,1,2
,estimator,Lasso(alpha=1...ndom_state=42)
,threshold,1e-05
,prefit,False
,norm_order,1
,max_features,
,importance_getter,'auto'

0,1,2
,alpha,10
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,10000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,42
,selection,'cyclic'

0,1,2
,alpha,100
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,42



Fitting final hybrid model on all 5000 rows...
✅ Final hybrid model is trained and ready!


In [None]:
import pandas as pd
import numpy as np
import joblib
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# ==============================
# 1️⃣ Create the XGBoost Pipeline
# ==============================
# We will use your *existing* 'preprocessor' variable,
# which has the RobustScaler included. This is perfectly fine.
# Assumes 'preprocessor' is in memory from your linear model setup.
try:
    xgb_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor), # <-- Using your existing object
        ('regressor', XGBRegressor(random_state=42, n_jobs=-1,
                                 objective='reg:squarederror'))
    ])
except NameError:
    print("ERROR: 'preprocessor' object not found.")
    print("Please run the preprocessing cell (section 5) from your linear model script first.")
    raise

# ==============================
# 2️⃣ Define Hyperparameter Grid for XGBoost
# ==============================
xgb_param_grid = {
    'regressor__n_estimators': [100, 300, 500],
    'regressor__learning_rate': [0.01, 0.1],
    'regressor__max_depth': [3, 5, 7]
}

# ==============================
# 3️⃣ Setup and Run GridSearchCV
# ==============================
# Assumes 'kf' (your KFold object) is still in memory
try:
    xgb_grid_search = GridSearchCV(
        xgb_pipeline,
        xgb_param_grid,
        cv=kf,
        scoring='neg_mean_squared_error',
        n_jobs=-1,
        verbose=2
    )
except NameError:
    print("ERROR: 'kf' object (KFold) not found.")
    print("Please run the cell that defines 'kf' first.")
    raise

print("\n--- 🌳 Fitting XGBoost Model (with RobustScaler) 🌳 ---")
# Assumes 'X_train' and 'y_train' are still in memory
try:
    xgb_grid_search.fit(X_train, y_train)
except NameError:
    print("ERROR: 'X_train' or 'y_train' not found.")
    print("Please run your Train-Test Split cell first.")
    raise

# ==============================
# 4️⃣ Best parameters and CV RMSE
# ==============================
xgb_best_params = xgb_grid_search.best_params_
xgb_best_rmse = np.sqrt(-xgb_grid_search.best_score_)

print("\n--- XGBoost Results ---")
print(f"Best parameters: {xgb_best_params}")
print(f"Best CV RMSE (XGBoost): {xgb_best_rmse:.4f}")

# ==============================
# 5️⃣ Evaluate on Test Set
# ==============================
# Assumes 'X_test' and 'y_test' are still in memory
y_test_pred_xgb = xgb_grid_search.predict(X_test)
test_rmse_xgb = np.sqrt(mean_squared_error(y_test, y_test_pred_xgb))

print(f"\n--- 📊 Final Model Comparison (Test Set) ---")
# Assumes 'test_rmse' (Lasso) and 'test_rmse_ridge_fs' are in memory
try:
    print(f"Lasso RMSE:             {test_rmse:.4f} (Original)")
    print(f"Hybrid Ridge RMSE:      {test_rmse_ridge_fs:.4f} (Your Best Linear)")
except NameError:
    print("Could not find previous model scores ('test_rmse' or 'test_rmse_ridge_fs') for comparison.")

print(f"XGBoost RMSE:           {test_rmse_xgb:.4f} (New)")

# ==============================
# 6️⃣ Feature Importance from XGBoost (All Features)
# ==============================
try:
    # Get names after OHE
    best_preprocessor = xgb_grid_search.best_estimator_.named_steps['preprocessor']
    ohe = best_preprocessor.named_transformers_['cat'].named_steps['onehot']
    cat_fnames = ohe.get_feature_names_out(categorical_features)
    # Assumes 'numeric_features' list is in memory
    all_feature_names = np.concatenate([numeric_features, cat_fnames])

    # Get importances from the best model
    best_xgb_model = xgb_grid_search.best_estimator_.named_steps['regressor']
    importances = best_xgb_model.feature_importances_

    xgb_feature_df = pd.DataFrame({
        'Feature': all_feature_names,
        'Importance': importances
    }).sort_values(by='Importance', ascending=False)

    print(f"\n--- XGBoost Feature Importances (All {len(xgb_feature_df)} Features) ---")
    
    # Temporarily set pandas to display all rows
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):
        display(xgb_feature_df)

except Exception as e:
    print(f"\nCould not generate feature importances: {e}")

In [53]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
import numpy as np
import pandas as pd

# ==============================
# 1️⃣ Create the Random Forest Pipeline
# ==============================
# We use the *exact same* 'preprocessor' you already have.
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor), # Using your existing object
    ('regressor', RandomForestRegressor(random_state=42, n_jobs=-1))
])

# ==============================
# 2️⃣ Define a WIDE Hyperparameter Grid
# ==============================
# This grid tests many combinations to fight overfitting
rf_param_grid = {
    'regressor__n_estimators': [100, 300, 500],
    'regressor__max_depth': [10, 20, None],  # Test limited-depth vs. fully-grown trees
    'regressor__max_features': ['sqrt', 1.0],  # 'sqrt' is classic RF, '1.0' is all features (like Bagging)
    'regressor__min_samples_split': [2, 5]   # Default (2) vs. slight regularization (5)
}
# Total fits = 3 (n_est) * 3 (depth) * 2 (max_feat) * 2 (min_split) = 36 combinations
# 36 combinations * 5 folds (cv) = 180 fits.
# This will take some time to run.

# ==============================
# 3️⃣ Setup and Run GridSearchCV
# ==============================
# Assumes 'kf' is still in memory
rf_grid_search = GridSearchCV(
    rf_pipeline,
    rf_param_grid,
    cv=kf,
    scoring='neg_mean_squared_error',
    n_jobs=-1, # Use all your CPU cores
    verbose=2
)

print("\n--- 🌳 Fitting Random Forest Model (Wide Grid Search) 🌳 ---")
rf_grid_search.fit(X_train, y_train)

# ==============================
# 4️⃣ Best parameters and CV RMSE
# ==============================
rf_best_params = rf_grid_search.best_params_
rf_best_rmse = np.sqrt(-rf_grid_search.best_score_)

print("\n--- Random Forest Results ---")
print(f"Best parameters: {rf_best_params}")
print(f"Best CV RMSE (RF): {rf_best_rmse:.4f}")

# ==============================
# 5️⃣ Evaluate on Test Set
# ==============================
y_test_pred_rf = rf_grid_search.predict(X_test)
test_rmse_rf = np.sqrt(mean_squared_error(y_test, y_test_pred_rf))

print(f"\n--- 📊 Final Model Comparison (Test Set) ---")
try:
    print(f"Lasso RMSE:             {test_rmse:.4f}")
    print(f"Hybrid Ridge RMSE:      {test_rmse_ridge_fs:.4f}")
    print(f"XGBoost RMSE:           {test_rmse_xgb:.4f}")
except NameError:
    print("Could not find previous model scores for comparison.")

print(f"Random Forest RMSE:     {test_rmse_rf:.4f} (New)")

# ==============================
# 6️⃣ Feature Importance from Random Forest
# ==============================
try:
    # Get names after OHE
    best_preprocessor = rf_grid_search.best_estimator_.named_steps['preprocessor']
    ohe = best_preprocessor.named_transformers_['cat'].named_steps['onehot']
    cat_fnames = ohe.get_feature_names_out(categorical_features)
    all_feature_names = np.concatenate([numeric_features, cat_fnames])

    # Get importances from the best model
    best_rf_model = rf_grid_search.best_estimator_.named_steps['regressor']
    importances = best_rf_model.feature_importances_

    rf_feature_df = pd.DataFrame({
        'Feature': all_feature_names,
        'Importance': importances
    }).sort_values(by='Importance', ascending=False)

    print(f"\n--- Random Forest Feature Importances (All {len(rf_feature_df)} Features) ---")
    
    # Display all features
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):
        display(rf_feature_df)

except Exception as e:
    print(f"\nCould not generate feature importances: {e}")


--- 🌳 Fitting Random Forest Model (Wide Grid Search) 🌳 ---
Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV] END regressor__max_depth=10, regressor__max_features=sqrt, regressor__min_samples_split=2, regressor__n_estimators=100; total time=   0.2s
[CV] END regressor__max_depth=10, regressor__max_features=sqrt, regressor__min_samples_split=2, regressor__n_estimators=100; total time=   0.2s
[CV] END regressor__max_depth=10, regressor__max_features=sqrt, regressor__min_samples_split=2, regressor__n_estimators=100; total time=   0.2s
[CV] END regressor__max_depth=10, regressor__max_features=sqrt, regressor__min_samples_split=2, regressor__n_estimators=100; total time=   0.2s
[CV] END regressor__max_depth=10, regressor__max_features=sqrt, regressor__min_samples_split=2, regressor__n_estimators=100; total time=   0.2s
[CV] END regressor__max_depth=10, regressor__max_features=sqrt, regressor__min_samples_split=5, regressor__n_estimators=100; total time=   0.4s
[CV] END regre

Unnamed: 0,Feature,Importance
4,Equipment_Value,0.2282
3,Equipment_Weight,0.198731
5,Base_Transport_Fee,0.087648
1,Equipment_Height,0.085045
2,Equipment_Width,0.072902
0,Supplier_Reliability,0.069927
7,Delivery_Date,0.062363
6,Order_Placed_Date,0.042915
19,Installation_Service_No,0.02072
16,CrossBorder_Shipping_Yes,0.016892


In [55]:
import pandas as pd
import numpy as np
import joblib
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline

# ==============================
# 1️⃣ Get Best Hyperparameters (Dynamically)
# ==============================
try:
    # Get the dictionary of best parameters, e.g.,
    # {'regressor__max_depth': None, 'regressor__max_features': 'sqrt', ...}
    best_rf_params = rf_grid_search.best_params_
    print(f"--- Using best hyperparameters from GridSearchCV ---")
    print(best_rf_params)
    
except NameError as e:
    print("\nERROR: Could not find 'rf_grid_search'.")
    print("Please make sure you have run the Random Forest GridSearchCV cell first.")
    print(f"Details: {e}")
    raise

# ==============================
# 2️⃣ Build the Final Random Forest Pipeline
# ==============================
# We build the *exact same* pipeline structure as before
# Assumes 'preprocessor' is in memory
try:
    final_rf_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor), 
        ('regressor', RandomForestRegressor(random_state=42, n_jobs=-1))
    ])
except NameError:
    print("\nERROR: 'preprocessor' object not found.")
    print("Please make sure you have run the initial preprocessing cell.")
    raise

# ==============================
# 3️⃣ Set the Best Parameters
# ==============================
# This applies the parameters (e.g., 'regressor__max_features': 'sqrt')
# to the pipeline without hardcoding them.
final_rf_pipeline.set_params(**best_rf_params)

print("\n--- Final Pipeline Architecture ---")
display(final_rf_pipeline)

# ==============================
# 4️⃣ Fit on the ENTIRE dataset (X and y)
# ==============================
# Assumes 'X' and 'y' (full datasets) are in memory
try:
    print(f"\nFitting final Random Forest model on all {X.shape[0]} rows...")
    final_rf_pipeline.fit(X, y)
    print(X.shape)
except NameError:
    print("\nERROR: 'X' or 'y' (full datasets) not found.")
    print("Please make sure you have run the initial data loading cell.")
    raise

print("✅ Final Random Forest model is trained and ready!")

# ==============================
# 5️⃣ (Optional) Save Your Model
# ==============================


--- Using best hyperparameters from GridSearchCV ---
{'regressor__max_depth': None, 'regressor__max_features': 'sqrt', 'regressor__min_samples_split': 2, 'regressor__n_estimators': 100}

--- Final Pipeline Architecture ---


0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,with_centering,True
,with_scaling,True
,quantile_range,"(25.0, ...)"
,copy,True
,unit_variance,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True



Fitting final Random Forest model on all 5000 rows...
(5000, 19)
✅ Final Random Forest model is trained and ready!


In [None]:
import pandas as pd
import numpy as np
import joblib
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline

# ==============================
# 1️⃣ Get Best Hyperparameters (Dynamically)
# ==============================
try:
    # Get the dictionary of best parameters, e.g.,
    # {'regressor__learning_rate': 0.01, 'regressor__max_depth': 3, ...}
    best_xgb_params = xgb_grid_search.best_params_
    print(f"--- Using best hyperparameters from GridSearchCV ---")
    print(best_xgb_params)
    
except NameError as e:
    print("\nERROR: Could not find 'xgb_grid_search'.")
    print("Please make sure you have run the XGBoost GridSearchCV cell first.")
    print(f"Details: {e}")
    raise

# ==============================
# 2️⃣ Build the Final XGBoost Pipeline
# ==============================
# We build the *exact same* pipeline structure as before
# Assumes 'preprocessor' is in memory
final_xgb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor), 
    ('regressor', XGBRegressor(random_state=42, n_jobs=-1,
                             objective='reg:squarederror'))
])

# ==============================
# 3️⃣ Set the Best Parameters
# ==============================
# This applies the parameters (e.g., 'regressor__max_depth': 3)
# to the pipeline without hardcoding them.
final_xgb_pipeline.set_params(**best_xgb_params)

print("\n--- Final Pipeline Architecture ---")
display(final_xgb_pipeline)

# ==============================
# 4️⃣ Fit on the ENTIRE dataset (X and y)
# ==============================
# Assumes 'X' and 'y' (full datasets) are in memory
print(f"\nFitting final XGBoost model on all {X.shape[0]} rows...")
final_xgb_pipeline.fit(X, y)

print("✅ Final XGBoost model is trained and ready!")

# ==============================
# 5️⃣ (Optional) Save Your Model
# ==============================


In [None]:
print("🚀 Retraining final Polynomial Lasso model on ALL available data...")

# === 1️⃣ Get Best Hyperparameters (DYNAMICALLY) ===
try:
    best_params_lasso = grid_search.best_params_
    best_degree = best_params_lasso['poly__degree']
    best_alpha = best_params_lasso['regressor__alpha']
    print(f"   Using best hyperparameters: {best_params_lasso}")

except NameError as e:
    print("\nERROR: Could not run. 'grid_search' not found.")
    print("Please make sure you have run the Lasso GridSearchCV cell successfully first.")
    print(f"Details: {e}")
    raise

# === 2️⃣ Re-define the Preprocessor (for clarity) ===
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

final_preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop'
)

# === 3️⃣ Build the Final Pipeline ===
final_pipeline = Pipeline(steps=[
    ('preprocessor', final_preprocessor),
    ('poly', PolynomialFeatures(degree=best_degree, include_bias=False)),
    ('regressor', Lasso(alpha=best_alpha, max_iter=10000, random_state=42))
])

# === 4️⃣ Fit on the ENTIRE dataset ===
print(f"\nFitting pipeline on {X.shape[0]} rows of data...")
final_pipeline.fit(X, y)

print("\n✅ Final Polynomial Lasso model trained on entire dataset!")
print("You can now use 'final_pipeline' for predictions on new data.")

In [56]:

def prepare_features(df_raw):
    """
    Loads a raw test CSV, applies all manual cleaning steps,
    and converts dates to numeric timestamps, preparing it for
    the model's preprocessing pipeline.
    
    Args:
        file_path (str): Path to the raw test CSV file.
        
    Returns:
        pd.DataFrame: A cleaned DataFrame ready for prediction.
    """
    
    # Load data
    df = df_raw.copy()

    print(f"Initial shape: {df.shape}")

    # Strip column names
    df.columns = df.columns.str.strip()

    # Clean string/object columns
    for col in df.select_dtypes(include='object').columns:
        df[col] = df[col].astype(str).str.strip()
        df[col] = df[col].replace({'': np.nan, 'nan': np.nan, 'NaN': np.nan})

    # Normalize Yes/No columns
    yes_no_cols = [
        'CrossBorder_Shipping', 'Urgent_Shipping', 'Installation_Service',
        'Fragile_Equipment', 'Rural_Hospital'
    ]
    for col in yes_no_cols:
        if col in df.columns:
            df[col] = df[col].replace({
                'YES': 'Yes', 'yes': 'Yes', 'Y': 'Yes', 'y': 'Yes',
                'NO': 'No', 'no': 'No', 'N': 'No', 'n': 'No'
            })

    # Convert dates
    date_order_col = 'Order_Placed_Date'
    date_delivery_col = 'Delivery_Date'
    df[date_order_col] = pd.to_datetime(df[date_order_col], errors='coerce')
    df[date_delivery_col] = pd.to_datetime(df[date_delivery_col], errors='coerce')

    # Compute temporary Delivery_Days to find errors
    df['Delivery_Days'] = (df[date_delivery_col] - df[date_order_col]).dt.days

    # Detect and fix negative delivery days (swap dates)
    neg_mask = df['Delivery_Days'].notna() & (df['Delivery_Days'] < 0)
    print(f"Found and swapping {neg_mask.sum()} rows with incorrect date order.")

    if neg_mask.any():
        tmp = df.loc[neg_mask, date_order_col].copy()
        df.loc[neg_mask, date_order_col] = df.loc[neg_mask, date_delivery_col]
        df.loc[neg_mask, date_delivery_col] = tmp

    # Drop temporary column
    df = df.drop(columns=['Delivery_Days'])

    # NOTE: We skip all 'Transport_Cost' processing because it's the target
    # variable and will not be present in new test data.
    
    # Drop duplicates
    before = len(df)
    df = df.drop_duplicates()
    after = len(df)
    print(f"Dropped {before - after} duplicate rows.")

    # ==============================
    # 2️⃣ Convert dates to numeric timestamps (from your second script)
    # This MUST be done before sending to the pipeline, as your
    # pipeline's 'numeric_features' list expects these to be numbers.
    # ==============================
    date_cols = ['Order_Placed_Date', 'Delivery_Date']
    for col in date_cols:
        # Re-apply to_datetime in case swapping created NaTs
        df[col] = pd.to_datetime(df[col], errors='coerce').astype(int) / 10**9 

    print(f"Cleaning complete. Final shape: {df.shape}")
    
    return df

In [58]:
# Assume 'final_model_pipeline' and 'prepare_features' are in memory
# If not, you would load your model:
# import joblib
# final_model_pipeline = joblib.load('final_xgb_model.pkl') 

# 1. Load your new, raw test data
print("Loading new test data...")
df_new_test = pd.read_csv('../data/test.csv') 

# 2. Save IDs for the final submission
# We need to map our predictions back to the original IDs
submission_ids = df_new_test['Hospital_Id']

# 3. Apply the *exact same* feature engineering
print("Applying feature engineering to new data...")
X_new_prepared = prepare_features(df_new_test)

# 4. Get predictions
print("Getting predictions from the final model...")
final_predictions = final_rf_pipeline.predict(X_new_prepared)

# 5. Convert predictions back from log-scale! (THE FIX)
# Remember, you trained on log(Transport_Cost + 1)

# (Safety check: ensure no negative predictions)
# final_predictions[final_predictions < 0] = 0

# 6. Create the final submission file
submission_df = pd.DataFrame({
    'Hospital_Id': submission_ids,
    'Transport_Cost': final_predictions
})

# Display the first few predictions
print("\nFinal Predictions (in $):")
display(submission_df.head())

# Save to CSV
submission_df.to_csv('newnewRF.csv', index=False)
print("Submission file 'newnewRF.csv' created successfully.")

Loading new test data...
Applying feature engineering to new data...
Initial shape: (500, 19)
Found and swapping 206 rows with incorrect date order.
Dropped 0 duplicate rows.
Cleaning complete. Final shape: (500, 19)
Getting predictions from the final model...

Final Predictions (in $):


Unnamed: 0,Hospital_Id,Transport_Cost
0,fffe33003400,1858.061632
1,fffe3700330036003600,5443.515216
2,fffe3300390038003400,10993.321106
3,fffe310030003900,844.493571
4,fffe3700330031003200,5942.457364


Submission file 'newnewRF.csv' created successfully.
