In [None]:
# core
import os
import re
import warnings
warnings.filterwarnings('ignore')

# data + plotting
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from scipy.stats import zscore
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
np.set_printoptions(threshold=np.inf)

# sklearn (preprocessing / pipeline / model selection / metrics)
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures, PowerTransformer
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error

# classical models (if you use them elsewhere)
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor

# gradient boosting / lightgbm / xgboost
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

# utilities
import joblib   # optional: save/load pipeline

from lightgbm import LGBMRegressor
from sklearn.model_selection import RandomizedSearchCV


In [None]:
# Set plot style
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)


# 1️⃣ Load Data
print("Loading data...")
df = pd.read_csv('../data/train.csv')
df.columns = df.columns.str.strip()
display(df.head())
print(f"Initial data shape: {df.shape}")

# 2️⃣ Clean all string/object columns: strip spaces, replace blanks with NaN
print("Cleaning string columns...")
for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].astype(str).str.strip()
    df[col] = df[col].replace({'': np.nan, 'nan': np.nan, 'NaN': np.nan})

# 3️⃣ Normalize Yes/No columns to consistent "Yes"/"No"
print("Normalizing Yes/No columns...")
yes_no_cols = ['CrossBorder_Shipping', 'Urgent_Shipping', 'Installation_Service',
               'Fragile_Equipment', 'Rural_Hospital']
for col in yes_no_cols:
    if col in df.columns:
        df[col] = df[col].replace({
            'YES': 'Yes', 'yes': 'Yes', 'Y': 'Yes', 'y': 'Yes',
            'NO': 'No', 'no': 'No', 'N': 'No', 'n': 'No'
        })

# 4️⃣ Convert date columns to datetime
print("Converting date columns...")
df['Order_Placed_Date'] = pd.to_datetime(df['Order_Placed_Date'], errors='coerce')
df['Delivery_Date'] = pd.to_datetime(df['Delivery_Date'], errors='coerce')

# 5️⃣ Create new feature: Delivery_Days (difference in days)
print("Engineering Delivery_Days feature...")
df['Delivery_Days'] = (df['Delivery_Date'] - df['Order_Placed_Date']).dt.days
df['Delivery_Days'] = pd.to_numeric(df['Delivery_Days'], errors='coerce')

# === ADDED: Date Feature Engineering ===
print("Engineering more date features...")
df['Order_Month'] = df['Order_Placed_Date'].dt.month
df['Order_Day_of_Week'] = df['Order_Placed_Date'].dt.dayofweek  # Monday=0, Sunday=6
df['Order_Is_Weekend'] = df['Order_Day_of_Week'].isin([5, 6])
# === END ADDED ===

# 6️⃣ (Original) delete initial date rows
# df = df.dropna(subset=['Order_Placed_Date', 'Delivery_Date'])

# 7️⃣ Drop exact duplicate rows
print("Dropping duplicates...")
before = len(df)
df = df.drop_duplicates()
after = len(df)
print(f"Dropped {before - after} duplicate rows.")

# 8️⃣ Quick check after cleaning
print("\n" + "="*30)
print(" CLEANING & FEATURE ENGINEERING COMPLETE ")
print("="*30)
print(f"After basic cleaning shape: {df.shape}")

print("\nMissing values (raw count):")
print(df.isna().sum())

# === ADDED: Missing Value Percentage View ===
print("\nMissing values (percentage):")
missing_pct = (df.isna().sum() / len(df) * 100).sort_values(ascending=False)
print(missing_pct[missing_pct > 0])
# === END ADDED ===

print("\nDataFrame head:")
display(df.head())
# print(df['Delivery_Days'])

In [None]:

# ==============================================================================
# 📊 START OF EXPLORATORY DATA ANALYSIS (EDA)
# ==============================================================================

print("\n" + "="*30)
print(" STARTING EDA ")
print("="*30)

# 🔹 Define column lists
num_cols = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
# === ADDED: Exclude new date features from 'num_cols' for general stats ===
date_num_features = ['Order_Month', 'Order_Day_of_Week', 'Delivery_Days']
for col in ['Transport_Cost'] + date_num_features:
    if col in num_cols:
        num_cols.remove(col)
# === END ADDED ===
        
cat_cols = df.select_dtypes(include='object').columns.tolist()
# === ADDED: Add boolean 'Is_Weekend' to cat_cols for analysis ===
if 'Order_Is_Weekend' in df.columns:
    cat_cols.append('Order_Is_Weekend')
# === END ADDED ===

print(f"Numeric features identified: {num_cols}")
print(f"Categorical features identified: {cat_cols}")
print(f"Date-derived features identified: {date_num_features}")


# === ADDED: 1. Target Variable Analysis (Transport_Cost) ===
print("\n===== 1. TARGET VARIABLE ANALYSIS: Transport_Cost =====")
plt.figure(figsize=(14, 5))

# Plot 1: Original Distribution
plt.subplot(1, 2, 1)
sns.histplot(df['Transport_Cost'], kde=True, bins=40)
plt.title('Distribution of Transport_Cost (Original)')
plt.xlabel('Transport_Cost')

# Plot 2: Log-Transformed Distribution
# We add 1 to handle potential zero values before logging
plt.subplot(1, 2, 2)
log_target = np.log1p(df['Transport_Cost'])
sns.histplot(log_target, kde=True, bins=40, color='green')
plt.title('Distribution of log(Transport_Cost + 1)')
plt.xlabel('log(Transport_Cost + 1)')

plt.suptitle('Target Variable Distribution Analysis', fontsize=16)
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

print(f"Skewness of Transport_Cost: {df['Transport_Cost'].skew():.4f}")
print(f"Skewness of log(Transport_Cost + 1): {log_target.skew():.4f}")
# === END ADDED ===


print("\n===== 2. NUMERIC FEATURE ANALYSIS =====")
print("===== BASIC NUMERIC STATISTICS =====")
if not num_cols:
    print("No numeric columns found to describe (excluding target/dates).")
else:
    display(df[num_cols].describe().T)

    print("\n===== SKEWNESS =====")
    display(df[num_cols].skew())

# 🔹 Numeric distributions + boxplots
# (Your original loop)
# === MODIFIED: Added a check for empty list ===
print("\nGenerating numeric distribution plots...")
analysis_num_cols = num_cols + ['Delivery_Days'] # Add Delivery_Days back for plotting
if 'Transport_Cost' not in analysis_num_cols:
    analysis_num_cols.append('Transport_Cost') # Add Target back for plotting
    
for col in analysis_num_cols:
    if col in df.columns:
        plt.figure(figsize=(12,4))
        
        plt.subplot(1,2,1)
        sns.histplot(df[col], kde=True, bins=30)
        plt.title(f'{col} distribution')
        
        plt.subplot(1,2,2)
        sns.boxplot(x=df[col])
        plt.title(f'{col} boxplot')
        
        plt.tight_layout()
        plt.show()
    else:
        print(f"Warning: Column '{col}' not found for plotting.")


print("\n===== 3. CORRELATION ANALYSIS =====")
# 🔹 Correlation heatmap
# (Your original code)
plt.figure(figsize=(10,8))
corr = df[num_cols + ['Transport_Cost', 'Delivery_Days']].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap')
plt.show()


print("\n===== 4. CATEGORICAL FEATURE ANALYSIS =====")
# 🔹 Categorical distributions
# (Your original loop)
print("\nGenerating categorical distribution plots...")
high_cardinality_cols = []
for col in cat_cols:
    print(f"\n===== Column: {col} =====")
    print(df[col].value_counts(dropna=False))
    
    nunique = df[col].nunique()
    if nunique > 20:
        high_cardinality_cols.append(col)
        print(f"SKIPPING countplot for {col} (High Cardinality: {nunique} unique values)")
        continue
        
    plt.figure(figsize=(8,4))
    sns.countplot(y=col, data=df, order=df[col].value_counts().index)
    plt.title(f'Count of {col}')
    plt.tight_layout()
    plt.show()

# === ADDED: 4a. High-Cardinality Column Summary ===
print("\n===== 4a. HIGH-CARDINALITY CATEGORICAL SUMMARY =====")
if high_cardinality_cols:
    print(f"High-cardinality features detected: {high_cardinality_cols}")
    for col in high_cardinality_cols:
        print(f"\n--- Top 10 values for: {col} ---")
        print(df[col].value_counts(dropna=False).head(10))
        print(f"...and {df[col].nunique() - 10} other unique values.")
else:
    print("No high-cardinality categorical features detected (threshold > 20).")
# === END ADDED ===


print("\n===== 5. BIVARIATE ANALYSIS (FEATURES vs. TARGET) =====")
# 🔹 Numeric features vs target
# (Your original loop)
print("\nGenerating numeric features vs. Transport_Cost...")
for col in num_cols + ['Delivery_Days']:
    if col in df.columns:
        plt.figure(figsize=(6,4))
        sns.scatterplot(x=df[col], y=df['Transport_Cost'])
        plt.title(f'{col} vs Transport_Cost')
        plt.tight_layout()
        plt.show()

# 🔹 Categorical features vs target (low-cardinality)
# (Your original loop)
print("\nGenerating categorical features vs. Transport_Cost...")
for col in cat_cols:
    if col in df.columns and df[col].nunique() < 20:
        plt.figure(figsize=(10,4))
        sns.boxplot(x=col, y='Transport_Cost', data=df)
        plt.title(f'{col} vs Transport_Cost')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()

# === ADDED: 5a. Date-Derived Features vs. Target ===
print("\nGenerating date-derived features vs. Transport_Cost...")
date_features_to_plot = ['Order_Month', 'Order_Day_of_Week', 'Order_Is_Weekend']
for col in date_features_to_plot:
    if col in df.columns:
        plt.figure(figsize=(10, 4))
        sns.boxplot(x=col, y='Transport_Cost', data=df)
        plt.title(f'{col} vs Transport_Cost')
        if col == 'Order_Day_of_Week':
            plt.xticks(ticks=range(7), labels=['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'])
        plt.tight_layout()
        plt.show()
# === END ADDED ===


print("\n===== 6. OUTLIER DETECTION =====")
# 🔹 Outlier detection (Z-score)
# (Your original code)
# === MODIFIED: Added nan_policy='omit' to handle missing values gracefully ===
try:
    z_scores = df[num_cols + ['Transport_Cost', 'Delivery_Days']].apply(lambda x: zscore(x, nan_policy='omit'))
    outliers = (abs(z_scores) > 3).sum()
    print("\n===== NUMBER OF OUTLIERS PER COLUMN (Z-score > 3) =====")
    print(outliers[outliers > 0].sort_values(ascending=False))
except ValueError as e:
    print(f"Could not calculate Z-scores, likely due to all-NaN column. Error: {e}")
# === END MODIFIED ===


print("\n===== 7. MISSING VALUE VISUALIZATION =====")
# 🔹 Missing value visualization
# (Your original code)
print("\nGenerating missing value matrix...")
msno.matrix(df)
plt.title('Missing Value Matrix')
plt.show()

print("\nGenerating missing value bar chart...")
msno.bar(df)
plt.title('Missing Value Bar Chart')
plt.show()


print("\n" + "="*30)
print(" EDA COMPLETE ")
print("="*30)

In [None]:
print("===== 1. INVESTIGATING NEGATIVE DURATION =====")
# We must use the original 'df' before any rows are dropped
try:
    neg_days = df[df['Delivery_Days'] < 0]['Delivery_Days']
    print(f"Found {len(neg_days)} negative duration rows.")
    
    print("\n--- Top 10 most common negative values: ---")
    print(neg_days.value_counts().head(10))
    
    print("\n--- Stats for negative values (min, max, mean): ---")
    print(neg_days.describe())

except Exception as e:
    print(f"Error checking Delivery_Days: {e}")
    print("Hint: Make sure 'df' is your original DataFrame and 'Delivery_Days' is created.")


print("\n\n===== 2. INVESTIGATING NEGATIVE COST =====")
try:
    neg_costs = df[df['Transport_Cost'] < 0]['Transport_Cost']
    print(f"Found {len(neg_costs)} negative cost rows.")
    
    print("\n--- Top 10 most common negative values: ---")
    print(neg_costs.value_counts().head(10))
    
    print("\n--- Stats for negative values (min, max, mean): ---")
    print(neg_costs.describe())
    
except Exception as e:
    print(f"Error checking Transport_Cost: {e}")

In [None]:
print("===== 3. INVESTIGATING WEIGHT vs. VALUE =====")
try:
    # Calculate correlation
    correlation = df['Equipment_Weight'].corr(df['Equipment_Value'])
    print(f"Correlation (Weight vs. Value): {correlation:.4f}")

    # Check for missing/zero values in each
    weight_zeros = (df['Equipment_Weight'] == 0).sum()
    weight_nans = df['Equipment_Weight'].isna().sum()
    weight_missing_pct = (weight_zeros + weight_nans) / len(df) * 100
    
    value_zeros = (df['Equipment_Value'] == 0).sum()
    value_nans = df['Equipment_Value'].isna().sum()
    value_missing_pct = (value_zeros + value_nans) / len(df) * 100

    print(f"\n--- Missing Data Stats ---")
    print(f"Equipment_Weight: {weight_missing_pct:.2f}% missing (as 0 or NaN)")
    print(f"Equipment_Value:  {value_missing_pct:.2f}% missing (as 0 or NaN)")

except Exception as e:
    print(f"Error checking Weight vs. Value: {e}")


print("\n\n===== 4. INVESTIGATING HEIGHT vs. WIDTH =====")
try:
    # Calculate correlation
    correlation = df['Equipment_Height'].corr(df['Equipment_Width'])
    print(f"Correlation (Height vs. Width): {correlation:.4f}")

    # Check for missing/zero values in each
    height_zeros = (df['Equipment_Height'] == 0).sum()
    height_nans = df['Equipment_Height'].isna().sum()
    height_missing_pct = (height_zeros + height_nans) / len(df) * 100
    
    width_zeros = (df['Equipment_Width'] == 0).sum()
    width_nans = df['Equipment_Width'].isna().sum()
    width_missing_pct = (width_zeros + width_nans) / len(df) * 100

    print(f"\n--- Missing Data Stats ---")
    print(f"Equipment_Height: {height_missing_pct:.2f}% missing (as 0 or NaN)")
    print(f"Equipment_Width:  {width_missing_pct:.2f}% missing (as 0 or NaN)")

except Exception as e:
    print(f"Error checking Height vs. Width: {e}")

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

print("="*70)
print(" ROBUST PREPROCESSING SCRIPT - MEDICAL EQUIPMENT TRANSPORT")
print("="*70)

# ==============================================================================
# PART 1: PRE-SPLIT DATA CLEANING & FEATURE ENGINEERING
# (Assuming 'df' is loaded from cell 7f4617a9, with 'Delivery_Days' created)
# ==============================================================================

print("\n[1/10] Repairing impossible negative values (Problem 1)...")
# Based on EDA, negative values are typos/swapped dates, not missing data.
# We will use .abs() to repair them and preserve all rows.
neg_days = (df['Delivery_Days'] < 0).sum()
if neg_days > 0:
    df['Delivery_Days'] = df['Delivery_Days'].abs()
    print(f"   ✓ Repaired {neg_days} rows with negative 'Delivery_Days' using .abs()")
else:
    print("   ✓ No negative 'Delivery_Days' found.")

# Note: 'Transport_Cost' is repaired in Step 4

# ==============================================================================
print("\n[2/10] Feature Engineering - Volume (Problem 4)...")
# Combine correlated Height & Width into a single 'Volume' feature.
# This solves multicollinearity and simplifies imputation.
df['Equipment_Volume'] = df['Equipment_Height'] * df['Equipment_Width']
print(f"   ✓ Created 'Equipment_Volume' from Height * Width")

# ==============================================================================
print("\n[3/10] Log-transforming heavily skewed features...")
# Apply log-transform to normalize extreme right-skewed features
df['Equipment_Value'] = np.log1p(df['Equipment_Value'])
df['Equipment_Volume'] = np.log1p(df['Equipment_Volume'])
print(f"   ✓ Log-transformed 'Equipment_Value'")
print(f"   ✓ Log-transformed 'Equipment_Volume'")

# ==============================================================================
print("\n[4/10] Defining target variable (y) (Problem 2)...")
# Repair negative 'Transport_Cost' typos FIRST, then log-transform.
neg_costs = (df['Transport_Cost'] < 0).sum()
if neg_costs > 0:
    print(f"   ✓ Repairing {neg_costs} rows with negative 'Transport_Cost' using .abs()")

# Apply .abs() to fix typos, then np.log1p() to normalize the target
y = np.log1p(df['Transport_Cost'].abs())

print(f"   ✓ Target (y) created using log1p(abs(Transport_Cost))")
print(f"   ✓ Target shape: {y.shape}")

# ==============================================================================
print("\n[5/10] Selecting features (X) for modeling (Problem 3 & 4)...")
# Define the final feature set, dropping redundant/replaced columns.
drop_cols = [
    # Target
    'Transport_Cost',
    
    # Replaced by Equipment_Volume
    'Equipment_Height', 
    'Equipment_Width',
    
    # Redundant & less reliable (Problem 3)
    'Equipment_Weight',
    
    # High-cardinality IDs / Unused
    'Hospital_Id',
    'Supplier_Name', 
    'Hospital_Location',
    
    # Replaced by engineered date features
    'Order_Placed_Date',
    'Delivery_Date'
]

X = df.drop(columns=drop_cols)
print(f"   ✓ Dropped {len(drop_cols)} columns (incl. Weight, Height, Width)")
print(f"   ✓ Remaining features: {X.shape[1]}")

# ==============================================================================
print("\n[6/10] Train-test split (80/20)...")
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print(f"   ✓ Training set: {X_train.shape}")
print(f"   ✓ Test set:     {X_test.shape}")

# ==============================================================================
print("\n[7/10] Baseline performance (predicting mean)...")
train_mean = y_train.mean()
y_test_pred_baseline = np.full_like(y_test, train_mean)

# Baseline RMSE in log-space
baseline_rmse_log = np.sqrt(mean_squared_error(y_test, y_test_pred_baseline))
print(f"   ✓ Baseline RMSE (log-space):    {baseline_rmse_log:.4f}")

# Baseline RMSE in original scale
y_test_actual_orig = np.expm1(y_test)
y_test_baseline_orig = np.expm1(y_test_pred_baseline)
baseline_rmse_orig = np.sqrt(mean_squared_error(
    y_test_actual_orig, y_test_baseline_orig
))
print(f"   ✓ Baseline RMSE (original-scale): ${baseline_rmse_orig:,.2f}")

# ==============================================================================
# PART 2: POST-SPLIT PIPELINES (Prevents Data Leakage)
# ==============================================================================
print("\n" + "="*70)
print(" BUILDING PREPROCESSING PIPELINES")
print("="*70)

print("\n[8/10] Configuring feature transformers (Problem 5)...")

# --- Numeric Features ---
# NOTE: 'Equipment_Weight' is GONE. 'Equipment_Volume' is IN.
numeric_features = [
    'Supplier_Reliability', 
    'Equipment_Value',      # Log-transformed
    'Base_Transport_Fee', 
    'Delivery_Days',        # Repaired
    'Equipment_Volume'      # Engineered & Log-transformed
]

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Robust to outliers
    ('scaler', StandardScaler())                    # Standardize
])
print(f"   ✓ Numeric features ({len(numeric_features)}): median imputation + scaling")

# --- Categorical Features ---
categorical_features = [
    'Equipment_Type',
    'CrossBorder_Shipping',
    'Urgent_Shipping',
    'Installation_Service',
    'Transport_Method',
    'Fragile_Equipment',
    'Hospital_Info',
    'Rural_Hospital',
    'Order_Month',
    'Order_Day_of_Week',
    'Order_Is_Weekend'
]

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), # Mode for categoricals
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
print(f"   ✓ Categorical features ({len(categorical_features)}): mode imputation + one-hot")

# ==============================================================================
print("\n[9/10] Assembling ColumnTransformer...")
# This preprocessor is the "heart" of your model pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop'  # Drop any columns we didn't explicitly list
)
print(f"   ✓ ColumnTransformer configured")

# ==============================================================================
print("\n[10/10] Applying preprocessing (fitting on train, transforming both)...")

# FIT on training data only
X_train_processed = preprocessor.fit_transform(X_train)

# TRANSFORM both sets
X_test_processed = preprocessor.transform(X_test)

print(f"   ✓ Training set processed: {X_train_processed.shape}")
print(f"   ✓ Test set processed:     {X_test_processed.shape}")

# ==============================================================================
print("\n" + "="*70)
print(" PREPROCESSING COMPLETE! ✓")
print("="*70)
print("\nKey Decisions Implemented:")
print("   1. ✓ REPAIRED all negative costs/durations using .abs()")
print("   2. ✓ KEPT all 5,000 training rows (no data loss)")
print("   3. ✓ DROPPED 'Equipment_Weight' (redundant with 'Value')")
print("   4. ✓ COMBINED 'Height'/'Width' into 'Equipment_Volume'")
print("   5. ✓ USED robust 'median'/'mode' imputation")
print("   6. ✓ LOG-TRANSFORMED target and skewed features")
print("="*70)

In [None]:

# ----- 1) Set up 5-Fold -----
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# ----- 2) Create the pipeline -----
lr_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),  # your ColumnTransformer
    ('model', LinearRegression())
])

# ----- 3) Run 5-Fold Cross-Validation -----
cv_scores = cross_val_score(
    lr_pipeline,
    X_train,       # raw training features
    y_train,       # log-transformed target
    cv=kf,
    scoring='neg_root_mean_squared_error',
    n_jobs=1
)

avg_rmse = np.abs(cv_scores).mean()
print(f"Linear Regression 5-Fold Avg. RMSE (log-space): {avg_rmse:.4f}")

# ----- 4) Fit final model on full training data -----
lr_pipeline.fit(X_train, y_train)
# print("Linear Regression final model trained on full training set.")

# ----- 5) Predict on test set -----
y_test_pred_log = lr_pipeline.predict(X_test)
y_test_pred_orig = np.expm1(y_test_pred_log)  # back to original scale

rmse_test_log = np.sqrt(np.mean((y_test - y_test_pred_log)**2))
rmse_test_orig = np.sqrt(np.mean((np.expm1(y_test) - y_test_pred_orig)**2))

print(f"Test RMSE (log-space)      : {rmse_test_log:.4f}")  
print(f"Test RMSE (original scale) : {rmse_test_orig:.2f}")

In [None]:

# ----- 1) Set up 5-Fold -----
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# ----- 2) Create the pipeline -----
poly_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),         # your ColumnTransformer
    ('poly', PolynomialFeatures()),         # will tune degree
    ('model', LinearRegression())
])

# ----- 3) Set up GridSearch for hyperparameter tuning -----
param_grid = {
    'poly__degree': [2,3]   # you can expand to 5 if dataset is small
}

grid_search = GridSearchCV(
    poly_pipeline,
    param_grid,
    scoring='neg_root_mean_squared_error',
    cv=kf,
    n_jobs=1
)

# ----- 4) Run GridSearch -----
grid_search.fit(X_train, y_train)

# Best degree
best_degree = grid_search.best_params_['poly__degree']
best_rmse = -grid_search.best_score_
print(f"Best polynomial degree: {best_degree}")
print(f"Best CV RMSE (log-space): {best_rmse:.4f}")

# ----- 5) Fit final model on full training data -----
final_poly_model = grid_search.best_estimator_
final_poly_model.fit(X_train, y_train)
print("Polynomial Regression final model trained on full training set.")

# ----- 6) Predict on test set -----
y_test_pred_log = final_poly_model.predict(X_test)
y_test_pred_orig = np.expm1(y_test_pred_log)  # back to original scale

rmse_test_log = np.sqrt(np.mean((y_test - y_test_pred_log)**2))
rmse_test_orig = np.sqrt(np.mean((np.expm1(y_test) - y_test_pred_orig)**2))

print(f"Test RMSE (log-space)      : {rmse_test_log:.4f}")
print(f"Test RMSE (original scale) : {rmse_test_orig:.2f}")

In [None]:

# ----- 1) Create the pipeline -----
ridge_poly_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),  # your ColumnTransformer
    ('poly', PolynomialFeatures()),  # polynomial expansion
    ('ridge', Ridge())               # ridge regression
])

# ----- 2) Set hyperparameter grid -----
param_grid = {
    'poly__degree': [2,3],      # try degrees 1, 2, 3, 4, 5
    'ridge__alpha': [ 0.01,0.1,1,10,100]  # try different regularization strengths
}

# ----- 3) Grid Search with 5-Fold CV -----
grid_search = GridSearchCV(
    ridge_poly_pipeline,
    param_grid,
    scoring='neg_root_mean_squared_error',
    cv=5,
    n_jobs=1
)

# ----- 4) Fit on training data -----
grid_search.fit(X_train, y_train)

# ----- 5) Best hyperparameters -----
print("Best hyperparameters:", grid_search.best_params_)
print("Best CV RMSE (log-space):", -grid_search.best_score_)

# ----- 6) Predict on test set -----
y_test_pred_log = grid_search.predict(X_test)
y_test_pred_orig = np.expm1(y_test_pred_log)

rmse_test_log = np.sqrt(np.mean((y_test - y_test_pred_log)**2))
rmse_test_orig = np.sqrt(np.mean((np.expm1(y_test) - y_test_pred_orig)**2))

print(f"Test RMSE (log-space)      : {rmse_test_log:.4f}")
print(f"Test RMSE (original scale) : {rmse_test_orig:.2f}")

In [None]:

# ----- 1) Create the pipeline -----
lasso_poly_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),  # your ColumnTransformer
    ('poly', PolynomialFeatures()),  # polynomial expansion
    ('lasso', Lasso(max_iter=10000)) # Lasso regression
])

# ----- 2) Set hyperparameter grid -----
param_grid = {
    'poly__degree': [2,3],       # try degrees 1, 2, 3
    'lasso__alpha': [0.001, 0.01, 0.1, 1]  # regularization strengths
}

# ----- 3) Grid Search with 5-Fold CV -----
grid_search = GridSearchCV(
    lasso_poly_pipeline,
    param_grid,
    scoring='neg_root_mean_squared_error',
    cv=5,
    n_jobs=-1
)

# ----- 4) Fit on training data -----
grid_search.fit(X_train, y_train)

# ----- 5) Best hyperparameters -----
print("Best hyperparameters:", grid_search.best_params_)
print("Best CV RMSE (log-space):", -grid_search.best_score_)

# ----- 6) Predict on test set -----
y_test_pred_log = grid_search.predict(X_test)
y_test_pred_orig = np.expm1(y_test_pred_log)

rmse_test_log = np.sqrt(np.mean((y_test - y_test_pred_log)**2))
rmse_test_orig = np.sqrt(np.mean((np.expm1(y_test) - y_test_pred_orig)**2))

print(f"Test RMSE (log-space)      : {rmse_test_log:.4f}")
print(f"Test RMSE (original scale) : {rmse_test_orig:.2f}")

In [None]:

# ----- 1) CV splitter -----
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# ----- 2) Pipeline: preprocessor -> poly -> elastic net -----
enet_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),                       # your ColumnTransformer
    ('poly', PolynomialFeatures(include_bias=False)),     # polynomial expansion (tune degree)
    ('enet', ElasticNet(max_iter=20000, random_state=42)) # ElasticNet regression
])

# ----- 3) Hyperparameter grid -----
param_grid = {
    'poly__degree': [2,3],                      # try degrees 1..3 (increase carefully)
    'enet__alpha': [0.01,0.1, 1],   # regularization strengths
    'enet__l1_ratio': [0.01, 0.1,0.2]               # mix between L1 (1.0) and L2 (0.0)
}

# ----- 4) GridSearchCV (use n_jobs=1 in notebooks to avoid multiprocessing cwd issues) -----
grid_search = GridSearchCV(
    enet_pipeline,
    param_grid,
    scoring='neg_root_mean_squared_error',
    cv=kf,
    n_jobs=1,
    verbose=2
)

# ----- 5) Run grid search -----
print("Starting GridSearchCV for ElasticNet + PolynomialFeatures ...")
grid_search.fit(X_train, y_train)

# ----- 6) Best params & CV score -----
best_params = grid_search.best_params_
best_cv_rmse = -grid_search.best_score_
print(f"\nBest hyperparameters: {best_params}")
print(f"Best CV RMSE (log-space): {best_cv_rmse:.4f}")

# ----- 7) Final model (best estimator) -----
final_enet = grid_search.best_estimator_

# ----- 8) Evaluate on test set -----
y_test_pred_log = final_enet.predict(X_test)
y_test_pred_orig = np.expm1(y_test_pred_log)

rmse_test_log = np.sqrt(mean_squared_error(y_test, y_test_pred_log))
rmse_test_orig = np.sqrt(mean_squared_error(np.expm1(y_test), y_test_pred_orig))

print(f"\nTest RMSE (log-space)      : {rmse_test_log:.4f}")
print(f"Test RMSE (original scale) : {rmse_test_orig:.2f}")

In [None]:

# ----- 1) CV splitter -----
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# ----- 2) XGB pipeline (preprocessor -> xgb) -----
xgb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('xgb', XGBRegressor(objective='reg:squarederror',
                         random_state=42,
                         n_jobs=-1,
                         tree_method='hist'))  # 'hist' is faster for larger data
])

# ----- 3) Hyperparameter grid (example) -----
param_grid = {
    'xgb__n_estimators': [100, 300],
    'xgb__max_depth': [3, 6],
    'xgb__learning_rate': [0.01, 0.1],
    'xgb__subsample': [0.8, 1.0]
}

# ----- 4) GridSearchCV -----
grid_search = GridSearchCV(
    estimator=xgb_pipeline,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',
    cv=kf,
    n_jobs=1,   # use 1 in notebooks to avoid multiprocessing cwd issues
    verbose=2
)

# ----- 5) Run grid search -----
print("Starting GridSearchCV for XGBoost...")
grid_search.fit(X_train, y_train)

# ----- 6) Best params and CV score -----
best_params = grid_search.best_params_
best_cv_rmse = -grid_search.best_score_
print("\nBest hyperparameters:", best_params)
print(f"Best CV RMSE (log-space): {best_cv_rmse:.4f}")

# ----- 7) Final model (best estimator) -----
final_xgb = grid_search.best_estimator_

# ----- 8) Evaluate on test set -----
y_test_pred_log = final_xgb.predict(X_test)
y_test_pred_orig = np.expm1(y_test_pred_log)

rmse_test_log = np.sqrt(mean_squared_error(y_test, y_test_pred_log))
rmse_test_orig = np.sqrt(mean_squared_error(np.expm1(y_test), y_test_pred_orig))

print(f"\nTest RMSE (log-space)      : {rmse_test_log:.4f}")
print(f"Test RMSE (original scale) : {rmse_test_orig:.2f}")

# ----- 9) (Optional) Feature importances mapped to feature names -----
# This extracts names from the preprocessor (numeric + one-hot cat names)
pre = final_xgb.named_steps['preprocessor']
ohe = pre.named_transformers_['cat'].named_steps['onehot']
num_names = numeric_features
cat_names = list(ohe.get_feature_names_out(categorical_features))
feature_names = np.concatenate([num_names, cat_names])

# xgboost stores feature importances by index (0..n-1)
xgb_model = final_xgb.named_steps['xgb']
importances = xgb_model.feature_importances_

# If shapes mismatch (e.g., due to different handling), ensure lengths match before creating df
if len(importances) == len(feature_names):
    fi_df = pd.DataFrame({
        'feature': feature_names,
        'importance': importances
    }).sort_values('importance', ascending=False)
    print("\nTop 20 XGBoost feature importances:")
    print(fi_df.head(20).to_string(index=False))
else:
    print("\nFeature importance length does not match derived feature name length. Skipping feature-name mapping.")

In [None]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV, KFold

print("🚀 Starting GridSearchCV for XGBoost...")

# --- 1. CV splitter ---
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# --- 2. XGB pipeline (preprocessor -> xgb) ---
# 'preprocessor' is the robust one we built
xgb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('xgb', XGBRegressor(
        objective='reg:squarederror',
        random_state=42,
        n_jobs=-1
    ))
])

# --- 3. Hyperparameter grid ---
# This is a focused grid to start with
param_grid = {
    'xgb__n_estimators': [100, 300, 500],
    'xgb__max_depth': [3, 5, 7],
    'xgb__learning_rate': [0.01, 0.1]
}

# --- 4. GridSearchCV ---
# We use n_jobs=-1 to use all your CPU cores!
grid_search = GridSearchCV(
    estimator=xgb_pipeline,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',
    cv=kf,
    n_jobs=-1,  # Use all cores to speed this up
    verbose=2
)

# --- 5. Run grid search on the training data ---
# (X_train and y_train are from our robust script)
grid_search.fit(X_train, y_train)

# --- 6. Best params & CV score ---
print("\n✅ GridSearch complete!")
best_params = grid_search.best_params_
best_cv_rmse = -grid_search.best_score_
print(f"   Best hyperparameters: {best_params}")
print(f"   Best CV RMSE (log-space): {best_cv_rmse:.4f}")

# --- 7. Evaluate on test set ---
final_xgb = grid_search.best_estimator_
y_test_pred_log = final_xgb.predict(X_test)
y_test_pred_orig = np.expm1(y_test_pred_log)

rmse_test_log = np.sqrt(mean_squared_error(y_test, y_test_pred_log))
rmse_test_orig = np.sqrt(mean_squared_error(np.expm1(y_test), y_test_pred_orig))

print(f"\n   Test RMSE (log-space)      : {rmse_test_log:.4f}")
print(f"   Test RMSE (original scale) : {rmse_test_orig:.2f}")

In [None]:
from xgboost import XGBRegressor

print("\nTraining final XGBoost model on all data...")
print("--- THIS IS YOUR NEW BEST PERFORMING MODEL ---")

# === 1. Feature groups ===
# (From our robust script)
numeric_features = [
    'Supplier_Reliability', 'Equipment_Value', 'Base_Transport_Fee',
    'Delivery_Days', 'Equipment_Volume'
]
categorical_features = [
    'Equipment_Type', 'CrossBorder_Shipping', 'Urgent_Shipping',
    'Installation_Service', 'Transport_Method', 'Fragile_Equipment',
    'Hospital_Info', 'Rural_Hospital', 'Order_Month',
    'Order_Day_of_Week', 'Order_Is_Weekend'
]

# === 2. Define transformers ===
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# === 3. Combine them ===
final_preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop'
)

# === 4. Build final pipeline with BEST hyperparameters ===
# (These are the winning params from your last grid search)
print("   ✓ Using best params: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 500}")

final_xgb_pipeline = Pipeline(steps=[
    ('preprocessor', final_preprocessor),
    ('xgb', XGBRegressor(
        objective='reg:squarederror',
        random_state=42,
        n_jobs=-1,
        learning_rate=0.1,    # Best param
        max_depth=3,          # Best param
        n_estimators=500      # Best param
    ))
])

# === 5. Fit on full dataset ===
# (X and y are from our robust preprocessing script)
print("X shape:", X.shape)
print("y shape:", y.shape)

final_xgb_pipeline.fit(X, y)

print("\n✅ Final (BEST) XGBoost model trained on entire dataset.")
print("You can now use 'final_xgb_pipeline' for predictions.")
print("Remember: predictions will be in log1p space; use np.expm1() to convert if needed.")

In [None]:

kf = KFold(n_splits=5, shuffle=True, random_state=42)

lgb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('lgb', LGBMRegressor(random_state=42, n_jobs=-1))
])

param_grid = {
    'lgb__n_estimators': [100, 300],
    'lgb__max_depth': [4, 8],
    'lgb__learning_rate': [0.01, 0.1],
    'lgb__num_leaves': [31, 63]
}

grid = GridSearchCV(
    estimator=lgb_pipeline,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',
    cv=kf,
    n_jobs=1,     # safer in notebooks; use >1 or -1 in script environments
    verbose=2
)

print("Starting GridSearchCV for LightGBM...")
grid.fit(X_train, y_train)

print("Best params:", grid.best_params_)
best_cv_rmse = -grid.best_score_
print(f"Best CV RMSE (log-space): {best_cv_rmse:.4f}")

# Final model: best estimator already includes the preprocessor
final_lgb = grid.best_estimator_

# Evaluate on test set
y_test_pred_log = final_lgb.predict(X_test)
y_test_pred_orig = np.expm1(y_test_pred_log)

rmse_test_log = np.sqrt(mean_squared_error(y_test, y_test_pred_log))
rmse_test_orig = np.sqrt(mean_squared_error(np.expm1(y_test), y_test_pred_orig))

print(f"Test RMSE (log-space): {rmse_test_log:.4f}")
print(f"Test RMSE (original scale): {rmse_test_orig:.2f}")

# Optional: feature importances mapped to names (if preprocessor produces matching columns)
pre = final_lgb.named_steps['preprocessor']
ohe = pre.named_transformers_['cat'].named_steps['onehot']
num_names = numeric_features
cat_names = list(ohe.get_feature_names_out(categorical_features))
feature_names = np.concatenate([num_names, cat_names])

lgb_model = final_lgb.named_steps['lgb']
importances = lgb_model.feature_importances_

if len(importances) == len(feature_names):
    fi_df = pd.DataFrame({'feature': feature_names, 'importance': importances}) \
             .sort_values('importance', ascending=False)
    print(fi_df.head(20).to_string(index=False))
else:
    print("Warning: feature importance length != feature name length. Skipping mapping.")

In [None]:
# ----- 1) Pipeline -----
rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('rf', RandomForestRegressor(random_state=42, n_jobs=-1))
])

# ----- 2) Expanded parameter grid -----
param_dist = {
    'rf__n_estimators': [100, 300, 500, 700],
    'rf__max_depth': [None, 10, 20, 30],
    'rf__min_samples_split': [2, 5, 10],
    'rf__min_samples_leaf': [1, 2, 4],
    'rf__max_features': ['sqrt', 'log2', 0.5, 1.0]
}

# ----- 3) 5-Fold CV -----
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# ----- 4) Randomized Search -----
random_search = RandomizedSearchCV(
    rf_pipeline,
    param_distributions=param_dist,
    n_iter=10000,
    scoring='neg_root_mean_squared_error',
    cv=kf,
    n_jobs=-1,
    verbose=1,  # shows progress for each combination
    random_state=42
)

# ----- 5) Train -----
print("🚀 Starting RandomizedSearchCV for Random Forest...")
random_search.fit(X_train, y_train)
print("✅ RandomizedSearchCV complete.")

# ----- 6) Evaluate -----
best_model = random_search.best_estimator_
print("🔹 Best model selected. Predicting on test set...")
y_pred_log = best_model.predict(X_test)
y_pred_orig = np.expm1(y_pred_log)

rmse_log = np.sqrt(mean_squared_error(y_test, y_pred_log))
rmse_orig = np.sqrt(mean_squared_error(np.expm1(y_test), y_pred_orig))

# ----- 7) Final Results -----
print("\n===== FINAL RESULTS =====")
print("✅ Best Parameters:", random_search.best_params_)
print(f"✅ CV RMSE (log-space): {-random_search.best_score_:.4f}")
print(f"✅ Test RMSE (log-space): {rmse_log:.4f}")
print(f"✅ Test RMSE (original scale): {rmse_orig:.2f}")

In [None]:
print("\n" + "="*30)
print(" TRAINING FINAL MODEL ON ALL DATA ")
print("="*30)

print("You've found the best parameters. Now, we'll train a new model using")
print("these parameters on the *entire* dataset (X and y) to create")
print("the final, production-ready model.")

# --- 1. Re-define the unfitted preprocessor ---
# We MUST do this to get a fresh, unfitted preprocessor
# so it can be properly fitted on the *full* X dataset.

# Define Feature Lists (as before)
numeric_features = [
    'Supplier_Reliability', 'Equipment_Value', 'Base_Transport_Fee',
    'Delivery_Days', 'Equipment_Volume'
]
categorical_features = [
    'Equipment_Type', 'CrossBorder_Shipping', 'Urgent_Shipping',
    'Installation_Service', 'Transport_Method', 'Fragile_Equipment',
    'Hospital_Info', 'Rural_Hospital', 'Order_Month',
    'Order_Day_of_Week', 'Order_Is_Weekend'
]

# Create the Numeric Pipeline (unfitted)
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Create the Categorical Pipeline (unfitted)
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Create the Full Preprocessor (unfitted)
final_preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop'
)

# --- 2. Create the final, unfitted XGBoost pipeline ---
# This pipeline contains the unfitted preprocessor and an unfitted model
final_model_pipeline = Pipeline([
    ('preprocessor', final_preprocessor),
    ('xgb', XGBRegressor(objective='reg:squarederror',
                         random_state=42,
                         n_jobs=-1,
                         tree_method='hist'))
])

# --- 3. Get best parameters from your grid search ---
best_params = grid_search.best_params_
print(f"\nUsing best parameters: {best_params}")

# --- 4. Set the best parameters on the new pipeline ---
final_model_pipeline.set_params(**best_params)

# --- 5. Fit the final pipeline on ALL data (X, y) ---
# This will fit the preprocessor (imputers, scalers) on ALL X
# and then train the XGBoost model on ALL X and y.
print("Fitting final model on the entire (X, y) dataset...")
final_model_pipeline.fit(X, y)

print("\nTraining complete!")
print("The 'final_model_pipeline' object is now your fully-trained model,")
print("ready to be saved and used for predictions.")

# --- 6. (Optional) Save your final model ---
# You can now save this model to a file for later use.
# import joblib
# joblib.dump(final_model_pipeline, 'final_xgb_model.pkl')
# print("\nFinal model saved to 'final_xgb_model.pkl'")

In [None]:
print("\nTraining final Random Forest model on all data...")

# === 1. Feature groups ===
numeric_features = [
    'Supplier_Reliability', 'Equipment_Value', 'Base_Transport_Fee',
    'Delivery_Days', 'Equipment_Volume'
]
categorical_features = [
    'Equipment_Type', 'CrossBorder_Shipping', 'Urgent_Shipping',
    'Installation_Service', 'Transport_Method', 'Fragile_Equipment',
    'Hospital_Info', 'Rural_Hospital', 'Order_Month',
    'Order_Day_of_Week', 'Order_Is_Weekend'
]

# === 2. Define transformers ===
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# === 3. Combine them ===
final_preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop'
)

# === 4. Build final pipeline with best RF hyperparameters ===
final_rf_pipeline = Pipeline(steps=[
    ('preprocessor', final_preprocessor),
    ('rf', RandomForestRegressor(
        n_estimators=700,
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        max_features=1.0,
        random_state=42,
        n_jobs=-1
    ))
])

# === 5. Fit on full dataset ===
print("X shape:", X.shape)
print("y shape:", y.shape)

final_rf_pipeline.fit(X, y)

print("\n✅ Final Random Forest model trained on entire dataset.")
print("You can now use final_rf_pipeline.predict(new_X) for predictions.")
print("Remember: predictions will be in log1p space; use np.expm1() to convert if needed.")

In [None]:
print("\nTraining final Polynomial Regression (Degree 2) model on all data...")

# === 1. Feature groups ===
# (These remain the same as your robust script)
numeric_features = [
    'Supplier_Reliability', 'Equipment_Value', 'Base_Transport_Fee',
    'Delivery_Days', 'Equipment_Volume'
]
categorical_features = [
    'Equipment_Type', 'CrossBorder_Shipping', 'Urgent_Shipping',
    'Installation_Service', 'Transport_Method', 'Fragile_Equipment',
    'Hospital_Info', 'Rural_Hospital', 'Order_Month',
    'Order_Day_of_Week', 'Order_Is_Weekend'
]

# === 2. Define transformers ===
# (These remain the same)
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# === 3. Combine them ===
# (This remains the same)
final_preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop'
)

# === 4. Build final pipeline with PolynomialFeatures (Degree 2) ===
# THIS IS THE MODIFIED SECTION
final_poly_pipeline = Pipeline(steps=[
    ('preprocessor', final_preprocessor),
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('model', LinearRegression(n_jobs=-1)) # Using LinearRegression as the model
])

# === 5. Fit on full dataset ===
print("X shape:", X.shape)
print("y shape:", y.shape)

# Fit the new polynomial pipeline
final_poly_pipeline.fit(X, y)

print("\n✅ Final Polynomial Regression (Degree 2) model trained on entire dataset.")
print("You can now use final_poly_pipeline.predict(new_X) for predictions.")
print("Remember: predictions will be in log1p space; use np.expm1() to convert if needed.")

In [None]:
print("\nTraining final Ridge(polynomial) model on all data...")

# === 1. Feature groups ===
numeric_features = [
    'Supplier_Reliability', 'Equipment_Value', 'Base_Transport_Fee',
    'Delivery_Days', 'Equipment_Volume'
]
categorical_features = [
    'Equipment_Type', 'CrossBorder_Shipping', 'Urgent_Shipping',
    'Installation_Service', 'Transport_Method', 'Fragile_Equipment',
    'Hospital_Info', 'Rural_Hospital', 'Order_Month',
    'Order_Day_of_Week', 'Order_Is_Weekend'
]

# === 2. Define transformers ===
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# === 3. Combine them ===
final_preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop'
)

# === 4. Build final pipeline with best parameters ===
final_ridge_pipeline = Pipeline(steps=[
    ('preprocessor', final_preprocessor),
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),  # best poly__degree = 3
    ('ridge', Ridge(alpha=100))                                 # best ridge__alpha = 100
])

# === 5. Fit on full dataset ===
print("X shape:", X.shape)
print("y shape:", y.shape)

final_ridge_pipeline.fit(X, y)

print("\n✅ Final Ridge (poly=3, alpha=100) model trained on entire dataset.")
print("You can now use final_ridge_pipeline.predict(new_X) for predictions.")
print("Remember: predictions will be in log1p space; use np.expm1() to convert.")

In [None]:
def prepare_features(df_raw):
    """
    Applies all manual cleaning and feature engineering
    to match the data used for model training.
    
    Takes a raw DataFrame (like test.csv) and returns
    a DataFrame ready for the model pipeline's .predict() method.
    """
    # Make a copy to avoid changing the original data
    df = df_raw.copy()
    
    # 1. Clean column names (from your training script)
    df.columns = df.columns.str.strip()

    # 2. Clean all string/object columns (from your training script)
    for col in df.select_dtypes(include='object').columns:
        df[col] = df[col].astype(str).str.strip()
        df[col] = df[col].replace({'': np.nan, 'nan': np.nan, 'NaN': np.nan})

    # 3. Normalize Yes/No columns (from your training script)
    yes_no_cols = ['CrossBorder_Shipping', 'Urgent_Shipping', 'Installation_Service',
                   'Fragile_Equipment', 'Rural_Hospital']
    for col in yes_no_cols:
        if col in df.columns:
            df[col] = df[col].replace({
                'YES': 'Yes', 'yes': 'Yes', 'Y': 'Yes', 'y': 'Yes',
                'NO': 'No', 'no': 'No', 'N': 'No', 'n': 'No'
            })

    # 4. Convert date columns (from your training script)
    df['Order_Placed_Date'] = pd.to_datetime(df['Order_Placed_Date'], errors='coerce')
    df['Delivery_Date'] = pd.to_datetime(df['Delivery_Date'], errors='coerce')

    # 5. Engineer Date Features (from your training script)
    df['Delivery_Days'] = (df['Delivery_Date'] - df['Order_Placed_Date']).dt.days
    
    # 6. ==== OUR ROBUST FIX (Problem 1) ====
    # Apply .abs() to fix any negative durations (55% of test data)
    # This preserves the magnitude instead of imputing with median.
    num_bad_delivery_days = (df['Delivery_Days'] < 0).sum()
    df['Delivery_Days'] = df['Delivery_Days'].abs()
    print(f"   ✓ Repaired {num_bad_delivery_days} invalid Delivery_Days using .abs().")
    
    # Continue engineering date features
    df['Order_Month'] = df['Order_Placed_Date'].dt.month
    df['Order_Day_of_Week'] = df['Order_Placed_Date'].dt.dayofweek
    df['Order_Is_Weekend'] = df['Order_Day_of_Week'].isin([5, 6])
    
    # 7. ==== OUR ROBUST FIX (Problem 4) ====
    # Engineer Volume Feature
    df['Equipment_Volume'] = df['Equipment_Height'] * df['Equipment_Width']

    # 8. ==== OUR ROBUST FIX (Log-transform) ====
    # Log-Transform Skewed Features
    # The model was trained on these log-transformed features.
    df['Equipment_Value'] = np.log1p(df['Equipment_Value'])
    df['Equipment_Volume'] = np.log1p(df['Equipment_Volume'])
    
    # 9. Return the feature-engineered DataFrame
    # The pipeline will automatically select the columns it needs
    # (numeric_features, categorical_features) and drop the rest.
    return df

In [None]:
# Assume 'final_model_pipeline' is your trained model object from the previous step
# import joblib
# final_model_pipeline = joblib.load('final_xgb_model.pkl') # If you saved it

# 1. Load your new, raw test data
print("Loading new test data...")
# I'm using 'test.csv' as the example filename
df_new_test = pd.read_csv('../data/test.csv') 

# 2. Save IDs for the final submission
# We need to map our predictions back to the original IDs
submission_ids = df_new_test['Hospital_Id']

# 3. Apply the *exact same* feature engineering
print("Applying feature engineering to new data...")
X_new_prepared = prepare_features(df_new_test)

# 4. Get predictions
# The pipeline will handle the rest:
# - Selects the correct columns
# - Imputes missing values (using 'median'/'most_frequent' from training)
# - Scales numeric features (using 'scaler' from training)
# - One-hot encodes categorical features (using 'onehot' from training)
# - Runs the XGBoost model
print("Getting predictions from the final model...")
log_predictions = final_xgb_pipeline.predict(X_new_prepared)

# 5. Convert predictions back from log-scale!
# Remember, you trained on log(Transport_Cost + 1)
final_predictions = np.expm1(log_predictions)

# 6. Create the final submission file
submission_df = pd.DataFrame({
    'Hospital_Id': submission_ids,
    'Transport_Cost': final_predictions
})

# Display the first few predictions
print("\nFinal Predictions:")
display(submission_df.head())

# Save to CSV
submission_df.to_csv('submission1.csv', index=False)
print("Submission file 'submission.csv' created successfully.")

This is 2nd Attempt : Not to be included with above

In [106]:
# === CELL 1: IMPORTS ===

# core
import os
import re
import warnings
warnings.filterwarnings('ignore')

# data + plotting
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn (preprocessing / pipeline / model selection / metrics)
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
# NEW: Import RobustScaler
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error

# models
from xgboost import XGBRegressor

In [107]:
# === CELL 2: INITIAL DATA LOAD & CLEANING ===

print("Loading data...")
try:
    df = pd.read_csv('../data/train.csv')
    df.columns = df.columns.str.strip()
    print(f"Initial data shape: {df.shape}")
except Exception as e:
    print(f"Error loading '../data/train.csv'. Make sure the file is in the correct path.")
    print(e)

# Clean all string/object columns
print("Cleaning string columns...")
for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].astype(str).str.strip()
    df[col] = df[col].replace({'': np.nan, 'nan': np.nan, 'NaN': np.nan})

# Normalize Yes/No columns
print("Normalizing Yes/No columns...")
yes_no_cols = ['CrossBorder_Shipping', 'Urgent_Shipping', 'Installation_Service',
               'Fragile_Equipment', 'Rural_Hospital']
for col in df.select_dtypes(include='object').columns:
    if col in yes_no_cols:
        df[col] = df[col].replace({
            'YES': 'Yes', 'yes': 'Yes', 'Y': 'Yes', 'y': 'Yes',
            'NO': 'No', 'no': 'No', 'N': 'No', 'n': 'No'
        })

# Convert date columns
print("Converting date columns...")
df['Order_Placed_Date'] = pd.to_datetime(df['Order_Placed_Date'], errors='coerce')
df['Delivery_Date'] = pd.to_datetime(df['Delivery_Date'], errors='coerce')

# Create new features
print("Engineering Delivery_Days and date features...")
df['Delivery_Days'] = (df['Delivery_Date'] - df['Order_Placed_Date']).dt.days
df['Order_Month'] = df['Order_Placed_Date'].dt.month
df['Order_Day_of_Week'] = df['Order_Placed_Date'].dt.dayofweek
df['Order_Is_Weekend'] = df['Order_Day_of_Week'].isin([5, 6])

print("\n✅ Initial load and feature engineering complete.")

Loading data...
Initial data shape: (5000, 20)
Cleaning string columns...
Normalizing Yes/No columns...
Converting date columns...
Engineering Delivery_Days and date features...

✅ Initial load and feature engineering complete.


In [None]:
# === CELL 3: V4 ROBUST PREPROCESSING (USING ROBUSTSCALER) ===

print("="*70)
print(" V4 - ROBUST PREPROCESSING (NO CLIPPING, USING ROBUSTSCALER)")
print("="*70)

# ==============================================================================
# PART 1: PRE-SPLIT DATA CLEANING & FEATURE ENGINEERING
# ==============================================================================

print("\n[1/10] Repairing impossible negative values...")
# We STILL do this. This is correct.
df['Delivery_Days'] = df['Delivery_Days'].abs()
df['Transport_Cost'] = df['Transport_Cost'].abs()
print("   ✓ Repaired negative costs and durations using .abs()")

# ==============================================================================
print("\n[2/10] Engineering features...")
df['Equipment_Volume'] = df['Equipment_Height'] * df['Equipment_Width']
print("   ✓ Created 'Equipment_Volume'")

# ==============================================================================
print("\n[3/10] Log-transforming skewed features...")
# We no longer clip, just log-transform
df['Equipment_Value'] = np.log1p(df['Equipment_Value'])
df['Equipment_Volume'] = np.log1p(df['Equipment_Volume'])
print(f"   ✓ Log-transformed 'Equipment_Value' and 'Equipment_Volume'")

# ==============================================================================
print("\n[4/10] Defining target variable (y)...")
# We no longer clip, just log-transform the repaired target
y = np.log1p(df['Transport_Cost'])
print(f"   ✓ Target (y) created using log1p(abs(Transport_Cost))")

# ==============================================================================
print("\n[5/10] Selecting features (X) for modeling...")
drop_cols = [
    'Transport_Cost', 'Equipment_Height', 'Equipment_Width', 'Equipment_Weight',
    'Hospital_Id', 'Supplier_Name', 'Hospital_Location',
    'Order_Placed_Date', 'Delivery_Date'
]
X = df.drop(columns=drop_cols)
print(f"   ✓ Selected {X.shape[1]} features.")

# ==============================================================================
print("\n[6/10] Train-test split (80/20)...")
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print(f"   ✓ Training set: {X_train.shape}")
print(f"   ✓ Test set:     {X_test.shape}")

# ==============================================================================
# PART 2: POST-SPLIT PIPELINES
# ==============================================================================
print("\n" + "="*70)
print(" BUILDING V4 ROBUST PIPELINES (USING ROBUSTSCALER)")
print("="*70)

print("\n[8/10] Configuring feature transformers...")

# --- Numeric Features ---
numeric_features = [
    'Supplier_Reliability', 'Equipment_Value', 'Base_Transport_Fee',
    'Delivery_Days', 'Equipment_Volume'
]
# === CRITICAL CHANGE: USE ROBUSTSCALER ===
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler()) # Use RobustScaler instead of StandardScaler
])
print(f"   ✓ Numeric features: median imputation + RobustScaler")

# --- Categorical Features ---
categorical_features = [
    'Equipment_Type', 'CrossBorder_Shipping', 'Urgent_Shipping',
    'Installation_Service', 'Transport_Method', 'Fragile_Equipment',
    'Hospital_Info', 'Rural_Hospital', 'Order_Month',
    'Order_Day_of_Week', 'Order_Is_Weekend'
]
# Use the 'Missing' category strategy
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
print(f"   ✓ Categorical features: imputing NaNs as 'Missing' + one-hot")

# ==============================================================================
print("\n[9/10] Assembling ColumnTransformer...")
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop'
)
print(f"   ✓ ColumnTransformer 'preprocessor' configured")

# ==============================================================================
print("\n[10/10] Applying preprocessing...")
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)
print(f"   ✓ V4 Preprocessing complete.")
print(f"   ✓ Training set processed: {X_train_processed.shape}")
print(f"   ✓ Test set processed:     {X_test_processed.shape}")

In [None]:
from sklearn.linear_model import Ridge

# === CELL 4 (V8): GRIDSEARCHCV FOR A SIMPLE RIDGE MODEL ===

print("🚀 Starting V8 GridSearchCV for Ridge (Focusing on EXTREME SIMPLICITY)...")

# --- 1. CV splitter ---
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# --- 2. Ridge pipeline ---
# 'preprocessor' is your V4 preprocessor (with RobustScaler)
# This MUST be the V4 preprocessor object, already in your memory
ridge_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor), 
    ('ridge', Ridge(random_state=42)) # Simple, robust linear model
])

# --- 3. V8 Hyperparameter grid (Just tune regularization strength) ---
param_grid = {
    'ridge__alpha': [1, 10, 50, 100, 200, 500, 1000] # Test a wide range of L2 regularization
}

# --- 4. GridSearchCV ---
grid_search = GridSearchCV(
    estimator=ridge_pipeline,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',
    cv=kf,
    n_jobs=-1,
    verbose=2
)

# --- 5. Run grid search on the V4 TRAINING DATA ---
# (X_train and y_train are from your V4 preprocessing cell)
grid_search.fit(X_train, y_train)

# --- 6. Best params & CV score ---
print("\n✅ V8 (Ridge) GridSearch complete!")
best_params = grid_search.best_params_
best_cv_rmse = -grid_search.best_score_
print(f"   Best hyperparameters: {best_params}")
print(f"   Best CV RMSE (log-space): {best_cv_rmse:.4f}")

# --- 7. Evaluate on V4 test set ---
final_ridge_v8 = grid_search.best_estimator_
y_test_pred_log = final_ridge_v8.predict(X_test)
y_test_pred_orig = np.expm1(y_test_pred_log)

rmse_test_log = np.sqrt(mean_squared_error(y_test, y_test_pred_log))
rmse_test_orig = np.sqrt(mean_squared_error(np.expm1(y_test), y_test_pred_orig))

print(f"\n   Test RMSE (log-space)      : {rmse_test_log:.4f}")
print(f"   Test RMSE (original scale) : {rmse_test_orig:.2f}")

In [None]:
# === CELL 4 (V6): GRIDSEARCHCV FOR EXTREME ROBUSTNESS ===

print("🚀 Starting V6 GridSearchCV for XGBoost (Focusing on EXTREME ROBUSTNESS)...")

# --- 1. CV splitter ---
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# --- 2. XGB pipeline ---
# 'preprocessor' is your V4 preprocessor (with RobustScaler)
# This MUST be the V4 preprocessor object, already in your memory
xgb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor), 
    ('xgb', XGBRegressor(
        objective='reg:squarederror',
        random_state=42,
        n_jobs=-1,
        learning_rate=0.05 # Slower learning rate for more stability
    ))
])

# --- 3. V6 Hyperparameter grid (Shallow, Regularized) ---
param_grid = {
    'xgb__n_estimators': [1000, 1500],
    'xgb__max_depth': [3],               # Force shallow trees
    'xgb__reg_alpha': [10, 50, 100],     # Aggressive L1
    'xgb__reg_lambda': [10, 50, 100]     # Aggressive L2
}

# --- 4. GridSearchCV ---
grid_search = GridSearchCV(
    estimator=xgb_pipeline,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',
    cv=kf,
    n_jobs=-1,
    verbose=2
)

# --- 5. Run grid search on the V4 TRAINING DATA ---
# (X_train and y_train are from your V4 preprocessing cell)
grid_search.fit(X_train, y_train)

# --- 6. Best params & CV score ---
print("\n✅ V6 GridSearch complete!")
best_params = grid_search.best_params_
best_cv_rmse = -grid_search.best_score_
print(f"   Best hyperparameters: {best_params}")
print(f"   Best CV RMSE (log-space): {best_cv_rmse:.4f}")

# --- 7. Evaluate on V4 test set ---
final_xgb_bulletproof_v6 = grid_search.best_estimator_
y_test_pred_log = final_xgb_bulletproof_v6.predict(X_test)
y_test_pred_orig = np.expm1(y_test_pred_log)

rmse_test_log = np.sqrt(mean_squared_error(y_test, y_test_pred_log))
rmse_test_orig = np.sqrt(mean_squared_error(np.expm1(y_test), y_test_pred_orig))

print(f"\n   Test RMSE (log-space)      : {rmse_test_log:.4f}")
print(f"   Test RMSE (original scale) : {rmse_test_orig:.2f}")

In [None]:
# === CELL 5: DEFINE THE V4 'PREPARE_FEATURES' FUNCTION ===

# This function is now simpler: no clipping!
def prepare_features_V4(df_raw):
    """
    Applies all V4 (robust) manual cleaning and feature engineering.
    No clipping is needed as the RobustScaler pipeline handles outliers.
    """
    df = df_raw.copy()
    
    # 1. Clean column names
    df.columns = df.columns.str.strip()

    # 2. Clean all string/object columns
    for col in df.select_dtypes(include='object').columns:
        df[col] = df[col].astype(str).str.strip()
        df[col] = df[col].replace({'': np.nan, 'nan': np.nan, 'NaN': np.nan})

    # 3. Normalize Yes/No columns
    yes_no_cols = ['CrossBorder_Shipping', 'Urgent_Shipping', 'Installation_Service',
                   'Fragile_Equipment', 'Rural_Hospital']
    for col in df.select_dtypes(include='object').columns:
        if col in yes_no_cols:
            df[col] = df[col].replace({
                'YES': 'Yes', 'yes': 'Yes', 'Y': 'Yes', 'y': 'Yes',
                'NO': 'No', 'no': 'No', 'N': 'No', 'n': 'No'
            })

    # 4. Convert date columns
    df['Order_Placed_Date'] = pd.to_datetime(df['Order_Placed_Date'], errors='coerce')
    df['Delivery_Date'] = pd.to_datetime(df['Delivery_Date'], errors='coerce')

    # 5. Engineer Date Features
    df['Delivery_Days'] = (df['Delivery_Date'] - df['Order_Placed_Date']).dt.days
    df['Delivery_Days'] = df['Delivery_Days'].abs() # V4 fix
    df['Order_Month'] = df['Order_Placed_Date'].dt.month
    df['Order_Day_of_Week'] = df['Order_Placed_Date'].dt.dayofweek
    df['Order_Is_Weekend'] = df['Order_Day_of_Week'].isin([5, 6])
    
    # 7. ==== V4: ENGINEERING & LOG-TRANSFORM ====
    df['Equipment_Volume'] = df['Equipment_Height'] * df['Equipment_Width']
    df['Equipment_Value'] = np.log1p(df['Equipment_Value'])
    df['Equipment_Volume'] = np.log1p(df['Equipment_Volume'])
    
    return df

print("   ✓ V4 `prepare_features_V4` function created.")

In [None]:
# === CELL 6: TRAIN YOUR FINAL, BEST V8 (RIDGE) MODEL ON ALL V4 DATA ===
from sklearn.linear_model import Ridge

print("\nTraining final V8 (Ridge) model on all V4 data...")
print("--- THIS IS OUR 'ROBUST & SIMPLE' MODEL ---")

# === 1. Feature groups (from our V4 robust script) ===
numeric_features = [
    'Supplier_Reliability', 'Equipment_Value', 'Base_Transport_Fee',
    'Delivery_Days', 'Equipment_Volume'
]
categorical_features = [
    'Equipment_Type', 'CrossBorder_Shipping', 'Urgent_Shipping',
    'Installation_Service', 'Transport_Method', 'Fragile_Equipment',
    'Hospital_Info', 'Rural_Hospital', 'Order_Month',
    'Order_Day_of_Week', 'Order_Is_Weekend'
]

# === 2. Define V4 transformers (with RobustScaler) ===
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler()) # CRITICAL: Use RobustScaler
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# === 3. Combine them ===
final_preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop'
)

# === 4. Build final pipeline with BEST V8 hyperparameters ===
# (This automatically uses the 'best_params' variable from Cell 4)
print(f"   ✓ Using best params from V8 GridSearch: {best_params}")

final_ridge_v8_pipeline = Pipeline(steps=[
    ('preprocessor', final_preprocessor),
    ('ridge', Ridge(
        alpha=best_params['ridge__alpha'], # Unpacks {'ridge__alpha': 10}
        random_state=42
    ))
])

# === 5. Fit on full V4 dataset ===
# (X and y are from Cell 3)
print("X shape:", X.shape)
print("y shape:", y.shape)

final_ridge_v8_pipeline.fit(X, y)

print("\n✅ Final (V8 RIDGE) model trained on entire dataset.")
print("The 'final_ridge_v8_pipeline' object is ready for prediction.")

In [None]:
# === CELL 6: TRAIN YOUR FINAL, BEST V4 MODEL ON ALL V4 DATA ===

print("\nTraining final V4 XGBoost model on all V4 data...")
print("--- THIS IS YOUR NEW BEST/MOST STABLE MODEL ---")

# === 1. Feature groups (from our V4 robust script) ===
numeric_features = [
    'Supplier_Reliability', 'Equipment_Value', 'Base_Transport_Fee',
    'Delivery_Days', 'Equipment_Volume'
]
categorical_features = [
    'Equipment_Type', 'CrossBorder_Shipping', 'Urgent_Shipping',
    'Installation_Service', 'Transport_Method', 'Fragile_Equipment',
    'Hospital_Info', 'Rural_Hospital', 'Order_Month',
    'Order_Day_of_Week', 'Order_Is_Weekend'
]

# === 2. Define V4 transformers (with RobustScaler) ===
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler()) # CRITICAL: Use RobustScaler
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# === 3. Combine them ===
final_preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop'
)

# === 4. Build final pipeline with BEST V4 hyperparameters ===
# (This automatically uses the 'best_params' variable from Cell 4)
print(f"   ✓ Using best params from V4 GridSearch: {best_params}")

# Map the 'xgb__' keys to the model's expected keys
model_params = {
    key.replace('xgb__', ''): value 
    for key, value in best_params.items()
}

final_xgb_robust_pipeline_V4 = Pipeline(steps=[
    ('preprocessor', final_preprocessor),
    ('xgb', XGBRegressor(
        objective='reg:squarederror',
        random_state=42,
        n_jobs=-1,
        **model_params # Unpacks {'learning_rate': 0.1, 'max_depth': 4, ...}
    ))
])

# === 5. Fit on full V4 dataset ===
# (X and y are from Cell 3)
print("X shape:", X.shape)
print("y shape:", y.shape)

final_xgb_robust_pipeline_V4.fit(X, y)

print("\n✅ Final (V4 ROBUST) XGBoost model trained on entire dataset.")
print("The 'final_xgb_robust_pipeline_V4' object is ready for prediction.")

In [None]:
# === CELL 7: GENERATE YOUR FINAL V4 SUBMISSION ===

# 1. Load your new, raw test data
print("Loading new test data (test.csv)...")
df_new_test = pd.read_csv('../data/test.csv') 

# 2. Save IDs
submission_ids = df_new_test['Hospital_Id']

# 3. Apply the *V4* feature engineering
print("Applying V4 (RobustScaler) feature engineering...")
# This uses the 'prepare_features_V4' function you defined in Cell 5
X_new_prepared = prepare_features_V4(df_new_test) 

# 4. Get predictions FROM THE ROBUST V4 MODEL
print("Getting predictions from the final V4 XGBoost model...")
# This uses the 'final_xgb_robust_pipeline_V4' model from Cell 6
log_predictions = final_xgb_robust_pipeline_V4.predict(X_new_prepared)

# 5. Convert predictions back from log-scale
final_predictions = np.expm1(log_predictions)

# 6. Create the final submission file
submission_df = pd.DataFrame({
    'Hospital_Id': submission_ids,
    'Transport_Cost': final_predictions
})

# Display the first few predictions
print("\nFinal Predictions:")
display(submission_df.head())

# Save to CSV
submission_df.to_csv('submission_XGB_V4_RobustScaler.csv', index=False)
print("\n✅ Submission file 'submission_XGB_V4_RobustScaler.csv' created successfully.")
print("THIS IS THE ONE. UPLOAD THIS FILE TO KAGGLE!")

This is Attempt 3 for Dumb Models check

In [None]:
# === CELL 1: V4 ROBUST PREPROCESSING (Run this again) ===

print("="*70)
print(" V4 - ROBUST PREPROCESSING (USING ROBUSTSCALER)")
print("="*70)

# ==============================================================================
# PART 1: PRE-SPLIT DATA CLEANING & FEATURE ENGINEERING
# ==============================================================================

print("\n[1/10] Loading and cleaning data...")
try:
    df = pd.read_csv('../data/train.csv')
    df.columns = df.columns.str.strip()
except Exception as e:
    print(f"Error loading '../data/train.csv'. Make sure the file is in the correct path.")
    print(e)

for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].astype(str).str.strip()
    df[col] = df[col].replace({'': np.nan, 'nan': np.nan, 'NaN': np.nan})
    
yes_no_cols = ['CrossBorder_Shipping', 'Urgent_Shipping', 'Installation_Service',
               'Fragile_Equipment', 'Rural_Hospital']
for col in df.select_dtypes(include='object').columns:
    if col in yes_no_cols:
        df[col] = df[col].replace({
            'YES': 'Yes', 'yes': 'Yes', 'Y': 'Yes', 'y': 'Yes',
            'NO': 'No', 'no': 'No', 'N': 'No', 'n': 'No'
        })

df['Order_Placed_Date'] = pd.to_datetime(df['Order_Placed_Date'], errors='coerce')
df['Delivery_Date'] = pd.to_datetime(df['Delivery_Date'], errors='coerce')

df['Delivery_Days'] = (df['Delivery_Date'] - df['Order_Placed_Date']).dt.days
df['Order_Month'] = df['Order_Placed_Date'].dt.month
df['Order_Day_of_Week'] = df['Order_Placed_Date'].dt.dayofweek
df['Order_Is_Weekend'] = df['Order_Day_of_Week'].isin([5, 6])
print("   ✓ Initial load and cleaning complete.")

print("\n[2/10] Repairing negative values...")
df['Delivery_Days'] = df['Delivery_Days'].abs()
df['Transport_Cost'] = df['Transport_Cost'].abs()
print("   ✓ Repaired negative costs and durations.")

print("\n[3/10] Engineering & Log-transforming features...")
df['Equipment_Volume'] = df['Equipment_Height'] * df['Equipment_Width']
df['Equipment_Value'] = np.log1p(df['Equipment_Value'])
df['Equipment_Volume'] = np.log1p(df['Equipment_Volume'])
print("   ✓ Engineered and log-transformed features.")

print("\n[4/10] Defining target variable (y)...")
y = np.log1p(df['Transport_Cost'])
print(f"   ✓ Target (y) created.")

print("\n[5/10] Selecting features (X)...")
drop_cols = [
    'Transport_Cost', 'Equipment_Height', 'Equipment_Width', 'Equipment_Weight',
    'Hospital_Id', 'Supplier_Name', 'Hospital_Location',
    'Order_Placed_Date', 'Delivery_Date'
]
X = df.drop(columns=drop_cols)

print("\n[6/10] Train-test split (80/20)...")
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ==============================================================================
# PART 2: POST-SPLIT PIPELINES
# ==============================================================================
print("\n" + "="*70)
print(" BUILDING V4 ROBUST PIPELINES (USING ROBUSTSCALER)")
print("="*70)

# --- Numeric Features ---
numeric_features = [
    'Supplier_Reliability', 'Equipment_Value', 'Base_Transport_Fee',
    'Delivery_Days', 'Equipment_Volume'
]
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler()) 
])

# --- Categorical Features ---
categorical_features = [
    'Equipment_Type', 'CrossBorder_Shipping', 'Urgent_Shipping',
    'Installation_Service', 'Transport_Method', 'Fragile_Equipment',
    'Hospital_Info', 'Rural_Hospital', 'Order_Month',
    'Order_Day_of_Week', 'Order_Is_Weekend'
]
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# --- V4 Preprocessor (This is the one we will modify) ---
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop'
)

# --- V4 Prepare_features function (This is the one we will use) ---
def prepare_features_V4(df_raw):
    df = df_raw.copy()
    df.columns = df.columns.str.strip()
    for col in df.select_dtypes(include='object').columns:
        df[col] = df[col].astype(str).str.strip()
        df[col] = df[col].replace({'': np.nan, 'nan': np.nan, 'NaN': np.nan})
    yes_no_cols = ['CrossBorder_Shipping', 'Urgent_Shipping', 'Installation_Service', 'Fragile_Equipment', 'Rural_Hospital']
    for col in df.select_dtypes(include='object').columns:
        if col in yes_no_cols:
            df[col] = df[col].replace({'YES': 'Yes', 'yes': 'Yes', 'Y': 'Yes', 'y': 'Yes', 'NO': 'No', 'no': 'No', 'N': 'No', 'n': 'No'})
    df['Order_Placed_Date'] = pd.to_datetime(df['Order_Placed_Date'], errors='coerce')
    df['Delivery_Date'] = pd.to_datetime(df['Delivery_Date'], errors='coerce')
    df['Delivery_Days'] = (df['Delivery_Date'] - df['Order_Placed_Date']).dt.days
    df['Delivery_Days'] = df['Delivery_Days'].abs()
    df['Order_Month'] = df['Order_Placed_Date'].dt.month
    df['Order_Day_of_Week'] = df['Order_Placed_Date'].dt.dayofweek
    df['Order_Is_Weekend'] = df['Order_Day_of_Week'].isin([5, 6])
    df['Equipment_Volume'] = df['Equipment_Height'] * df['Equipment_Width']
    df['Equipment_Value'] = np.log1p(df['Equipment_Value'])
    df['Equipment_Volume'] = np.log1p(df['Equipment_Volume'])
    return df

print("   ✓ V4 Preprocessor and V4 Prepare_Features function are in memory.")

In [None]:
# === CELL 2: PROBE 1 (SUPPLIER_RELIABILITY) ===
from sklearn.linear_model import Ridge

print("🚀 Starting Probe 1: Supplier_Reliability")

# 1. Create a preprocessor for ONLY this feature
probe_1_features = ['Supplier_Reliability']
probe_1_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler()) 
])
probe_1_preprocessor = ColumnTransformer(
    transformers=[('num', probe_1_transformer, probe_1_features)],
    remainder='drop'
)

# 2. Build the final pipeline
probe_1_pipeline = Pipeline(steps=[
    ('preprocessor', probe_1_preprocessor),
    ('ridge', Ridge(alpha=10)) # Use alpha=10 from our V8 test
])

# 3. Fit on ALL V4 data
print("Fitting Probe 1 model on all data...")
probe_1_pipeline.fit(X, y)

# 4. Load test data and apply V4 preparation
print("Loading and preparing test data...")
df_new_test = pd.read_csv('../data/test.csv') 
submission_ids = df_new_test['Hospital_Id']
X_new_prepared = prepare_features_V4(df_new_test)

# 5. Get predictions
print("Getting predictions...")
log_predictions = probe_1_pipeline.predict(X_new_prepared)
final_predictions = np.expm1(log_predictions)

# 6. Create submission file
submission_df = pd.DataFrame({'Hospital_Id': submission_ids, 'Transport_Cost': final_predictions})
submission_df.to_csv('submission_PROBE_1_Reliability.csv', index=False)
print("✅ 'submission_PROBE_1_Reliability.csv' created. PLEASE SUBMIT THIS.")

In [None]:
# === CELL 3: PROBE 2 (BASE_TRANSPORT_FEE) ===
from sklearn.linear_model import Ridge

print("🚀 Starting Probe 2: Base_Transport_Fee")

# 1. Create a preprocessor for ONLY this feature
probe_2_features = ['Base_Transport_Fee']
probe_2_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler()) 
])
probe_2_preprocessor = ColumnTransformer(
    transformers=[('num', probe_2_transformer, probe_2_features)],
    remainder='drop'
)

# 2. Build the final pipeline
probe_2_pipeline = Pipeline(steps=[
    ('preprocessor', probe_2_preprocessor),
    ('ridge', Ridge(alpha=10))
])

# 3. Fit on ALL V4 data
print("Fitting Probe 2 model on all data...")
probe_2_pipeline.fit(X, y)

# 4. Load test data (already prepared)
print("Using prepared test data...")
# (X_new_prepared and submission_ids are already in memory from Cell 2)

# 5. Get predictions
print("Getting predictions...")
log_predictions = probe_2_pipeline.predict(X_new_prepared)
final_predictions = np.expm1(log_predictions)

# 6. Create submission file
submission_df = pd.DataFrame({'Hospital_Id': submission_ids, 'Transport_Cost': final_predictions})
submission_df.to_csv('submission_PROBE_2_BaseFee.csv', index=False)
print("✅ 'submission_PROBE_2_BaseFee.csv' created. PLEASE SUBMIT THIS.")

In [None]:
# === CELL 4: PROBE 3 (DELIVERY_DAYS) ===
from sklearn.linear_model import Ridge

print("🚀 Starting Probe 3: Delivery_Days")

# 1. Create a preprocessor for ONLY this feature
probe_3_features = ['Delivery_Days']
probe_3_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler()) 
])
probe_3_preprocessor = ColumnTransformer(
    transformers=[('num', probe_3_transformer, probe_3_features)],
    remainder='drop'
)

# 2. Build the final pipeline
probe_3_pipeline = Pipeline(steps=[
    ('preprocessor', probe_3_preprocessor),
    ('ridge', Ridge(alpha=10))
])

# 3. Fit on ALL V4 data
print("Fitting Probe 3 model on all data...")
probe_3_pipeline.fit(X, y)

# 4. Load test data (already prepared)
print("Using prepared test data...")
# (X_new_prepared and submission_ids are already in memory from Cell 2)

# 5. Get predictions
print("Getting predictions...")
log_predictions = probe_3_pipeline.predict(X_new_prepared)
final_predictions = np.expm1(log_predictions)

# 6. Create submission file
submission_df = pd.DataFrame({'Hospital_Id': submission_ids, 'Transport_Cost': final_predictions})
submission_df.to_csv('submission_PROBE_3_DeliveryDays.csv', index=False)
print("✅ 'submission_PROBE_3_DeliveryDays.csv' created. PLEASE SUBMIT THIS.")

In [None]:
4th Attempt of removing above features and retrying

In [None]:
# === CELL 1: V10 "SAFE FEATURES ONLY" PREPROCESSING ===

print("="*70)
print(" V10 - 'SAFE FEATURES ONLY' PREPROCESSING")
print("="*70)

# ==============================================================================
# PART 1: PRE-SPLIT DATA CLEANING & FEATURE ENGINEERING
# ==============================================================================

print("\n[1/10] Loading and cleaning data...")
try:
    df = pd.read_csv('../data/train.csv')
    df.columns = df.columns.str.strip()
except Exception as e:
    print(f"Error loading '../data/train.csv'. Make sure the file is in the correct path.")
    print(e)

for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].astype(str).str.strip()
    df[col] = df[col].replace({'': np.nan, 'nan': np.nan, 'NaN': np.nan})
    
yes_no_cols = ['CrossBorder_Shipping', 'Urgent_Shipping', 'Installation_Service',
               'Fragile_Equipment', 'Rural_Hospital']
for col in df.select_dtypes(include='object').columns:
    if col in yes_no_cols:
        df[col] = df[col].replace({
            'YES': 'Yes', 'yes': 'Yes', 'Y': 'Yes', 'y': 'Yes',
            'NO': 'No', 'no': 'No', 'N': 'No', 'n': 'No'
        })

df['Order_Placed_Date'] = pd.to_datetime(df['Order_Placed_Date'], errors='coerce')
df['Delivery_Date'] = pd.to_datetime(df['Delivery_Date'], errors='coerce')

df['Delivery_Days'] = (df['Delivery_Date'] - df['Order_Placed_Date']).dt.days
df['Order_Month'] = df['Order_Placed_Date'].dt.month
df['Order_Day_of_Week'] = df['Order_Placed_Date'].dt.dayofweek
df['Order_Is_Weekend'] = df['Order_Day_of_Week'].isin([5, 6])
print("   ✓ Initial load and cleaning complete.")

print("\n[2/10] Repairing negative costs/durations...")
# We still repair Delivery_Days in case it's used for something (it's not, but good practice)
df['Delivery_Days'] = df['Delivery_Days'].abs() 
df['Transport_Cost'] = df['Transport_Cost'].abs()
print("   ✓ Repaired negative costs and durations.")

print("\n[3/10] Engineering & Log-transforming 'safe' features...")
df['Equipment_Volume'] = df['Equipment_Height'] * df['Equipment_Width']
df['Equipment_Value'] = np.log1p(df['Equipment_Value'])
df['Equipment_Volume'] = np.log1p(df['Equipment_Volume'])
print("   ✓ Engineered and log-transformed 'safe' features (Value and Volume).")

print("\n[4/10] Defining target variable (y)...")
y = np.log1p(df['Transport_Cost'])
print(f"   ✓ Target (y) created.")

print("\n[5/10] Selecting features (X)...")
# We drop the "bomb" features from our training data!
drop_cols = [
    'Transport_Cost', 'Equipment_Height', 'Equipment_Width', 'Equipment_Weight',
    'Hospital_Id', 'Supplier_Name', 'Hospital_Location',
    'Order_Placed_Date', 'Delivery_Date',
    
    # === DROPPING THE BOMBS ===
    'Supplier_Reliability',
    'Base_Transport_Fee',
    'Delivery_Days'
]
X = df.drop(columns=drop_cols)
print(f"   ✓ 'X' created. 'Bomb' features (Reliability, Base_Fee, Days) have been DROPPED.")

print("\n[6/10] Train-test split (80/20)...")
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ==============================================================================
# PART 2: POST-SPLIT PIPELINES
# ==============================================================================
print("\n" + "="*70)
print(" BUILDING V10 'SAFE FEATURES ONLY' PIPELINES")
print("="*70)

print("\n[8/10] Configuring feature transformers...")

# --- Numeric Features (SAFE FEATURES ONLY) ---
numeric_features = [
    'Equipment_Value',  # Log-transformed, safe
    'Equipment_Volume'  # Log-transformed, safe
]
# We can use StandardScaler now, it's fine for log-transformed data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()) 
])
print(f"   ✓ Numeric features (SAFE ONLY): median imputation + StandardScaler")

# --- Categorical Features (ALL) ---
categorical_features = [
    'Equipment_Type', 'CrossBorder_Shipping', 'Urgent_Shipping',
    'Installation_Service', 'Transport_Method', 'Fragile_Equipment',
    'Hospital_Info', 'Rural_Hospital', 'Order_Month',
    'Order_Day_of_Week', 'Order_Is_Weekend'
]
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
print(f"   ✓ Categorical features: imputing NaNs as 'Missing' + one-hot")

# ==============================================================================
print("\n[9/10] Assembling ColumnTransformer...")
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop'
)
print(f"   ✓ ColumnTransformer 'preprocessor' configured")

# ==============================================================================
print("\n[10/10] Applying preprocessing...")
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)
print(f"   ✓ V10 Preprocessing complete.")
print(f"   ✓ Training set processed: {X_train_processed.shape}")
print(f"   ✓ Test set processed:     {X_test_processed.shape}")

In [None]:
# === CELL 2: V10 GRIDSEARCHCV (FOR 'SAFE FEATURES ONLY' DATA) ===

print("🚀 Starting V10 GridSearchCV for XGBoost (on 'Safe' data)...")

# --- 1. CV splitter ---
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# --- 2. XGB pipeline ---
# 'preprocessor' is the new V10 one from Cell 1
xgb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('xgb', XGBRegressor(
        objective='reg:squarederror',
        random_state=42,
        n_jobs=-1
    ))
])

# --- 3. Hyperparameter grid (we can be less aggressive) ---
param_grid = {
    'xgb__n_estimators': [300, 500],
    'xgb__max_depth': [3, 5, 7],
    'xgb__learning_rate': [0.1],
    'xgb__reg_alpha': [0.1, 1],  # Less regularization needed now
    'xgb__reg_lambda': [1]
}

# --- 4. GridSearchCV ---
grid_search = GridSearchCV(
    estimator=xgb_pipeline,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',
    cv=kf,
    n_jobs=-1,
    verbose=2
)

# --- 5. Run grid search on the V10 TRAINING DATA ---
grid_search.fit(X_train, y_train)

# --- 6. Best params & CV score ---
print("\n✅ V10 GridSearch complete!")
best_params = grid_search.best_params_
best_cv_rmse = -grid_search.best_score_
print(f"   Best hyperparameters: {best_params}")
print(f"   Best CV RMSE (log-space): {best_cv_rmse:.4f}")

# --- 7. Evaluate on V10 test set ---
final_xgb_safe_v10 = grid_search.best_estimator_
y_test_pred_log = final_xgb_safe_v10.predict(X_test)
y_test_pred_orig = np.expm1(y_test_pred_log)

rmse_test_log = np.sqrt(mean_squared_error(y_test, y_test_pred_log))
rmse_test_orig = np.sqrt(mean_squared_error(np.expm1(y_test), y_test_pred_orig))

print(f"\n   Test RMSE (log-space)      : {rmse_test_log:.4f}")
print(f"   Test RMSE (original scale) : {rmse_test_orig:.2f}")

In [None]:
# === CELL 3: DEFINE THE V10 'PREPARE_FEATURES' FUNCTION ===

# This function ONLY engineers the 'safe' features
def prepare_features_V10(df_raw):
    """
    Applies all V10 ('Safe Features Only') manual cleaning and feature engineering.
    It does NOT create the 'bomb' features.
    """
    df = df_raw.copy()
    
    # 1. Clean column names
    df.columns = df.columns.str.strip()

    # 2. Clean all string/object columns
    for col in df.select_dtypes(include='object').columns:
        df[col] = df[col].astype(str).str.strip()
        df[col] = df[col].replace({'': np.nan, 'nan': np.nan, 'NaN': np.nan})

    # 3. Normalize Yes/No columns
    yes_no_cols = ['CrossBorder_Shipping', 'Urgent_Shipping', 'Installation_Service',
                   'Fragile_Equipment', 'Rural_Hospital']
    for col in df.select_dtypes(include='object').columns:
        if col in yes_no_cols:
            df[col] = df[col].replace({
                'YES': 'Yes', 'yes': 'Yes', 'Y': 'Yes', 'y': 'Yes',
                'NO': 'No', 'no': 'No', 'N': 'No', 'n': 'No'
            })

    # 4. Convert date columns
    df['Order_Placed_Date'] = pd.to_datetime(df['Order_Placed_Date'], errors='coerce')
    df['Delivery_Date'] = pd.to_datetime(df['Delivery_Date'], errors='coerce')

    # 5. Engineer ONLY categorical date features
    df['Order_Month'] = df['Order_Placed_Date'].dt.month
    df['Order_Day_of_Week'] = df['Order_Placed_Date'].dt.dayofweek
    df['Order_Is_Weekend'] = df['Order_Day_of_Week'].isin([5, 6])
    
    # 6. ==== V10: ENGINEERING & LOG-TRANSFORM 'SAFE' FEATURES ====
    df['Equipment_Volume'] = df['Equipment_Height'] * df['Equipment_Width']
    df['Equipment_Value'] = np.log1p(df['Equipment_Value'])
    df['Equipment_Volume'] = np.log1p(df['Equipment_Volume'])
    
    # We explicitly DO NOT create 'Delivery_Days', 'Supplier_Reliability', or 'Base_Transport_Fee'
    
    return df

print("   ✓ V10 `prepare_features_V10` ('Safe Features Only') function created.")

In [None]:
# === CELL 4: TRAIN YOUR FINAL, BEST V10 MODEL ON ALL V10 DATA ===

print("\nTraining final V10 XGBoost model on all V10 ('Safe') data...")

# === 1. Feature groups (V10 SAFE FEATURES) ===
numeric_features = [
    'Equipment_Value',
    'Equipment_Volume'
]
categorical_features = [
    'Equipment_Type', 'CrossBorder_Shipping', 'Urgent_Shipping',
    'Installation_Service', 'Transport_Method', 'Fragile_Equipment',
    'Hospital_Info', 'Rural_Hospital', 'Order_Month',
    'Order_Day_of_Week', 'Order_Is_Weekend'
]

# === 2. Define V10 transformers ===
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# === 3. Combine them ===
final_preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop'
)

# === 4. Build final pipeline with BEST V10 hyperparameters ===
# (This automatically uses the 'best_params' variable from Cell 2)
print(f"   ✓ Using best params from V10 GridSearch: {best_params}")

model_params = {
    key.replace('xgb__', ''): value 
    for key, value in best_params.items()
}

final_xgb_safe_pipeline_V10 = Pipeline(steps=[
    ('preprocessor', final_preprocessor),
    ('xgb', XGBRegressor(
        objective='reg:squarederror',
        random_state=42,
        n_jobs=-1,
        **model_params # Unpacks the best params here
    ))
])

# === 5. Fit on full V10 dataset ===
# (X and y are from Cell 1)
print("X shape:", X.shape)
print("y shape:", y.shape)

final_xgb_safe_pipeline_V10.fit(X, y)

print("\n✅ Final (V10 'SAFE') XGBoost model trained on entire dataset.")
print("The 'final_xgb_safe_pipeline_V10' object is ready for prediction.")

In [None]:
# === CELL 5: GENERATE YOUR FINAL V10 SUBMISSION ===

# 1. Load your new, raw test data
print("Loading new test data (test.csv)...")
df_new_test = pd.read_csv('../data/test.csv') 

# 2. Save IDs
submission_ids = df_new_test['Hospital_Id']

# 3. Apply the *V10* feature engineering
print("Applying V10 ('Safe Features Only') feature engineering...")
X_new_prepared = prepare_features_V10(df_new_test) # Using the new V10 function

# 4. Get predictions FROM THE ROBUST V10 MODEL
print("Getting predictions from the final V10 XGBoost model...")
log_predictions = final_xgb_safe_pipeline_V10.predict(X_new_prepared)

# 5. Convert predictions back from log-scale
final_predictions = np.expm1(log_predictions)

# 6. Create the final submission file
submission_df = pd.DataFrame({
    'Hospital_Id': submission_ids,
    'Transport_Cost': final_predictions
})

# Display the first few predictions
print("\nFinal Predictions:")
display(submission_df.head())

# Save to CSV
submission_df.to_csv('submission_XGB_V10_SafeFeaturesOnly.csv', index=False)
print("\n✅ Submission file 'submission_XGB_V10_SafeFeaturesOnly.csv' created successfully.")
print("UPLOAD THIS FILE. This is the ultimate test.")

5th Try with LGBM and robustScaler"


In [110]:
# === CELL 1: IMPORTS ===

# core
import os
import re
import warnings
warnings.filterwarnings('ignore')

# data + plotting
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn (preprocessing / pipeline / model selection / metrics)
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
# NEW: Import RobustScaler
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error

# models
from xgboost import XGBRegressor

In [111]:
# === CELL 2: INITIAL DATA LOAD & CLEANING ===

print("Loading data...")
try:
    df = pd.read_csv('../data/train.csv')
    df.columns = df.columns.str.strip()
    print(f"Initial data shape: {df.shape}")
except Exception as e:
    print(f"Error loading '../data/train.csv'. Make sure the file is in the correct path.")
    print(e)

# Clean all string/object columns
print("Cleaning string columns...")
for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].astype(str).str.strip()
    df[col] = df[col].replace({'': np.nan, 'nan': np.nan, 'NaN': np.nan})

# Normalize Yes/No columns
print("Normalizing Yes/No columns...")
yes_no_cols = ['CrossBorder_Shipping', 'Urgent_Shipping', 'Installation_Service',
               'Fragile_Equipment', 'Rural_Hospital']
for col in df.select_dtypes(include='object').columns:
    if col in yes_no_cols:
        df[col] = df[col].replace({
            'YES': 'Yes', 'yes': 'Yes', 'Y': 'Yes', 'y': 'Yes',
            'NO': 'No', 'no': 'No', 'N': 'No', 'n': 'No'
        })

# Convert date columns
print("Converting date columns...")
df['Order_Placed_Date'] = pd.to_datetime(df['Order_Placed_Date'], errors='coerce')
df['Delivery_Date'] = pd.to_datetime(df['Delivery_Date'], errors='coerce')

# Create new features
print("Engineering Delivery_Days and date features...")
df['Delivery_Days'] = (df['Delivery_Date'] - df['Order_Placed_Date']).dt.days
df['Order_Month'] = df['Order_Placed_Date'].dt.month
df['Order_Day_of_Week'] = df['Order_Placed_Date'].dt.dayofweek
df['Order_Is_Weekend'] = df['Order_Day_of_Week'].isin([5, 6])

print("\n✅ Initial load and feature engineering complete.")

Loading data...
Initial data shape: (5000, 20)
Cleaning string columns...
Normalizing Yes/No columns...
Converting date columns...
Engineering Delivery_Days and date features...

✅ Initial load and feature engineering complete.


In [112]:
# === CELL 1: V12 PREPROCESSING (Same as V4/V6) ===

print("="*70)
print(" V12 - ROBUST PREPROCESSING (USING ROBUSTSCALER + OHE)")
print("="*70)

# ==============================================================================
# PART 1: PRE-SPLIT DATA CLEANING & FEATURE ENGINEERING
# ==============================================================================

print("\n[1/10] Loading and cleaning data...")
try:
    df = pd.read_csv('../data/train.csv')
    df.columns = df.columns.str.strip()
except Exception as e:
    print(f"Error loading '../data/train.csv'. Make sure the file is in the correct path.")
    print(e)

for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].astype(str).str.strip()
    df[col] = df[col].replace({'': np.nan, 'nan': np.nan, 'NaN': np.nan})
    
yes_no_cols = ['CrossBorder_Shipping', 'Urgent_Shipping', 'Installation_Service',
               'Fragile_Equipment', 'Rural_Hospital']
for col in df.select_dtypes(include='object').columns:
    if col in yes_no_cols:
        df[col] = df[col].replace({
            'YES': 'Yes', 'yes': 'Yes', 'Y': 'Yes', 'y': 'Yes',
            'NO': 'No', 'no': 'No', 'N': 'No', 'n': 'No'
        })

df['Order_Placed_Date'] = pd.to_datetime(df['Order_Placed_Date'], errors='coerce')
df['Delivery_Date'] = pd.to_datetime(df['Delivery_Date'], errors='coerce')

df['Delivery_Days'] = (df['Delivery_Date'] - df['Order_Placed_Date']).dt.days
df['Order_Month'] = df['Order_Placed_Date'].dt.month
df['Order_Day_of_Week'] = df['Order_Placed_Date'].dt.dayofweek
df['Order_Is_Weekend'] = df['Order_Day_of_Week'].isin([5, 6])
print("   ✓ Initial load and cleaning complete.")

print("\n[2/10] Repairing negative values...")
df['Delivery_Days'] = df['Delivery_Days'].abs()
df['Transport_Cost'] = df['Transport_Cost'].abs()
print("   ✓ Repaired negative costs and durations.")

print("\n[3/10] Engineering & Log-transforming features...")
df['Equipment_Volume'] = df['Equipment_Height'] * df['Equipment_Width']
df['Equipment_Value'] = np.log1p(df['Equipment_Value'])
df['Equipment_Volume'] = np.log1p(df['Equipment_Volume'])
print("   ✓ Engineered and log-transformed features.")

print("\n[4/10] Defining target variable (y)...")
y = np.log1p(df['Transport_Cost'])
print(f"   ✓ Target (y) created.")

print("\n[5/10] Selecting features (X)...")
drop_cols = [
    'Transport_Cost', 'Equipment_Height', 'Equipment_Width', 'Equipment_Weight',
    'Hospital_Id', 'Supplier_Name', 'Hospital_Location',
    'Order_Placed_Date', 'Delivery_Date'
]
X = df.drop(columns=drop_cols)

print("\n[6/10] Train-test split (80/20)...")
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ==============================================================================
# PART 2: POST-SPLIT PIPELINES
# ==============================================================================
print("\n" + "="*70)
print(" BUILDING V12 ROBUST PIPELINES (USING ROBUSTSCALER + OHE)")
print("="*70)

print("\n[8/10] Configuring feature transformers...")

# --- Numeric Features ---
numeric_features = [
    'Supplier_Reliability', 'Equipment_Value', 'Base_Transport_Fee',
    'Delivery_Days', 'Equipment_Volume'
]
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler()) 
])
print(f"   ✓ Numeric features: median imputation + RobustScaler")

# --- Categorical Features ---
categorical_features = [
    'Equipment_Type', 'CrossBorder_Shipping', 'Urgent_Shipping',
    'Installation_Service', 'Transport_Method', 'Fragile_Equipment',
    'Hospital_Info', 'Rural_Hospital', 'Order_Month',
    'Order_Day_of_Week', 'Order_Is_Weekend'
]
# We are back to using OneHotEncoder. This is reliable.
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
print(f"   ✓ Categorical features: 'Missing' imputation + OneHotEncoder")

# ==============================================================================
print("\n[9/10] Assembling ColumnTransformer...")
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop'
)
print(f"   ✓ ColumnTransformer 'preprocessor' configured")

# ==============================================================================
print("\n[10/10] Applying preprocessing...")
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)
print(f"   ✓ V12 Preprocessing complete.")
print(f"   ✓ Training set processed: {X_train_processed.shape}")
print(f"   ✓ Test set processed:     {X_test_processed.shape}")

 V12 - ROBUST PREPROCESSING (USING ROBUSTSCALER + OHE)

[1/10] Loading and cleaning data...
   ✓ Initial load and cleaning complete.

[2/10] Repairing negative values...
   ✓ Repaired negative costs and durations.

[3/10] Engineering & Log-transforming features...
   ✓ Engineered and log-transformed features.

[4/10] Defining target variable (y)...
   ✓ Target (y) created.

[5/10] Selecting features (X)...

[6/10] Train-test split (80/20)...

 BUILDING V12 ROBUST PIPELINES (USING ROBUSTSCALER + OHE)

[8/10] Configuring feature transformers...
   ✓ Numeric features: median imputation + RobustScaler
   ✓ Categorical features: 'Missing' imputation + OneHotEncoder

[9/10] Assembling ColumnTransformer...
   ✓ ColumnTransformer 'preprocessor' configured

[10/10] Applying preprocessing...
   ✓ V12 Preprocessing complete.
   ✓ Training set processed: (4000, 51)
   ✓ Test set processed:     (1000, 51)


In [113]:
# === CELL 2: V12 GRIDSEARCHCV (FOR "DUMBER" XGBOOST) ===

print("🚀 Starting V12 GridSearchCV for XGBoost (Focusing on MAX STABILITY)...")

# --- 1. CV splitter ---
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# --- 2. XGB pipeline ---
# 'preprocessor' is your V12 (RobustScaler + OHE) preprocessor
xgb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor), 
    ('xgb', XGBRegressor(
        objective='reg:squarederror',
        random_state=42,
        n_jobs=-1
    ))
])

# --- 3. V12 Hyperparameter grid (Even dumber/more regularized) ---
param_grid = {
    'xgb__n_estimators': [1000],
    'xgb__max_depth': [2],               # FORCE max_depth=2
    'xgb__learning_rate': [0.05],
    'xgb__subsample': [0.5, 0.7],        # NEW: Use only a fraction of data per tree
    'xgb__colsample_bytree': [0.5, 0.7], # NEW: Use only a fraction of features per tree
    'xgb__reg_alpha': [10, 100],         # Aggressive L1
    'xgb__reg_lambda': [10, 100]         # Aggressive L2
}

# --- 4. GridSearchCV ---
grid_search = GridSearchCV(
    estimator=xgb_pipeline,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',
    cv=kf,
    n_jobs=-1,
    verbose=2
)

# --- 5. Run grid search on the V12 TRAINING DATA ---
grid_search.fit(X_train, y_train)

# --- 6. Best params & CV score ---
print("\n✅ V12 GridSearch complete!")
best_params = grid_search.best_params_
best_cv_rmse = -grid_search.best_score_
print(f"   Best hyperparameters: {best_params}")
print(f"   Best CV RMSE (log-space): {best_cv_rmse:.4f}")

# --- 7. Evaluate on V12 test set ---
final_xgb_v12 = grid_search.best_estimator_
y_test_pred_log = final_xgb_v12.predict(X_test)
y_test_pred_orig = np.expm1(y_test_pred_log)

rmse_test_log = np.sqrt(mean_squared_error(y_test, y_test_pred_log))
rmse_test_orig = np.sqrt(mean_squared_error(np.expm1(y_test), y_test_pred_orig))

print(f"\n   Test RMSE (log-space)      : {rmse_test_log:.4f}")
print(f"   Test RMSE (original scale) : {rmse_test_orig:.2f}")

🚀 Starting V12 GridSearchCV for XGBoost (Focusing on MAX STABILITY)...
Fitting 5 folds for each of 16 candidates, totalling 80 fits

✅ V12 GridSearch complete!
   Best hyperparameters: {'xgb__colsample_bytree': 0.7, 'xgb__learning_rate': 0.05, 'xgb__max_depth': 2, 'xgb__n_estimators': 1000, 'xgb__reg_alpha': 10, 'xgb__reg_lambda': 10, 'xgb__subsample': 0.7}
   Best CV RMSE (log-space): 0.3771

   Test RMSE (log-space)      : 0.3606
   Test RMSE (original scale) : 34062.36


In [114]:
# === CELL 3: DEFINE THE V12 'PREPARE_FEATURES' FUNCTION ===

# This is the same as V4/V6/V8 - our stable function
def prepare_features_V12(df_raw):
    """
    Applies all V12 (robust) manual cleaning and feature engineering.
    No clipping is needed as the RobustScaler pipeline handles outliers.
    """
    df = df_raw.copy()
    
    # 1. Clean column names
    df.columns = df.columns.str.strip()

    # 2. Clean all string/object columns
    for col in df.select_dtypes(include='object').columns:
        df[col] = df[col].astype(str).str.strip()
        df[col] = df[col].replace({'': np.nan, 'nan': np.nan, 'NaN': np.nan})

    # 3. Normalize Yes/No columns
    yes_no_cols = ['CrossBorder_Shipping', 'Urgent_Shipping', 'Installation_Service',
                   'Fragile_Equipment', 'Rural_Hospital']
    for col in df.select_dtypes(include='object').columns:
        if col in yes_no_cols:
            df[col] = df[col].replace({
                'YES': 'Yes', 'yes': 'Yes', 'Y': 'Yes', 'y': 'Yes',
                'NO': 'No', 'no': 'No', 'N': 'No', 'n': 'No'
            })

    # 4. Convert date columns
    df['Order_Placed_Date'] = pd.to_datetime(df['Order_Placed_Date'], errors='coerce')
    df['Delivery_Date'] = pd.to_datetime(df['Delivery_Date'], errors='coerce')

    # 5. Engineer Date Features
    df['Delivery_Days'] = (df['Delivery_Date'] - df['Order_Placed_Date']).dt.days
    df['Delivery_Days'] = df['Delivery_Days'].abs() 
    df['Order_Month'] = df['Order_Placed_Date'].dt.month
    df['Order_Day_of_Week'] = df['Order_Placed_Date'].dt.dayofweek
    df['Order_Is_Weekend'] = df['Order_Day_of_Week'].isin([5, 6])
    
    # 7. ==== ENGINEERING & LOG-TRANSFORM ====
    df['Equipment_Volume'] = df['Equipment_Height'] * df['Equipment_Width']
    df['Equipment_Value'] = np.log1p(df['Equipment_Value'])
    df['Equipment_Volume'] = np.log1p(df['Equipment_Volume'])
    
    return df

print("   ✓ V12 `prepare_features_V12` function created.")

   ✓ V12 `prepare_features_V12` function created.


In [118]:
# === CELL 4: TRAIN YOUR FINAL, BEST V12 MODEL ON ALL V12 DATA ===

print("\nTraining final V12 XGBoost model on all V12 data...")

# === 1. Feature groups (from our V12 script) ===
numeric_features = [
    'Supplier_Reliability', 'Equipment_Value', 'Base_Transport_Fee',
    'Delivery_Days', 'Equipment_Volume'
]
categorical_features = [
    'Equipment_Type', 'CrossBorder_Shipping', 'Urgent_Shipping',
    'Installation_Service', 'Transport_Method', 'Fragile_Equipment',
    'Hospital_Info', 'Rural_Hospital', 'Order_Month',
    'Order_Day_of_Week', 'Order_Is_Weekend'
]

# === 2. Define V12 transformers (with RobustScaler + OHE) ===
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# === 3. Combine them ===
final_preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop'
)

# === 4. Build final pipeline with BEST V12 hyperparameters ===
print(f"   ✓ Using best params from V12 GridSearch: {best_params}")

model_params = {
    key.replace('xgb__', ''): value 
    for key, value in best_params.items()
}

final_xgb_v12_pipeline = Pipeline(steps=[
    ('preprocessor', final_preprocessor),
    ('xgb', XGBRegressor(
        objective='reg:squarederror',
        random_state=42,
        n_jobs=-1,
        **model_params # Unpacks the best params here
    ))
])

# === 5. Fit on full V12 dataset ===
# (X and y are from Cell 1)
print("X shape:", X.shape)
print("y shape:", y.shape)

final_xgb_v12_pipeline.fit(X, y)

print("\n✅ Final (V12 \"DUMBER\" XGBOOST) model trained on entire dataset.")

print("The 'final_xgb_v12_pipeline' object is ready for prediction.")


Training final V12 XGBoost model on all V12 data...
   ✓ Using best params from V12 GridSearch: {'xgb__colsample_bytree': 0.7, 'xgb__learning_rate': 0.05, 'xgb__max_depth': 2, 'xgb__n_estimators': 1000, 'xgb__reg_alpha': 10, 'xgb__reg_lambda': 10, 'xgb__subsample': 0.7}
X shape: (5000, 16)
y shape: (5000,)

✅ Final (V12 "DUMBER" XGBOOST) model trained on entire dataset.
The 'final_xgb_v12_pipeline' object is ready for prediction.


In [119]:
# === CELL 5: GENERATE YOUR FINAL V12 SUBMISSION ===

# 1. Load your new, raw test data
print("Loading new test data (test.csv)...")
df_new_test = pd.read_csv('../data/test.csv') 

# 2. Save IDs
submission_ids = df_new_test['Hospital_Id']

# 3. Apply the *V12* feature engineering
print("Applying V12 (RobustScaler) feature engineering...")
X_new_prepared = prepare_features_V12(df_new_test) # Using the new V12 function

# 4. Get predictions FROM THE ROBUST V12 MODEL
print("Getting predictions from the final V12 XGBoost model...")
log_predictions = final_xgb_v12_pipeline.predict(X_new_prepared)

# 5. Convert predictions back from log-scale
final_predictions = np.expm1(log_predictions)

# 6. Create the final submission file
submission_df = pd.DataFrame({
    'Hospital_Id': submission_ids,
    'Transport_Cost': final_predictions
})

# Display the first few predictions
print("\nFinal Predictions:")
display(submission_df.head())

# Save to CSV
submission_df.to_csv('submission_XGB_V12_Dumber.csv', index=False)
print("\n✅ Submission file 'submission_XGB_V12_Dumber.csv' created successfully.")
print("UPLOAD THIS FILE TO KAGGLE!")

Loading new test data (test.csv)...
Applying V12 (RobustScaler) feature engineering...
Getting predictions from the final V12 XGBoost model...

Final Predictions:


Unnamed: 0,Hospital_Id,Transport_Cost
0,fffe33003400,485.060547
1,fffe3700330036003600,252.996292
2,fffe3300390038003400,2019.174194
3,fffe310030003900,204.991043
4,fffe3700330031003200,1029.840088



✅ Submission file 'submission_XGB_V12_Dumber.csv' created successfully.
UPLOAD THIS FILE TO KAGGLE!
