In [None]:
# === CELL 1: IMPORTS ===

# core
import os
import re
import warnings
warnings.filterwarnings('ignore')

# data + plotting
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn (preprocessing / pipeline / model selection / metrics)
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
# NEW: Import RobustScaler
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error

# models
from xgboost import XGBRegressor

In [None]:
# === CELL 2: INITIAL DATA LOAD & CLEANING ===

print("Loading data...")
try:
    df = pd.read_csv('../data/train.csv')
    df.columns = df.columns.str.strip()
    print(f"Initial data shape: {df.shape}")
except Exception as e:
    print(f"Error loading '../data/train.csv'. Make sure the file is in the correct path.")
    print(e)

# Clean all string/object columns
print("Cleaning string columns...")
for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].astype(str).str.strip()
    df[col] = df[col].replace({'': np.nan, 'nan': np.nan, 'NaN': np.nan})

# Normalize Yes/No columns
print("Normalizing Yes/No columns...")
yes_no_cols = ['CrossBorder_Shipping', 'Urgent_Shipping', 'Installation_Service',
               'Fragile_Equipment', 'Rural_Hospital']
for col in df.select_dtypes(include='object').columns:
    if col in yes_no_cols:
        df[col] = df[col].replace({
            'YES': 'Yes', 'yes': 'Yes', 'Y': 'Yes', 'y': 'Yes',
            'NO': 'No', 'no': 'No', 'N': 'No', 'n': 'No'
        })

# Convert date columns
print("Converting date columns...")
df['Order_Placed_Date'] = pd.to_datetime(df['Order_Placed_Date'], errors='coerce')
df['Delivery_Date'] = pd.to_datetime(df['Delivery_Date'], errors='coerce')

# Create new features
print("Engineering Delivery_Days and date features...")
df['Delivery_Days'] = (df['Delivery_Date'] - df['Order_Placed_Date']).dt.days
df['Order_Month'] = df['Order_Placed_Date'].dt.month
df['Order_Day_of_Week'] = df['Order_Placed_Date'].dt.dayofweek
df['Order_Is_Weekend'] = df['Order_Day_of_Week'].isin([5, 6])

print("\n✅ Initial load and feature engineering complete.")

In [None]:
# === CELL 3: V4 ROBUST PREPROCESSING (USING ROBUSTSCALER) ===

print("="*70)
print(" V4 - ROBUST PREPROCESSING (NO CLIPPING, USING ROBUSTSCALER)")
print("="*70)

# ==============================================================================
# PART 1: PRE-SPLIT DATA CLEANING & FEATURE ENGINEERING
# ==============================================================================

print("\n[1/10] Repairing impossible negative values...")
# We STILL do this. This is correct.
df['Delivery_Days'] = df['Delivery_Days'].abs()
df['Transport_Cost'] = df['Transport_Cost'].abs()
print("   ✓ Repaired negative costs and durations using .abs()")

# ==============================================================================
print("\n[2/10] Engineering features...")
df['Equipment_Volume'] = df['Equipment_Height'] * df['Equipment_Width']
print("   ✓ Created 'Equipment_Volume'")

# ==============================================================================
print("\n[3/10] Log-transforming skewed features...")
# We no longer clip, just log-transform
df['Equipment_Value'] = np.log1p(df['Equipment_Value'])
df['Equipment_Volume'] = np.log1p(df['Equipment_Volume'])
print(f"   ✓ Log-transformed 'Equipment_Value' and 'Equipment_Volume'")

# ==============================================================================
print("\n[4/10] Defining target variable (y)...")
# We no longer clip, just log-transform the repaired target
y = np.log1p(df['Transport_Cost'])
print(f"   ✓ Target (y) created using log1p(abs(Transport_Cost))")

# ==============================================================================
print("\n[5/10] Selecting features (X) for modeling...")
drop_cols = [
    'Transport_Cost', 'Equipment_Height', 'Equipment_Width', 'Equipment_Weight',
    'Hospital_Id', 'Supplier_Name', 'Hospital_Location',
    'Order_Placed_Date', 'Delivery_Date'
]
X = df.drop(columns=drop_cols)
print(f"   ✓ Selected {X.shape[1]} features.")

# ==============================================================================
print("\n[6/10] Train-test split (80/20)...")
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print(f"   ✓ Training set: {X_train.shape}")
print(f"   ✓ Test set:     {X_test.shape}")

# ==============================================================================
# PART 2: POST-SPLIT PIPELINES
# ==============================================================================
print("\n" + "="*70)
print(" BUILDING V4 ROBUST PIPELINES (USING ROBUSTSCALER)")
print("="*70)

print("\n[8/10] Configuring feature transformers...")

# --- Numeric Features ---
numeric_features = [
    'Supplier_Reliability', 'Equipment_Value', 'Base_Transport_Fee',
    'Delivery_Days', 'Equipment_Volume'
]
# === CRITICAL CHANGE: USE ROBUSTSCALER ===
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler()) # Use RobustScaler instead of StandardScaler
])
print(f"   ✓ Numeric features: median imputation + RobustScaler")

# --- Categorical Features ---
categorical_features = [
    'Equipment_Type', 'CrossBorder_Shipping', 'Urgent_Shipping',
    'Installation_Service', 'Transport_Method', 'Fragile_Equipment',
    'Hospital_Info', 'Rural_Hospital', 'Order_Month',
    'Order_Day_of_Week', 'Order_Is_Weekend'
]
# Use the 'Missing' category strategy
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
print(f"   ✓ Categorical features: imputing NaNs as 'Missing' + one-hot")

# ==============================================================================
print("\n[9/10] Assembling ColumnTransformer...")
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop'
)
print(f"   ✓ ColumnTransformer 'preprocessor' configured")

# ==============================================================================
print("\n[10/10] Applying preprocessing...")
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)
print(f"   ✓ V4 Preprocessing complete.")
print(f"   ✓ Training set processed: {X_train_processed.shape}")
print(f"   ✓ Test set processed:     {X_test_processed.shape}")

In [None]:
from sklearn.linear_model import Ridge

# === CELL 4 (V8): GRIDSEARCHCV FOR A SIMPLE RIDGE MODEL ===

print("🚀 Starting V8 GridSearchCV for Ridge (Focusing on EXTREME SIMPLICITY)...")

# --- 1. CV splitter ---
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# --- 2. Ridge pipeline ---
# 'preprocessor' is your V4 preprocessor (with RobustScaler)
# This MUST be the V4 preprocessor object, already in your memory
ridge_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor), 
    ('ridge', Ridge(random_state=42)) # Simple, robust linear model
])

# --- 3. V8 Hyperparameter grid (Just tune regularization strength) ---
param_grid = {
    'ridge__alpha': [1, 10, 50, 100, 200, 500, 1000] # Test a wide range of L2 regularization
}

# --- 4. GridSearchCV ---
grid_search = GridSearchCV(
    estimator=ridge_pipeline,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',
    cv=kf,
    n_jobs=-1,
    verbose=2
)

# --- 5. Run grid search on the V4 TRAINING DATA ---
# (X_train and y_train are from your V4 preprocessing cell)
grid_search.fit(X_train, y_train)

# --- 6. Best params & CV score ---
print("\n✅ V8 (Ridge) GridSearch complete!")
best_params = grid_search.best_params_
best_cv_rmse = -grid_search.best_score_
print(f"   Best hyperparameters: {best_params}")
print(f"   Best CV RMSE (log-space): {best_cv_rmse:.4f}")

# --- 7. Evaluate on V4 test set ---
final_ridge_v8 = grid_search.best_estimator_
y_test_pred_log = final_ridge_v8.predict(X_test)
y_test_pred_orig = np.expm1(y_test_pred_log)

rmse_test_log = np.sqrt(mean_squared_error(y_test, y_test_pred_log))
rmse_test_orig = np.sqrt(mean_squared_error(np.expm1(y_test), y_test_pred_orig))

print(f"\n   Test RMSE (log-space)      : {rmse_test_log:.4f}")
print(f"   Test RMSE (original scale) : {rmse_test_orig:.2f}")

In [None]:
# === CELL 5: DEFINE THE V4 'PREPARE_FEATURES' FUNCTION ===

# This function is now simpler: no clipping!
def prepare_features_V4(df_raw):
    """
    Applies all V4 (robust) manual cleaning and feature engineering.
    No clipping is needed as the RobustScaler pipeline handles outliers.
    """
    df = df_raw.copy()
    
    # 1. Clean column names
    df.columns = df.columns.str.strip()

    # 2. Clean all string/object columns
    for col in df.select_dtypes(include='object').columns:
        df[col] = df[col].astype(str).str.strip()
        df[col] = df[col].replace({'': np.nan, 'nan': np.nan, 'NaN': np.nan})

    # 3. Normalize Yes/No columns
    yes_no_cols = ['CrossBorder_Shipping', 'Urgent_Shipping', 'Installation_Service',
                   'Fragile_Equipment', 'Rural_Hospital']
    for col in df.select_dtypes(include='object').columns:
        if col in yes_no_cols:
            df[col] = df[col].replace({
                'YES': 'Yes', 'yes': 'Yes', 'Y': 'Yes', 'y': 'Yes',
                'NO': 'No', 'no': 'No', 'N': 'No', 'n': 'No'
            })

    # 4. Convert date columns
    df['Order_Placed_Date'] = pd.to_datetime(df['Order_Placed_Date'], errors='coerce')
    df['Delivery_Date'] = pd.to_datetime(df['Delivery_Date'], errors='coerce')

    # 5. Engineer Date Features
    df['Delivery_Days'] = (df['Delivery_Date'] - df['Order_Placed_Date']).dt.days
    df['Delivery_Days'] = df['Delivery_Days'].abs() # V4 fix
    df['Order_Month'] = df['Order_Placed_Date'].dt.month
    df['Order_Day_of_Week'] = df['Order_Placed_Date'].dt.dayofweek
    df['Order_Is_Weekend'] = df['Order_Day_of_Week'].isin([5, 6])
    
    # 7. ==== V4: ENGINEERING & LOG-TRANSFORM ====
    df['Equipment_Volume'] = df['Equipment_Height'] * df['Equipment_Width']
    df['Equipment_Value'] = np.log1p(df['Equipment_Value'])
    df['Equipment_Volume'] = np.log1p(df['Equipment_Volume'])
    
    return df

print("   ✓ V4 `prepare_features_V4` function created.")

In [None]:
# === CELL 6: TRAIN YOUR FINAL, BEST V8 (RIDGE) MODEL ON ALL V4 DATA ===
from sklearn.linear_model import Ridge

print("\nTraining final V8 (Ridge) model on all V4 data...")
print("--- THIS IS OUR 'ROBUST & SIMPLE' MODEL ---")

# === 1. Feature groups (from our V4 robust script) ===
numeric_features = [
    'Supplier_Reliability', 'Equipment_Value', 'Base_Transport_Fee',
    'Delivery_Days', 'Equipment_Volume'
]
categorical_features = [
    'Equipment_Type', 'CrossBorder_Shipping', 'Urgent_Shipping',
    'Installation_Service', 'Transport_Method', 'Fragile_Equipment',
    'Hospital_Info', 'Rural_Hospital', 'Order_Month',
    'Order_Day_of_Week', 'Order_Is_Weekend'
]

# === 2. Define V4 transformers (with RobustScaler) ===
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler()) # CRITICAL: Use RobustScaler
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# === 3. Combine them ===
final_preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop'
)

# === 4. Build final pipeline with BEST V8 hyperparameters ===
# (This automatically uses the 'best_params' variable from Cell 4)
print(f"   ✓ Using best params from V8 GridSearch: {best_params}")

final_ridge_v8_pipeline = Pipeline(steps=[
    ('preprocessor', final_preprocessor),
    ('ridge', Ridge(
        alpha=best_params['ridge__alpha'], # Unpacks {'ridge__alpha': 10}
        random_state=42
    ))
])

# === 5. Fit on full V4 dataset ===
# (X and y are from Cell 3)
print("X shape:", X.shape)
print("y shape:", y.shape)

final_ridge_v8_pipeline.fit(X, y)

print("\n✅ Final (V8 RIDGE) model trained on entire dataset.")
print("The 'final_ridge_v8_pipeline' object is ready for prediction.")

In [None]:
# === CELL 7: GENERATE YOUR FINAL V4 SUBMISSION ===

# 1. Load your new, raw test data
print("Loading new test data (test.csv)...")
df_new_test = pd.read_csv('../data/test.csv') 

# 2. Save IDs
submission_ids = df_new_test['Hospital_Id']

# 3. Apply the *V4* feature engineering
print("Applying V4 (RobustScaler) feature engineering...")
# This uses the 'prepare_features_V4' function you defined in Cell 5
X_new_prepared = prepare_features_V4(df_new_test) 

# 4. Get predictions FROM THE ROBUST V4 MODEL
print("Getting predictions from the final V4 XGBoost model...")
# This uses the 'final_xgb_robust_pipeline_V4' model from Cell 6
log_predictions = final_xgb_robust_pipeline_V4.predict(X_new_prepared)

# 5. Convert predictions back from log-scale
final_predictions = np.expm1(log_predictions)

# 6. Create the final submission file
submission_df = pd.DataFrame({
    'Hospital_Id': submission_ids,
    'Transport_Cost': final_predictions
})

# Display the first few predictions
print("\nFinal Predictions:")
display(submission_df.head())

# Save to CSV
submission_df.to_csv('submission_XGB_V4_RobustScaler.csv', index=False)
print("\n✅ Submission file 'submission_XGB_V4_RobustScaler.csv' created successfully.")
print("THIS IS THE ONE. UPLOAD THIS FILE TO KAGGLE!")