In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
import warnings
import xgboost as xgb
import lightgbm as lgb # <-- Import LightGBM
import re

warnings.filterwarnings('ignore')
sns.set(style="whitegrid")

print("Starting the full ML pipeline (Strategy: v22 - Replicate v16 Features + Ensemble)...")

# --- 1. Load Data ---
try:
    train_df_raw = pd.read_csv("train.csv")
    test_df_raw = pd.read_csv("test.csv")
    sample_submission = pd.read_csv("sample_submission.csv")
except FileNotFoundError as e:
    print(f"FATAL ERROR: Could not find data files. {e}")
    raise

print(f"Original train data shape: {train_df_raw.shape}")
print(f"Original test data shape: {test_df_raw.shape}")

test_hospital_ids = test_df_raw['Hospital_Id']

# --- 2. Clean Target Variable ---
target = train_df_raw['Transport_Cost'].copy()
invalid_cost_indices = target[target <= 0].index
print(f"Found {len(invalid_cost_indices)} rows with non-positive cost. Setting them to 0.")
target.loc[invalid_cost_indices] = 0
y_log = np.log1p(target) # We train on log-transformed target

# Combine train and test for consistent preprocessing
train_df_processed = train_df_raw.drop('Transport_Cost', axis=1)
train_df_processed['source'] = 'train'
test_df_raw['source'] = 'test'
df = pd.concat([train_df_processed, test_df_raw], ignore_index=True)
print(f"Combined data shape for preprocessing: {df.shape}")


# --- 3. Feature Engineering (Same as v16) ---
# We create all features, even the ones we will drop, 
# because the kept features ('Cost_per_Day') might depend on them.
missing_cols = [
    'Supplier_Reliability', 'Equipment_Height', 'Equipment_Width', 'Equipment_Weight',
    'Equipment_Type', 'Transport_Method', 'Rural_Hospital'
]

def preprocess_features(df_to_process):
    print("Starting feature engineering (v16 style)...")
    df_processed = df_to_process.copy()
    df_processed = df_processed.drop(['Supplier_Name'], axis=1)

    # Missing value indicators (will be dropped by ColTransformer)
    for col in missing_cols:
        df_processed[col + '_Is_Missing'] = df_processed[col].isnull().astype(int)

    # Date features
    df_processed['Order_Placed_Date'] = pd.to_datetime(df_processed['Order_Placed_Date'], errors='coerce')
    df_processed['Delivery_Date'] = pd.to_datetime(df_processed['Delivery_Date'], errors='coerce')
    df_processed['Delivery_Time_Days'] = (df_processed['Delivery_Date'] - df_processed['Order_Placed_Date']).dt.days
    df_processed['Delivery_Time_Days'] = df_processed['Delivery_Time_Days'].fillna(df_processed['Delivery_Time_Days'].median()).clip(lower=0)
    df_processed = df_processed.drop(['Order_Placed_Date', 'Delivery_Date'], axis=1)

    # Location features (will be dropped by ColTransformer)
    df_processed['Hospital_State'] = df_processed['Hospital_Location'].str.split(',').str[1].str.strip().str.split(' ').str[0]
    df_processed['Hospital_State'] = df_processed['Hospital_State'].fillna('Unknown')
    df_processed = df_processed.drop('Hospital_Location', axis=1)

    # Binary features
    binary_cols = ['CrossBorder_Shipping', 'Urgent_Shipping', 'Installation_Service',  
                   'Fragile_Equipment', 'Rural_Hospital','Hospital_Info']
    for col in binary_cols:
        df_processed[col] = df_processed[col].map({'Yes': 1, 'No': 0}).fillna(0).astype(int)

    # Fill numeric
    df_processed['Equipment_Height'] = df_processed['Equipment_Height'].fillna(1)
    df_processed['Equipment_Width'] = df_processed['Equipment_Width'].fillna(1)
    df_processed['Equipment_Weight'] = df_processed['Equipment_Weight'].fillna(0)
    df_processed['Equipment_Value'] = df_processed['Equipment_Value'].fillna(0)

    # Derived/interaction features (v16 style)
    df_processed['Equipment_Area'] = df_processed['Equipment_Height'] * df_processed['Equipment_Width']
    df_processed['Value_Density'] = df_processed['Equipment_Value'] / (df_processed['Equipment_Weight'] + 1e-6)
    df_processed['Value_per_Height'] = df_processed['Equipment_Value'] / (df_processed['Equipment_Height'] + 1e-6)
    df_processed['Value_per_Width'] = df_processed['Equipment_Value'] / (df_processed['Equipment_Width'] + 1e-6)
    df_processed['Weight_per_Area'] = df_processed['Equipment_Weight'] / (df_processed['Equipment_Area'] + 1e-6)
    df_processed['Cost_per_Day'] = df_processed['Base_Transport_Fee'] / (df_processed['Delivery_Time_Days'] + 1)
    df_processed['Value_per_Area'] = df_processed['Equipment_Value'] / (df_processed['Equipment_Area'] + 1e-6)

    print("Feature engineering complete.")
    return df_processed

df_featured = preprocess_features(df)

# --- 4. Preprocessing Pipeline (EXACTLY as in v16) ---
print("\n--- Building v16 'Accidentally Correct' pipeline ---")

# These are the *only* features v16 used.
numeric_features = ['Supplier_Reliability', "Cost_per_Day"]
skewed_features = [
    'Equipment_Value', 'Base_Transport_Fee', 'Value_Density', 'Equipment_Width', 'Equipment_Height',
    'Equipment_Area', "Value_per_Area", "Value_per_Height", "Value_per_Width", "Weight_per_Area"
]
categorical_features = ['Equipment_Type', 'Transport_Method']
binary_features = ['Fragile_Equipment', 'Rural_Hospital', 'Hospital_Info']

# Build the pipelines
numeric_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median', add_indicator=True)),
    ('scaler', RobustScaler())
])
skewed_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median', add_indicator=True)),
    ('log_transform', FunctionTransformer(np.log1p, validate=False, feature_names_out="one-to-one")),
    ('scaler', RobustScaler())
])
categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=True))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_pipeline, numeric_features),
        ('skew', skewed_pipeline, skewed_features),
        ('cat', categorical_pipeline, categorical_features),
        ('binary', 'passthrough', binary_features)
    ],
    remainder='drop', # This is the "magic" part that drops all other features
    n_jobs=None
)

# --- 5. Apply Final Preprocessing ---
df_to_transform = df_featured.drop(['Hospital_Id', 'source'], axis=1)
train_mask = (df_featured['source'] == 'train').values

print("Fitting final preprocessor...")
preprocessor.fit(df_to_transform[train_mask])

print("Extracting final feature names...")
num_cols = list(preprocessor.named_transformers_['num'].get_feature_names_out()) if numeric_features else []
skew_cols = list(preprocessor.named_transformers_['skew'].get_feature_names_out()) if skewed_features else []
cat_cols = list(preprocessor.named_transformers_['cat'].get_feature_names_out()) if categorical_features else []
bin_cols = binary_features
final_feature_names = num_cols + skew_cols + cat_cols + bin_cols
final_feature_names = [re.sub(r'[^A-Za-z0-9_]+', '', col) for col in final_feature_names]
print(f"Total features for training: {len(final_feature_names)}")

df_final = preprocessor.transform(df_to_transform)
X = df_final[train_mask]
X_test = df_final[~train_mask]
X = X.astype('float32')
X_test = X_test.astype('float32')

# Convert to standard DataFrame (data is dense)
X_df = pd.DataFrame(X, columns=final_feature_names)
X_test_df = pd.DataFrame(X_test, columns=final_feature_names)

y = y_log.reset_index(drop=True) # Our target is y_log

print(f"Final shapes: X_df={X_df.shape}, y={y.shape}, X_test_df={X_test_df.shape}")

# --- 6. Model Training (10-Fold Ensemble) ---
print("\n--- Starting 10-Fold Ensemble Training (XGB + LGBM) ---")

# Use robust parameters for both models
# Use robust parameters for both models
xgb_params = {
    'learning_rate': 0.03, 'n_estimators': 1000, 'max_depth': 4,
    'subsample': 0.8, 'colsample_bytree': 0.7, 'reg_alpha': 1, 'reg_lambda': 1,
    'min_child_weight': 5, 'objective': 'reg:squarederror', 'tree_method': 'hist',
    'random_state': 42, 'n_jobs': -1,
    'early_stopping_rounds': 50  # <-- ADD THE PARAMETER HERE
}

lgb_params = {
    'learning_rate': 0.03, 'n_estimators': 1000, 'max_depth': 4,
    'subsample': 0.8, 'colsample_bytree': 0.7, 'reg_alpha': 1, 'reg_lambda': 1,
    'min_child_weight': 5, 'objective': 'regression_l2', # L2 is MSE
    'random_state': 42, 'n_jobs': -1, 'verbose': -1
}

kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Arrays to store out-of-fold predictions
oof_preds_xgb = np.zeros(X_df.shape[0])
oof_preds_lgbm = np.zeros(X_df.shape[0])
# Arrays to store test predictions
test_preds_xgb = np.zeros(X_test_df.shape[0])
test_preds_lgbm = np.zeros(X_test_df.shape[0])

for fold, (train_index, val_index) in enumerate(kf.split(X_df, y)):
    print(f"--- Fold {fold+1}/10 ---")
    X_train, X_val = X_df.iloc[train_index], X_df.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    
    # --- XGBoost ---
    # --- XGBoost ---
    xgb_model = xgb.XGBRegressor(**xgb_params)
    xgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False) # <-- REMOVED FROM HERE
    oof_preds_xgb[val_index] = xgb_model.predict(X_val)
    test_preds_xgb += xgb_model.predict(X_test_df) / kf.n_splits
    
    # --- LightGBM ---
    lgb_model = lgb.LGBMRegressor(**lgb_params)
    lgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], callbacks=[lgb.early_stopping(50, verbose=False)])
    oof_preds_lgbm[val_index] = lgb_model.predict(X_val)
    test_preds_lgbm += lgb_model.predict(X_test_df) / kf.n_splits

# --- 7. Evaluate OOF Predictions ---
print("\n--- Validation Results (Converted to Actual Cost) ---")
y_actual_oof = np.expm1(y) # The true, non-log-transformed costs

# Convert OOF (log) predictions to actual cost
y_pred_actual_xgb = np.expm1(oof_preds_xgb)
y_pred_actual_lgbm = np.expm1(oof_preds_lgbm)

# Create the blended prediction
y_pred_actual_blend = 0.5 * y_pred_actual_xgb + 0.5 * y_pred_actual_lgbm

# Clip negatives
y_pred_actual_xgb[y_pred_actual_xgb < 0] = 0
y_pred_actual_lgbm[y_pred_actual_lgbm < 0] = 0
y_pred_actual_blend[y_pred_actual_blend < 0] = 0

# Calculate MSE for each model and the blend
mse_xgb = mean_squared_error(y_actual_oof, y_pred_actual_xgb)
mse_lgbm = mean_squared_error(y_actual_oof, y_pred_actual_lgbm)
mse_blend = mean_squared_error(y_actual_oof, y_pred_actual_blend)

print(f"XGBoost OOF MSE: {mse_xgb:.2f}")
print(f"LGBM OOF MSE:    {mse_lgbm:.2f}")
print(f"BLEND OOF MSE:   {mse_blend:.2f}") # <-- THIS IS THE NUMBER TO WATCH

# --- 8. Create Final Submission ---
print("Creating final submission file...")

# Convert test (log) predictions to actual cost
test_pred_actual_xgb = np.expm1(test_preds_xgb)
test_pred_actual_lgbm = np.expm1(test_preds_lgbm)

# Blend the test predictions
test_pred_actual_blend = 0.5 * test_pred_actual_xgb + 0.5 * test_pred_actual_lgbm
test_pred_actual_blend[test_pred_actual_blend < 0] = 0 # Final safety clip

submission_df = pd.DataFrame({
    'Hospital_Id': test_hospital_ids,
    'Transport_Cost': test_pred_actual_blend    # <-- This is the fix
})

sample_submission_df = pd.read_csv("sample_submission.csv")
submission_df['Transport_Cost'] = submission_df['Transport_Cost'].astype(sample_submission_df['Transport_Cost'].dtype)

submission_df.to_csv('submission_v22_ensemble.csv', index=False)

print("\n--- DONE ---")
print("Submission file 'submission_v22_ensemble.csv' created successfully.")
print(submission_df.head())

Starting the full ML pipeline (Strategy: v22 - Replicate v16 Features + Ensemble)...
Original train data shape: (5000, 20)
Original test data shape: (500, 19)
Found 493 rows with non-positive cost. Setting them to 0.
Combined data shape for preprocessing: (5500, 20)
Starting feature engineering (v16 style)...
Feature engineering complete.

--- Building v16 'Accidentally Correct' pipeline ---
Fitting final preprocessor...
Extracting final feature names...
Total features for training: 26
Final shapes: X_df=(5000, 26), y=(5000,), X_test_df=(500, 26)

--- Starting 10-Fold Ensemble Training (XGB + LGBM) ---
--- Fold 1/10 ---
--- Fold 2/10 ---
--- Fold 3/10 ---
--- Fold 4/10 ---
--- Fold 5/10 ---
--- Fold 6/10 ---
--- Fold 7/10 ---
--- Fold 8/10 ---
--- Fold 9/10 ---
--- Fold 10/10 ---

--- Validation Results (Converted to Actual Cost) ---
XGBoost OOF MSE: 58486291445.48
LGBM OOF MSE:    58262372788.69
BLEND OOF MSE:   58243011266.16
Creating final submission file...

--- DONE ---
Submission