In [None]:
# === CELL 1: IMPORTS (V25 - Bare Bones + ElasticNet) ===
print("--- Cell 1: Imports ---")
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder # Using StandardScaler
from sklearn.linear_model import ElasticNet # Using ElasticNet
from sklearn.metrics import mean_squared_error

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
print("✅ Libraries imported.")

In [None]:
# === CELL 2: DATA LOADING & INITIAL CLEANING ===
print("\n--- Cell 2: Data Loading & Initial Cleaning ---")
try:
    # Load the training data from the specified path
    df = pd.read_csv('../data/train.csv')
    df.columns = df.columns.str.strip() # Remove leading/trailing spaces from column names
    print(f"Loaded train.csv: {df.shape}")
except FileNotFoundError:
    print("Error: ../data/train.csv not found.")
    df = None
except Exception as e:
    print(f"Error loading train.csv: {e}")
    df = None

if df is not None:
    # Clean string columns
    print("Cleaning string columns...")
    for col in df.select_dtypes(include='object').columns:
        df[col] = df[col].astype(str).str.strip().replace({'': np.nan, 'nan': np.nan, 'NaN': np.nan})

    # Normalize Yes/No columns
    print("Normalizing Yes/No columns...")
    yes_no_cols = ['CrossBorder_Shipping', 'Urgent_Shipping', 'Installation_Service',
                   'Fragile_Equipment', 'Rural_Hospital']
    for col in df.columns:
        if col in yes_no_cols:
            df[col] = df[col].replace({ 'YES': 'Yes', 'yes': 'Yes', 'Y': 'Yes', 'y': 'Yes',
                                        'NO': 'No', 'no': 'No', 'N': 'No', 'n': 'No' })

    # Convert date columns
    print("Converting date columns...")
    df['Order_Placed_Date'] = pd.to_datetime(df['Order_Placed_Date'], errors='coerce')
    df['Delivery_Date'] = pd.to_datetime(df['Delivery_Date'], errors='coerce')
    print("✅ Basic cleaning complete.")

In [None]:
# === CELL 3: V25 FEATURE ENGINEERING & DATA REPAIR (Minimal) ===
if df is not None:
    print("\n--- Cell 3: V25 Feature Engineering & Repair (Minimal) ---")

    print("[Step 1/4] Repairing negative values using abs()...")
    # Engineer Delivery_Days first
    df['Delivery_Days'] = (df['Delivery_Date'] - df['Order_Placed_Date']).dt.days
    df['Delivery_Days'] = df['Delivery_Days'].abs() # Use abs()
    # Apply abs() to Transport_Cost and store max for clipping later
    df['Transport_Cost'] = df['Transport_Cost'].abs() # Use abs()
    # Calculate prediction cap based on 99th percentile of ORIGINAL absolute cost
    try:
        # Calculate on non-NaN costs only
        prediction_cap_value = df['Transport_Cost'].dropna().quantile(0.99)
        print(f"   ✓ Calculated Prediction Cap (99th percentile): {prediction_cap_value:,.2f}")
    except Exception as e:
        print(f"   ! Warning: Could not calculate prediction cap: {e}. Using fallback.")
        prediction_cap_value = 1000000 # Fallback high value
    print("   ✓ Repaired negative costs and durations using abs().")


    print("\n[Step 2/4] Defining target variable (y)...")
    y = np.log1p(df['Transport_Cost']) # Log-transform of abs(cost)
    print(f"   ✓ Target (y) created: {y.shape}")

    print("\n[Step 3/4] Selecting features (X) - Keeping originals...")
    # Keep original numeric features, drop IDs/dates/target
    # NO Volume, NO Logs (except target), NO Date Parts, NO Missing Flags (initially)
    drop_cols = [
        'Transport_Cost',
        'Hospital_Id', 'Supplier_Name', 'Hospital_Location',
        'Order_Placed_Date', 'Delivery_Date',
    ]
    drop_cols_present = [col for col in drop_cols if col in df.columns]
    X = df.drop(columns=drop_cols_present)
    print(f"   ✓ Feature set 'X' created (Minimal): {X.shape}")
    # print(f"   ✓ Features: {X.columns.tolist()}") # Uncomment to verify

    print("\n[Step 4/4] Defining Feature Lists for Pipeline...")
    # Define features based on columns NOW in X
    numeric_features = X.select_dtypes(include=np.number).columns.tolist()
    categorical_features = X.select_dtypes(exclude=np.number).columns.tolist()
    print(f"   ✓ Numeric Features: {numeric_features}")
    print(f"   ✓ Categorical Features: {categorical_features}")

    # --- Split for Tuning ---
    print("\nSplitting data into Train/Validation (80/20)...")
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    print(f"   ✓ Data split complete.")

    print("✅ V25 Minimal Feature Engineering complete.")

else:
    print("   ✗ Feature engineering skipped due to data loading error.")

In [None]:
# === CELL 1: V27 PREPROCESSING (Minimal Features + RobustScaler) ===
print("--- Cell 1: V27 Preprocessing ---")
# --- Imports ---
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler, OneHotEncoder # Using RobustScaler
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

# --- Load & Clean ---
print("Loading and cleaning data...")
try:
    df = pd.read_csv('../data/train.csv')
    df.columns = df.columns.str.strip()
except FileNotFoundError:
    print("Error: ../data/train.csv not found.")
    df = None
except Exception as e:
    print(f"Error loading train.csv: {e}")
    df = None

if df is not None:
    for col in df.select_dtypes(include='object').columns:
        df[col] = df[col].astype(str).str.strip().replace({'': np.nan, 'nan': np.nan, 'NaN': np.nan})
    yes_no_cols = ['CrossBorder_Shipping', 'Urgent_Shipping', 'Installation_Service', 'Fragile_Equipment', 'Rural_Hospital']
    for col in df.columns:
        if col in yes_no_cols:
            df[col] = df[col].replace({ 'YES': 'Yes', 'yes': 'Yes', 'Y': 'Yes', 'y': 'Yes','NO': 'No', 'no': 'No', 'N': 'No', 'n': 'No' })
    df['Order_Placed_Date'] = pd.to_datetime(df['Order_Placed_Date'], errors='coerce')
    df['Delivery_Date'] = pd.to_datetime(df['Delivery_Date'], errors='coerce')
    print("   ✓ Basic cleaning complete.")

    # --- Minimal FE & Repair ---
    print("Applying minimal FE and abs() repair...")
    df['Delivery_Days'] = (df['Delivery_Date'] - df['Order_Placed_Date']).dt.days
    df['Delivery_Days'] = df['Delivery_Days'].abs()
    df['Transport_Cost'] = df['Transport_Cost'].abs()
    # --- V27: Calculate 99.5th percentile cap ---
    try:
        prediction_cap_value_995 = df['Transport_Cost'].dropna().quantile(0.995) # 99.5th percentile
        print(f"   ✓ Calculated Prediction Cap (99.5th percentile): {prediction_cap_value_995:,.2f}")
    except Exception as e:
        print(f"   ! Warning: Could not calculate 99.5th percentile cap: {e}. Using fallback.")
        prediction_cap_value_995 = 1500000 # Adjusted fallback
    print("   ✓ Negatives repaired.")

    # --- Define Target y ---
    y = np.log1p(df['Transport_Cost'])
    print(f"   ✓ Target (y) created: {y.shape}")

    # --- Define Features X (Minimal) ---
    drop_cols = ['Transport_Cost', 'Hospital_Id', 'Supplier_Name', 'Hospital_Location',
                 'Order_Placed_Date', 'Delivery_Date']
    drop_cols_present = [col for col in drop_cols if col in df.columns]
    X = df.drop(columns=drop_cols_present)
    print(f"   ✓ Feature set 'X' created (Minimal): {X.shape}")

    # --- Define Feature Lists ---
    numeric_features = X.select_dtypes(include=np.number).columns.tolist()
    categorical_features = X.select_dtypes(exclude=np.number).columns.tolist()
    print(f"   ✓ Numeric Features: {numeric_features}")
    print(f"   ✓ Categorical Features: {categorical_features}")

    # --- Split for Tuning ---
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    print(f"   ✓ Data split for tuning.")

    # --- Define V27 Preprocessor (Using RobustScaler) ---
    numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), ('scaler', RobustScaler())]) # CHANGED to RobustScaler
    categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
                                            ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop=None))])
    preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features),
                                                     ('cat', categorical_transformer, categorical_features)],
                                       remainder='drop')
    print("   ✓ V27 Preprocessor defined (RobustScaler + OHE).")

    # --- Fit Preprocessor ---
    print("   Fitting preprocessor on X_train...")
    preprocessor.fit(X_train)
    print("✅ V27 Preprocessing complete and preprocessor fitted.")

else:
    print("   ✗ Preprocessing skipped.")

In [None]:
# === CELL 2: V27 HYPERPARAMETER TUNING (Refined ELASTICNET on V27 DATA) ===
if 'X_train' in locals() and 'preprocessor' in locals():
    print("\n--- Cell 2: Tuning Refined ElasticNet (V27 Data) ---")

    # --- ElasticNet pipeline ---
    enet_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor), # Use V27 preprocessor
        ('enet', ElasticNet(random_state=42, max_iter=10000)) # Increased max_iter
    ])

    # --- Refined Hyperparameter grid (same as V26) ---
    # Focus around alpha=0.01 and l1_ratio=0.1
    param_grid = {
        'enet__alpha': [0.005, 0.01, 0.02, 0.05], # Fine grid around 0.01
        'enet__l1_ratio': [0.05, 0.1, 0.15, 0.2]  # Fine grid around 0.1
    }
    print(f"GridSearchCV Parameter Grid:\n{param_grid}")

    # --- GridSearchCV setup ---
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    grid_search_enet = GridSearchCV(
        estimator=enet_pipeline,
        param_grid=param_grid,
        scoring='neg_root_mean_squared_error',
        cv=kf,
        n_jobs=-1, # Use all cores
        verbose=1
    )

    print("Starting Refined GridSearchCV for ElasticNet...")
    grid_search_enet.fit(X_train, y_train) # Fit on the 80% training split

    print("\n✅ Refined ElasticNet Tuning complete!")
    best_params = grid_search_enet.best_params_ # Save the new best params
    best_cv_rmse = -grid_search_enet.best_score_
    print(f"   Best hyperparameters: {best_params}")
    print(f"   Best CV RMSE (log-space): {best_cv_rmse:.4f}")

    # --- Evaluate on Validation Set ---
    best_model_tuned = grid_search_enet.best_estimator_
    y_val_pred_log = best_model_tuned.predict(X_val)
    val_rmse_log = np.sqrt(mean_squared_error(y_val, y_val_pred_log))
    y_val_orig = np.expm1(y_val)
    y_val_pred_orig = np.maximum(np.expm1(y_val_pred_log), 0) # Clip preds >= 0
    val_rmse_orig = np.sqrt(mean_squared_error(y_val_orig, y_val_pred_orig))
    print(f"--- Validation Set Performance ---")
    print(f"   Validation RMSE (log-space): {val_rmse_log:.4f}")
    print(f"   Validation RMSE (original scale): {val_rmse_orig:,.2f}")

else:
    print("   ✗ Tuning skipped - check previous cells.")

In [None]:
# === CELL 3: TRAIN FINAL ELASTICNET MODEL (V27 DATA + BEST PARAMS) ===

if 'best_params' in locals() and 'X' in locals() and 'y' in locals() and 'ElasticNet' in globals():
    print("\n--- Cell 3: Training Final ElasticNet Model (V27 Data + Best Params) ---")

    print(f"   ✓ Using best params from V27 ElasticNet GridSearch: {best_params}")

    # --- Re-fit preprocessor on FULL X data ---
    print("   Re-fitting V27 preprocessor (RobustScaler) on full dataset X...")
    # Rebuild the final preprocessor object
    numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), ('scaler', RobustScaler())]) # RobustScaler
    categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
                                            ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop=None))])
    final_preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features),
                                                         ('cat', categorical_transformer, categorical_features)],
                                           remainder='drop')
    final_preprocessor.fit(X) # Fit on ALL X data defined in Cell 1 (V27)
    print("   ✓ Preprocessor re-fitted on full X dataset.")

    # Map grid search param names
    final_model_params = {key.replace('enet__', ''): value for key, value in best_params.items()}

    # Create the final pipeline using the RE-FITTED preprocessor and best params
    final_pipeline = Pipeline(steps=[
        ('preprocessor', final_preprocessor), # Use the re-fitted preprocessor
        ('enet', ElasticNet(
            random_state=42,
            max_iter=10000, # Keep increased iterations
            **final_model_params # Unpack best alpha and l1_ratio
        ))
    ])

    # === Fit final model on full V27 dataset ===
    print("Fitting final ElasticNet model on the full V27 dataset (X, y)...")
    print("X shape:", X.shape)
    print("y shape:", y.shape)

    final_pipeline.fit(X, y) # Train on full X and y from Cell 1 (V27)

    print("\n✅ Final (V27 ELASTICNET) model trained.")
    print("The 'final_pipeline' object is ready for prediction.")

else:
    print("   ✗ Final model training skipped. Required variables not found.")
    if 'best_params' not in locals(): print("     - Best parameters not found (Run GridSearchCV - Cell 2).")
    if 'X' not in locals() or 'y' not in locals(): print("     - Full dataset 'X' or 'y' not defined (Cell 1).")

In [None]:
# === CELL 4: V25/V26/V27 TEST DATA PREPARATION FUNCTION ===

# This function applies the minimal V25/V26/V27 steps (no specific V27 changes needed here)
def prepare_features_V25(df_raw, train_cols_expected):
    """Applies V25/V26/V27 cleaning and minimal FE to raw test data"""
    print("Preparing test features (V25/V26/V27 - Bare Bones)...")
    df_test = df_raw.copy()
    df_test.columns = df_test.columns.str.strip()

    # Basic Cleaning (Strings, Yes/No, Dates)
    for col in df_test.select_dtypes(include='object').columns:
         df_test[col] = df_test[col].astype(str).str.strip().replace({'': np.nan, 'nan': np.nan, 'NaN': np.nan})
    yes_no_cols = ['CrossBorder_Shipping', 'Urgent_Shipping', 'Installation_Service','Fragile_Equipment', 'Rural_Hospital']
    for col in df_test.columns:
        if col in yes_no_cols:
             df_test[col] = df_test[col].replace({ 'YES': 'Yes', 'yes': 'Yes', 'Y': 'Yes', 'y': 'Yes','NO': 'No', 'no': 'No', 'N': 'No', 'n': 'No' })
    df_test['Order_Placed_Date'] = pd.to_datetime(df_test['Order_Placed_Date'], errors='coerce')
    df_test['Delivery_Date'] = pd.to_datetime(df_test['Delivery_Date'], errors='coerce')

    # --- Minimal Feature Engineering ---
    # Only Delivery_Days (repaired)
    df_test['Delivery_Days'] = (df_test['Delivery_Date'] - df_test['Order_Placed_Date']).dt.days
    df_test['Delivery_Days'] = df_test['Delivery_Days'].abs() # Repair negatives

    # NO Volume, NO Logs (except target), NO Date Parts, NO Missing Flags

    # --- Ensure consistency with training columns ---
    missing_cols = set(train_cols_expected) - set(df_test.columns)
    if missing_cols:
        print(f"     ! Adding missing columns to test data: {missing_cols}")
        for c in missing_cols:
            df_test[c] = np.nan # Add as NaN

    # Return only the columns expected by the training pipeline
    try:
        # Select only the columns that X had when the pipeline was trained
        df_test = df_test[train_cols_expected]
        print(f"Test data prepared. Shape: {df_test.shape}")
        return df_test
    except KeyError as e:
        print(f"   ✗ Error: Columns mismatch: {e}")
        # Identify discrepancies
        expected_set = set(train_cols_expected)
        actual_set = set(df_test.columns)
        print(f"     Missing in Test (Required by X): {expected_set - actual_set}")
        print(f"     Extra in Test (Not in X): {actual_set - expected_set}")
        return None

print("\n✅ V25/V26/V27 Test preparation function defined.")

In [None]:
# === CELL 5: GENERATE SUBMISSION (V27 - ElasticNet) ===

# Check if the final model, prepare function, and NEW cap value exist
if ('final_pipeline' in locals() and
    'prepare_features_V25' in globals() and # Reusing V25 function name
    'X' in locals() and
    'prediction_cap_value_995' in locals()):
    print("\n--- Cell 5: Generating Submission File (V27 ElasticNet Pipeline) ---")
    try:
        # Load the raw test data
        df_test_raw = pd.read_csv('../data/test.csv')
        submission_ids = df_test_raw['Hospital_Id']
        print(f"Loaded test.csv: {df_test_raw.shape}")

        # Prepare test features using V25 function, passing training columns
        X_test_final_raw = prepare_features_V25(df_test_raw, X.columns) # Pass training X columns

        if X_test_final_raw is not None:
            print("Getting predictions from the final V27 ElasticNet model...")
            # The final_pipeline object applies the V27 preprocessing (RobustScaler)
            log_predictions = final_pipeline.predict(X_test_final_raw)

            print("Converting predictions from log scale...")
            final_predictions = np.expm1(log_predictions)

            # --- V27 AGGRESSIVE Safety Clip (99.5th Percentile) ---
            # Use the prediction_cap_value_995 calculated in Cell 1
            print(f"Applying AGGRESSIVE safety clip to predictions at {prediction_cap_value_995:,.2f} (99.5th percentile)")
            final_predictions = np.clip(final_predictions, 0, prediction_cap_value_995) # Clip hard at 99.5th

            print("Creating submission DataFrame...")
            submission_df = pd.DataFrame({
                'Hospital_Id': submission_ids,
                'Transport_Cost': final_predictions
            })

            output_filename = 'model.csv'
            submission_df.to_csv(output_filename, index=False)
            print(f"\n✅ Submission file '{output_filename}' saved successfully.")
            print("Final Predictions Head:")
            print(submission_df.head())

        else:
            print("   ✗ Submission generation failed: Error preparing test features.")

    except FileNotFoundError:
        print("   ✗ Error: ../data/test.csv not found.")
    except Exception as e:
        print(f"   ✗ An unexpected error occurred during submission generation: {e}")

else:
    print("\n--- Submission Generation Skipped ---")
    if 'final_pipeline' not in locals(): print("   Reason: Final V27 model not trained (Run Cell 3).")
    if 'prepare_features_V25' not in globals(): print("   Reason: 'prepare_features_V25' function not defined (Run Cell 4).")
    if 'X' not in locals(): print("   Reason: Training features 'X' not defined (Run Cell 1).")
    if 'prediction_cap_value_995' not in locals(): print("   Reason: 'prediction_cap_value_995' not defined (Run Cell 1).")