In [39]:
# === CELL 1: IMPORTS (V25 - Bare Bones) ===
print("--- Cell 1: Imports ---")
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import train_test_split # Keep for potential later use
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder # Using StandardScaler
from sklearn.linear_model import Ridge # Using Ridge
from sklearn.metrics import mean_squared_error

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
print("✅ Libraries imported.")

--- Cell 1: Imports ---
✅ Libraries imported.


In [40]:
# === CELL 2: DATA LOADING & INITIAL CLEANING ===
print("\n--- Cell 2: Data Loading & Initial Cleaning ---")
try:
    df = pd.read_csv('../data/train.csv')
    df.columns = df.columns.str.strip()
    print(f"Loaded train.csv: {df.shape}")
except FileNotFoundError:
    print("Error: ../data/train.csv not found.")
    df = None
except Exception as e:
    print(f"Error loading train.csv: {e}")
    df = None

if df is not None:
    # Clean string columns
    print("Cleaning string columns...")
    for col in df.select_dtypes(include='object').columns:
        df[col] = df[col].astype(str).str.strip().replace({'': np.nan, 'nan': np.nan, 'NaN': np.nan})

    # Normalize Yes/No columns
    print("Normalizing Yes/No columns...")
    yes_no_cols = ['CrossBorder_Shipping', 'Urgent_Shipping', 'Installation_Service',
                   'Fragile_Equipment', 'Rural_Hospital']
    for col in df.columns:
        if col in yes_no_cols:
            df[col] = df[col].replace({ 'YES': 'Yes', 'yes': 'Yes', 'Y': 'Yes', 'y': 'Yes',
                                        'NO': 'No', 'no': 'No', 'N': 'No', 'n': 'No' })

    # Convert date columns
    print("Converting date columns...")
    df['Order_Placed_Date'] = pd.to_datetime(df['Order_Placed_Date'], errors='coerce')
    df['Delivery_Date'] = pd.to_datetime(df['Delivery_Date'], errors='coerce')
    print("✅ Basic cleaning complete.")


--- Cell 2: Data Loading & Initial Cleaning ---
Loaded train.csv: (5000, 20)
Cleaning string columns...
Normalizing Yes/No columns...
Converting date columns...
✅ Basic cleaning complete.


In [41]:
# === CELL 3: V25 FEATURE ENGINEERING & DATA REPAIR (Minimal) ===
if df is not None:
    print("\n--- Cell 3: V25 Feature Engineering & Repair (Minimal) ---")

    print("[Step 1/4] Repairing negative values using abs()...")
    # Engineer Delivery_Days first
    df['Delivery_Days'] = (df['Delivery_Date'] - df['Order_Placed_Date']).dt.days
    df['Delivery_Days'] = df['Delivery_Days'].abs() # Use abs()
    # Apply abs() to Transport_Cost and store max for clipping later
    df['Transport_Cost'] = df['Transport_Cost'].abs() # Use abs()
    # Calculate prediction cap based on 99th percentile of ORIGINAL absolute cost
    try:
        prediction_cap_value = df['Transport_Cost'].quantile(0.99)
        print(f"   ✓ Calculated Prediction Cap (99th percentile): {prediction_cap_value:,.2f}")
    except Exception as e:
        print(f"   ! Warning: Could not calculate prediction cap: {e}. Using fallback.")
        prediction_cap_value = 1000000 # Fallback high value
    print("   ✓ Repaired negative costs and durations using abs().")


    print("\n[Step 2/4] Defining target variable (y)...")
    y = np.log1p(df['Transport_Cost']) # Log-transform of abs(cost)
    print(f"   ✓ Target (y) created: {y.shape}")

    print("\n[Step 3/4] Selecting features (X) - Keeping originals...")
    # Keep original numeric features, drop IDs/dates/target
    # NO Volume, NO Log transforms (except target), NO Date Parts, NO Missing Flags (initially)
    drop_cols = [
        'Transport_Cost',
        'Hospital_Id', 'Supplier_Name', 'Hospital_Location',
        'Order_Placed_Date', 'Delivery_Date',
        # Also drop Delivery_Days initially to be truly minimal? Or keep it? Let's keep it for now.
    ]
    drop_cols_present = [col for col in drop_cols if col in df.columns]
    X = df.drop(columns=drop_cols_present)
    print(f"   ✓ Feature set 'X' created (Minimal): {X.shape}")
    print(f"   ✓ Features: {X.columns.tolist()}")


    print("\n[Step 4/4] Defining Feature Lists for Pipeline...")
    # Define features based on columns NOW in X
    numeric_features = X.select_dtypes(include=np.number).columns.tolist()
    categorical_features = X.select_dtypes(exclude=np.number).columns.tolist()
    print(f"Numeric Features ({len(numeric_features)}): {numeric_features}")
    print(f"Categorical Features ({len(categorical_features)}): {categorical_features}")

    print("✅ V25 Minimal Feature Engineering complete.")

else:
    print("   ✗ Feature engineering skipped due to data loading error.")


--- Cell 3: V25 Feature Engineering & Repair (Minimal) ---
[Step 1/4] Repairing negative values using abs()...
   ✓ Calculated Prediction Cap (99th percentile): 269,745.26
   ✓ Repaired negative costs and durations using abs().

[Step 2/4] Defining target variable (y)...
   ✓ Target (y) created: (5000,)

[Step 3/4] Selecting features (X) - Keeping originals...
   ✓ Feature set 'X' created (Minimal): (5000, 15)
   ✓ Features: ['Supplier_Reliability', 'Equipment_Height', 'Equipment_Width', 'Equipment_Weight', 'Equipment_Type', 'Equipment_Value', 'Base_Transport_Fee', 'CrossBorder_Shipping', 'Urgent_Shipping', 'Installation_Service', 'Transport_Method', 'Fragile_Equipment', 'Hospital_Info', 'Rural_Hospital', 'Delivery_Days']

[Step 4/4] Defining Feature Lists for Pipeline...
Numeric Features (7): ['Supplier_Reliability', 'Equipment_Height', 'Equipment_Width', 'Equipment_Weight', 'Equipment_Value', 'Base_Transport_Fee', 'Delivery_Days']
Categorical Features (8): ['Equipment_Type', 'CrossB

In [42]:
# === CELL 4: V25 PREPROCESSING PIPELINE & FINAL MODEL TRAINING ===
if 'X' in locals() and 'y' in locals() and 'numeric_features' in locals():
    print("\n--- Cell 4: V25 Preprocessing Pipeline & Final Model Training ---")

    # --- Define Transformers ---
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()) # Using StandardScaler
    ])
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop=None))
    ])
    print("   ✓ Transformers defined (Median Impute + StandardScaler + OHE).")

    # --- Assemble Preprocessor ---
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ],
        remainder='drop'
    )
    print("   ✓ Preprocessor assembled.")

    # --- Define Final Pipeline with Ridge ---
    # No tuning, just train Ridge with alpha=10
    final_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('ridge', Ridge(alpha=10, random_state=42)) # Simple Ridge model
    ])
    print("   ✓ Final Pipeline defined (Ridge alpha=10).")

    # === Fit final model on full V25 dataset ===
    print("Fitting final Ridge model on the full V25 dataset (X, y)...")
    print("X shape:", X.shape)
    print("y shape:", y.shape)

    final_pipeline.fit(X, y) # Train on full X and y from Cell 3 (V25)

    print("\n✅ Final (V25 Bare Bones RIDGE) model trained.")
    print("The 'final_pipeline' object is ready for prediction.")

else:
    print("   ✗ Final model training skipped. Required variables not found.")


--- Cell 4: V25 Preprocessing Pipeline & Final Model Training ---
   ✓ Transformers defined (Median Impute + StandardScaler + OHE).
   ✓ Preprocessor assembled.
   ✓ Final Pipeline defined (Ridge alpha=10).
Fitting final Ridge model on the full V25 dataset (X, y)...
X shape: (5000, 15)
y shape: (5000,)

✅ Final (V25 Bare Bones RIDGE) model trained.
The 'final_pipeline' object is ready for prediction.


In [43]:
# === CELL 5: V25 TEST DATA PREPARATION FUNCTION ===

# This function applies the minimal V25 steps
def prepare_features_V25(df_raw, train_cols_expected):
    """Applies V25 cleaning and minimal FE to raw test data"""
    print("Preparing test features (V25 - Bare Bones)...")
    df_test = df_raw.copy()
    df_test.columns = df_test.columns.str.strip()

    # Basic Cleaning (Strings, Yes/No, Dates)
    for col in df_test.select_dtypes(include='object').columns:
         df_test[col] = df_test[col].astype(str).str.strip().replace({'': np.nan, 'nan': np.nan, 'NaN': np.nan})
    yes_no_cols = ['CrossBorder_Shipping', 'Urgent_Shipping', 'Installation_Service','Fragile_Equipment', 'Rural_Hospital']
    for col in df_test.columns:
        if col in yes_no_cols:
             df_test[col] = df_test[col].replace({ 'YES': 'Yes', 'yes': 'Yes', 'Y': 'Yes', 'y': 'Yes','NO': 'No', 'no': 'No', 'N': 'No', 'n': 'No' })
    df_test['Order_Placed_Date'] = pd.to_datetime(df_test['Order_Placed_Date'], errors='coerce')
    df_test['Delivery_Date'] = pd.to_datetime(df_test['Delivery_Date'], errors='coerce')

    # --- V25 Minimal Feature Engineering ---
    # Only Delivery_Days (repaired)
    df_test['Delivery_Days'] = (df_test['Delivery_Date'] - df_test['Order_Placed_Date']).dt.days
    df_test['Delivery_Days'] = df_test['Delivery_Days'].abs() # Repair negatives

    # NO Volume, NO Logs (except target), NO Date Parts, NO Missing Flags

    # --- Ensure consistency with training columns ---
    missing_cols = set(train_cols_expected) - set(df_test.columns)
    if missing_cols:
        print(f"     ! Adding missing columns to test data: {missing_cols}")
        for c in missing_cols:
            df_test[c] = np.nan # Add as NaN

    # Return only the columns expected by the training pipeline
    try:
        # Select only the columns that X had when the pipeline was trained
        df_test = df_test[train_cols_expected]
        print(f"Test data prepared. Shape: {df_test.shape}")
        return df_test
    except KeyError as e:
        print(f"   ✗ Error: Columns mismatch: {e}")
        # Identify discrepancies
        expected_set = set(train_cols_expected)
        actual_set = set(df_test.columns)
        print(f"     Missing in Test (Required by X): {expected_set - actual_set}")
        print(f"     Extra in Test (Not in X): {actual_set - expected_set}")
        return None

print("\n✅ V25 Test preparation function defined.")


✅ V25 Test preparation function defined.


In [44]:
# === CELL 6: GENERATE SUBMISSION (V25) ===

# Check if the final model and prepare function exist
if 'final_pipeline' in locals() and 'prepare_features_V25' in globals() and 'X' in locals() and 'prediction_cap_value' in locals():
    print("\n--- Cell 6: Generating Submission File (V25 Pipeline) ---")
    try:
        # Load the raw test data
        df_test_raw = pd.read_csv('../data/test.csv')
        submission_ids = df_test_raw['Hospital_Id']
        print(f"Loaded test.csv: {df_test_raw.shape}")

        # Prepare test features using V25 function, passing training columns
        X_test_final_raw = prepare_features_V25(df_test_raw, X.columns) # Pass training X columns

        if X_test_final_raw is not None:
            print("Getting predictions from the final V25 Ridge model...")
            # The final_pipeline object applies the V25 preprocessing
            log_predictions = final_pipeline.predict(X_test_final_raw)

            print("Converting predictions from log scale...")
            final_predictions = np.expm1(log_predictions)

            # --- AGGRESSIVE Safety Clip ---
            # Use the prediction_cap_value calculated in Cell 3 (99th percentile)
            print(f"Applying AGGRESSIVE safety clip to predictions at {prediction_cap_value:,.2f}")
            final_predictions = np.clip(final_predictions, 0, prediction_cap_value) # Clip hard at 99th percentile

            print("Creating submission DataFrame...")
            submission_df = pd.DataFrame({
                'Hospital_Id': submission_ids,
                'Transport_Cost': final_predictions
            })

            output_filename = 'model.csv'
            submission_df.to_csv(output_filename, index=False)
            print(f"\n✅ Submission file '{output_filename}' saved successfully.")
            print("Final Predictions Head:")
            print(submission_df.head())

        else:
            print("   ✗ Submission generation failed: Error preparing test features.")

    except FileNotFoundError:
        print("   ✗ Error: ../data/test.csv not found.")
    except Exception as e:
        print(f"   ✗ An unexpected error occurred during submission generation: {e}")

else:
    print("\n--- Submission Generation Skipped ---")
    if 'final_pipeline' not in locals(): print("   Reason: Final V25 model not trained (Run Cell 4).")
    if 'prepare_features_V25' not in globals(): print("   Reason: 'prepare_features_V25' function not defined (Run Cell 5).")
    if 'X' not in locals(): print("   Reason: Training features 'X' not defined (Run Cell 3).")
    if 'prediction_cap_value' not in locals(): print("   Reason: 'prediction_cap_value' not defined (Run Cell 3).")


--- Cell 6: Generating Submission File (V25 Pipeline) ---
Loaded test.csv: (500, 19)
Preparing test features (V25 - Bare Bones)...
Test data prepared. Shape: (500, 15)
Getting predictions from the final V25 Ridge model...
Converting predictions from log scale...
Applying AGGRESSIVE safety clip to predictions at 269,745.26
Creating submission DataFrame...

✅ Submission file 'model.csv' saved successfully.
Final Predictions Head:
            Hospital_Id  Transport_Cost
0          fffe33003400      573.860345
1  fffe3700330036003600      507.934321
2  fffe3300390038003400     6566.382011
3      fffe310030003900      217.863109
4  fffe3700330031003200     1080.194807
