In [None]:
# === CELL 1: IMPORTS (V45 - Focus 2) ===
print("--- Cell 1: Imports ---")
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures, FunctionTransformer
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.2f}'.format)
print("✅ Libraries imported.")

In [None]:
# === CELL 2: DATA LOADING & INITIAL CLEANING ===
print("\n--- Cell 2: Data Loading & Initial Cleaning ---")
try:
    df = pd.read_csv('../data/train.csv')
    df.columns = df.columns.str.strip()
    print(f"Loaded train.csv: {df.shape}")
except FileNotFoundError:
    print("Error: ../data/train.csv not found.")
    df = None
except Exception as e: print(f"Error loading train.csv: {e}"); df = None

if df is not None:
    # Minimal cleaning needed as most columns are dropped
    print("Converting date columns (temporarily)...")
    df['Order_Placed_Date'] = pd.to_datetime(df['Order_Placed_Date'], errors='coerce')
    df['Delivery_Date'] = pd.to_datetime(df['Delivery_Date'], errors='coerce')
    print("✅ Basic loading complete.")

In [None]:
# === CELL 3: V45 FEATURE ENGINEERING & SELECTION (Focus 2) ===
if df is not None:
    print("\n--- Cell 3: V45 Feature Engineering & Selection ---")

    print("[Step 1/4] Repairing negative values using abs()...")
    # Need Delivery_Days temporarily for consistency check if needed later
    df['Delivery_Days_temp'] = (df['Delivery_Date'] - df['Order_Placed_Date']).dt.days
    df['Delivery_Days_temp'] = df['Delivery_Days_temp'].abs() # Repair
    # Apply abs() to Transport_Cost and store cap
    df['Transport_Cost'] = df['Transport_Cost'].abs() # Use abs()
    # Calculate 99.5th percentile cap
    try:
        prediction_cap_value_995 = df['Transport_Cost'].dropna().quantile(0.995)
        print(f"   ✓ Calculated Prediction Cap (99.5th percentile): {prediction_cap_value_995:,.2f}")
    except Exception as e:
        print(f"   ! Warning: Could not calculate prediction cap: {e}. Using fallback.")
        prediction_cap_value_995 = 1500000
    print("   ✓ Negatives repaired.")

    print("\n[Step 2/4] Defining target variable (y)...")
    y = np.log1p(df['Transport_Cost']) # Log-transform of abs(cost)
    print(f"   ✓ Target (y) created: {y.shape}")

    print("\n[Step 3/4] Selecting features (X) - Focus 2...")
    # Keep ONLY the two specified features
    features_to_keep = ['Equipment_Value', 'Equipment_Weight']
    features_present = [col for col in features_to_keep if col in df.columns]

    if len(features_present) != len(features_to_keep):
         print(f"   ! Warning: Not all requested features found! Keeping: {features_present}")

    X = df[features_present].copy() # Create X with ONLY these columns
    print(f"   ✓ Feature set 'X' created (Focus 2): {X.shape}")
    print(f"   ✓ Features: {X.columns.tolist()}")

    print("\n[Step 4/4] Splitting data into Train/Validation (80/20)...")
    # Split the focused X and y
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    print(f"   ✓ Data split complete.")

    print("✅ V45 Feature Engineering/Selection complete.")

else:
    print("   ✗ Feature engineering skipped due to data loading error.")

In [None]:
# === CELL 4: V45 PREPROCESSING PIPELINE DEFINITION & FITTING ===
if 'X_train' in locals() and not X_train.empty:
    print("\n--- Cell 4: V45 Preprocessing Pipeline (Focus 2 + Log + StandardScaler) ---")

    # Define the log transform function (handles potential negatives/zeros after imputation)
    def log_transform_safe(x):
        # Clip at 0 before log1p
        return np.log1p(np.maximum(x, 0))

    # --- Define Transformer Pipeline for the 2 features ---
    # Apply Impute -> Clip(0) -> Log1p -> Scale
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('clipper', FunctionTransformer(lambda x: np.maximum(x, 0), validate=False)), # Clip >= 0
        ('log', FunctionTransformer(np.log1p, validate=False)), # Apply log1p
        ('scaler', StandardScaler()) # Then scale
    ])
    print("   ✓ Transformer defined (Impute -> Clip(0) -> Log1p -> StandardScaler).")


    # --- Assemble Preprocessor ---
    # Apply the transformer to all columns in X (which should only be the 2 features)
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, X_train.columns.tolist()) # Apply to all columns in X_train
        ],
        remainder='drop'
    )
    print("   ✓ Preprocessor assembled.")

    # --- Fit Preprocessor ---
    print("Fitting preprocessor on X_train...")
    preprocessor.fit(X_train) # Fit only on the training part
    print("✅ V45 Preprocessor fitted.")
else:
      print("   ✗ Preprocessor fitting skipped (X_train not defined or empty).")

In [None]:
# === CELL 5: HYPERPARAMETER TUNING (POLY + LASSO - V45 DATA) ===
if 'X_train' in locals() and 'preprocessor' in locals() and preprocessor is not None:
    print("\n--- Cell 5: Tuning Polynomial + Lasso (V45 Data) ---")

    # --- Poly + Lasso pipeline ---
    poly_lasso_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor), # Use V45 preprocessor
        ('poly', PolynomialFeatures(include_bias=False)), # Degree tuned below
        ('lasso', Lasso(random_state=42, max_iter=20000)) # Increased max_iter
    ])

    # --- Hyperparameter grid (as requested) ---
    param_grid = {
        'poly__degree': [2,3,4],
        'lasso__alpha': [0.001, 0.01, 0.1, 1.0, 10.0] # 0.001 up to 10
    }
    print(f"GridSearchCV Parameter Grid:\n{param_grid}")

    # --- GridSearchCV setup ---
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    grid_search = GridSearchCV(
        estimator=poly_lasso_pipeline,
        param_grid=param_grid,
        scoring='neg_root_mean_squared_error',
        cv=kf,
        n_jobs=-1, # Use all cores
        verbose=2
    )

    print("Starting GridSearchCV for Polynomial + Lasso...")
    grid_search.fit(X_train, y_train) # Fit on the 80% training split

    print("\n✅ Poly+Lasso Tuning complete!")
    best_params = grid_search.best_params_ # Save the best params
    best_cv_rmse = -grid_search.best_score_
    print(f"   Best hyperparameters: {best_params}")
    print(f"   Best CV RMSE (log-space): {best_cv_rmse:.4f}")

    # --- Evaluate on Validation Set ---
    best_model_tuned = grid_search.best_estimator_
    y_val_pred_log = best_model_tuned.predict(X_val)
    val_rmse_log = np.sqrt(mean_squared_error(y_val, y_val_pred_log))
    y_val_orig = np.expm1(y_val)
    y_val_pred_orig = np.maximum(np.expm1(y_val_pred_log), 0) # Clip preds >= 0
    val_rmse_orig = np.sqrt(mean_squared_error(y_val_orig, y_val_pred_orig))
    print(f"--- Validation Set Performance ---")
    print(f"   Validation RMSE (log-space): {val_rmse_log:.4f}")
    print(f"   Validation RMSE (original scale): {val_rmse_orig:,.2f}")

else:
    print("   ✗ Tuning skipped - check previous cells.")

In [None]:
# === CELL 6: TRAIN FINAL MODEL (V45 - Poly+Lasso) ===

if 'best_params' in locals() and 'X' in locals() and 'y' in locals():
    print("\n--- Cell 6: Training Final Polynomial+Lasso Model on ALL V45 Data ---")

    print(f"   ✓ Using best params from V45 GridSearch: {best_params}")

    # --- Re-fit preprocessor on FULL X data ---
    print("   Re-fitting V45 preprocessor on full dataset X...")
    # Rebuild the final preprocessor object using definitions from Cell 4
    log_transformer_pipeline = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('clipper', FunctionTransformer(lambda x: np.maximum(x, 0), validate=False)),
        ('log', FunctionTransformer(np.log1p, validate=False)),
        ('scaler', StandardScaler())
    ])
    # Need feature lists from Cell 3 (which will just be the two features)
    final_preprocessor = ColumnTransformer(transformers=[
        ('num', log_transformer_pipeline, X.columns.tolist()) # Apply to all columns in X
        ], remainder='drop')

    final_preprocessor.fit(X) # Fit on ALL X data defined in Cell 3 (V45)
    print("   ✓ Preprocessor re-fitted on full X dataset.")

    # Create the final pipeline using the RE-FITTED preprocessor and best params
    final_pipeline = Pipeline(steps=[
        ('preprocessor', final_preprocessor), # Use the re-fitted preprocessor
        ('poly', PolynomialFeatures(degree=best_params['poly__degree'], include_bias=False)),
        ('lasso', Lasso(
            alpha=best_params['lasso__alpha'], # Use alpha from grid_search
            random_state=42,
            max_iter=20000 # Keep increased iterations
        ))
    ])

    # === Fit final model on full V45 dataset ===
    print("Fitting final Poly+Lasso model on the full V45 dataset (X, y)...")
    print("X shape:", X.shape)
    print("y shape:", y.shape)

    final_pipeline.fit(X, y) # Train on full X and y from Cell 3 (V45)

    print("\n✅ Final (V45 FOCUS 2 POLY+LASSO) model trained.")
    print("The 'final_pipeline' object is ready for prediction.")

else:
    print("   ✗ Final model training skipped. Required variables not found.")
    if 'best_params' not in locals(): print("     - Best parameters not found (Run GridSearchCV - Cell 5).")
    if 'X' not in locals() or 'y' not in locals(): print("     - Full dataset 'X' or 'y' not defined (Cell 3).")

In [None]:
# === CELL 7: V45 TEST DATA PREPARATION FUNCTION ===

def prepare_features_V45(df_raw, train_cols_expected):
    """Applies V45 cleaning and selects the 2 key features"""
    print("Preparing test features (V45 - Focus 2)...")
    df_test = df_raw.copy()
    df_test.columns = df_test.columns.str.strip()

    # --- Select ONLY the 2 key features ---
    # train_cols_expected should be ['Equipment_Value', 'Equipment_Weight']
    features_present_in_test = [col for col in train_cols_expected if col in df_test.columns]

    # Check if all expected features are present
    missing_in_test = set(train_cols_expected) - set(features_present_in_test)
    if missing_in_test:
        print(f"   ! Warning: Expected features missing in test data: {missing_in_test}. Adding as NaN.")
        for col in missing_in_test:
            # Add missing column initialized with NaN
            df_test[col] = np.nan

    # Select only the expected columns
    try:
        # Use .reindex to ensure columns match exactly, filling missing with NaN
        df_test_final = df_test.reindex(columns=train_cols_expected)
        print(f"Test data prepared. Shape: {df_test_final.shape}")
        return df_test_final
    except Exception as e: # Catch potential errors during reindex
        print(f"   ✗ Error ensuring column consistency: {e}")
        return None


print("\n✅ V45 Test preparation function defined.")

In [None]:
# === CELL 8: GENERATE SUBMISSION (V45 - Focus 2 Poly+Lasso) ===

# Check if the final model, prepare function, cap value, and X exist
if ('final_pipeline' in locals() and
    'prepare_features_V45' in globals() and
    'X' in locals() and # Need original X for column names
    'prediction_cap_value_995' in locals()):
    print("\n--- Cell 8: Generating Submission File (V45 Pipeline) ---")
    try:
        # Load the raw test data
        df_test_raw = pd.read_csv('../data/test.csv')
        submission_ids = df_test_raw['Hospital_Id']
        print(f"Loaded test.csv: {df_test_raw.shape}")

        # Prepare test features using V45 function, passing training columns
        X_test_final_raw = prepare_features_V45(df_test_raw, X.columns) # Pass training X columns

        if X_test_final_raw is not None:
            print("Getting predictions from the final V45 Poly+Lasso model...")
            # The final_pipeline object applies the V45 preprocessing
            log_predictions = final_pipeline.predict(X_test_final_raw)

            print("Converting predictions from log scale...")
            final_predictions = np.expm1(log_predictions)

            # --- Apply Aggressive Safety Clip (99.5th Percentile) ---
            print(f"Applying safety clip to predictions at {prediction_cap_value_995:,.2f} (99.5th percentile)")
            final_predictions = np.clip(final_predictions, 0, prediction_cap_value_995)

            print("Creating submission DataFrame...")
            submission_df = pd.DataFrame({
                'Hospital_Id': submission_ids,
                'Transport_Cost': final_predictions
            })

            output_filename = 'model.csv'
            submission_df.to_csv(output_filename, index=False)
            print(f"\n✅ Submission file '{output_filename}' saved successfully.")
            print("Final Predictions Head:")
            print(submission_df.head())

        else:
            print("   ✗ Submission generation failed: Error preparing test features.")

    except FileNotFoundError:
        print("   ✗ Error: ../data/test.csv not found.")
    except Exception as e:
        print(f"   ✗ An unexpected error occurred during submission generation: {e}")

else:
    print("\n--- Submission Generation Skipped ---")
    # Add detailed checks for V45
    if 'final_pipeline' not in locals(): print("   Reason: Final V45 model not trained (Run Cell 6).")
    if 'prepare_features_V45' not in globals(): print("   Reason: 'prepare_features_V45' function not defined (Run Cell 7).")
    if 'X' not in locals(): print("   Reason: Training features 'X' not defined (Run Cell 3).")
    if 'prediction_cap_value_995' not in locals(): print("   Reason: 'prediction_cap_value_995' not defined (Run Cell 3).")