In [35]:
# === CELL 1: IMPORTS (V49 - Lasso + QuantileTransformer) ===
print("--- Cell 1: Imports ---")
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
# --- KEY CHANGE: Import QuantileTransformer ---
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, FunctionTransformer, QuantileTransformer
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.2f}'.format)
print("✅ Libraries imported (incl. QuantileTransformer).")

--- Cell 1: Imports ---
✅ Libraries imported (incl. QuantileTransformer).


In [36]:
# === CELL 2: DATA LOADING & INITIAL CLEANING ===
print("\n--- Cell 2: Data Loading & Initial Cleaning ---")
try:
    # We load the FULL dataset again. NO outlier removal.
    df = pd.read_csv('../data/train.csv')
    df.columns = df.columns.str.strip()
    print(f"Loaded train.csv: {df.shape}")
except FileNotFoundError:
    print("Error: ../data/train.csv not found.")
    df = None
except Exception as e: 
    print(f"Error loading train.csv: {e}"); df = None

if df is not None:
    # Minimal cleaning
    print("Converting date columns (temporarily)...")
    df['Order_Placed_Date'] = pd.to_datetime(df['Order_Placed_Date'], errors='coerce')
    df['Delivery_Date'] = pd.to_datetime(df['Delivery_Date'], errors='coerce')
    print("✅ Basic loading complete.")


--- Cell 2: Data Loading & Initial Cleaning ---
Loaded train.csv: (5000, 20)
Converting date columns (temporarily)...
✅ Basic loading complete.


In [37]:
# === CELL 3: V49 FEATURE ENGINEERING & SELECTION (Focus 2) ===
if df is not None:
    print("\n--- Cell 3: V49 Feature Engineering & Selection ---")

    print("[Step 1/4] Repairing negative values using abs()...")
    df['Delivery_Days_temp'] = (df['Delivery_Date'] - df['Order_Placed_Date']).dt.days
    df['Delivery_Days_temp'] = df['Delivery_Days_temp'].abs() 
    # Apply abs() to Transport_Cost and store cap
    df['Transport_Cost'] = df['Transport_Cost'].abs() # Use abs()
    
    # Calculate 99.5th percentile cap from the FULL data
    try:
        prediction_cap_value_995 = df['Transport_Cost'].dropna().quantile(0.995)
        print(f"   ✓ Calculated Prediction Cap (99.5th percentile): {prediction_cap_value_995:,.2f}")
    except Exception as e:
        print(f"   ! Warning: Could not calculate prediction cap: {e}. Using fallback.")
        prediction_cap_value_995 = 1500000
    print("   ✓ Negatives repaired.")

    print("\n[Step 2/4] Defining target variable (y)...")
    y = np.log1p(df['Transport_Cost']) # Log-transform of abs(cost)
    print(f"   ✓ Target (y) created: {y.shape}")

    print("\n[Step 3/4] Selecting features (X) - Focus 2...")
    features_to_keep = ['Equipment_Value', 'Equipment_Weight']
    # --- Apply abs() to features BEFORE preprocessing ---
    df['Equipment_Value'] = df['Equipment_Value'].abs()
    df['Equipment_Weight'] = df['Equipment_Weight'].abs()
    
    X = df[features_to_keep].copy() # Create X with ONLY these columns
    print(f"   ✓ Feature set 'X' created (Focus 2): {X.shape}")
    print(f"   ✓ Features: {X.columns.tolist()}")

    print("\n[Step 4/4] Splitting data into Train/Validation (80/20)...")
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    print(f"   ✓ Data split complete.")

    print("✅ V49 Feature Engineering/Selection complete.")

else:
    print("   ✗ Feature engineering skipped due to data loading error.")


--- Cell 3: V49 Feature Engineering & Selection ---
[Step 1/4] Repairing negative values using abs()...
   ✓ Calculated Prediction Cap (99.5th percentile): 627,931.87
   ✓ Negatives repaired.

[Step 2/4] Defining target variable (y)...
   ✓ Target (y) created: (5000,)

[Step 3/4] Selecting features (X) - Focus 2...
   ✓ Feature set 'X' created (Focus 2): (5000, 2)
   ✓ Features: ['Equipment_Value', 'Equipment_Weight']

[Step 4/4] Splitting data into Train/Validation (80/20)...
   ✓ Data split complete.
✅ V49 Feature Engineering/Selection complete.


In [38]:
# === CELL 4: V49 PREPROCESSING PIPELINE (QuantileTransformer) ===
if 'X_train' in locals() and not X_train.empty:
    print("\n--- Cell 4: V49 Preprocessing Pipeline (Impute -> QuantileTransform) ---")

    # --- KEY CHANGE: New Transformer Pipeline ---
    # This pipeline is designed to be highly robust to outliers.
    # It replaces Log1p and StandardScaler for the *features*.
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('quantile', QuantileTransformer(
            output_distribution='normal', # Maps to a normal curve
            n_quantiles=1000, # Use 1000 percentiles
            random_state=42
        ))
    ])
    print("   ✓ Transformer defined (Impute -> QuantileTransform).")

    # --- Assemble Preprocessor ---
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, X_train.columns.tolist()) 
        ],
        remainder='drop'
    )
    print("   ✓ Preprocessor assembled.")

    # --- Fit Preprocessor ---
    print("Fitting preprocessor on X_train...")
    preprocessor.fit(X_train) 
    print("✅ V49 Preprocessor fitted.")
else:
      print("   ✗ Preprocessor fitting skipped (X_train not defined or empty).")


--- Cell 4: V49 Preprocessing Pipeline (Impute -> QuantileTransform) ---
   ✓ Transformer defined (Impute -> QuantileTransform).
   ✓ Preprocessor assembled.
Fitting preprocessor on X_train...
✅ V49 Preprocessor fitted.


In [39]:
# === CELL 5: HYPERPARAMETER TUNING (POLY + LASSO - V49 DATA) ===
if 'X_train' in locals() and 'preprocessor' in locals() and preprocessor is not None:
    print("\n--- Cell 5: Tuning Polynomial + Lasso (V49 Data) ---")

    # --- Poly + Lasso pipeline ---
    poly_lasso_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor), # Use V49 (Quantile) preprocessor
        ('poly', PolynomialFeatures(include_bias=False)), 
        ('lasso', Lasso(random_state=42, max_iter=20000))
    ])

    # --- Hyperparameter grid (Expanded) ---
    param_grid = {
        'poly__degree': [3, 4, 5],
        'lasso__alpha': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1]
    }
    print(f"GridSearchCV Parameter Grid:\\n{param_grid}")

    # --- GridSearchCV setup ---
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    grid_search = GridSearchCV(
        estimator=poly_lasso_pipeline,
        param_grid=param_grid,
        scoring='neg_root_mean_squared_error',
        cv=kf,
        n_jobs=-1, 
        verbose=2
    )

    print("Starting GridSearchCV for Polynomial + Lasso...")
    grid_search.fit(X_train, y_train) # Fit on the 80% training split

    print("\n✅ Poly+Lasso Tuning complete!")
    best_params = grid_search.best_params_ 
    best_cv_rmse = -grid_search.best_score_
    print(f"   Best hyperparameters: {best_params}")
    print(f"   Best CV RMSE (log-space): {best_cv_rmse:.4f}")

    # --- Evaluate on Validation Set ---
    best_model_tuned = grid_search.best_estimator_
    y_val_pred_log = best_model_tuned.predict(X_val)
    val_rmse_log = np.sqrt(mean_squared_error(y_val, y_val_pred_log))
    y_val_orig = np.expm1(y_val)
    y_val_pred_orig = np.maximum(np.expm1(y_val_pred_log), 0)
    val_rmse_orig = np.sqrt(mean_squared_error(y_val_orig, y_val_pred_orig))
    print(f"--- Validation Set Performance ---")
    print(f"   Validation RMSE (log-space): {val_rmse_log:.4f}")
    print(f"   Validation RMSE (original scale): {val_rmse_orig:,.2f}")

else:
    print("   ✗ Tuning skipped - check previous cells.")


--- Cell 5: Tuning Polynomial + Lasso (V49 Data) ---
GridSearchCV Parameter Grid:\n{'poly__degree': [3, 4, 5], 'lasso__alpha': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1]}
Starting GridSearchCV for Polynomial + Lasso...
Fitting 5 folds for each of 21 candidates, totalling 105 fits

✅ Poly+Lasso Tuning complete!
   Best hyperparameters: {'lasso__alpha': 0.01, 'poly__degree': 4}
   Best CV RMSE (log-space): 0.6943
--- Validation Set Performance ---
   Validation RMSE (log-space): 0.6690
   Validation RMSE (original scale): 37,641.57


In [40]:
# === CELL 6: TRAIN FINAL MODEL (V49 - Poly+Lasso) ===

if 'best_params' in locals() and 'X' in locals() and 'y' in locals():
    print("\n--- Cell 6: Training Final Polynomial+Lasso Model on ALL V49 Data --")

    print(f"   ✓ Using best params from V49 GridSearch: {best_params}")

    # --- Re-fit preprocessor on FULL X data ---
    print("   Re-fitting V49 (Quantile) preprocessor on full dataset X...")
    # Rebuild the final preprocessor object using definitions from Cell 4
    quantile_transformer_pipeline = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('quantile', QuantileTransformer(
            output_distribution='normal', 
            n_quantiles=1000, 
            random_state=42
        ))
    ])
    final_preprocessor = ColumnTransformer(transformers=[
        ('num', quantile_transformer_pipeline, X.columns.tolist())
        ], remainder='drop')

    final_preprocessor.fit(X) # Fit on ALL X data
    print("   ✓ Preprocessor re-fitted on full X dataset.")

    # Create the final pipeline
    final_pipeline = Pipeline(steps=[
        ('preprocessor', final_preprocessor),
        ('poly', PolynomialFeatures(degree=best_params['poly__degree'], include_bias=False)),
        ('lasso', Lasso(
            alpha=best_params['lasso__alpha'], 
            random_state=42,
            max_iter=20000 
        ))
    ])

    # === Fit final model on full V49 dataset ===
    print("Fitting final Poly+Lasso model on the full V49 dataset (X, y)...")
    print("X shape:", X.shape)
    print("y shape:", y.shape)

    final_pipeline.fit(X, y) # Train on full X and y

    print("\n✅ Final (V49 QUANTILE LASSO) model trained.")
    print("The 'final_pipeline' object is ready for prediction.")

else:
    print("   ✗ Final model training skipped. Required variables not found.")
    if 'best_params' not in locals(): print("     - Best parameters not found (Run GridSearchCV - Cell 5).")
    if 'X' not in locals() or 'y' not in locals(): print("     - Full dataset 'X' or 'y' not defined (Cell 3).")


--- Cell 6: Training Final Polynomial+Lasso Model on ALL V49 Data --
   ✓ Using best params from V49 GridSearch: {'lasso__alpha': 0.01, 'poly__degree': 4}
   Re-fitting V49 (Quantile) preprocessor on full dataset X...
   ✓ Preprocessor re-fitted on full X dataset.
Fitting final Poly+Lasso model on the full V49 dataset (X, y)...
X shape: (5000, 2)
y shape: (5000,)

✅ Final (V49 QUANTILE LASSO) model trained.
The 'final_pipeline' object is ready for prediction.


In [41]:
# === CELL 7: V49 TEST DATA PREPARATION FUNCTION ===

def prepare_features_V49(df_raw, train_cols_expected):
    """Applies V49 cleaning and selects the 2 key features"""
    print("Preparing test features (V49 - Focus 2)...")
    df_test = df_raw.copy()
    df_test.columns = df_test.columns.str.strip()

    # --- Select ONLY the 2 key features ---
    features_present_in_test = [col for col in train_cols_expected if col in df_test.columns]

    missing_in_test = set(train_cols_expected) - set(features_present_in_test)
    if missing_in_test:
        print(f"   ! Warning: Expected features missing in test data: {missing_in_test}. Adding as NaN.")
        for col in missing_in_test:
            df_test[col] = np.nan
            
    # --- Apply abs() to features to match training ---
    if 'Equipment_Value' in df_test.columns:
        df_test['Equipment_Value'] = df_test['Equipment_Value'].abs()
    if 'Equipment_Weight' in df_test.columns:
        df_test['Equipment_Weight'] = df_test['Equipment_Weight'].abs()

    try:
        df_test_final = df_test.reindex(columns=train_cols_expected)
        print(f"Test data prepared. Shape: {df_test_final.shape}")
        return df_test_final
    except Exception as e:
        print(f"   ✗ Error ensuring column consistency: {e}")
        return None


print("\n✅ V49 Test preparation function defined.")


✅ V49 Test preparation function defined.


In [42]:
# === CELL 8: GENERATE SUBMISSION (V49 - Quantile Lasso Model) ===

if ('final_pipeline' in locals() and
    'prepare_features_V49' in globals() and
    'X' in locals() and 
    'prediction_cap_value_995' in locals()):
    print("\n--- Cell 8: Generating Submission File (V49 Pipeline) ---")
    try:
        # Load the raw test data
        df_test_raw = pd.read_csv('../data/test.csv')
        submission_ids = df_test_raw['Hospital_Id']
        print(f"Loaded test.csv: {df_test_raw.shape}")

        # Prepare test features using V49 function
        X_test_final_raw = prepare_features_V49(df_test_raw, X.columns) 

        if X_test_final_raw is not None:
            print("Getting predictions from the final V49 model...")
            log_predictions = final_pipeline.predict(X_test_final_raw)

            print("Converting predictions from log scale...")
            final_predictions = np.expm1(log_predictions)

            # --- Apply Aggressive Safety Clip (from full data) ---
            print(f"Applying safety clip to predictions at {prediction_cap_value_995:,.2f} (99.5th percentile)")
            final_predictions = np.clip(final_predictions, 0, prediction_cap_value_995)

            print("Creating submission DataFrame...")
            submission_df = pd.DataFrame({
                'Hospital_Id': submission_ids,
                'Transport_Cost': final_predictions
            })

            output_filename = 'model.csv'
            submission_df.to_csv(output_filename, index=False)
            print(f"\n✅ Submission file '{output_filename}' saved successfully.")
            print("Final Predictions Head:")
            print(submission_df.head())

        else:
            print("   ✗ Submission generation failed: Error preparing test features.")

    except FileNotFoundError:
        print("   ✗ Error: ../data/test.csv not found.")
    except Exception as e:
        print(f"   ✗ An unexpected error occurred during submission generation: {e}")

else:
    print("\n--- Submission Generation Skipped ---")
    if 'final_pipeline' not in locals(): print("   Reason: Final V49 model not trained (Run Cell 6).")
    if 'prepare_features_V49' not in globals(): print("   Reason: 'prepare_features_V49' function not defined (Run Cell 7).")
    if 'X' not in locals(): print("   Reason: Training features 'X' not defined (Run Cell 3).")
    if 'prediction_cap_value_995' not in locals(): print("   Reason: 'prediction_cap_value_995' not defined (Run Cell 3).")


--- Cell 8: Generating Submission File (V49 Pipeline) ---
Loaded test.csv: (500, 19)
Preparing test features (V49 - Focus 2)...
Test data prepared. Shape: (500, 2)
Getting predictions from the final V49 model...
Converting predictions from log scale...
Applying safety clip to predictions at 627,931.87 (99.5th percentile)
Creating submission DataFrame...

✅ Submission file 'model.csv' saved successfully.
Final Predictions Head:
            Hospital_Id  Transport_Cost
0          fffe33003400          225.80
1  fffe3700330036003600          289.40
2  fffe3300390038003400         2858.75
3      fffe310030003900          218.29
4  fffe3700330031003200          740.17
