In [None]:
#import all libraries
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from scipy.stats import zscore
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PowerTransformer, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from lightgbm import LGBMRegressor # A powerful gradient boosting model
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
np.set_printoptions(threshold=np.inf)

In [None]:
# Set plot style
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)


# 1️⃣ Load Data
print("Loading data...")
df = pd.read_csv('../data/train.csv')
df.columns = df.columns.str.strip()
display(df.head())
print(f"Initial data shape: {df.shape}")

# 2️⃣ Clean all string/object columns: strip spaces, replace blanks with NaN
print("Cleaning string columns...")
for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].astype(str).str.strip()
    df[col] = df[col].replace({'': np.nan, 'nan': np.nan, 'NaN': np.nan})

# 3️⃣ Normalize Yes/No columns to consistent "Yes"/"No"
print("Normalizing Yes/No columns...")
yes_no_cols = ['CrossBorder_Shipping', 'Urgent_Shipping', 'Installation_Service',
               'Fragile_Equipment', 'Rural_Hospital']
for col in yes_no_cols:
    if col in df.columns:
        df[col] = df[col].replace({
            'YES': 'Yes', 'yes': 'Yes', 'Y': 'Yes', 'y': 'Yes',
            'NO': 'No', 'no': 'No', 'N': 'No', 'n': 'No'
        })

# 4️⃣ Convert date columns to datetime
print("Converting date columns...")
df['Order_Placed_Date'] = pd.to_datetime(df['Order_Placed_Date'], errors='coerce')
df['Delivery_Date'] = pd.to_datetime(df['Delivery_Date'], errors='coerce')

# 5️⃣ Create new feature: Delivery_Days (difference in days)
print("Engineering Delivery_Days feature...")
df['Delivery_Days'] = (df['Delivery_Date'] - df['Order_Placed_Date']).dt.days
df['Delivery_Days'] = pd.to_numeric(df['Delivery_Days'], errors='coerce')

# === ADDED: Date Feature Engineering ===
print("Engineering more date features...")
df['Order_Month'] = df['Order_Placed_Date'].dt.month
df['Order_Day_of_Week'] = df['Order_Placed_Date'].dt.dayofweek  # Monday=0, Sunday=6
df['Order_Is_Weekend'] = df['Order_Day_of_Week'].isin([5, 6])
# === END ADDED ===

# 6️⃣ (Original) delete initial date rows
# df = df.dropna(subset=['Order_Placed_Date', 'Delivery_Date'])

# 7️⃣ Drop exact duplicate rows
print("Dropping duplicates...")
before = len(df)
df = df.drop_duplicates()
after = len(df)
print(f"Dropped {before - after} duplicate rows.")

# 8️⃣ Quick check after cleaning
print("\n" + "="*30)
print(" CLEANING & FEATURE ENGINEERING COMPLETE ")
print("="*30)
print(f"After basic cleaning shape: {df.shape}")

print("\nMissing values (raw count):")
print(df.isna().sum())

# === ADDED: Missing Value Percentage View ===
print("\nMissing values (percentage):")
missing_pct = (df.isna().sum() / len(df) * 100).sort_values(ascending=False)
print(missing_pct[missing_pct > 0])
# === END ADDED ===

print("\nDataFrame head:")
display(df.head())
# print(df['Delivery_Days'])

In [None]:

# ==============================================================================
# 📊 START OF EXPLORATORY DATA ANALYSIS (EDA)
# ==============================================================================

print("\n" + "="*30)
print(" STARTING EDA ")
print("="*30)

# 🔹 Define column lists
num_cols = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
# === ADDED: Exclude new date features from 'num_cols' for general stats ===
date_num_features = ['Order_Month', 'Order_Day_of_Week', 'Delivery_Days']
for col in ['Transport_Cost'] + date_num_features:
    if col in num_cols:
        num_cols.remove(col)
# === END ADDED ===
        
cat_cols = df.select_dtypes(include='object').columns.tolist()
# === ADDED: Add boolean 'Is_Weekend' to cat_cols for analysis ===
if 'Order_Is_Weekend' in df.columns:
    cat_cols.append('Order_Is_Weekend')
# === END ADDED ===

print(f"Numeric features identified: {num_cols}")
print(f"Categorical features identified: {cat_cols}")
print(f"Date-derived features identified: {date_num_features}")


# === ADDED: 1. Target Variable Analysis (Transport_Cost) ===
print("\n===== 1. TARGET VARIABLE ANALYSIS: Transport_Cost =====")
plt.figure(figsize=(14, 5))

# Plot 1: Original Distribution
plt.subplot(1, 2, 1)
sns.histplot(df['Transport_Cost'], kde=True, bins=40)
plt.title('Distribution of Transport_Cost (Original)')
plt.xlabel('Transport_Cost')

# Plot 2: Log-Transformed Distribution
# We add 1 to handle potential zero values before logging
plt.subplot(1, 2, 2)
log_target = np.log1p(df['Transport_Cost'])
sns.histplot(log_target, kde=True, bins=40, color='green')
plt.title('Distribution of log(Transport_Cost + 1)')
plt.xlabel('log(Transport_Cost + 1)')

plt.suptitle('Target Variable Distribution Analysis', fontsize=16)
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

print(f"Skewness of Transport_Cost: {df['Transport_Cost'].skew():.4f}")
print(f"Skewness of log(Transport_Cost + 1): {log_target.skew():.4f}")
# === END ADDED ===


print("\n===== 2. NUMERIC FEATURE ANALYSIS =====")
print("===== BASIC NUMERIC STATISTICS =====")
if not num_cols:
    print("No numeric columns found to describe (excluding target/dates).")
else:
    display(df[num_cols].describe().T)

    print("\n===== SKEWNESS =====")
    display(df[num_cols].skew())

# 🔹 Numeric distributions + boxplots
# (Your original loop)
# === MODIFIED: Added a check for empty list ===
print("\nGenerating numeric distribution plots...")
analysis_num_cols = num_cols + ['Delivery_Days'] # Add Delivery_Days back for plotting
if 'Transport_Cost' not in analysis_num_cols:
    analysis_num_cols.append('Transport_Cost') # Add Target back for plotting
    
for col in analysis_num_cols:
    if col in df.columns:
        plt.figure(figsize=(12,4))
        
        plt.subplot(1,2,1)
        sns.histplot(df[col], kde=True, bins=30)
        plt.title(f'{col} distribution')
        
        plt.subplot(1,2,2)
        sns.boxplot(x=df[col])
        plt.title(f'{col} boxplot')
        
        plt.tight_layout()
        plt.show()
    else:
        print(f"Warning: Column '{col}' not found for plotting.")


print("\n===== 3. CORRELATION ANALYSIS =====")
# 🔹 Correlation heatmap
# (Your original code)
plt.figure(figsize=(10,8))
corr = df[num_cols + ['Transport_Cost', 'Delivery_Days']].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap')
plt.show()


print("\n===== 4. CATEGORICAL FEATURE ANALYSIS =====")
# 🔹 Categorical distributions
# (Your original loop)
print("\nGenerating categorical distribution plots...")
high_cardinality_cols = []
for col in cat_cols:
    print(f"\n===== Column: {col} =====")
    print(df[col].value_counts(dropna=False))
    
    nunique = df[col].nunique()
    if nunique > 20:
        high_cardinality_cols.append(col)
        print(f"SKIPPING countplot for {col} (High Cardinality: {nunique} unique values)")
        continue
        
    plt.figure(figsize=(8,4))
    sns.countplot(y=col, data=df, order=df[col].value_counts().index)
    plt.title(f'Count of {col}')
    plt.tight_layout()
    plt.show()

# === ADDED: 4a. High-Cardinality Column Summary ===
print("\n===== 4a. HIGH-CARDINALITY CATEGORICAL SUMMARY =====")
if high_cardinality_cols:
    print(f"High-cardinality features detected: {high_cardinality_cols}")
    for col in high_cardinality_cols:
        print(f"\n--- Top 10 values for: {col} ---")
        print(df[col].value_counts(dropna=False).head(10))
        print(f"...and {df[col].nunique() - 10} other unique values.")
else:
    print("No high-cardinality categorical features detected (threshold > 20).")
# === END ADDED ===


print("\n===== 5. BIVARIATE ANALYSIS (FEATURES vs. TARGET) =====")
# 🔹 Numeric features vs target
# (Your original loop)
print("\nGenerating numeric features vs. Transport_Cost...")
for col in num_cols + ['Delivery_Days']:
    if col in df.columns:
        plt.figure(figsize=(6,4))
        sns.scatterplot(x=df[col], y=df['Transport_Cost'])
        plt.title(f'{col} vs Transport_Cost')
        plt.tight_layout()
        plt.show()

# 🔹 Categorical features vs target (low-cardinality)
# (Your original loop)
print("\nGenerating categorical features vs. Transport_Cost...")
for col in cat_cols:
    if col in df.columns and df[col].nunique() < 20:
        plt.figure(figsize=(10,4))
        sns.boxplot(x=col, y='Transport_Cost', data=df)
        plt.title(f'{col} vs Transport_Cost')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()

# === ADDED: 5a. Date-Derived Features vs. Target ===
print("\nGenerating date-derived features vs. Transport_Cost...")
date_features_to_plot = ['Order_Month', 'Order_Day_of_Week', 'Order_Is_Weekend']
for col in date_features_to_plot:
    if col in df.columns:
        plt.figure(figsize=(10, 4))
        sns.boxplot(x=col, y='Transport_Cost', data=df)
        plt.title(f'{col} vs Transport_Cost')
        if col == 'Order_Day_of_Week':
            plt.xticks(ticks=range(7), labels=['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'])
        plt.tight_layout()
        plt.show()
# === END ADDED ===


print("\n===== 6. OUTLIER DETECTION =====")
# 🔹 Outlier detection (Z-score)
# (Your original code)
# === MODIFIED: Added nan_policy='omit' to handle missing values gracefully ===
try:
    z_scores = df[num_cols + ['Transport_Cost', 'Delivery_Days']].apply(lambda x: zscore(x, nan_policy='omit'))
    outliers = (abs(z_scores) > 3).sum()
    print("\n===== NUMBER OF OUTLIERS PER COLUMN (Z-score > 3) =====")
    print(outliers[outliers > 0].sort_values(ascending=False))
except ValueError as e:
    print(f"Could not calculate Z-scores, likely due to all-NaN column. Error: {e}")
# === END MODIFIED ===


print("\n===== 7. MISSING VALUE VISUALIZATION =====")
# 🔹 Missing value visualization
# (Your original code)
print("\nGenerating missing value matrix...")
msno.matrix(df)
plt.title('Missing Value Matrix')
plt.show()

print("\nGenerating missing value bar chart...")
msno.bar(df)
plt.title('Missing Value Bar Chart')
plt.show()


print("\n" + "="*30)
print(" EDA COMPLETE ")
print("="*30)

In [49]:
print("Preprocessing script started...")

# ==============================================================================
# PART 1: PRE-SPLIT (Data Cleaning & Feature Engineering)
# These actions are applied to the whole dataset before splitting.
# ==============================================================================

# 1. Filter Bad Data
# EDA Finding: We found impossible values like Transport_Cost < 0 and Delivery_Days < 0.
# Action: Remove these rows entirely.
initial_rows = len(df)
df = df[df['Transport_Cost'] >= 0]
df = df[df['Delivery_Days'] >= 0]
print(f"Filtered {initial_rows - len(df)} rows with bad data (negative cost or delivery days).")

# 2. Feature Engineering
# EDA Finding: Equipment_Height & Equipment_Width were highly correlated (0.77).
# Action: Combine them into a single 'Equipment_Volume' feature.
df['Equipment_Volume'] = df['Equipment_Height'] * df['Equipment_Width']

# 3. Log-Transform Skewed Features
# EDA Finding: Equipment_Value (skew=24) and our new Equipment_Volume
# (derived from skewed features) are extremely right-skewed.
# Action: Apply np.log1p to normalize them.
df['Equipment_Value'] = np.log1p(df['Equipment_Value'])
df['Equipment_Volume'] = np.log1p(df['Equipment_Volume'])

# 4. Define Target (y) and Features (X)
# EDA Finding: Target 'Transport_Cost' is extremely skewed (skew=30).
# Action: Use np.log1p on the target. We will predict the log, then convert back.
y = np.log1p(df['Transport_Cost'])

# Action: Define X by dropping the target, original engineered columns, 
# and high-cardinality/redundant/ID columns identified in the EDA.
X = df.drop(columns=[
    # Target
    'Transport_Cost',
    
    # Replaced by Equipment_Volume
    'Equipment_Height',
    'Equipment_Width',
    
    # Redundant (corr 0.90 with Value)
    'Equipment_Weight',
    
    # High-Cardinality IDs / Unused
    'Hospital_Id',
    'Supplier_Name',
    'Hospital_Location',
    
    # Replaced by date features
    'Order_Placed_Date',
    'Delivery_Date'
])

print(f"Features for modeling: {X.columns.tolist()}")

# 5. Train-Test Split
# Action: Split the data into training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
train_mean = y_train.mean()

# 1️⃣ Baseline RMSE on the training set (log-space)
y_train_pred_baseline = np.full_like(y_train, train_mean)
baseline_rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred_baseline))
print(f"Baseline RMSE (train, log-space): {baseline_rmse_train:.4f}")

# 2️⃣ Baseline RMSE on the test set (using train mean as predictor) — log-space
y_test_pred_baseline = np.full_like(y_test, train_mean)
baseline_rmse_test_log = np.sqrt(mean_squared_error(y_test, y_test_pred_baseline))
print(f"Baseline RMSE (test, log-space): {baseline_rmse_test_log:.4f}")

# 3️⃣ Baseline RMSE in original (Transport_Cost) scale
y_test_actual_orig = np.expm1(y_test)
y_test_baseline_pred_orig = np.expm1(y_test_pred_baseline)
baseline_rmse_test_orig = np.sqrt(mean_squared_error(y_test_actual_orig, y_test_baseline_pred_orig))
print(f"Baseline RMSE (test, original-scale): {baseline_rmse_test_orig:.4f}")

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")


# ==============================================================================
# PART 2: POST-SPLIT (Pipelines & ColumnTransformer)
# This prevents data leakage. We FIT on X_train, then TRANSFORM X_train and X_test.
# ==============================================================================

# 1. Define Feature Lists
# Action: Separate our final columns into numeric and categorical lists.

numeric_features = [
    'Supplier_Reliability',
    'Equipment_Value',      # Already log-transformed
    'Base_Transport_Fee',
    'Delivery_Days',
    'Equipment_Volume'      # Already log-transformed
]

categorical_features = [
    'Equipment_Type',
    'CrossBorder_Shipping',
    'Urgent_Shipping',
    'Installation_Service',
    'Transport_Method',
    'Fragile_Equipment',
    'Hospital_Info',
    'Rural_Hospital',
    'Order_Month',
    'Order_Day_of_Week',
    'Order_Is_Weekend'
]

# 2. Create the Numeric Pipeline
# EDA Finding: Numeric features had missing values (e.g., Supplier_Reliability)
# and were on different scales.
# Action: Impute missing values with the median (robust to outliers)
# and then scale all features.
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# 3. Create the Categorical Pipeline
# EDA Finding: Categorical features had missing values (e.g., Transport_Method,
# Rural_Hospital) and need to be converted to numbers.
# Action: Impute missing values with the most frequent value and then
# one-hot encode. 'handle_unknown='ignore'' ensures our model doesn't
# crash if it sees a new category in the test data.
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# 4. Create the Full Preprocessor
# Action: Combine the numeric and categorical pipelines using ColumnTransformer.
# This single object will handle all preprocessing for us.
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop'  # Drop any columns we didn't explicitly list
)

# 5. Apply the Preprocessor
# Action: FIT the preprocessor on X_train ONLY (to learn medians, modes, etc.)
# and then TRANSFORM both X_train and X_test.
# This gives us our final, model-ready datasets.

print("\nFitting preprocessor on X_train...")
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

print("Preprocessing complete.")
print(f"Processed X_train shape: {X_train_processed.shape}")
print(f"Processed X_test shape: {X_test_processed.shape}")


# ==============================================================================
# K-Fold Validation Setup (Bonus)
# ==============================================================================
# You mentioned K-Fold. The *best* way to use this `preprocessor` is to
# put it in a pipeline with your model. This way, the K-Fold cross-validation
# will correctly re-fit the preprocessor on each fold, preventing all leakage.

# Example (don't run, just for info):
# from sklearn.linear_model import LinearRegression
# from sklearn.model_selection import cross_val_score

# 1. Choose a model
# model = LinearRegression()

# 2. Create the full ML pipeline
# full_pipeline = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('regressor', model)
# ])

# 3. Run cross-validation on the *original* X_train and y_train
# This is the correct, leak-proof way to do it.
# scores = cross_val_score(full_pipeline, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')
# print(f"K-Fold Scores: {scores}")
# print(f"Mean RMSE: {np.mean(scores)}")

# 4. Fit the final model
# full_pipeline.fit(X_train, y_train)
# print("Final model pipeline is fitted and ready.")

Preprocessing script started...
Filtered 0 rows with bad data (negative cost or delivery days).
Features for modeling: ['Supplier_Reliability', 'Equipment_Type', 'Equipment_Value', 'Base_Transport_Fee', 'CrossBorder_Shipping', 'Urgent_Shipping', 'Installation_Service', 'Transport_Method', 'Fragile_Equipment', 'Hospital_Info', 'Rural_Hospital', 'Delivery_Days', 'Order_Month', 'Order_Day_of_Week', 'Order_Is_Weekend', 'Equipment_Volume']
Baseline RMSE (train, log-space): 1.6738
Baseline RMSE (test, log-space): 1.7725
Baseline RMSE (test, original-scale): 625946.6068
Training set shape: (2186, 16)
Test set shape: (547, 16)

Fitting preprocessor on X_train...
Preprocessing complete.
Processed X_train shape: (2186, 48)
Processed X_test shape: (547, 48)


In [None]:

# --- 1. Define Feature Groups and Drop Useless Columns ---
features_to_drop = [
    'Hospital_Id', 
    'Supplier_Name', 
    'Hospital_Location', 
    'Order_Placed_Date', 
    'Delivery_Date'
]
numeric_features = [
    'Supplier_Reliability', 
    'Equipment_Height', 
    'Equipment_Width', 
    'Equipment_Weight', 
    'Equipment_Value', 
    'Base_Transport_Fee', 
    'Delivery_Days'
]
categorical_features = [
    'Equipment_Type', 
    'CrossBorder_Shipping', 
    'Urgent_Shipping', 
    'Installation_Service', 
    'Transport_Method', 
    'Fragile_Equipment', 
    'Hospital_Info', 
    'Rural_Hospital'
]
target_variable = 'Transport_Cost'

# Apply the drop
df_clean = df.drop(columns=features_to_drop)

# Drop any rows where the target is *initially* missing
df_clean = df_clean.dropna(subset=[target_variable])

# --- 2. Handle the Extreme Skew of the Target Variable ---
# This line can create NaNs if Transport_Cost <= -1
df_clean[target_variable] = np.log1p(df_clean[target_variable])

# --- 💡💡💡 HERE IS THE NEW FIX 💡💡💡 ---
# We must now drop any rows that *became* NaN after the log transform
initial_rows = len(df_clean)
df_clean = df_clean.dropna(subset=[target_variable])
final_rows = len(df_clean)

if initial_rows > final_rows:
    print(f"Dropped {initial_rows - final_rows} rows due to invalid log transform (e.g., Transport_Cost <= -1).")
# --- END OF FIX ---


# --- 2b. Inspect the Target Variable Distribution ---
print("\n===== TARGET VARIABLE (Transport_Cost) SUMMARY =====")
print(df_clean[target_variable].describe())
print(f"\nSkewness: {df_clean[target_variable].skew():.4f}")

# If you want to see the mean in both raw and original (unlogged) form:
mean_logged = df_clean[target_variable].mean()
print(f"\nMean (after log1p): {mean_logged:.4f}")

# Convert back to original scale for interpretation
mean_original = np.expm1(mean_logged)
print(f"Approximate mean Transport_Cost (original scale): {mean_original:.2f}")


# --- 3. Create the Train-Test Split ---
# Now X and y will be clean
X = df_clean.drop(columns=target_variable)
y = df_clean[target_variable]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- 4. Build the Preprocessing Pipelines ---
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('transformer', PowerTransformer(method='yeo-johnson'))
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# --- 5. Create the Master Preprocessor ---
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough'
)

# --- 6. Apply Preprocessing (for inspection) ---
# (This part is just for your own inspection, the model pipeline uses the raw X_train)
try:
    X_train_processed = preprocessor.fit_transform(X_train)
    X_test_processed = preprocessor.transform(X_test)

    # --- 7. Get Feature Names After Transformation (Optional, but helpful) ---
    feature_names = numeric_features + \
                    preprocessor.named_transformers_['cat'] \
                                .named_steps['onehot'] \
                                .get_feature_names_out(categorical_features).tolist()
    
    X_train_processed = pd.DataFrame(X_train_processed, columns=feature_names, index=X_train.index)
    X_test_processed = pd.DataFrame(X_test_processed, columns=feature_names, index=X_test.index)
    
    print("\n--- Training Data Head (Processed) ---")
    display(X_train_processed.head())

except Exception as e:
    # This can fail if the split results in an empty dataframe (if all rows were bad)
    print(f"Could not process data, possibly too many NaNs dropped: {e}")
    print("Proceeding with NumPy arrays if possible.")


# --- 8. Final Check ---
print("--- Preprocessing Complete ---")
print(f"Original X_train shape: {X_train.shape}")
if 'X_train_processed' in locals():
    print(f"Processed X_train shape: {X_train_processed.shape}")
print(f"\nOriginal X_test shape: {X_test.shape}")
if 'X_test_processed' in locals():
    print(f"Processed X_test shape: {X_test_processed.shape}")
print(f"\ny_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

In [50]:


# --- 1. Define Models to Test ---
# We'll put them in a dictionary to loop through them easily
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1),
    'LightGBM': LGBMRegressor(random_state=42, n_jobs=-1)
}

# --- 2. Set up K-Fold ---
# We use the *original* X_train and y_train
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# --- 3. Loop, Validate, and Get Scores ---
print("--- Running 5-Fold Cross-Validation ---")

for model_name, model in models.items():
    
    # Create the FULL pipeline: Preprocessor -> Model
    full_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor), # This is the 'preprocessor' from your last cell
        ('model', model)
    ])
    
    # Run cross-validation
    # We pass the RAW X_train and y_train
    # 'cross_val_score' handles the fit/transform loop internally
    cv_scores = cross_val_score(full_pipeline, 
                                X_train,  # <-- Raw training features
                                y_train,  # <-- Raw training target
                                cv=kf, 
                                scoring='neg_root_mean_squared_error',
                                n_jobs=-1)
    
    # We take the absolute value of the negative RMSE
    avg_rmse = np.abs(cv_scores).mean()
    
    print(f"\nModel: {model_name}")
    print(f"  Avg. Log-Scale RMSE: {avg_rmse:.4f} (from 5 folds)")

print("\n--- Validation Complete ---")
print("Lower RMSE is better. Choose the best model for final training.")

--- Running 5-Fold Cross-Validation ---

Model: Linear Regression
  Avg. Log-Scale RMSE: 0.7662 (from 5 folds)

Model: Random Forest
  Avg. Log-Scale RMSE: 0.3931 (from 5 folds)
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001920 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 957
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002057 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Number of data points in the train set: 1749, number of used features: 48
[LightGBM] [Info] Total Bins 957
[LightGBM] [Info] Number of data points in the train set: 1749, number of used features: 48
[LightGBM] [Info] Start training from score 6.551130
[LightGBM] [Info] Start training from score 6.538262
[LightGBM] [In




Model: LightGBM
  Avg. Log-Scale RMSE: 0.3768 (from 5 folds)

--- Validation Complete ---
Lower RMSE is better. Choose the best model for final training.


