In [1]:
#PCA + KNN (GridSearchCV)
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

warnings.filterwarnings("ignore")

#Load Data
try:
    train_df_raw = pd.read_csv("medical_cost_prediction/train.csv")
    test_df_raw = pd.read_csv("medical_cost_prediction/test.csv")
except FileNotFoundError:
    try:
        train_df_raw = pd.read_csv("trainvad.csv")
        test_df_raw = pd.read_csv("test.csv")
    except Exception as e:
        raise FileNotFoundError("Could not find train/test CSV files. Put them in expected paths.") from e

print(f"Loaded train: {train_df_raw.shape}, test: {test_df_raw.shape}")

#Prepare target (log transform)
train_df_raw = train_df_raw[train_df_raw["Transport_Cost"] > 0].copy()
target_log = np.log1p(train_df_raw["Transport_Cost"]).reset_index(drop=True)
train_df_raw = train_df_raw.drop(columns=["Transport_Cost"])

#mark source and combine for uniform feature engineering
train_df_raw["source"] = "train"
test_df_raw["source"] = "test"
df = pd.concat([train_df_raw, test_df_raw], ignore_index=True)
print(f"Combined df shape: {df.shape}, target length: {len(target_log)}")

#Feature engineering function(keeps your logic)
missing_cols = [
    'Supplier_Reliability', 'Equipment_Height', 'Equipment_Width',
    'Equipment_Weight', 'Equipment_Type', 'Transport_Method', 'Rural_Hospital'
]

def preprocess_features(df_in):
    df_out = df_in.copy()

    if "Supplier_Name" in df_out.columns:
        df_out = df_out.drop(columns=["Supplier_Name"])

    # Missing indicators
    for col in missing_cols:
        if col in df_out.columns:
            df_out[col + "_Is_Missing"] = df_out[col].isnull().astype(int)

    # Date handling
    if "Order_Placed_Date" in df_out.columns and "Delivery_Date" in df_out.columns:
        df_out["Order_Placed_Date"] = pd.to_datetime(df_out["Order_Placed_Date"], errors="coerce")
        df_out["Delivery_Date"] = pd.to_datetime(df_out["Delivery_Date"], errors="coerce")
        df_out["Delivery_Time_Days"] = (df_out["Delivery_Date"] - df_out["Order_Placed_Date"]).dt.days.clip(lower=0).fillna(0).astype(int)
        df_out["Order_Year"] = df_out["Order_Placed_Date"].dt.year.fillna(0).astype(int)
        df_out["Order_Month"] = df_out["Order_Placed_Date"].dt.month.fillna(0).astype(int)
        df_out["Order_DayOfWeek"] = df_out["Order_Placed_Date"].dt.dayofweek.fillna(0).astype(int)
        df_out = df_out.drop(columns=["Order_Placed_Date", "Delivery_Date"], errors="ignore")

    # Binary mapping
    binary_cols = ['CrossBorder_Shipping', 'Urgent_Shipping', 'Installation_Service',
                   'Fragile_Equipment', 'Rural_Hospital', 'Hospital_Info']
    for c in binary_cols:
        if c in df_out.columns:
            df_out[c] = df_out[c].map({"Yes": 1, "No": 0}).fillna(0).astype(int)

    # Fill numeric-ish columns
    if "Equipment_Height" in df_out.columns:
        df_out["Equipment_Height"] = df_out["Equipment_Height"].fillna(1)
    if "Equipment_Width" in df_out.columns:
        df_out["Equipment_Width"] = df_out["Equipment_Width"].fillna(1)
    if "Equipment_Weight" in df_out.columns:
        df_out["Equipment_Weight"] = df_out["Equipment_Weight"].fillna(0)
    if "Equipment_Value" in df_out.columns:
        df_out["Equipment_Value"] = df_out["Equipment_Value"].fillna(0)

    # Derived features
    if {"Equipment_Height", "Equipment_Width"}.issubset(df_out.columns):
        df_out["Equipment_Area"] = df_out["Equipment_Height"] * df_out["Equipment_Width"]
    else:
        df_out["Equipment_Area"] = 0

    if {"Equipment_Value", "Equipment_Weight"}.issubset(df_out.columns):
        df_out["Value_Density"] = df_out["Equipment_Value"] / (df_out["Equipment_Weight"] + 1e-6)
    else:
        df_out["Value_Density"] = 0

    df_out["Value_per_Height"] = df_out.get("Equipment_Value", 0) / (df_out.get("Equipment_Height", 1) + 1e-6)
    df_out["Value_per_Width"] = df_out.get("Equipment_Value", 0) / (df_out.get("Equipment_Width", 1) + 1e-6)
    df_out["Weight_per_Area"] = df_out.get("Equipment_Weight", 0) / (df_out.get("Equipment_Area", 1) + 1e-6)
    df_out["ValuePerArea"] = df_out.get("Equipment_Value", 0) / (df_out.get("Equipment_Area", 1) + 1e-6)

    # OPTIONAL: additional interactions that often help (safe defaults)
    # only add if columns exist
    if {"Equipment_Height", "Equipment_Width", "Equipment_Weight"}.issubset(df_out.columns):
        df_out["Volume"] = df_out["Equipment_Height"] * df_out["Equipment_Width"] * (df_out["Equipment_Weight"] + 1e-6)
    else:
        df_out["Volume"] = 0

    return df_out

df_featured = preprocess_features(df)
print("Feature engineering complete.")

#Build raw X(keep original columns; pipeline will transform)
#Drop ID and source from model inputs (we keep Hospital_Id separately for submission)
X_all_raw = df_featured.drop(columns=['Hospital_Id', 'source'], errors='ignore')
# Split raw train vs raw test by source
train_mask = df_featured['source'] == 'train'
X_raw = X_all_raw.loc[train_mask.values].reset_index(drop=True)
X_test_raw = X_all_raw.loc[~train_mask.values].reset_index(drop=True)
y = target_log.reset_index(drop=True)

print(f"Prepared raw X: {X_raw.shape}, raw X_test: {X_test_raw.shape}, y: {y.shape}")

#Identify feature groups
numeric_features = ['Supplier_Reliability', "Value_per_Height", "Value_per_Width",
                    "Weight_per_Area", 'Delivery_Time_Days', "ValuePerArea", "Volume"]
skewed_features = ['Equipment_Weight', 'Base_Transport_Fee', 'Value_Density',
                   'Equipment_Area', "Equipment_Height", "Equipment_Width", "Equipment_Value"]
categorical_features = ['Transport_Method', 'Order_DayOfWeek', 'Equipment_Type']
binary_features = ['Urgent_Shipping', 'Installation_Service', 'Hospital_Info']

def filter_existing(cols, df_cols):
    return [c for c in cols if c in df_cols]

numeric_features = filter_existing(numeric_features, X_all_raw.columns)
skewed_features = filter_existing(skewed_features, X_all_raw.columns)
categorical_features = filter_existing(categorical_features, X_all_raw.columns)
binary_features = filter_existing(binary_features, X_all_raw.columns)

print("Using feature groups sizes -> numeric:", len(numeric_features),
      "skewed:", len(skewed_features), "categorical:", len(categorical_features), "binary:", len(binary_features))

#Build ColumnTransformer (preprocessing)
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())   # scale numeric before PCA
])

skewed_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('log_transform', FunctionTransformer(np.log1p, validate=False)),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_pipeline, numeric_features),
        ('skew', skewed_pipeline, skewed_features),
        ('cat', categorical_pipeline, categorical_features),
        ('binary', 'passthrough', binary_features)
    ],
    remainder='drop'  # drop any other columns
)

#Train/Validation split on raw data
X_train_raw, X_val_raw, y_train, y_val = train_test_split(X_raw, y, test_size=0.2, random_state=42)
print("Train/validation split done. Train:", X_train_raw.shape, "Val:", X_val_raw.shape)

#Full pipeline: preprocessor
full_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=30, random_state=42)),
    ('post_scaler', StandardScaler()),   # scale PCA outputs before KNN
    ('knn', KNeighborsRegressor(n_jobs=-1))
])

#Grid search parameters (tune PCA n_components and KNN params)
param_grid = {
    'pca__n_components': [20, 30, 40],             
    'knn__n_neighbors': [5, 10, 15, 20],            
    'knn__p': [1, 2],                              
    'knn__weights': ['distance', 'uniform']
}

grid_search = GridSearchCV(
    estimator=full_pipeline,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',
    cv=3,
    n_jobs=-1,
    verbose=2
)

print("\nStarting GridSearchCV (this may take a while)...")
grid_search.fit(X_train_raw, y_train)
print("Grid search done.")
print("Best params:", grid_search.best_params_)
print("Best CV RMSE (log-scale):", -grid_search.best_score_)

#Validation evaluation
best_model = grid_search.best_estimator_
y_val_pred_log = best_model.predict(X_val_raw)

# back-transform to original cost scale
y_val_actual = np.expm1(y_val)
y_val_pred_actual = np.maximum(0, np.expm1(y_val_pred_log))

rmse = np.sqrt(mean_squared_error(y_val_actual, y_val_pred_actual))
r2 = r2_score(y_val_actual, y_val_pred_actual)
mae = mean_absolute_error(y_val_actual, y_val_pred_actual)

print("\nKNN + PCA Validation Results (on actual cost scale):")
print(f"Validation RMSE: ${rmse:,.2f}")
print(f"Validation R-squared: {r2:.4f}")
print(f"Validation MAE: ${mae:,.2f}")

#Predict on test set and save submission

# Use best_model
test_pred_log = best_model.predict(X_test_raw)
test_pred = np.maximum(0, np.expm1(test_pred_log))

# Ensure Hospital_Id exists in original test_df_raw
if 'Hospital_Id' not in test_df_raw.columns:
    # if not present, create an index-based id
    submission_ids = np.arange(len(test_df_raw))
else:
    submission_ids = test_df_raw['Hospital_Id'].values

submission_df = pd.DataFrame({
    'Hospital_Id': submission_ids,
    'Transport_Cost': test_pred
})

submission_df.to_csv('submission_knn_pca.csv', index=False)
print("\nSaved submission_knn_pca.csv")
print(submission_df.head())

#explaination of variance by PCA
try:
    pca_obj = best_model.named_steps['pca']
    explained = np.cumsum(pca_obj.explained_variance_ratio_)*100
    print("\nPCA cumulative explained variance (%):", explained[:min(len(explained), 10)])
except Exception:
    pass

# === End of script ===


Loaded train: (5000, 20), test: (500, 19)
Combined df shape: (5007, 20), target length: 4507
Feature engineering complete.
Prepared raw X: (4507, 33), raw X_test: (500, 33), y: (4507,)
Using feature groups sizes -> numeric: 7 skewed: 7 categorical: 3 binary: 3
Train/validation split done. Train: (3605, 33) Val: (902, 33)

Starting GridSearchCV (this may take a while)...
Fitting 3 folds for each of 48 candidates, totalling 144 fits
Grid search done.
Best params: {'knn__n_neighbors': 10, 'knn__p': 1, 'knn__weights': 'distance', 'pca__n_components': 20}
Best CV RMSE (log-scale): 0.7951077855594888

KNN + PCA Validation Results (on actual cost scale):
Validation RMSE: $130,328.69
Validation R-squared: 0.1777
Validation MAE: $13,341.20

Saved submission_knn_pca.csv
            Hospital_Id  Transport_Cost
0          fffe33003400      231.466655
1  fffe3700330036003600      239.829060
2  fffe3300390038003400     1430.538048
3      fffe310030003900      337.033489
4  fffe3700330031003200     1

In [1]:
#Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import RobustScaler
from sklearn.neighbors import KNeighborsRegressor
import warnings

warnings.filterwarnings('ignore')
sns.set(style="whitegrid")

#Load Data
try:
    train_df_raw = pd.read_csv("medical_cost_prediction/train.csv")
    test_df_raw = pd.read_csv("medical_cost_prediction/test.csv")
except FileNotFoundError:
    print("Trying alternative path...")
    try:
        train_df_raw = pd.read_csv("trainvad.csv")
        test_df_raw = pd.read_csv("test.csv")
    except Exception as e:
        print(f"FATAL ERROR: Could not find train.csv or test.csv. {e}")
        raise

print(f"Original train data shape: {train_df_raw.shape}")
print(f"Original test data shape: {test_df_raw.shape}")

test_hospital_ids = test_df_raw['Hospital_Id']

# --- 3. Clean Target Variable & Log Transform ---
target = train_df_raw['Transport_Cost'].copy()
invalid_cost_indices = target[target <= 0].index
print(f"Found {len(invalid_cost_indices)} rows with non-positive cost. Setting them to 0.")
target.loc[invalid_cost_indices] = 0
target_log = np.log1p(target)

train_df_processed = train_df_raw.drop('Transport_Cost', axis=1)
train_df_processed['source'] = 'train'
test_df_raw['source'] = 'test'
df = pd.concat([train_df_processed, test_df_raw], ignore_index=True)
print(f"Combined data shape for preprocessing: {df.shape}")

#Feature Engineering
missing_cols = [
    'Supplier_Reliability',
    'Equipment_Height',
    'Equipment_Width',
    'Equipment_Weight',
    'Equipment_Type',
    'Transport_Method',
    'Rural_Hospital'
]

def preprocess_features(df_to_process):
    print("Starting feature engineering...")
    df_processed = df_to_process.copy()
    df_processed = df_processed.drop(['Supplier_Name'], axis=1)

    # Missing value indicators
    for col in missing_cols:
        df_processed[col + '_Is_Missing'] = df_processed[col].isnull().astype(int)
    print("Created 'Is_Missing' flags for missing columns.")

    # Date features
    df_processed['Order_Placed_Date'] = pd.to_datetime(df_processed['Order_Placed_Date'])
    df_processed['Delivery_Date'] = pd.to_datetime(df_processed['Delivery_Date'])
    df_processed['Delivery_Time_Days'] = (df_processed['Delivery_Date'] - df_processed['Order_Placed_Date']).dt.days.clip(lower=0)
    df_processed = df_processed.drop(['Order_Placed_Date', 'Delivery_Date'], axis=1)

    # Location features
    df_processed['Hospital_State'] = df_processed['Hospital_Location'].str.split(',').str[1].str.strip().str.split(' ').str[0]
    df_processed['Hospital_State'] = df_processed['Hospital_State'].fillna('Unknown')
    df_processed = df_processed.drop('Hospital_Location', axis=1)

    # Binary features
    binary_cols = ['CrossBorder_Shipping', 'Urgent_Shipping', 'Installation_Service', 
                   'Fragile_Equipment', 'Rural_Hospital', 'Hospital_Info']
    for col in binary_cols:
        df_processed[col] = df_processed[col].map({'Yes': 1, 'No': 0}).fillna(0).astype(int)

    # Fill numeric
    df_processed['Equipment_Height'] = df_processed['Equipment_Height'].fillna(1)
    df_processed['Equipment_Width'] = df_processed['Equipment_Width'].fillna(1)
    df_processed['Equipment_Weight'] = df_processed['Equipment_Weight'].fillna(0)
    df_processed['Equipment_Value'] = df_processed['Equipment_Value'].fillna(0)

    # Derived/interaction features
    df_processed['Equipment_Area'] = df_processed['Equipment_Height'] * df_processed['Equipment_Width']
    df_processed['Value_Density'] = df_processed['Equipment_Value'] / (df_processed['Equipment_Weight'] + 1e-6)
    df_processed['Value_per_Height'] = df_processed['Equipment_Value'] / (df_processed['Equipment_Height'] + 1e-6)
    df_processed['Value_per_Width'] = df_processed['Equipment_Value'] / (df_processed['Equipment_Width'] + 1e-6)
    df_processed['Weight_per_Area'] = df_processed['Equipment_Weight'] / (df_processed['Equipment_Area'] + 1e-6)
    df_processed['Cost_per_Day'] = df_processed['Base_Transport_Fee'] / (df_processed['Delivery_Time_Days'] + 1)
    df_processed['Value_per_Area'] = df_processed['Equipment_Value'] / (df_processed['Equipment_Area'] + 1e-6)

    print("Feature engineering complete.")
    return df_processed

df_featured = preprocess_features(df)

#Preprocessing Pipeline
print("Building preprocessing pipeline...")

numeric_features = ['Supplier_Reliability', "Cost_per_Day"]
skewed_features = [
    'Equipment_Value', 'Base_Transport_Fee', 'Value_Density', 'Equipment_Width', 'Equipment_Height',
    'Equipment_Area', "Value_per_Area", "Value_per_Height", "Value_per_Width", "Weight_per_Area"
]
categorical_features = ['Equipment_Type', 'Transport_Method']
binary_features = ['Fragile_Equipment', 'Rural_Hospital', 'Hospital_Info']

numeric_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

skewed_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('log_transform', FunctionTransformer(np.log1p, validate=False)),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=True))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_pipeline, numeric_features),
        ('skew', skewed_pipeline, skewed_features),
        ('cat', categorical_pipeline, categorical_features),
        ('binary', 'passthrough', binary_features)
    ],
    remainder='drop',
    n_jobs=1
)

#Apply Preprocessing
df_to_transform = df_featured.drop(['Hospital_Id', 'source'], axis=1)
train_mask = df_featured['source'] == 'train'
preprocessor.fit(df_to_transform[train_mask])
df_final = preprocessor.transform(df_to_transform)

train_mask_numpy = train_mask.values
X = df_final[train_mask_numpy]
X_test = df_final[~train_mask_numpy]
y = target_log.reset_index(drop=True)

print(f"Final shapes: X={X.shape}, y={y.shape}, X_test={X_test.shape}")

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

#KNN Model
print("\n--- Starting Model Tuning for KNN ---")

knn = KNeighborsRegressor()

param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11, 15, 20],
    'weights': ['uniform', 'distance'],
    'p': [1, 2],
    'leaf_size': [20, 30, 40, 50]
}

search = RandomizedSearchCV(
    estimator=knn,
    param_distributions=param_grid,
    n_iter=20,
    cv=5,
    scoring='neg_root_mean_squared_error',
    random_state=42,
    verbose=2,
    n_jobs=-1
)

search.fit(X_train, y_train)
best_knn = search.best_estimator_
print(f"Best parameters: {search.best_params_}")

#Evaluate on Validation Set
y_pred_log = best_knn.predict(X_val)
y_pred_actual = np.expm1(y_pred_log)
y_val_actual = np.expm1(y_val)
rmse = np.sqrt(mean_squared_error(y_val_actual, y_pred_actual))
mse = mean_squared_error(y_val_actual, y_pred_actual)
print(f"\nKNN RMSE (on validation): {rmse:.2f}")
print(f"KNN MSE (on validation): {mse:.2f}")

#Retrain on Full Data
print("Retraining KNN on full training data...")
best_knn.fit(X, y)

#Predict on Test Data
test_pred_log = best_knn.predict(X_test)
test_pred_actual = np.expm1(test_pred_log)
test_pred_actual[test_pred_actual < 0] = 0  # Safety clip

submission_df = pd.DataFrame({
    'Hospital_Id': test_hospital_ids,
    'Transport_Cost': test_pred_actual
})

submission_df.to_csv('submission_KNN.csv', index=False)

print("\n--- DONE ---")
print("Submission file 'submission_KNN.csv' created successfully.")
print(submission_df.head())


Starting the full ML pipeline (Strategy: KNN ONLY)...
Trying alternative path...
Original train data shape: (5000, 20)
Original test data shape: (500, 19)
Found 493 rows with non-positive cost. Setting them to 0.
Combined data shape for preprocessing: (5500, 20)
Starting feature engineering...
Created 'Is_Missing' flags for missing columns.
Feature engineering complete.
Building preprocessing pipeline...
Final shapes: X=(5000, 25), y=(5000,), X_test=(500, 25)

--- Starting Model Tuning for KNN ---
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best parameters: {'weights': 'uniform', 'p': 2, 'n_neighbors': 20, 'leaf_size': 20}

KNN RMSE (on validation): 44278.05
KNN MSE (on validation): 1960545274.93
Retraining KNN on full training data...

--- DONE ---
Submission file 'submission_KNN_ONLY_model.csv' created successfully.
            Hospital_Id  Transport_Cost
0          fffe33003400      263.338849
1  fffe3700330036003600      182.750656
2  fffe3300390038003400     1405

In [None]:
#Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsRegressor
import warnings

warnings.filterwarnings('ignore')
sns.set(style="whitegrid")

#Load Data
try:
    train_df_raw = pd.read_csv("medical_cost_prediction/train.csv")
    test_df_raw = pd.read_csv("medical_cost_prediction/test.csv")
except FileNotFoundError:
    print("Trying alternative path...")
    try:
        train_df_raw = pd.read_csv("trainvad.csv")
        test_df_raw = pd.read_csv("test.csv")
    except Exception as e:
        print(f"FATAL ERROR: Could not find train.csv or test.csv. {e}")
        raise

print(f"Original train data shape: {train_df_raw.shape}")
print(f"Original test data shape: {test_df_raw.shape}")

test_hospital_ids = test_df_raw['Hospital_Id']

#Clean Target Variable and Log Transform
target = train_df_raw['Transport_Cost'].copy()
invalid_cost_indices = target[target <= 0].index
print(f"Found {len(invalid_cost_indices)} rows with non-positive cost. Setting them to 0.")
target.loc[invalid_cost_indices] = 0
target_log = np.log1p(target)

train_df_processed = train_df_raw.drop('Transport_Cost', axis=1)
train_df_processed['source'] = 'train'
test_df_raw['source'] = 'test'
df = pd.concat([train_df_processed, test_df_raw], ignore_index=True)
print(f"Combined data shape for preprocessing: {df.shape}")

#Feature Engineering
missing_cols = [
    'Supplier_Reliability',
    'Equipment_Height',
    'Equipment_Width',
    'Equipment_Weight',
    'Equipment_Type',
    'Transport_Method',
    'Rural_Hospital'
]

def preprocess_features(df_to_process):
    print("Starting feature engineering...")
    df_processed = df_to_process.copy()
    df_processed = df_processed.drop(['Supplier_Name'], axis=1)

    for col in missing_cols:
        df_processed[col + '_Is_Missing'] = df_processed[col].isnull().astype(int)
    print("Created 'Is_Missing' flags for missing columns.")

    df_processed['Order_Placed_Date'] = pd.to_datetime(df_processed['Order_Placed_Date'])
    df_processed['Delivery_Date'] = pd.to_datetime(df_processed['Delivery_Date'])
    df_processed['Delivery_Time_Days'] = (df_processed['Delivery_Date'] - df_processed['Order_Placed_Date']).dt.days.clip(lower=0)
    df_processed = df_processed.drop(['Order_Placed_Date', 'Delivery_Date'], axis=1)

    df_processed['Hospital_State'] = df_processed['Hospital_Location'].str.split(',').str[1].str.strip().str.split(' ').str[0]
    df_processed['Hospital_State'] = df_processed['Hospital_State'].fillna('Unknown')
    df_processed = df_processed.drop('Hospital_Location', axis=1)

    binary_cols = ['CrossBorder_Shipping', 'Urgent_Shipping', 'Installation_Service', 
                   'Fragile_Equipment', 'Rural_Hospital','Hospital_Info']
    for col in binary_cols:
        df_processed[col] = df_processed[col].map({'Yes': 1, 'No': 0}).fillna(0).astype(int)

    df_processed['Equipment_Height'] = df_processed['Equipment_Height'].fillna(1)
    df_processed['Equipment_Width'] = df_processed['Equipment_Width'].fillna(1)
    df_processed['Equipment_Weight'] = df_processed['Equipment_Weight'].fillna(0)
    df_processed['Equipment_Value'] = df_processed['Equipment_Value'].fillna(0)

    df_processed['Equipment_Area'] = df_processed['Equipment_Height'] * df_processed['Equipment_Width']
    df_processed['Value_Density'] = df_processed['Equipment_Value'] / (df_processed['Equipment_Weight'] + 1e-6)
    df_processed['Value_per_Height'] = df_processed['Equipment_Value'] / (df_processed['Equipment_Height'] + 1e-6)
    df_processed['Value_per_Width'] = df_processed['Equipment_Value'] / (df_processed['Equipment_Width'] + 1e-6)
    df_processed['Weight_per_Area'] = df_processed['Equipment_Weight'] / (df_processed['Equipment_Area'] + 1e-6)
    df_processed['Cost_per_Day'] = df_processed['Base_Transport_Fee'] / (df_processed['Delivery_Time_Days'] + 1)
    df_processed['Value_per_Area'] = df_processed['Equipment_Value'] / (df_processed['Equipment_Area'] + 1e-6)

    print("Feature engineering complete.")
    return df_processed

df_featured = preprocess_features(df)

#Preprocessing Pipeline
print("Building preprocessing + PCA pipeline...")

numeric_features = ['Supplier_Reliability', "Cost_per_Day"]
skewed_features = [
    'Equipment_Value', 'Base_Transport_Fee', 'Value_Density', 'Equipment_Width', 'Equipment_Height',
    'Equipment_Area', "Value_per_Area", "Value_per_Height", "Value_per_Width", "Weight_per_Area"
]
categorical_features = ['Equipment_Type', 'Transport_Method']
binary_features = ['Fragile_Equipment', 'Rural_Hospital', 'Hospital_Info']

numeric_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

skewed_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('log_transform', FunctionTransformer(np.log1p, validate=False)),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=True))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_pipeline, numeric_features),
        ('skew', skewed_pipeline, skewed_features),
        ('cat', categorical_pipeline, categorical_features),
        ('binary', 'passthrough', binary_features)
    ],
    remainder='drop',
    n_jobs=1
)

#PCA + Model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=0.95, random_state=42)),  # Retain 95% variance
    ('knn', KNeighborsRegressor())
])

# --- 7. Prepare Data ---
df_to_transform = df_featured.drop(['Hospital_Id', 'source'], axis=1)
train_mask = df_featured['source'] == 'train'
X = df_to_transform[train_mask]
X_test = df_to_transform[~train_mask]
y = target_log.reset_index(drop=True)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

#Model Tuning
print("\n--- Starting Model Tuning for KNN + PCA ---")

param_grid = {
    'pca__n_components': [0.90, 0.95, 0.98],
    'knn__n_neighbors': [3, 5, 7, 9, 11],
    'knn__weights': ['uniform', 'distance'],
    'knn__p': [1, 2]
}

search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_grid,
    n_iter=20,
    cv=5,
    scoring='neg_root_mean_squared_error',
    random_state=42,
    verbose=2,
    n_jobs=-1
)

search.fit(X_train, y_train)
best_model = search.best_estimator_
print(f"Best parameters: {search.best_params_}")

#Evaluate on Validation Set
y_pred_log = best_model.predict(X_val)
y_pred_actual = np.expm1(y_pred_log)
y_val_actual = np.expm1(y_val)
rmse = np.sqrt(mean_squared_error(y_val_actual, y_pred_actual))
print(f"\nKNN + PCA RMSE (on validation): {rmse:.2f}")

#Retrain on Full Data
print("Retraining best KNN + PCA model on full training data...")
best_model.fit(X, y)

#Predict on Test Data
test_pred_log = best_model.predict(X_test)
test_pred_actual = np.expm1(test_pred_log)
test_pred_actual[test_pred_actual < 0] = 0

submission_df = pd.DataFrame({
    'Hospital_Id': test_hospital_ids,
    'Transport_Cost': test_pred_actual
})

submission_df.to_csv('submission_KNN_PCA_model.csv', index=False)

print("\n--- DONE ---")
print("Submission file 'submission_KNN_PCA_model.csv' created successfully.")
print(submission_df.head())

Trying alternative path...
Original train data shape: (5000, 20)
Original test data shape: (500, 19)
Found 493 rows with non-positive cost. Setting them to 0.
Combined data shape for preprocessing: (5500, 20)
Starting feature engineering...
Created 'Is_Missing' flags for missing columns.
Feature engineering complete.
Building preprocessing + PCA pipeline...

--- Starting Model Tuning for KNN + PCA ---
Fitting 5 folds for each of 20 candidates, totalling 100 fits
