In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import warnings
from sklearn.preprocessing import RobustScaler

warnings.filterwarnings('ignore')
sns.set(style="whitegrid")

#Load Data
try:
    train_df_raw = pd.read_csv("medical_cost_prediction/train.csv")
    test_df_raw = pd.read_csv("medical_cost_prediction/test.csv")
except FileNotFoundError:
    print("Trying alternative path...")
    try:
        train_df_raw = pd.read_csv("trainvad.csv")
        test_df_raw = pd.read_csv("test.csv")
    except Exception as e:
        print(f"FATAL ERROR: Could not find train.csv or test.csv. {e}")
        raise

print(f"Original train data shape: {train_df_raw.shape}")
print(f"Original test data shape: {test_df_raw.shape}")

test_hospital_ids = test_df_raw['Hospital_Id']

#Clean Target Variable & Log Transform
target = train_df_raw['Transport_Cost'].copy()
invalid_cost_indices = target[target <= 0].index
print(f"Found {len(invalid_cost_indices)} rows with non-positive cost. Setting them to 0.")
target.loc[invalid_cost_indices] = 0
target_log = np.log1p(target)

train_df_processed = train_df_raw.drop('Transport_Cost', axis=1)
train_df_processed['source'] = 'train'
test_df_raw['source'] = 'test'
df = pd.concat([train_df_processed, test_df_raw], ignore_index=True)
print(f"Combined data shape for preprocessing: {df.shape}")

#Feature Engineering
missing_cols = [
    'Supplier_Reliability',
    'Equipment_Height',
    'Equipment_Width',
    'Equipment_Weight',
    'Equipment_Type',
    'Transport_Method',
    'Rural_Hospital'
]

def preprocess_features(df_to_process):
    print("Starting feature engineering...")
    df_processed = df_to_process.copy()
    df_processed = df_processed.drop(['Supplier_Name'], axis=1)
    
    #Date Features
    df_processed['Order_Placed_Date'] = pd.to_datetime(df_processed['Order_Placed_Date'])
    df_processed['Delivery_Date'] = pd.to_datetime(df_processed['Delivery_Date'])
    df_processed['Delivery_Time_Days'] = (df_processed['Delivery_Date'] - df_processed['Order_Placed_Date']).dt.days.clip(lower=0)
    df_processed = df_processed.drop(['Order_Placed_Date', 'Delivery_Date'], axis=1)

    #Location Features
    df_processed['Hospital_State'] = df_processed['Hospital_Location'].str.split(',').str[1].str.strip().str.split(' ').str[0]
    df_processed['Hospital_State'] = df_processed['Hospital_State'].fillna('Unknown')
    df_processed = df_processed.drop('Hospital_Location', axis=1)

    #Binary Features
    binary_cols = ['CrossBorder_Shipping', 'Urgent_Shipping', 'Installation_Service', 
                   'Fragile_Equipment', 'Rural_Hospital', 'Hospital_Info']
    for col in binary_cols:
        df_processed[col] = df_processed[col].map({'Yes': 1, 'No': 0}).fillna(0).astype(int)

    #Interaction Features
    df_processed['Equipment_Height'] = df_processed['Equipment_Height'].fillna(1)
    df_processed['Equipment_Width'] = df_processed['Equipment_Width'].fillna(1)
    df_processed['Equipment_Weight'] = df_processed['Equipment_Weight'].fillna(0)
    df_processed['Equipment_Value'] = df_processed['Equipment_Value'].fillna(0)

    df_processed['Equipment_Area'] = df_processed['Equipment_Height'] * df_processed['Equipment_Width']
    df_processed['Value_Density'] = df_processed['Equipment_Value'] / (df_processed['Equipment_Weight'] + 1e-6)
    df_processed["Value_per_Height"] = df_processed["Equipment_Value"] / (df_processed["Equipment_Height"] + 1e-6)
    df_processed["Value_per_Width"] = df_processed["Equipment_Value"] / (df_processed["Equipment_Width"] + 1e-6)
    df_processed["Weight_per_Area"] = df_processed["Equipment_Weight"] / (df_processed["Equipment_Area"] + 1e-6)
    df_processed["Cost_per_Day"] = df_processed["Base_Transport_Fee"] / (df_processed["Delivery_Time_Days"] + 1)
    df_processed["Value_per_Area"] = df_processed["Equipment_Value"] / (df_processed["Equipment_Area"] + 1e-6)

    print("Feature engineering complete.")
    return df_processed

df_featured = preprocess_features(df)

#Preprocessing Pipeline
print("Building preprocessing pipeline...")

numeric_features = ['Supplier_Reliability', "Cost_per_Day"]
skewed_features = ['Equipment_Value', 'Base_Transport_Fee', 'Value_Density',
                   'Equipment_Width', 'Equipment_Height', 'Equipment_Area',
                   "Value_per_Area", "Value_per_Height", "Value_per_Width", "Weight_per_Area"]
categorical_features = ['Equipment_Type', 'Transport_Method']
binary_features = ['Fragile_Equipment', 'Rural_Hospital', 'Hospital_Info']

numeric_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median', add_indicator=True))
])

skewed_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median', add_indicator=True)),
    ('log_transform', FunctionTransformer(np.log1p, validate=False))
])

categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=True))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_pipeline, numeric_features),
        ('skew', skewed_pipeline, skewed_features),
        ('cat', categorical_pipeline, categorical_features),
        ('binary', 'passthrough', binary_features)
    ],
    remainder='drop',
    n_jobs=1
)

#Apply Preprocessing
df_to_transform = df_featured.drop(['Hospital_Id', 'source'], axis=1)
train_mask = df_featured['source'] == 'train'
preprocessor.fit(df_to_transform[train_mask])
df_final = preprocessor.transform(df_to_transform)

train_mask_numpy = train_mask.values
X = df_final[train_mask_numpy]
X_test = df_final[~train_mask_numpy]
y = target_log.reset_index(drop=True)

print(f"Final shapes: X={X.shape}, y={y.shape}, X_test={X_test.shape}")

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

#Random Forest Model
print("\n--- Starting Model Tuning for Random Forest ---")

rf_model = RandomForestRegressor(
    random_state=42,
    n_jobs=-1
)

param_grid = {
    'n_estimators': [200, 400, 600, 800],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True, False]
}

search = RandomizedSearchCV(
    estimator=rf_model,
    param_distributions=param_grid,
    n_iter=20,
    cv=5,
    scoring='neg_root_mean_squared_error',
    random_state=42,
    verbose=2,
    n_jobs=-1
)

search.fit(X_train, y_train)
best_model = search.best_estimator_
print(f"Best parameters: {search.best_params_}")

#Evaluate on Validation Set
y_pred_log = best_model.predict(X_val)
y_pred_actual = np.expm1(y_pred_log)
y_val_actual = np.expm1(y_val)
rmse = np.sqrt(mean_squared_error(y_val_actual, y_pred_actual))
mse = mean_squared_error(y_val_actual, y_pred_actual)
print(f"\nRandom Forest RMSE (on validation): {rmse:.2f}")
print(f"Random Forest MSE (on validation): {mse:.2f}")

#Retrain on Full Data
print("Retraining Random Forest on full training data...")
best_model.fit(X, y)

# --- 10. Predict on Test Data ---
test_pred_log = best_model.predict(X_test)
test_pred_actual = np.expm1(test_pred_log)
test_pred_actual[test_pred_actual < 0] = 0  # Safety clip

submission_df = pd.DataFrame({
    'Hospital_Id': test_hospital_ids,
    'Transport_Cost': test_pred_actual
})

submission_df.to_csv('submission_RANDOM_FOREST.csv', index=False)

print("\n--- DONE ---")
print("Submission file 'submission_RANDOM_FOREST.csv' created successfully.")
print(submission_df.head())

Trying alternative path...
Original train data shape: (5000, 20)
Original test data shape: (500, 19)
Found 493 rows with non-positive cost. Setting them to 0.
Combined data shape for preprocessing: (5500, 20)
Starting feature engineering...
Feature engineering complete.
Building preprocessing pipeline...
Final shapes: X=(5000, 26), y=(5000,), X_test=(500, 26)

--- Starting Model Tuning for Random Forest ---
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best parameters: {'n_estimators': 800, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'log2', 'max_depth': 10, 'bootstrap': True}

Random Forest RMSE (on validation): 41714.88
Random Forest MSE (on validation): 1740130982.78
Retraining Random Forest on full training data...

--- DONE ---
Submission file 'submission_RANDOM_FOREST.csv' created successfully.
            Hospital_Id  Transport_Cost
0          fffe33003400      141.927769
1  fffe3700330036003600      125.636713
2  fffe3300390038003400      817.