In [1]:
#imports
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

#helper function
#Calculates and prints the RMSE and R2 score for a model's predictions.
def print_model_performance(y_true, y_pred, model_name):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    print(f"--- Performance for {model_name} ---")
    print(f"RMSE: {rmse:.4f}")
    print(f"R2 Score: {r2:.4f}")
    print("---------------------------------")

#load data
try:
    #Attempt to read the CSV files into pandas DataFrames
    train_df = pd.read_csv("train.csv")
    test_df = pd.read_csv("test.csv")
    sample_submission_df = pd.read_csv("sample_submission.csv")
except FileNotFoundError as e:
    #Handle the case where a file is not found
    print(f"Error loading file: {e}")
    exit() #Exit the script if data can't be loaded
except Exception as e:
    #Handle other potential errors during file loading
    print(f"An error occurred: {e}")
    exit() #Exit the script
print("Data loaded successfully.")

#set index
try:
    #Set the 'Hospital_Id' column as the index for both train and test DataFrames
    #This is useful for aligning data and submissions
    train_df = train_df.set_index('Hospital_Id')
    test_df = test_df.set_index('Hospital_Id')
    print("Index set to Hospital_Id.")
except KeyError:
    #Handle the case where 'Hospital_Id' column doesn't exist
    print("Hospital_Id column not found. Please check your CSV files.")
    exit()

#target variable preparation
#clip(lower=0) to ensure there are no negatives cost values
y_original = train_df['Transport_Cost'].clip(lower=0)

#apply log transform
#Apply a log1p transformation (log(1 + x)) to the target variable
#This helps normalize the distribution of the target, which is often skewed
#It's particularly useful for models sensitive to outliers
y_log = np.log1p(y_original)

#Drop the original target variable from the training DataFrame
train_df = train_df.drop('Transport_Cost', axis=1)
print("Target variable 'y' prepared (clipped at 0 + log1p transform).")

#combine data
#Add a 'source' column to distinguish between train and test data
train_df['source'] = 'train'
test_df['source'] = 'test'
#Concatenate the train and test DataFrames into a single DataFrame
#This simplifies preprocessing, as all transformations can be applied at once
combined_df = pd.concat([train_df, test_df], axis=0)
print(f"Combined DataFrame shape: {combined_df.shape}")

#feature engineering(dates)
try:
    combined_df['Order_Placed_Date'] = pd.to_datetime(combined_df['Order_Placed_Date'], format='%m/%d/%y')
    combined_df['Delivery_Date'] = pd.to_datetime(combined_df['Delivery_Date'], format='%m/%d/%y')
    
    #calculate days, which might be negative
    combined_df['Delivery_Time_Days'] = (combined_df['Delivery_Date'] - combined_df['Order_Placed_Date']).dt.days
    #nOW, clip at 0
    combined_df['Delivery_Time_Days_Corrected'] = (combined_df['Delivery_Time_Days'] <0).astype(int)
    combined_df['Delivery_Time_Days'] = combined_df['Delivery_Time_Days'].clip(lower=0)
    
    #continue with other date features
    combined_df['Order_Year'] = combined_df['Order_Placed_Date'].dt.year
    combined_df = combined_df.drop(['Order_Placed_Date', 'Delivery_Date'], axis=1)
    print("Date features engineered (Delivery_Time_Days clipped at 0).")
    
except Exception as e:
    print(f"Error during date feature engineering: {e}")

#feature engineering(location)
combined_df = combined_df.drop('Hospital_Location', axis=1)
#print("Location features engineered.")

#handle missing values
#Define numerical columns to impute
num_cols_impute = ['Supplier_Reliability', 'Equipment_Height', 'Equipment_Width', 'Equipment_Weight']
for col in num_cols_impute:
    median_val = combined_df[col].median()
    combined_df[col] = combined_df[col].fillna(median_val)
print("Numerical missing values imputed.")

#Define categorical columns to impute
cat_cols_impute = ['Equipment_Type', 'Transport_Method']
for col in cat_cols_impute:
    if not combined_df[col].mode().empty:
        mode_val = combined_df[col].mode()[0]
        combined_df[col] = combined_df[col].fillna(mode_val)
    else:
        combined_df[col] = combined_df[col].fillna('Unknown')
combined_df['Rural_Hospital'] = combined_df['Rural_Hospital'].fillna('No')
print("Categorical missing values imputed.")

#categorical encoding
combined_df = combined_df.drop('Urgent_Shipping', axis=1)
combined_df = combined_df.drop('CrossBorder_Shipping', axis=1)
combined_df = combined_df.drop('Installation_Service', axis=1)
binary_cols = ['Fragile_Equipment', 'Rural_Hospital']
for col in binary_cols:
    combined_df[col] = combined_df[col].map({'Yes': 1, 'No': 0})
print("Binary 'Yes'/'No' columns encoded.")

combined_df = combined_df.drop('Supplier_Name', axis=1)
print("Dropped 'Supplier_Name' column.")

ohe_cols = ['Equipment_Type', 'Transport_Method', 'Hospital_Info']
combined_df = pd.get_dummies(combined_df, columns=ohe_cols, drop_first=True, dummy_na=True)
print("One-Hot Encoding complete.")

#split and scale
#Split the combined DataFrame back into training and test sets using the 'source' column
X = combined_df[combined_df['source'] == 'train'].drop('source', axis=1)
X_test = combined_df[combined_df['source'] == 'test'].drop('source', axis=1)
y = y_log

train_cols = set(X.columns)
test_cols = set(X_test.columns)

missing_in_test = list(train_cols - test_cols)
for col in missing_in_test:
    X_test[col] = 0
missing_in_train = list(test_cols - train_cols)
for col in missing_in_train:
    X[col] = 0

X_test = X_test[X.columns]
print("Column alignment complete.")

numerical_cols = X.select_dtypes(include=np.number).columns
ohe_generated_cols = [col for col in X.columns if col.startswith(tuple(ohe_cols))]
binary_cols_to_exclude = set(binary_cols + ['Order_Year'])
cols_to_exclude = set.union(binary_cols_to_exclude, set(ohe_generated_cols))
cols_to_scale = [col for col in numerical_cols if col not in cols_to_exclude and col not in ohe_generated_cols]
print(f"Columns to scale: {cols_to_scale}")

if cols_to_scale:
    scaler = StandardScaler()
    X[cols_to_scale] = scaler.fit_transform(X[cols_to_scale])
    X_test[cols_to_scale] = scaler.transform(X_test[cols_to_scale])
    print("Scaling complete.")
else:
    print("No columns identified for scaling.")

#Create Train/Validation Split
#Split the full training data (X, y) into a new training set and a validation set
#'test_size=0.2' holds out 20% of the data for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"X (full train): {X.shape}")
print(f"y (full train): {y.shape}")
print(f"X_train (split): {X_train.shape}")
print(f"y_train (split): {y_train.shape}")
print(f"X_val (split): {X_val.shape}")
print(f"y_val (split): {y_val.shape}")
print(f"X_test (full test): {X_test.shape}")
print("--- All variables are now defined. ---")


#adaboost regressor

print("\nAdaBoost Regressor")

#basic model
base_est = DecisionTreeRegressor(max_depth=5)
ada_basic = AdaBoostRegressor(estimator=base_est, n_estimators=50, learning_rate=0.1, random_state=42)

print("Training basic AdaBoost model")
ada_basic.fit(X_train, y_train)

preds_basic_ada = ada_basic.predict(X_val)
print_model_performance(y_val, preds_basic_ada, "AdaBoost (Basic)")


#Detailed Hyperparameter Tuning (GridSearchCV)
print("\nStarting AdaBoost Hyperparameter Tuning")

param_grid_ada = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.05, 0.1],
    'estimator__max_depth': [3, 5, 7, 10]
}

#Changed n_jobs=-1 to n_jobs=1 to prevent MemoryError.
grid_search_ada = GridSearchCV(
    estimator=AdaBoostRegressor(estimator=DecisionTreeRegressor(random_state=42), random_state=42),
    param_grid=param_grid_ada,
    cv=3,
    scoring='neg_mean_squared_error',
    verbose=2,
    n_jobs=1  # <-- CHANGED FROM -1 to 1
)

print("Running GridSearchCV for AdaBoost... (This will be slower but should not crash)")
grid_search_ada.fit(X_train, y_train)

print(f"\nBest parameters found for AdaBoost: {grid_search_ada.best_params_}")

#Check Tuned Model Performance
best_ada_model = grid_search_ada.best_estimator_
preds_tuned_ada = best_ada_model.predict(X_val)
preds_actual= np.expm1(preds_tuned_ada)
y_actuall = np.expm1(y_val)
print_model_performance(y_val, preds_tuned_ada, "AdaBoost (Tuned)")
print_model_performance(y_actuall, preds_actual, "AdaBoost (Actual)")

#Feature Importance
print("\nAdaBoost Feature Importance ")
importances_ada = best_ada_model.feature_importances_
feature_names = X.columns
feat_imp_ada = pd.Series(importances_ada, index=feature_names).sort_values(ascending=False)

plt.figure(figsize=(10, 8))
feat_imp_ada.head(20).plot(kind='barh')
plt.title('Top 20 Features (AdaBoost)')
plt.savefig('adaboost_features_clipped.png')
print("Saved 'adaboost_features_clipped.png'")
plt.clf()

#Final Submission (AdaBoost)
print("\nTraining final AdaBoost model on ALL training data...")
final_ada_model = grid_search_ada.best_estimator_
final_ada_model.fit(X, y)

print("Making predictions on test.csv...")
test_preds_log_ada = final_ada_model.predict(X_test)

test_preds_ada = np.expm1(test_preds_log_ada)

#Ensure no negative costs
test_preds_ada[test_preds_ada < 0] = 0

#Create submission file with a new name
submission_ada = pd.DataFrame({
    'Hospital_Id': X_test.index,
    'Transport_Cost': test_preds_ada
})

submission_ada.to_csv('submission_adaboost_clipped.csv', index=False)
print("\n'submission_adaboost_clipped.csv' created successfully!")
print(submission_ada.head())
print(f"File shape: {submission_ada.shape} (Should be 500, 2)")

Error loading file: [Errno 2] No such file or directory: 'sample_submission.csv'
Data loaded successfully.
Index set to Hospital_Id.
Target variable 'y' prepared (clipped at 0 + log1p transform).
Combined DataFrame shape: (5500, 19)
Date features engineered (Delivery_Time_Days clipped at 0).
Numerical missing values imputed.
Categorical missing values imputed.
Binary 'Yes'/'No' columns encoded.
Dropped 'Supplier_Name' column.
One-Hot Encoding complete.
Column alignment complete.
Columns to scale: ['Supplier_Reliability', 'Equipment_Height', 'Equipment_Width', 'Equipment_Weight', 'Equipment_Value', 'Base_Transport_Fee', 'Delivery_Time_Days', 'Delivery_Time_Days_Corrected']
Scaling complete.
X (full train): (5000, 23)
y (full train): (5000,)
X_train (split): (4000, 23)
y_train (split): (4000,)
X_val (split): (1000, 23)
y_val (split): (1000,)
X_test (full test): (500, 23)
--- All variables are now defined. ---

AdaBoost Regressor
Training basic AdaBoost model
--- Performance for AdaBoost 

<Figure size 1000x800 with 0 Axes>