In [8]:
# ============================================
# CELL 1: LOAD DATA
# ============================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

# Load the data from data folder
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

print("📂 Data Loaded Successfully!")
print(f"Training data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")

# Display first few rows
print("\n📊 Training Data Preview:")
display(train_df.head())

📂 Data Loaded Successfully!
Training data shape: (5000, 20)
Test data shape: (500, 19)

📊 Training Data Preview:


Unnamed: 0,Hospital_Id,Supplier_Name,Supplier_Reliability,Equipment_Height,Equipment_Width,Equipment_Weight,Equipment_Type,Equipment_Value,Base_Transport_Fee,CrossBorder_Shipping,Urgent_Shipping,Installation_Service,Transport_Method,Fragile_Equipment,Hospital_Info,Rural_Hospital,Order_Placed_Date,Delivery_Date,Hospital_Location,Transport_Cost
0,fffe3200360030003700,Jo Valencia,0.44,21.0,6.0,,,3.62,17.13,No,No,No,Roadways,No,Working Class,No,10/20/17,10/20/17,APO AA 33776,179.5
1,fffe3400380037003400,Wanda Warren,0.58,29.0,20.0,1210684.0,Marble,9703.37,35.42,No,Yes,Yes,Roadways,No,Working Class,No,02/22/16,02/24/16,"South Kevin, VT 84493",627732.45
2,fffe3200350036003700,Robert Ackies,0.97,39.0,15.0,3305.0,Aluminium,40.21,18.54,No,No,No,Roadways,No,Working Class,No,01/11/18,01/10/18,"Kevinshire, NE 31279",1565.92
3,fffe3800320034003400,Charlotte Membreno,0.7,8.0,5.0,606.0,Brass,4.55,17.48,No,No,No,Roadways,No,Working Class,No,08/06/16,08/06/16,DPO AP 61572,257.71
4,fffe3600340033003000,Nena Silva,0.66,27.0,13.0,,Marble,2726.8,30.23,Yes,No,No,Roadways,No,Working Class,,12/15/16,12/17/16,"Joshuamouth, AK 01550",8553.52


In [9]:
# ============================================
# CELL 2: PREPARE FEATURES
# ============================================

# Drop columns that won't be used for modeling
columns_to_drop = ['Hospital_Id', 'Order_Placed_Date', 'Delivery_Date']

# Separate features and target from training data
X = train_df.drop(columns=columns_to_drop + ['Transport_Cost'])
y = train_df['Transport_Cost']

# Prepare test data (no target variable)
X_submission = test_df.drop(columns=columns_to_drop)
test_ids = test_df['Hospital_Id']

print("✅ Features Prepared!")
print(f"Feature columns: {X.shape[1]}")
print(f"Training samples: {X.shape[0]}")
print(f"\nFeature names:")
print(X.columns.tolist())

✅ Features Prepared!
Feature columns: 16
Training samples: 5000

Feature names:
['Supplier_Name', 'Supplier_Reliability', 'Equipment_Height', 'Equipment_Width', 'Equipment_Weight', 'Equipment_Type', 'Equipment_Value', 'Base_Transport_Fee', 'CrossBorder_Shipping', 'Urgent_Shipping', 'Installation_Service', 'Transport_Method', 'Fragile_Equipment', 'Hospital_Info', 'Rural_Hospital', 'Hospital_Location']


In [10]:
# ============================================
# CELL 3: TRAIN-TEST SPLIT
# ============================================

# Split the data (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42,
    shuffle=True
)

print("📊 Train-Test Split Complete!")
print(f"Training set: {X_train.shape[0]} samples ({X_train.shape[0]/len(X)*100:.1f}%)")
print(f"Test set: {X_test.shape[0]} samples ({X_test.shape[0]/len(X)*100:.1f}%)")
print(f"\nTarget variable statistics:")
print(f"  Train mean: ${y_train.mean():,.2f}")
print(f"  Train std: ${y_train.std():,.2f}")
print(f"  Test mean: ${y_test.mean():,.2f}")
print(f"  Test std: ${y_test.std():,.2f}")

📊 Train-Test Split Complete!
Training set: 4000 samples (80.0%)
Test set: 1000 samples (20.0%)

Target variable statistics:
  Train mean: $20,866.69
  Train std: $284,531.95
  Test mean: $6,023.55
  Test std: $47,122.21


In [11]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from scipy import stats

# ============================================
# POST-SPLIT PREPROCESSING PIPELINE
# ============================================

def preprocess_data(X_train, X_test, y_train=None):
    """
    Robust preprocessing pipeline for medical equipment transport cost prediction.
    
    Steps:
    1. Handle missing values (mean/median/mode)
    2. Handle skewness (log transformation)
    3. Scale features (StandardScaler)
    4. Encode categorical variables
    
    Parameters:
    - X_train: Training features (before preprocessing)
    - X_test: Test features (before preprocessing)
    - y_train: Training target (for log transformation if needed)
    
    Returns:
    - X_train_processed, X_test_processed, y_train_processed (if provided)
    """
    
    # Create copies to avoid modifying original data
    X_train = X_train.copy()
    X_test = X_test.copy()
    
    # ============================================
    # 1. SEPARATE FEATURES BY TYPE
    # ============================================
    
    # Numerical features
    numerical_features = [
        'Supplier_Reliability', 'Equipment_Height', 'Equipment_Width', 
        'Equipment_Weight', 'Equipment_Value', 'Base_Transport_Fee', 
        'Delivery_Days'
    ]
    
    # Categorical features
    categorical_features = [
        'Supplier_Name', 'Equipment_Type', 'Transport_Method',
        'CrossBorder_Shipping', 'Urgent_Shipping', 'Installation_Service',
        'Fragile_Equipment', 'Hospital_Info', 'Rural_Hospital', 'Hospital_Location'
    ]
    
    # ============================================
    # 2. HANDLE MISSING VALUES - NUMERICAL
    # ============================================
    
    print("📊 Handling Missing Values...")
    
    # For skewed numerical features -> use MEDIAN
    skewed_features = ['Equipment_Weight', 'Equipment_Value', 'Base_Transport_Fee']
    
    for col in numerical_features:
        if col in X_train.columns:
            if col in skewed_features:
                # Use median for skewed features
                train_median = X_train[col].median()
                X_train[col].fillna(train_median, inplace=True)
                X_test[col].fillna(train_median, inplace=True)
            else:
                # Use mean for normally distributed features
                train_mean = X_train[col].mean()
                X_train[col].fillna(train_mean, inplace=True)
                X_test[col].fillna(train_mean, inplace=True)
    
    # ============================================
    # 3. HANDLE MISSING VALUES - CATEGORICAL
    # ============================================
    
    for col in categorical_features:
        if col in X_train.columns:
            # Use MODE (most frequent value)
            train_mode = X_train[col].mode()[0] if not X_train[col].mode().empty else 'Unknown'
            X_train[col].fillna(train_mode, inplace=True)
            X_test[col].fillna(train_mode, inplace=True)
    
    # ============================================
    # 4. HANDLE SKEWNESS - LOG TRANSFORMATION
    # ============================================
    
    print("📈 Handling Skewness...")
    
    # Apply log transformation to highly skewed features
    # Add 1 to avoid log(0) issues
    highly_skewed = ['Equipment_Weight', 'Equipment_Value', 'Base_Transport_Fee']
    
    for col in highly_skewed:
        if col in X_train.columns:
            # Check if column has positive values
            if (X_train[col] > 0).all():
                X_train[col] = np.log1p(X_train[col])  # log(1+x)
                X_test[col] = np.log1p(X_test[col])
            else:
                # If has zeros or negatives, add minimum value + 1
                min_val = X_train[col].min()
                if min_val <= 0:
                    shift = abs(min_val) + 1
                    X_train[col] = np.log1p(X_train[col] + shift)
                    X_test[col] = np.log1p(X_test[col] + shift)
    
    # ============================================
    # 5. ENCODE CATEGORICAL VARIABLES
    # ============================================
    
    print("🔤 Encoding Categorical Variables...")
    
    # Label Encoding for categorical features
    label_encoders = {}
    
    for col in categorical_features:
        if col in X_train.columns:
            le = LabelEncoder()
            
            # Fit on training data
            X_train[col] = X_train[col].astype(str)
            X_test[col] = X_test[col].astype(str)
            
            # Get all unique categories from both train and test
            all_categories = pd.concat([X_train[col], X_test[col]]).unique()
            le.fit(all_categories)
            
            # Transform
            X_train[col] = le.transform(X_train[col])
            X_test[col] = le.transform(X_test[col])
            
            label_encoders[col] = le
    
    # ============================================
    # 6. SCALE FEATURES - STANDARDIZATION
    # ============================================
    
    print("⚖️ Scaling Features...")
    
    # StandardScaler for all numerical features (including encoded categoricals)
    scaler = StandardScaler()
    
    # Fit on training data only
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Convert back to DataFrame
    X_train_processed = pd.DataFrame(
        X_train_scaled, 
        columns=X_train.columns,
        index=X_train.index
    )
    
    X_test_processed = pd.DataFrame(
        X_test_scaled, 
        columns=X_test.columns,
        index=X_test.index
    )
    
    # ============================================
    # 7. HANDLE TARGET VARIABLE (if provided)
    # ============================================
    
    y_train_processed = None
    if y_train is not None:
        y_train_processed = y_train.copy()
        
        # Check if target is skewed (optional: apply log transformation)
        skewness = stats.skew(y_train_processed)
        
        if abs(skewness) > 1:  # If highly skewed
            print(f"⚠️ Target variable is skewed (skewness: {skewness:.2f})")
            print("   Applying log transformation to target...")
            y_train_processed = np.log1p(y_train_processed)
    
    print("✅ Preprocessing Complete!")
    print(f"   Training shape: {X_train_processed.shape}")
    print(f"   Test shape: {X_test_processed.shape}")
    
    # Return processed data and fitted objects for future use
    results = {
        'X_train': X_train_processed,
        'X_test': X_test_processed,
        'y_train': y_train_processed,
        'scaler': scaler,
        'label_encoders': label_encoders
    }
    
    return results


# ============================================
# USAGE EXAMPLE
# ============================================

# Assuming you have already split your data:
# X_train, X_test, y_train, y_test = train_test_split(X, y, ...)

# Apply preprocessing
# results = preprocess_data(X_train, X_test, y_train)

# Extract processed data
# X_train_processed = results['X_train']
# X_test_processed = results['X_test']
# y_train_processed = results['y_train']

# For making predictions on new data later:
# scaler = results['scaler']
# label_encoders = results['label_encoders']

In [18]:
# ============================================
# CELL 4: APPLY PREPROCESSING PIPELINE (SIMPLIFIED)
# ============================================

print("🔄 Starting Preprocessing Pipeline...\n")

# DON'T pass y_train to preprocessing - we'll handle it separately
results = preprocess_data(X_train, X_test, y_train=None)

# Extract preprocessed data
X_train_processed = results['X_train']
X_test_processed = results['X_test']

# Use ORIGINAL y_train (no transformation)
y_train_processed = y_train.copy()

# Save the scaler and encoders for later use
scaler = results['scaler']
label_encoders = results['label_encoders']

print("\n📦 Preprocessed Data Summary:")
print(f"X_train shape: {X_train_processed.shape}")
print(f"X_test shape: {X_test_processed.shape}")
print(f"y_train shape: {y_train_processed.shape}")
print(f"y_train NaN count: {y_train_processed.isna().sum()}")

print(f"\nSample of preprocessed features:")
display(X_train_processed.head())

🔄 Starting Preprocessing Pipeline...

📊 Handling Missing Values...
📈 Handling Skewness...
🔤 Encoding Categorical Variables...
⚖️ Scaling Features...
✅ Preprocessing Complete!
   Training shape: (4000, 16)
   Test shape: (1000, 16)

📦 Preprocessed Data Summary:
X_train shape: (4000, 16)
X_test shape: (1000, 16)
y_train shape: (4000,)
y_train NaN count: 0

Sample of preprocessed features:


Unnamed: 0,Supplier_Name,Supplier_Reliability,Equipment_Height,Equipment_Width,Equipment_Weight,Equipment_Type,Equipment_Value,Base_Transport_Fee,CrossBorder_Shipping,Urgent_Shipping,Installation_Service,Transport_Method,Fragile_Equipment,Hospital_Info,Rural_Hospital,Hospital_Location
4227,1.020988,-1.312871,0.527965,0.071885,-0.837113,1.596695,-0.774738,-1.05493,1.411302,1.424082,1.23695,-1.354274,-0.43603,-1.675082,2.171241,-1.322553
4676,-0.903351,-1.832738,1.639633,2.182058,2.389144,0.64868,2.224475,1.648837,-0.708566,1.424082,-0.80844,0.123788,2.293421,-1.675082,-0.460566,-0.023474
800,0.177778,-0.9929528,-1.524347,-1.079119,-1.347175,-1.247351,-0.67897,-0.483329,-0.708566,-0.702207,1.23695,-1.354274,-0.43603,0.596986,-0.460566,-1.654602
3671,-0.530379,1.886309,-1.524347,-1.079119,-1.502645,-1.247351,-0.629338,-0.304239,-0.708566,-0.702207,-0.80844,-1.354274,-0.43603,0.596986,-0.460566,0.133192
4193,-1.684983,-2.219877e-16,0.698991,1.222889,0.587493,-0.299336,0.456636,0.242517,-0.708566,-0.702207,1.23695,0.123788,-0.43603,0.596986,-0.460566,1.218069


In [19]:
# ============================================
# CELL 5: BASELINE MODEL - LINEAR REGRESSION (SIMPLE)
# ============================================

from sklearn.linear_model import LinearRegression

print("🤖 Training Baseline Model: Linear Regression\n")

# Remove any rows where target is NaN
valid_indices = ~y_train_processed.isna()
X_train_clean = X_train_processed[valid_indices]
y_train_clean = y_train_processed[valid_indices]

print(f"Training samples after removing NaN: {len(y_train_clean)}")

# Initialize and train the model
baseline_model = LinearRegression()
baseline_model.fit(X_train_clean, y_train_clean)

print("✅ Model trained successfully!\n")

# Make predictions
y_train_pred = baseline_model.predict(X_train_clean)
y_test_pred = baseline_model.predict(X_test_processed)

# Calculate metrics
train_rmse = np.sqrt(mean_squared_error(y_train_clean, y_train_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))

train_mae = mean_absolute_error(y_train_clean, y_train_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)

train_r2 = r2_score(y_train_clean, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

# Display results
print("=" * 60)
print("📈 BASELINE MODEL PERFORMANCE - LINEAR REGRESSION")
print("=" * 60)
print(f"\n🎯 TRAINING SET:")
print(f"   RMSE: ${train_rmse:,.2f}")
print(f"   MAE:  ${train_mae:,.2f}")
print(f"   R²:   {train_r2:.4f}")

print(f"\n🎯 TEST SET:")
print(f"   RMSE: ${test_rmse:,.2f}")
print(f"   MAE:  ${test_mae:,.2f}")
print(f"   R²:   {test_r2:.4f}")

print(f"\n📊 Overfitting Check:")
print(f"   RMSE Difference: ${abs(train_rmse - test_rmse):,.2f}")
print(f"   R² Difference: {abs(train_r2 - test_r2):.4f}")

if abs(train_r2 - test_r2) < 0.05:
    print("   ✅ Model generalizes well!")
elif abs(train_r2 - test_r2) < 0.1:
    print("   ⚠️ Slight overfitting detected")
else:
    print("   ❌ Significant overfitting detected")

print("=" * 60)

🤖 Training Baseline Model: Linear Regression

Training samples after removing NaN: 4000
✅ Model trained successfully!

📈 BASELINE MODEL PERFORMANCE - LINEAR REGRESSION

🎯 TRAINING SET:
   RMSE: $273,100.45
   MAE:  $63,258.27
   R²:   0.0785

🎯 TEST SET:
   RMSE: $75,414.20
   MAE:  $52,528.90
   R²:   -1.5638

📊 Overfitting Check:
   RMSE Difference: $197,686.24
   R² Difference: 1.6423
   ❌ Significant overfitting detected
