In [87]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from datetime import datetime

In [95]:
# Define dataset paths and their corresponding passenger counts
dataset_info = {
    #"dataset_128_a.csv": 128,
    #"dataset_128_b.csv": 128,
    #"dataset_250_a.csv": 250,
    #"dataset_250_b.csv": 250,
    "luggage_dataset1_300.csv": 300,
    "luggage_dataset2_300.csv": 300,
    #"dataset_350_a.csv": 350,
    #"dataset_350_b.csv": 350,
}

# Load and combine all datasets with a new column
dfs = []
for file_name, passenger_count in dataset_info.items():
    df_temp = pd.read_csv(f"C:/Users/sarah/OneDrive - King Suad University/Desktop/GPCode/{file_name}")
    df_temp["Passenger_Count"] = passenger_count
    dfs.append(df_temp)

df = pd.concat(dfs, ignore_index=True)

# Drop irrelevant or sensitive columns
df = df.drop(columns=[
    'Bag_ID', 'Phone_Number', 'Password', 'Claim_Option', 
    'Flight_Number', 'Delivery_Address'
], errors='ignore')

# Keep only rows with required time columns
df = df.dropna(subset=[
    'Reservation_ID', 'Flight_Arrival_Time', 
    'Estimated_Arrival_Time', 'Actual_Arrival_Time', 'Cluster_Size'
])

# Parse time columns using known time format (e.g., 12-hour format with AM/PM)
time_format = "%H:%M:%S"  # Adjust this if your format is different

df['Estimated_Arrival_Time'] = pd.to_datetime(df['Estimated_Arrival_Time'], format=time_format)
df['Actual_Arrival_Time'] = pd.to_datetime(df['Actual_Arrival_Time'], format=time_format)
df['Flight_Arrival_Time'] = pd.to_datetime(df['Flight_Arrival_Time'], format=time_format)


print(df[['Estimated_Arrival_Time', 'Actual_Arrival_Time', 'Flight_Arrival_Time']].head())

# Take latest bag per reservation (i.e., when can the passenger actually claim their luggage?)
df = df.sort_values('Actual_Arrival_Time').groupby('Reservation_ID').last().reset_index()

# Compute error in seconds
df["Error_Seconds"] = (df["Actual_Arrival_Time"] - df["Estimated_Arrival_Time"]).dt.total_seconds()

# One-hot encode pickup gate and luggage status
df = pd.get_dummies(df, columns=['Pickup_Gate', 'Luggage_Status'], drop_first=True)

# Define features and target
y = df["Error_Seconds"].values
X = df.drop(columns=['Reservation_ID', 'Estimated_Arrival_Time', 'Actual_Arrival_Time', 'Error_Seconds'])

ValueError: time data "03:24" doesn't match format "%H:%M:%S", at position 0. You might want to try:
    - passing `format` if your strings have a consistent format;
    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.

In [83]:
# Scale input features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Define models
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Support Vector Regressor": SVR(),
    "K-Nearest Neighbors": KNeighborsRegressor(n_neighbors=5)
}

# Cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
model_results = {}

for name, model in tqdm(models.items(), desc="Training Models", unit="model"):
    scores = cross_val_score(model, X_scaled, y, cv=kf, scoring="r2")
    model_results[name] = np.mean(scores)
    print(f"{name}: Average R² Score = {model_results[name]:.4f}")


ValueError: could not convert string to float: 'Malone'

In [77]:
# Train best model on full data
best_model_name = max(model_results, key=model_results.get)
best_model = models[best_model_name]
best_model.fit(X_scaled, y)

# Predict error in seconds
y_pred = best_model.predict(X_scaled)

# Evaluate MAE
mae = mean_absolute_error(y, y_pred)
mae_minutes, mae_seconds = divmod(mae, 60)
mae_hours, mae_minutes = divmod(mae_minutes, 60)
print(f"\n✅ Best Model: {best_model_name} with R² = {model_results[best_model_name]:.4f}")
print(f"🕓 Final MAE: {int(mae_hours)} hours, {int(mae_minutes)} minutes, {int(mae_seconds)} seconds")



✅ Best Model: Random Forest with R² = 0.1711
🕓 Final MAE: 0 hours, 12 minutes, 56 seconds


In [None]:
# Plot predicted vs actual error (in seconds)
plt.figure(figsize=(8, 5))
sns.scatterplot(x=y, y=y_pred, alpha=0.6)
plt.plot([min(y), max(y)], [min(y), max(y)], 'r--')
plt.xlabel("Actual Error (seconds)")
plt.ylabel("Predicted Error (seconds)")
plt.title("Actual vs Predicted Error")
plt.show()

# Plot residuals
residuals = y - y_pred
plt.figure(figsize=(8, 5))
sns.histplot(residuals, bins=30, kde=True)
plt.axvline(0, color='red', linestyle='dashed')
plt.title("Residuals Distribution")
plt.xlabel("Prediction Error (seconds)")
plt.show()
