In [None]:

import pandas as pd
import numpy as np
import re
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# Load dataset
file_path = "/Train_delay_Prediction.csv"  # Change path if needed
df = pd.read_csv(file_path).dropna()  # Load and drop missing values

# Convert 'Started On' to datetime (vectorized)
df['Started On'] = df['Started On'].astype(str).str.replace(r'(\d+)(st|nd|rd|th)', r'\1', regex=True)
df['Started On'] = pd.to_datetime(df['Started On'], format="%d, %b, %Y at %I:%M %p", errors='coerce')

# Convert 'Delay' to numeric values (vectorized)
df['Hours'] = df['Delay'].str.extract(r'(\d+)\s*Hr', expand=False).fillna(0).astype(int)
df['Minutes'] = df['Delay'].str.extract(r'(\d+)\s*Min', expand=False).fillna(0).astype(int)
df['Delay (mins)'] = df['Hours'] * 60 + df['Minutes']
df.drop(columns=['Hours', 'Minutes', 'Delay'], inplace=True)

# Encode 'Status' column (vectorized)
df['Late'] = (df['Status'] == 'Late').astype(int)

# Extract time-based features (vectorized)
df['Hour'] = df['Started On'].dt.hour
df['DayOfWeek'] = df['Started On'].dt.dayofweek
df['Month'] = df['Started On'].dt.month

# Drop unnecessary columns
df.drop(columns=['Status', 'Reach Time', 'Started On'], inplace=True)

# Define features (X) and target variable (y)
X, y = df.drop(columns=['Delay (mins)']), df['Delay (mins)']

# Split dataset into training and testing sets (vectorized)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model (Random Forest, no loops required)
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
mae, r2 = mean_absolute_error(y_test, y_pred), r2_score(y_test, y_pred)

print(f"Mean Absolute Error: {mae:.2f} minutes")
print(f"R² Score: {r2:.2f}")

# Example prediction (vectorized input)
example_input = np.array([[1, 17, 3, 1]])  # Example: Late train at 5 PM on Wednesday in January
predicted_delay = model.predict(example_input)
print(f"Predicted Delay: {predicted_delay[0]:.2f} minutes")