In [None]:
#scikit-learn modules
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import cross_val_score,KFold
from sklearn.preprocessing import LabelEncoder

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# Load the dataset from Excel
data = pd.read_csv('services.csv')
data.head()

In [None]:
# Convert arrival time, waiting time, and service time to minutes
data['arrival_time'] = pd.to_datetime(data['arrival_time'], format='%H:%M:%S').dt.hour * 60 + pd.to_datetime(data['arrival_time'], format='%H:%M:%S').dt.minute
data['start_time'] = pd.to_datetime(data['start_time'], format='%H:%M:%S').dt.hour * 60 + pd.to_datetime(data['start_time'], format='%H:%M:%S').dt.minute + pd.to_datetime(data['start_time'], format='%H:%M:%S').dt.second/60
data['end_time'] = pd.to_datetime(data['end_time'], format='%H:%M:%S').dt.hour * 60 + pd.to_datetime(data['end_time'], format='%H:%M:%S').dt.minute + pd.to_datetime(data['end_time'], format='%H:%M:%S').dt.second/60
data['waiting_time'] = pd.to_datetime(data['waiting_time'], format='%H:%M:%S').dt.hour * 60 + pd.to_datetime(data['waiting_time'], format='%H:%M:%S').dt.minute
data['date'] = pd.to_datetime(data['date'], format='%d/%m/%Y')
data['date'] = data['date'].astype(int)
data.drop(columns=['client_id'], inplace=True)


In [None]:
encoder = LabelEncoder()
data['priority'] = encoder.fit_transform(data['priority'])
data['entity'] = encoder.fit_transform(data['entity'])
data['service_type'] = encoder.fit_transform(data['service_type'])
data['status'] = encoder.fit_transform(data['status'])
data.head()

In [None]:
# Define a custom function to calculate queue length
def calculate_queue_length(row, data):
    return data[(data['date'] == row['date']) &
        (data['queue_id'] == row['queue_id']) &
        (data['branch_id'] == row['branch_id']) &
        (data['arrival_time'] < row['start_time'])]['arrival_time'].count()

# Apply the custom function to each row
data['queue_length'] = data.apply(calculate_queue_length, axis=1, data=data)
data.head()

In [None]:
#x = data[['branch_id','date','queue_id','sequence','status','entity','priority','arrival_time','cashier','service_type','queue_length']]
x = data[['cashier','queue_length','date','arrival_time','queue_id']]
y = data['waiting_time']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=42)

In [None]:
#rf_model = RandomForestRegressor(n_estimators = 200, max_depth = 30, min_samples_split = 2, min_samples_leaf = 1)
rf_model = RandomForestRegressor(n_estimators = 100)
rf_model.fit(x_train, y_train)
y_pred = rf_model.predict(x_test)

In [None]:
# Metrics
print("MAE",mean_absolute_error(y_test,y_pred))
print("MSE",mean_squared_error(y_test,y_pred))
print("RMSE",(np.sqrt(mean_squared_error(y_test,y_pred))))
print("r2", r2_score(y_test, y_pred))

In [None]:
# Plot the actual vs predicted waiting time
plt.figure(figsize=(10, 6))
plt.plot(range(len(y_test[1:100])), y_test[1:100], color='r', label='Actual')
plt.plot(range(len(y_test[1:100])), y_pred[1:100], color='b', label='Predicted')
plt.xlabel('Sample')
plt.ylabel('Waiting Time (minutes)')
plt.title('Actual vs Predicted Waiting Time')
plt.legend()
plt.show()

In [None]:
# Define the number of folds (adjust as needed)
n_folds = 5

# Create KFold object
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

# Initialize a list to store r2 score for each fold
r2_per_fold = []

# Perform K-Fold Cross-Validation
for train_index, test_index in kf.split(x):
    X_train, X_test = x.iloc[train_index], x.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Train the model on the training set
    rf_model.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = rf_model.predict(X_test)

    # Calculate r2 score for the fold
    r2_fold = r2_score(y_test, y_pred)
    r2_per_fold.append(r2_fold)

# Display r2 score for each fold
print("r2 score for Each Fold:")
for i, r2_fold in enumerate(r2_per_fold, start=1):
    print(f"Fold {i}: {r2_fold}")

# Calculate and print the average r2 score
average_r2 = sum(r2_per_fold) / n_folds
print(f"\nAverage r2 score Error: {average_r2}")

In [None]:
et_model = ExtraTreesRegressor(n_estimators = 100)
et_model.fit(x_train, y_train)
y_pred = et_model.predict(x_test)

In [None]:
# Metrics
print("MAE",mean_absolute_error(y_test,y_pred))
print("MSE",mean_squared_error(y_test,y_pred))
print("RMSE",(np.sqrt(mean_squared_error(y_test,y_pred))))
print("r2", r2_score(y_test, y_pred))

In [None]:
# Plot the actual vs predicted waiting time
plt.figure(figsize=(10, 6))
plt.plot(range(len(y_test[1:100])), y_test[1:100], color='r', label='Actual')
plt.plot(range(len(y_test[1:100])), y_pred[1:100], color='b', label='Predicted')
plt.xlabel('Sample')
plt.ylabel('Waiting Time (minutes)')
plt.title('Actual vs Predicted Waiting Time')
plt.legend()
plt.show()