## Linear Regression


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Load your dataset (assuming it's in a CSV file)
data = pd.read_csv('queue_data.csv')

# Preprocess the data
# Remove any unnecessary columns
data = data[['date', 'checkin_time', 'checkout_time', 'wait_time','scheduled_appointment', 'first_timer', 'reason_visit', 'specialty',
             'gender',  'age', 'number_waiting']]

# Encode categorical variables (e.g., gender, specialty, reason_visit)
label_encoders = {}
categorical_columns = ['gender', 'specialty', 'reason_visit', 'scheduled_appointment', 'first_timer']
for col in categorical_columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# Split the data into features and target
X = data.drop(columns=['wait_time'])
y = data['wait_time']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize/Standardize numerical features (e.g., age)
scaler = StandardScaler()
X_train[['age', 'checkin_time', 'checkout_time', 'date', 'number_waiting']] = scaler.fit_transform(X_train[['age', 'checkin_time', 'checkout_time', 'date', 'number_waiting']])
X_test[['age', 'checkin_time', 'checkout_time', 'date', 'number_waiting']] = scaler.transform(X_test[['age', 'checkin_time', 'checkout_time', 'date', 'number_waiting']])

# Build a linear regression model
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(1)  # Linear regression output layer
])

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test))

# Make predictions
y_pred = model.predict(X_test)

# Calculate evaluation metrics
linear_mae = mean_absolute_error(y_test, y_pred)
linear_mse = mean_squared_error(y_test, y_pred)
linear_r2 = r2_score(y_test, y_pred)

print("Mean Absolute Error:", linear_mae)
print("Mean Squared Error:", linear_mse)
print("R-squared:", linear_r2)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Mean Absolute Error: 54.29348543018103
Mean Squared Error: 4563.917652174455
R-squared: -1.6012633865312624


## LightGBM (Gradient Boosted Tree)

In [None]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import LabelEncoder

# Load your data (replace 'your_data.csv' with the actual file path)
data = pd.read_csv('queue_data.csv')

# Preprocessing
label_encoders = {}
categorical_features = ['gender', 'specialty', 'scheduled_appointment',
                        'first_timer', 'reason_visit']
for feature in categorical_features:
    le = LabelEncoder()
    data[feature] = le.fit_transform(data[feature])
    label_encoders[feature] = le

features = ['date', 'checkin_time', 'checkout_time', 'wait_time', 'gender', 'specialty',
            'scheduled_appointment', 'first_timer', 'reason_visit', 'age', 'number_waiting']
X = data[features]
y = data['wait_time']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a LightGBM dataset
train_data = lgb.Dataset(X_train, label=y_train)

# Define LightGBM parameters
params = {
    'objective': 'regression',  # Regression task
    'metric': 'l1',  # Mean Absolute Error (MAE) as the evaluation metric
    'num_leaves': 31,  # Maximum number of leaves in one tree
    'learning_rate': 0.05,  # Learning rate
    'feature_fraction': 0.9,  # Fraction of features used in each iteration
}

# Train the LightGBM model
num_round = 100  # Number of boosting rounds (you can tune this)
bst = lgb.train(params, train_data, num_round)

# Make predictions
y_pred = bst.predict(X_test)

# Calculate metrics
gbm_mae = mean_absolute_error(y_test, y_pred)
gbm_mse = mean_squared_error(y_test, y_pred)
gbm_r2 = r2_score(y_test, y_pred)

print("Mean Absolute Error:", gbm_mae)
print("Mean Squared Error:", gbm_mse)
print("R-squared:", gbm_r2)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 266
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 10
[LightGBM] [Info] Start training from score 62.576250
Mean Absolute Error: 1.1210474820801526
Mean Squared Error: 19.87588887288816
R-squared: 0.9886714822790077


## Random Forest

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Load your dataset (assuming it's in a CSV file)
data = pd.read_csv('queue_data.csv')

# Preprocess the data
# Remove any unnecessary columns
data = data[['date', 'checkin_time', 'checkout_time', 'wait_time','scheduled_appointment', 'first_timer', 'reason_visit', 'specialty',
             'gender',  'age', 'number_waiting']]

# Encode categorical variables (e.g., gender, specialty, reason_visit)
label_encoders = {}
categorical_columns = ['gender', 'specialty', 'reason_visit', 'scheduled_appointment', 'first_timer']
for col in categorical_columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# Split the data into features and target
X = data.drop(columns=['wait_time'])
y = data['wait_time']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize/Standardize numerical features (e.g., age)
scaler = StandardScaler()
X_train[['age', 'checkin_time', 'checkout_time', 'date', 'number_waiting']] = scaler.fit_transform(X_train[['age', 'checkin_time', 'checkout_time', 'date', 'number_waiting']])
X_test[['age', 'checkin_time', 'checkout_time', 'date', 'number_waiting']] = scaler.transform(X_test[['age', 'checkin_time', 'checkout_time', 'date', 'number_waiting']])

# Build a Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the Random Forest model
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)

# Calculate evaluation metrics
rf_mae = mean_absolute_error(y_test, y_pred)
rf_mse = mean_squared_error(y_test, y_pred)
rf_r2 = r2_score(y_test, y_pred)

print("Mean Absolute Error:", rf_mae)
print("Mean Squared Error:", rf_mse)
print("R-squared:", rf_r2)


Mean Absolute Error: 0.6479500000000005
Mean Squared Error: 7.702548500000001
R-squared: 0.9956098337167664


# Evaluation Results

In [None]:
import pandas as pd

# Create a DataFrame to display the evaluation metrics results
results = pd.DataFrame({
    'Model': ['Linear Regression', 'LightGBM', 'Random Forest'],
    '(MAE)': [linear_mae, gbm_mae, rf_mae],
    '(MSE)': [linear_mse, gbm_mse, rf_mse],
    '(R²)': [linear_r2, gbm_r2, rf_r2],
})

# Sort the results DataFrame by the '(R²)' column in ascending order
results_sorted = results.sort_values(by='(R²)')

# Display the sorted results DataFrame
print(results_sorted)



               Model      (MAE)        (MSE)      (R²)
0  Linear Regression  54.293485  4563.917652 -1.601263
1           LightGBM   1.121047    19.875889  0.988671
2      Random Forest   0.647950     7.702549  0.995610


## Predicted Wait Time from trained Random Forest Model

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder

# Load your data (replace 'your_data.csv' with the actual file path)
data = pd.read_csv('queue_data.csv')

# Assuming you have a DataFrame 'data' with columns:
# date, checkin_time, checkout_time, wait_time, gender, specialty, scheduled_appointment,
# first_timer, reason_of_visit, age

# Preprocessing
# Convert categorical variables to numerical using LabelEncoder
label_encoders = {}
categorical_features = ['gender', 'specialty', 'scheduled_appointment',
                        'first_timer', 'reason_visit']
for feature in categorical_features:
    le = LabelEncoder()
    data[feature] = le.fit_transform(data[feature])
    label_encoders[feature] = le

# Assuming you want to predict 'wait_time'
features = ['date', 'checkin_time', 'checkout_time', 'gender', 'specialty',
            'scheduled_appointment', 'first_timer', 'reason_visit', 'age', 'number_waiting']
X = data[features]
y = data['wait_time']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build and train a Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Show the results (predicted wait time alongside actual wait time)
results = pd.DataFrame({'Actual Wait Time': y_test, 'Predicted Wait Time': y_pred})
print(results.head())  # Display the first few rows of results

     Actual Wait Time  Predicted Wait Time
521                33                33.05
737                40                39.96
740                50                50.02
660                28                27.98
411                53                52.87


## Average Predicted Wait Time according to Number Waiting

In [None]:
import pandas as pd

# Assuming you have already made predictions and have a DataFrame 'results' with 'Actual Wait Time' and 'Predicted Wait Time'
# Also, 'X_test' should have a 'number_waiting' feature

# Create a DataFrame combining the actual and predicted wait times with 'number_waiting'
results_with_number_waiting = pd.concat([results, X_test['number_waiting']], axis=1)

# Group by 'number_waiting' and calculate the mean predicted wait time for each group
average_predicted_wait_time = results_with_number_waiting.groupby('number_waiting')['Predicted Wait Time'].mean().reset_index()

# Rename the columns for clarity
average_predicted_wait_time.columns = ['number_waiting', 'Average Predicted Wait Time']

# Display the average predicted wait time for each group as a table
print(average_predicted_wait_time)



    number_waiting  Average Predicted Wait Time
0                1                    12.907143
1                2                    20.778276
2                3                    28.317143
3                4                    34.040625
4                5                    39.486923
5                6                    44.223333
6                7                    50.628421
7                9                    89.020000
8               11                   110.010000
9               12                   112.192500
10              13                   126.444444
11              14                   131.295714
12              15                   139.836667
13              18                   155.290000
14              20                   209.600000


### Linear Regression model for Average Wait Time Prediction of Number Waiting from 21 to 30

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# Assuming you already have the 'average_predicted_wait_time' DataFrame

# Extract relevant data
X = average_predicted_wait_time[['number_waiting']]
y = average_predicted_wait_time['Average Predicted Wait Time']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict average wait times for 'number_waiting' values in the range 21 to 30
number_waiting_range = list(range(21, 31))
predicted_wait_times = model.predict(pd.DataFrame(number_waiting_range, columns=['number_waiting']))

# Create a DataFrame to display the predictions in a table
prediction_table = pd.DataFrame({'number_waiting': number_waiting_range, 'Average Predicted Wait Time': predicted_wait_times})

# Display the prediction table
print(prediction_table)



   number_waiting  Average Predicted Wait Time
0              21                   203.283406
1              22                   213.295817
2              23                   223.308227
3              24                   233.320637
4              25                   243.333048
5              26                   253.345458
6              27                   263.357869
7              28                   273.370279
8              29                   283.382689
9              30                   293.395100


In [None]:
# Assuming you already have prediction tables for two scenarios: prediction_table1 and prediction_table2

# Combine the two prediction tables into one
combined_prediction_table = pd.concat([average_predicted_wait_time, prediction_table])

# Round the 'Predicted Wait Time' column to two decimal places
combined_prediction_table['Average Predicted Wait Time'] = combined_prediction_table['Average Predicted Wait Time'].round(2)

# Display the combined and rounded prediction table
print(combined_prediction_table)



    number_waiting  Average Predicted Wait Time
0                1                        12.91
1                2                        20.78
2                3                        28.32
3                4                        34.04
4                5                        39.49
5                6                        44.22
6                7                        50.63
7                9                        89.02
8               11                       110.01
9               12                       112.19
10              13                       126.44
11              14                       131.30
12              15                       139.84
13              18                       155.29
14              20                       209.60
0               21                       203.28
1               22                       213.30
2               23                       223.31
3               24                       233.32
4               25                      

In [None]:
import pandas as pd

# Assuming you have the combined_table DataFrame

# Specify the file path for the CSV file
csv_file_path = 'predicted_table.csv'

# Export the DataFrame to a CSV file
combined_prediction_table.to_csv(csv_file_path, index=False)

print(f"Combined table has been exported to {csv_file_path}")


Combined table has been exported to predicted_table.csv
