In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.arima.model import ARIMA
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from keras.models import Sequential
from keras.layers import LSTM, Dense, Input
import numpy as np

2024-10-05 10:22:57.526919: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-05 10:22:57.540973: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-05 10:22:57.545207: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-05 10:22:57.556738: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
# Load the CSV file
file_path = '/workspaces/codespaces-jupyter/Medical Inventory Optimization Dataset - Cleaned.csv'
data = pd.read_csv(file_path)

In [5]:
# Convert Dateofbill to datetime
data['Dateofbill'] = pd.to_datetime(data['Dateofbill'])

# Encode 'Dept' and 'Specialisation' as integers
label_encoder_dept = LabelEncoder()
label_encoder_specialisation = LabelEncoder()
data['Dept'] = label_encoder_dept.fit_transform(data['Dept'])
data['Specialisation'] = label_encoder_specialisation.fit_transform(data['Specialisation'])


In [6]:
# Aggregate Quantity and Final_Sales by DrugName, Dateofbill, Specialisation, and Dept
aggregated_data = data.groupby(['DrugName', 'Dateofbill', 'Specialisation', 'Dept']).agg({
    'Quantity': 'sum',
    'Final_Sales': 'sum'
}).reset_index()

# Extract date features
aggregated_data['Month'] = aggregated_data['Dateofbill'].dt.month
aggregated_data['Day'] = aggregated_data['Dateofbill'].dt.day
aggregated_data['Year'] = aggregated_data['Dateofbill'].dt.year

# Calculate rolling averages for Quantity and Final_Sales
aggregated_data['Quantity_MA'] = aggregated_data.groupby('DrugName')['Quantity'].transform(lambda x: x.rolling(window=7, min_periods=1).mean())
aggregated_data['Final_Sales_MA'] = aggregated_data.groupby('DrugName')['Final_Sales'].transform(lambda x: x.rolling(window=7, min_periods=1).mean())


In [7]:

# Prepare data for modeling
X = aggregated_data[['Month', 'Day', 'Year', 'Quantity_MA', 'Final_Sales_MA', 'Specialisation', 'Dept']]
y = aggregated_data['Final_Sales']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [8]:
# Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_predictions = lr_model.predict(X_test)
lr_rmse = mean_squared_error(y_test, lr_predictions, squared=False)




In [9]:
# ARIMA
y_train.index = pd.date_range(start='2020-01-01', periods=len(y_train), freq='D')
arima_model = ARIMA(y_train, order=(5, 1, 0))
arima_model_fit = arima_model.fit()
arima_predictions = arima_model_fit.forecast(steps=len(y_test))
arima_rmse = mean_squared_error(y_test, arima_predictions, squared=False)




In [10]:
# Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)
rf_rmse = mean_squared_error(y_test, rf_predictions, squared=False)




In [11]:
# LSTM
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
X_train_scaled, X_test_scaled = train_test_split(X_scaled, test_size=0.2, random_state=42)

X_train_lstm = X_train_scaled.reshape((X_train_scaled.shape[0], 1, X_train_scaled.shape[1]))
X_test_lstm = X_test_scaled.reshape((X_test_scaled.shape[0], 1, X_test_scaled.shape[1]))

lstm_model = Sequential()
lstm_model.add(Input(shape=(1, X_train_scaled.shape[1])))
lstm_model.add(LSTM(50, return_sequences=True))
lstm_model.add(LSTM(50))
lstm_model.add(Dense(1))
lstm_model.compile(optimizer='adam', loss='mean_squared_error')

lstm_model.fit(X_train_lstm, y_train, epochs=10, batch_size=32)
lstm_predictions = lstm_model.predict(X_test_lstm)
lstm_rmse = mean_squared_error(y_test, lstm_predictions, squared=False)


Epoch 1/10
[1m303/303[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 651974.7500
Epoch 2/10
[1m303/303[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 783578.0000
Epoch 3/10
[1m303/303[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 725352.6875
Epoch 4/10
[1m303/303[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 831542.1875
Epoch 5/10
[1m303/303[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 626252.8125
Epoch 6/10
[1m303/303[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 506905.5312
Epoch 7/10
[1m303/303[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 579367.8750
Epoch 8/10
[1m303/303[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 595712.1875
Epoch 9/10
[1m303/303[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 562686.5625
Epoch 10/10
[1m303/303[0m [32m━━━━━━━━━━━━━



In [12]:
# Print RMSE for each model
print(f'Linear Regression RMSE: {lr_rmse}')
print(f'ARIMA RMSE: {arima_rmse}')
print(f'Random Forest RMSE: {rf_rmse}')
print(f'LSTM RMSE: {lstm_rmse}')


Linear Regression RMSE: 287.79125877825635
ARIMA RMSE: 575.586428964685
Random Forest RMSE: 332.06248832435256
LSTM RMSE: 580.0171051094078


In [13]:
# Predict future sales
future_dates = pd.date_range(start=aggregated_data['Dateofbill'].max(), periods=30, freq='D')
future_data = pd.DataFrame({
    'Dateofbill': future_dates,
    'Month': future_dates.month,
    'Day': future_dates.day,
    'Year': future_dates.year
})


In [14]:
# Calculate moving averages for future dates based on historical data
last_known_date = aggregated_data['Dateofbill'].max()
last_known_quantity_ma = aggregated_data.loc[aggregated_data['Dateofbill'] == last_known_date, 'Quantity_MA'].values[0]
last_known_final_sales_ma = aggregated_data.loc[aggregated_data['Dateofbill'] == last_known_date, 'Final_Sales_MA'].values[0]

future_data['Quantity_MA'] = last_known_quantity_ma
future_data['Final_Sales_MA'] = last_known_final_sales_ma

# Add encoded 'Specialisation' and 'Dept' columns to future_data
# Assuming the future data has the same 'Specialisation' and 'Dept' as the last known data
last_known_specialisation = aggregated_data.loc[aggregated_data['Dateofbill'] == last_known_date, 'Specialisation'].values[0]
last_known_dept = aggregated_data.loc[aggregated_data['Dateofbill'] == last_known_date, 'Dept'].values[0]

future_data['Specialisation'] = last_known_specialisation
future_data['Dept'] = last_known_dept


In [15]:
# Use the best model (e.g., LSTM) to predict future sales
future_X = future_data[['Month', 'Day', 'Year', 'Quantity_MA', 'Final_Sales_MA', 'Specialisation', 'Dept']]
future_X_scaled = scaler.transform(future_X)
future_X_lstm = future_X_scaled.reshape((future_X_scaled.shape[0], 1, future_X_scaled.shape[1]))

# Use the appropriate input dimensions for each model
future_predictions_lr = lr_model.predict(future_X_scaled)
future_predictions_lstm = lstm_model.predict(future_X_lstm)

# Optimize stock levels based on predictions
optimal_stock_levels_lr = future_predictions_lr.flatten() * 1.1  # Adding 10% buffer
optimal_stock_levels_lstm = future_predictions_lstm.flatten() * 1.1  # Adding 10% buffer

# Display future predictions and optimal stock levels for both models
future_data['Predicted_Sales_LR'] = future_predictions_lr
future_data['Optimal_Stock_Level_LR'] = optimal_stock_levels_lr
future_data['Predicted_Sales_LSTM'] = future_predictions_lstm
future_data['Optimal_Stock_Level_LSTM'] = optimal_stock_levels_lstm

print(future_data[['Dateofbill', 'Predicted_Sales_LR', 'Optimal_Stock_Level_LR', 'Predicted_Sales_LSTM', 'Optimal_Stock_Level_LSTM']])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
   Dateofbill  Predicted_Sales_LR  Optimal_Stock_Level_LR  \
0  2022-12-31          -12.885684              -14.174253   
1  2023-01-01          -13.184063              -14.502469   
2  2023-01-02          -13.186500              -14.505150   
3  2023-01-03          -13.188938              -14.507832   
4  2023-01-04          -13.191376              -14.510513   
5  2023-01-05          -13.193814              -14.513195   
6  2023-01-06          -13.196251              -14.515877   
7  2023-01-07          -13.198689              -14.518558   
8  2023-01-08          -13.201127              -14.521240   
9  2023-01-09          -13.203565              -14.523921   
10 2023-01-10          -13.206003              -14.526603   
11 2023-01-11          -13.208440              -14.529284   
12 2023-01-12          -13.210878              -14.531966   
13 2023-01-13          -13.213316              -14.534648   
14 2023-01-14

