<a href="https://colab.research.google.com/github/hany69x/YAZAKI/blob/main/Scikit_Learn_%2B_PLOTLY%2BCompare.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
pip install dash

Collecting dash
  Downloading dash-2.17.1-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m25.1 MB/s[0m eta [36m0:00:00[0m
Collecting dash-html-components==2.0.0 (from dash)
  Downloading dash_html_components-2.0.0-py3-none-any.whl (4.1 kB)
Collecting dash-core-components==2.0.0 (from dash)
  Downloading dash_core_components-2.0.0-py3-none-any.whl (3.8 kB)
Collecting dash-table==5.0.0 (from dash)
  Downloading dash_table-5.0.0-py3-none-any.whl (3.9 kB)
Collecting retrying (from dash)
  Downloading retrying-1.3.4-py3-none-any.whl (11 kB)
Installing collected packages: dash-table, dash-html-components, dash-core-components, retrying, dash
Successfully installed dash-2.17.1 dash-core-components-2.0.0 dash-html-components-2.0.0 dash-table-5.0.0 retrying-1.3.4


In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
import plotly.graph_objects as go
import plotly.express as px

# Load the dataset
path = "/content/drive/MyDrive/DataSet/YazakiDSET.csv"
df = pd.read_csv(path)

# Remove the 'Sum of ' prefix from the column names
df.columns = [col.replace('Sum of ', '') for col in df.columns]

# Ensure 'Row Labels' is of string type
df['Row Labels'] = df['Row Labels'].astype(str)

# Verify the column names after replacement
print(df.columns)

# Melt the dataframe to long format
df_long = df.melt(id_vars=['Row Labels'], var_name='week_year', value_name='value')

# Extract year and week from the 'week_year' column
df_long['year'] = df_long['week_year'].str[:4].astype(int)
df_long['week'] = df_long['week_year'].str[4:].astype(int)

# Convert year and week to a datetime
df_long['ds'] = pd.to_datetime(df_long['year'].astype(str) + df_long['week'].astype(str) + '0', format='%Y%W%w')

# Filter out non-numeric rows (like 'grand total') if necessary
df_long = df_long[pd.to_numeric(df_long['value'], errors='coerce').notnull()]

# Optionally, filter out rows that are not relevant (e.g., 'grand total')
df_long = df_long[~df_long['Row Labels'].str.contains('grand total', case=False, na=False)]

# Handle missing values (example: forward fill)
df_long['value'] = df_long['value'].fillna(method='ffill')

# Feature engineering for time series
df_long['day_of_week'] = df_long['ds'].dt.dayofweek
df_long['day_of_month'] = df_long['ds'].dt.day
df_long['day_of_year'] = df_long['ds'].dt.dayofyear
df_long['week_of_year'] = df_long['ds'].dt.isocalendar().week
df_long['month'] = df_long['ds'].dt.month
df_long['quarter'] = df_long['ds'].dt.quarter
df_long['year'] = df_long['ds'].dt.year

# Lag features (e.g., previous week's value)
df_long['lag_1'] = df_long['value'].shift(1)
df_long['lag_2'] = df_long['value'].shift(2)
df_long['lag_3'] = df_long['value'].shift(3)

# Drop rows with NaN values (after creating lag features)
df_long = df_long.dropna()

# Print the dataframe to check the preprocessing
print(df_long.head())

# Print the date range in the dataset
print(f"Date range in dataset: {df_long['ds'].min()} to {df_long['ds'].max()}")

# Prepare features (X) and target (y)
X = df_long[['day_of_week', 'day_of_month', 'day_of_year', 'week_of_year', 'month', 'quarter', 'year', 'lag_1', 'lag_2', 'lag_3']]
y = df_long['value']

# Print the shape of the feature matrix and target vector
print(f"X shape: {X.shape}, y shape: {y.shape}")

# Choose an appropriate split date within the dataset date range
split_date = '2023-10-01'  # Adjusted split date within the dataset range

# Verify the split date
if not (df_long['ds'].min() <= pd.to_datetime(split_date) <= df_long['ds'].max()):
    raise ValueError(f"Split date {split_date} is outside the range of the dataset dates.")

# Split the data into training and testing sets
train = df_long[df_long['ds'] < split_date]
test = df_long[df_long['ds'] >= split_date]

X_train = train[['day_of_week', 'day_of_month', 'day_of_year', 'week_of_year', 'month', 'quarter', 'year', 'lag_1', 'lag_2', 'lag_3']]
y_train = train['value']
X_test = test[['day_of_week', 'day_of_month', 'day_of_year', 'week_of_year', 'month', 'quarter', 'year', 'lag_1', 'lag_2', 'lag_3']]
y_test = test['value']

# Print the shapes of the training and testing sets to verify they are not empty
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")

# Check if the training set is empty
if X_train.shape[0] == 0 or y_train.shape[0] == 0:
    raise ValueError("Training set is empty. Check the split date and ensure there is data before this date.")

# Initialize and train the Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Plotly: Actual vs Predicted Values
fig1 = go.Figure()
fig1.add_trace(go.Scatter(x=test['ds'], y=y_test, mode='lines', name='Actual', line=dict(color='blue')))
fig1.add_trace(go.Scatter(x=test['ds'], y=y_pred, mode='lines', name='Predicted', line=dict(color='red', dash='dash')))
fig1.update_layout(title='Actual vs Predicted Values', xaxis_title='Date', yaxis_title='Value')
fig1.show()

# Plotly: Feature Importances
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]
features = X.columns

fig2 = px.bar(x=importances[indices], y=[features[i] for i in indices], orientation='h')
fig2.update_layout(title='Feature Importances', xaxis_title='Importance', yaxis_title='Feature')
fig2.show()

# Plotly: Residuals Plot
residuals = y_test - y_pred
fig3 = go.Figure()
fig3.add_trace(go.Scatter(x=test['ds'], y=residuals, mode='markers', marker=dict(color='purple', opacity=0.6)))
fig3.add_trace(go.Scatter(x=test['ds'], y=[0]*len(test['ds']), mode='lines', line=dict(color='black', dash='dash')))
fig3.update_layout(title='Residuals Plot', xaxis_title='Date', yaxis_title='Residual')
fig3.show()

# Cross-validation
tscv = TimeSeriesSplit(n_splits=5)
cv_scores = cross_val_score(model, X, y, cv=tscv, scoring='neg_mean_squared_error')
cv_rmse = np.sqrt(-cv_scores)
print(f'Cross-validated RMSE: {cv_rmse.mean():.2f} ± {cv_rmse.std():.2f}')

# Plotly: Cross-Validation RMSE Scores
fig4 = px.box(x=cv_rmse, points="all")
fig4.update_layout(title='Cross-Validation RMSE Scores', xaxis_title='RMSE')
fig4.show()

# Plotly: Prediction vs Actual
fig5 = go.Figure()
fig5.add_trace(go.Scatter(x=y_test, y=y_pred, mode='markers', marker=dict(color='orange', opacity=0.6)))
fig5.add_trace(go.Scatter(x=[y_test.min(), y_test.max()], y=[y_test.min(), y_test.max()], mode='lines', line=dict(color='black', dash='dash')))
fig5.update_layout(title='Prediction vs Actual', xaxis_title='Actual Values', yaxis_title='Predicted Values')
fig5.show()

# Save the forecast
test['predicted'] = y_pred
test[['ds', 'value', 'predicted']].to_csv('forecast.csv', index=False)

# Save cross-validation metrics
cv_metrics = pd.DataFrame({'rmse': cv_rmse})
cv_metrics.to_csv('performance_metrics.csv', index=False)


Index(['Row Labels', '202312', '202313', '202314', '202315', '202317',
       '202319', '202320', '202321', '202322', '202318', '202316', '202323',
       '202324', '202338', '202339', '202340', '202344', '202346', '202350',
       '202348', '202347', '202345', '202343', '202342', '202341', '202325',
       '202326', '202327', '202328', '202329', '202330', '202331', '202332',
       '202333', '202334', '202335', '202336', '202337', '202349', '202351',
       '202352', '202401', '202402', '202403', '202404', '202405', '202407',
       '202406', '202408', '202409', '202410', '202411'],
      dtype='object')
  Row Labels week_year    value  year  week         ds  day_of_week  \
3     Line 3    202312  20188.0  2023    12 2023-03-26            6   
4     Line 4    202312  12186.0  2023    12 2023-03-26            6   
5     Line 5    202312   7470.0  2023    12 2023-03-26            6   
6     Line 6    202312   6203.0  2023    12 2023-03-26            6   
7     Line 7    202312    990.0 

Cross-validated RMSE: 9139.34 ± 1974.88




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff

# Load the dataset
path = "/content/drive/MyDrive/DataSet/YazakiDSET.csv"
df = pd.read_csv(path)

# Remove the 'Sum of ' prefix from the column names
df.columns = [col.replace('Sum of ', '') for col in df.columns]

# Ensure 'Row Labels' is of string type
df['Row Labels'] = df['Row Labels'].astype(str)

# Verify the column names after replacement
print(df.columns)

# Melt the dataframe to long format
df_long = df.melt(id_vars=['Row Labels'], var_name='week_year', value_name='value')

# Extract year and week from the 'week_year' column
df_long['year'] = df_long['week_year'].str[:4].astype(int)
df_long['week'] = df_long['week_year'].str[4:].astype(int)

# Convert year and week to a datetime
df_long['ds'] = pd.to_datetime(df_long['year'].astype(str) + df_long['week'].astype(str) + '0', format='%Y%W%w')

# Filter out non-numeric rows (like 'grand total') if necessary
df_long = df_long[pd.to_numeric(df_long['value'], errors='coerce').notnull()]

# Optionally, filter out rows that are not relevant (e.g., 'grand total')
df_long = df_long[~df_long['Row Labels'].str.contains('grand total', case=False, na=False)]

# Handle missing values (example: forward fill)
df_long['value'] = df_long['value'].fillna(method='ffill')

# Feature engineering for time series
df_long['day_of_week'] = df_long['ds'].dt.dayofweek
df_long['day_of_month'] = df_long['ds'].dt.day
df_long['day_of_year'] = df_long['ds'].dt.dayofyear
df_long['week_of_year'] = df_long['ds'].dt.isocalendar().week
df_long['month'] = df_long['ds'].dt.month
df_long['quarter'] = df_long['ds'].dt.quarter
df_long['year'] = df_long['ds'].dt.year
df_long['is_month_start'] = df_long['ds'].dt.is_month_start.astype(int)
df_long['is_month_end'] = df_long['ds'].dt.is_month_end.astype(int)

# Lag features (e.g., previous week's value)
for lag in range(1, 5):
    df_long[f'lag_{lag}'] = df_long['value'].shift(lag)

# Drop rows with NaN values (after creating lag features)
df_long = df_long.dropna()

# Print the dataframe to check the preprocessing
print(df_long.head())

# Print the date range in the dataset
print(f"Date range in dataset: {df_long['ds'].min()} to {df_long['ds'].max()}")

# Prepare features (X) and target (y)
feature_columns = ['day_of_week', 'day_of_month', 'day_of_year', 'week_of_year', 'month', 'quarter', 'year',
                   'is_month_start', 'is_month_end', 'lag_1', 'lag_2', 'lag_3', 'lag_4']
X = df_long[feature_columns]
y = df_long['value']

# Print the shape of the feature matrix and target vector
print(f"X shape: {X.shape}, y shape: {y.shape}")

# Choose an appropriate split date within the dataset date range
split_date = '2023-10-01'  # Adjusted split date within the dataset range

# Verify the split date
if not (df_long['ds'].min() <= pd.to_datetime(split_date) <= df_long['ds'].max()):
    raise ValueError(f"Split date {split_date} is outside the range of the dataset dates.")

# Split the data into training and testing sets
train = df_long[df_long['ds'] < split_date]
test = df_long[df_long['ds'] >= split_date]

X_train = train[feature_columns]
y_train = train['value']
X_test = test[feature_columns]
y_test = test['value']

# Print the shapes of the training and testing sets to verify they are not empty
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")

# Check if the training set is empty
if X_train.shape[0] == 0 or y_train.shape[0] == 0:
    raise ValueError("Training set is empty. Check the split date and ensure there is data before this date.")

# Initialize and train the Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Model Evaluation
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'MSE: {mse:.2f}, MAE: {mae:.2f}, R2: {r2:.2f}')

# Plotly: Actual vs Predicted Values
fig1 = go.Figure()
fig1.add_trace(go.Scatter(x=test['ds'], y=y_test, mode='lines', name='Actual', line=dict(color='blue')))
fig1.add_trace(go.Scatter(x=test['ds'], y=y_pred, mode='lines', name='Predicted', line=dict(color='red', dash='dash')))
fig1.update_layout(title='Actual vs Predicted Values', xaxis_title='Date', yaxis_title='Value')
fig1.show()

# Plotly: Feature Importances
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]
features = X.columns

fig2 = px.bar(x=importances[indices], y=[features[i] for i in indices], orientation='h')
fig2.update_layout(title='Feature Importances', xaxis_title='Importance', yaxis_title='Feature')
fig2.show()

# Plotly: Residuals Plot
residuals = y_test - y_pred
fig3 = go.Figure()
fig3.add_trace(go.Scatter(x=test['ds'], y=residuals, mode='markers', marker=dict(color='purple', opacity=0.6)))
fig3.add_trace(go.Scatter(x=test['ds'], y=[0]*len(test['ds']), mode='lines', line=dict(color='black', dash='dash')))
fig3.update_layout(title='Residuals Plot', xaxis_title='Date', yaxis_title='Residual')
fig3.show()

# Cross-validation
tscv = TimeSeriesSplit(n_splits=5)
cv_scores = cross_val_score(model, X, y, cv=tscv, scoring='neg_mean_squared_error')
cv_rmse = np.sqrt(-cv_scores)
print(f'Cross-validated RMSE: {cv_rmse.mean():.2f} ± {cv_rmse.std():.2f}')

# Plotly: Cross-Validation RMSE Scores
fig4 = px.box(x=cv_rmse, points="all")
fig4.update_layout(title='Cross-Validation RMSE Scores', xaxis_title='RMSE')
fig4.show()

# Plotly: Prediction vs Actual
fig5 = go.Figure()
fig5.add_trace(go.Scatter(x=y_test, y=y_pred, mode='markers', marker=dict(color='orange', opacity=0.6)))
fig5.add_trace(go.Scatter(x=[y_test.min(), y_test.max()], y=[y_test.min(), y_test.max()], mode='lines', line=dict(color='black', dash='dash')))
fig5.update_layout(title='Prediction vs Actual', xaxis_title='Actual Values', yaxis_title='Predicted Values')
fig5.show()

# Save the forecast
test['predicted'] = y_pred
test[['ds', 'value', 'predicted']].to_csv('forecast.csv', index=False)

# Save cross-validation metrics
cv_metrics = pd.DataFrame({'rmse': cv_rmse})
cv_metrics.to_csv('performance_metrics.csv', index=False)

# Dashboard with Plotly Dash
import dash
from dash import dcc, html
from dash.dependencies import Input, Output

app = dash.Dash(__name__)

app.layout = html.Div([
    html.H1("Time Series Forecasting Dashboard"),
    dcc.Tabs([
        dcc.Tab(label='Actual vs Predicted', children=[
            dcc.Graph(figure=fig1)
        ]),
        dcc.Tab(label='Feature Importances', children=[
            dcc.Graph(figure=fig2)
        ]),
        dcc.Tab(label='Residuals Plot', children=[
            dcc.Graph(figure=fig3)
        ]),
        dcc.Tab(label='Cross-Validation RMSE', children=[
            dcc.Graph(figure=fig4)
        ]),
        dcc.Tab(label='Prediction vs Actual', children=[
            dcc.Graph(figure=fig5)
        ]),
    ])
])

if __name__ == '__main__':
    app.run_server(debug=True)


Index(['Row Labels', '202312', '202313', '202314', '202315', '202317',
       '202319', '202320', '202321', '202322', '202318', '202316', '202323',
       '202324', '202338', '202339', '202340', '202344', '202346', '202350',
       '202348', '202347', '202345', '202343', '202342', '202341', '202325',
       '202326', '202327', '202328', '202329', '202330', '202331', '202332',
       '202333', '202334', '202335', '202336', '202337', '202349', '202351',
       '202352', '202401', '202402', '202403', '202404', '202405', '202407',
       '202406', '202408', '202409', '202410', '202411'],
      dtype='object')
  Row Labels week_year    value  year  week         ds  day_of_week  \
4     Line 4    202312  12186.0  2023    12 2023-03-26            6   
5     Line 5    202312   7470.0  2023    12 2023-03-26            6   
6     Line 6    202312   6203.0  2023    12 2023-03-26            6   
7     Line 7    202312    990.0  2023    12 2023-03-26            6   
8        SGB    202312   5670.0 

Cross-validated RMSE: 8844.50 ± 2147.02


<IPython.core.display.Javascript object>

In [4]:
# Load the dataset
path = "/content/drive/MyDrive/DataSet/YazakiDSET.csv"
df = pd.read_csv(path)

In [9]:
import pandas as pd
import os
import glob

def inspect_file(file_path):
    # Load the dataset
    df = pd.read_excel(file_path)
    # Display the first few rows and columns
    print(f"Inspecting file: {file_path}")
    print(df.head())
    print(df.columns)

# Directory containing the datasets
directory_path = '/content/drive/MyDrive/DATASETS'

# Get a list of all files in the directory
file_paths = glob.glob(os.path.join(directory_path, '*.xlsx'))

# Inspect the first file
if file_paths:
    inspect_file(file_paths[0])
else:
    print("No files found in the directory.")


Inspecting file: /content/drive/MyDrive/DATASETS/CW012023.xlsx
   Unnamed: 0     Unnamed: 1     Unnamed: 2     Unnamed: 3     Unnamed: 4  \
0   CW01.2023            NaN            NaN            NaN            NaN   
1  Row Labels  Sum of 202301  Sum of 202302  Sum of 202303  Sum of 202304   
2      Line 1           5245          25490          28696          26660   
3    Line 1-2           1034           1760           1782            770   
4      Line 2            346          19183          25728          28086   

      Unnamed: 5     Unnamed: 6     Unnamed: 7     Unnamed: 8     Unnamed: 9  \
0            NaN            NaN            NaN            NaN            NaN   
1  Sum of 202306  Sum of 202305  Sum of 202307  Sum of 202308  Sum of 202310   
2          32720          25050          30445          28835          27935   
3           1430           2002           2068           1386           2420   
4          24000          27986          26422          27854          252

In [11]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import glob
import os

def preprocess_and_train(file_path):
    try:
        # Load the dataset, skipping the first two rows
        df = pd.read_excel(file_path, skiprows=1)

        # Set the correct column names
        df.columns = df.iloc[0]
        df = df[1:]

        # Ensure all column names are of string type
        df.columns = df.columns.map(str)

        # Remove the 'Sum of ' prefix from the column names
        df.columns = df.columns.map(lambda x: x.replace('Sum of ', '') if isinstance(x, str) else x)

        # Ensure 'Row Labels' is of string type and handle NaN values
        df['Row Labels'] = df['Row Labels'].astype(str).fillna('Unknown')

        # Melt the dataframe to long format
        df_long = df.melt(id_vars=['Row Labels'], var_name='week_year', value_name='value')

        # Extract year and week from the 'week_year' column
        df_long['year'] = df_long['week_year'].str[:4].astype(int)
        df_long['week'] = df_long['week_year'].str[4:].astype(int)

        # Convert year and week to a datetime
        df_long['ds'] = pd.to_datetime(df_long['year'].astype(str) + df_long['week'].astype(str) + '0', format='%Y%W%w')

        # Filter out non-numeric rows (like 'grand total') if necessary
        df_long = df_long[pd.to_numeric(df_long['value'], errors='coerce').notnull()]

        # Optionally, filter out rows that are not relevant (e.g., 'grand total')
        df_long = df_long[~df_long['Row Labels'].str.contains('grand total', case=False, na=False)]

        # Handle missing values (example: forward fill)
        df_long['value'] = df_long['value'].fillna(method='ffill')

        # Feature engineering for time series
        df_long['day_of_week'] = df_long['ds'].dt.dayofweek
        df_long['day_of_month'] = df_long['ds'].dt.day
        df_long['day_of_year'] = df_long['ds'].dt.dayofyear
        df_long['week_of_year'] = df_long['ds'].dt.isocalendar().week
        df_long['month'] = df_long['ds'].dt.month
        df_long['quarter'] = df_long['ds'].dt.quarter
        df_long['year'] = df_long['ds'].dt.year
        df_long['is_month_start'] = df_long['ds'].dt.is_month_start.astype(int)
        df_long['is_month_end'] = df_long['ds'].dt.is_month_end.astype(int)

        # Lag features (e.g., previous week's value)
        for lag in range(1, 5):
            df_long[f'lag_{lag}'] = df_long['value'].shift(lag)

        # Drop rows with NaN values (after creating lag features)
        df_long = df_long.dropna()

        # Prepare features (X) and target (y)
        feature_columns = ['day_of_week', 'day_of_month', 'day_of_year', 'week_of_year', 'month', 'quarter', 'year',
                           'is_month_start', 'is_month_end', 'lag_1', 'lag_2', 'lag_3', 'lag_4']
        X = df_long[feature_columns]
        y = df_long['value']

        # Choose an appropriate split date within the dataset date range
        split_date = '2023-10-01'  # Adjusted split date within the dataset range

        # Split the data into training and testing sets
        train = df_long[df_long['ds'] < split_date]
        test = df_long[df_long['ds'] >= split_date]

        X_train = train[feature_columns]
        y_train = train['value']
        X_test = test[feature_columns]
        y_test = test['value']

        # Initialize and train the Random Forest Regressor
        model = RandomForestRegressor(n_estimators=100, random_state=42)
        model.fit(X_train, y_train)

        # Make predictions
        y_pred = model.predict(X_test)

        # Model Evaluation
        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        print(f'MSE: {mse:.2f}, MAE: {mae:.2f}, R2: {r2:.2f}')

        return {
            'mse': mse,
            'mae': mae,
            'r2': r2,
            'model': model,
            'y_test': y_test,
            'y_pred': y_pred,
            'test_ds': test['ds']
        }
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
        return None

# Directory containing the datasets
directory_path = '/content/drive/MyDrive/DATASETS'

# Get a list of all files in the directory
file_paths = glob.glob(os.path.join(directory_path, '*.xlsx'))

# Load and process each dataset
results = {}
for file_path in file_paths:
    print(f"Processing file: {file_path}")
    result = preprocess_and_train(file_path)
    if result:
        results[os.path.basename(file_path)] = result

# Compare results
for file_name, result in results.items():
    print(f"Results for {file_name}: MSE={result['mse']:.2f}, MAE={result['mae']:.2f}, R2={result['r2']:.2f}")

# You can add more comparisons or visualizations here if needed


Processing file: /content/drive/MyDrive/DATASETS/CW012023.xlsx
MSE: 137483106.58, MAE: 8714.92, R2: -0.05
Processing file: /content/drive/MyDrive/DATASETS/CW022023.xlsx
MSE: 257867259.92, MAE: 13355.12, R2: -0.96
Processing file: /content/drive/MyDrive/DATASETS/CW042023.xlsx
Error processing file /content/drive/MyDrive/DATASETS/CW042023.xlsx: 'Row Labels'
Processing file: /content/drive/MyDrive/DATASETS/CW062023.xlsx
Error processing file /content/drive/MyDrive/DATASETS/CW062023.xlsx: 'Row Labels'
Processing file: /content/drive/MyDrive/DATASETS/CW052023.xlsx
Error processing file /content/drive/MyDrive/DATASETS/CW052023.xlsx: 'Row Labels'
Processing file: /content/drive/MyDrive/DATASETS/CW072023.xlsx
MSE: 97485927.92, MAE: 7114.73, R2: 0.27
Processing file: /content/drive/MyDrive/DATASETS/CW0102023.xlsx
MSE: 105981052.14, MAE: 7463.49, R2: 0.09
Processing file: /content/drive/MyDrive/DATASETS/CW082023.xlsx
MSE: 132762829.32, MAE: 8523.58, R2: -0.12
Processing file: /content/drive/MyDr

In [13]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import glob
import os

def preprocess_and_train(file_path):
    try:
        # Load the dataset, skipping the first two rows
        df = pd.read_excel(file_path, skiprows=1)

        # Set the correct column names
        df.columns = df.iloc[0]
        df = df[1:]

        # Ensure all column names are of string type
        df.columns = df.columns.map(str)

        # Check if 'Row Labels' column exists
        if 'Row Labels' not in df.columns:
            raise ValueError("'Row Labels' column is missing")

        # Remove the 'Sum of ' prefix from the column names
        df.columns = df.columns.map(lambda x: x.replace('Sum of ', '') if isinstance(x, str) else x)

        # Ensure 'Row Labels' is of string type and handle NaN values
        df['Row Labels'] = df['Row Labels'].astype(str).fillna('Unknown')

        # Melt the dataframe to long format
        df_long = df.melt(id_vars=['Row Labels'], var_name='week_year', value_name='value')

        # Extract year and week from the 'week_year' column
        df_long['year'] = df_long['week_year'].str[:4].astype(int)
        df_long['week'] = df_long['week_year'].str[4:].astype(int)

        # Convert year and week to a datetime
        df_long['ds'] = pd.to_datetime(df_long['year'].astype(str) + df_long['week'].astype(str) + '0', format='%Y%W%w')

        # Filter out non-numeric rows (like 'grand total') if necessary
        df_long = df_long[pd.to_numeric(df_long['value'], errors='coerce').notnull()]

        # Optionally, filter out rows that are not relevant (e.g., 'grand total')
        df_long = df_long[~df_long['Row Labels'].str.contains('grand total', case=False, na=False)]

        # Handle missing values (example: forward fill)
        df_long['value'] = df_long['value'].fillna(method='ffill')

        # Feature engineering for time series
        df_long['day_of_week'] = df_long['ds'].dt.dayofweek
        df_long['day_of_month'] = df_long['ds'].dt.day
        df_long['day_of_year'] = df_long['ds'].dt.dayofyear
        df_long['week_of_year'] = df_long['ds'].dt.isocalendar().week
        df_long['month'] = df_long['ds'].dt.month
        df_long['quarter'] = df_long['ds'].dt.quarter
        df_long['year'] = df_long['ds'].dt.year
        df_long['is_month_start'] = df_long['ds'].dt.is_month_start.astype(int)
        df_long['is_month_end'] = df_long['ds'].dt.is_month_end.astype(int)

        # Lag features (e.g., previous week's value)
        for lag in range(1, 5):
            df_long[f'lag_{lag}'] = df_long['value'].shift(lag)

        # Drop rows with NaN values (after creating lag features)
        df_long = df_long.dropna()

        # Prepare features (X) and target (y)
        feature_columns = ['day_of_week', 'day_of_month', 'day_of_year', 'week_of_year', 'month', 'quarter', 'year',
                           'is_month_start', 'is_month_end', 'lag_1', 'lag_2', 'lag_3', 'lag_4']
        X = df_long[feature_columns]
        y = df_long['value']

        # Choose an appropriate split date within the dataset date range
        split_date = '2023-10-01'  # Adjusted split date within the dataset range

        # Split the data into training and testing sets
        train = df_long[df_long['ds'] < split_date]
        test = df_long[df_long['ds'] >= split_date]

        X_train = train[feature_columns]
        y_train = train['value']
        X_test = test[feature_columns]
        y_test = test['value']

        # Initialize and train the Random Forest Regressor
        model = RandomForestRegressor(n_estimators=100, random_state=42)
        model.fit(X_train, y_train)

        # Make predictions
        y_pred = model.predict(X_test)

        # Model Evaluation
        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        print(f'MSE: {mse:.2f}, MAE: {mae:.2f}, R2: {r2:.2f}')

        return {
            'mse': mse,
            'mae': mae,
            'r2': r2,
            'model': model,
            'y_test': y_test,
            'y_pred': y_pred,
            'test_ds': test['ds']
        }
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
        return None

# Directory containing the datasets
directory_path = '/content/drive/MyDrive/DATASETS'

# Get a list of all files in the directory
file_paths = glob.glob(os.path.join(directory_path, '*.xlsx'))

# Load and process each dataset
results = {}
for file_path in file_paths:
    print(f"Processing file: {file_path}")
    result = preprocess_and_train(file_path)
    if result:
        results[os.path.basename(file_path)] = result

# Compare results
for file_name, result in results.items():
    print(f"Results for {file_name}: MSE={result['mse']:.2f}, MAE={result['mae']:.2f}, R2={result['r2']:.2f}")

# Additional comparison and visualization code can be added here
# For example, you can create a DataFrame to store and compare metrics:
comparison_df = pd.DataFrame({
    'Dataset': results.keys(),
    'MSE': [result['mse'] for result in results.values()],
    'MAE': [result['mae'] for result in results.values()],
    'R2': [result['r2'] for result in results.values()]
})

print(comparison_df)

# Save the comparison results to a CSV file if needed
comparison_df.to_csv('/content/drive/MyDrive/DATASETS/comparison_results.csv', index=False)


Processing file: /content/drive/MyDrive/DATASETS/CW012023.xlsx
MSE: 137483106.58, MAE: 8714.92, R2: -0.05
Processing file: /content/drive/MyDrive/DATASETS/CW022023.xlsx
MSE: 257867259.92, MAE: 13355.12, R2: -0.96
Processing file: /content/drive/MyDrive/DATASETS/CW042023.xlsx
Error processing file /content/drive/MyDrive/DATASETS/CW042023.xlsx: 'Row Labels' column is missing
Processing file: /content/drive/MyDrive/DATASETS/CW062023.xlsx
Error processing file /content/drive/MyDrive/DATASETS/CW062023.xlsx: 'Row Labels' column is missing
Processing file: /content/drive/MyDrive/DATASETS/CW052023.xlsx
Error processing file /content/drive/MyDrive/DATASETS/CW052023.xlsx: 'Row Labels' column is missing
Processing file: /content/drive/MyDrive/DATASETS/CW072023.xlsx
MSE: 97485927.92, MAE: 7114.73, R2: 0.27
Processing file: /content/drive/MyDrive/DATASETS/CW0102023.xlsx
MSE: 105981052.14, MAE: 7463.49, R2: 0.09
Processing file: /content/drive/MyDrive/DATASETS/CW082023.xlsx
MSE: 132762829.32, MAE: 8

In [15]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import glob
import os
import plotly.graph_objects as go

def preprocess_and_train(file_path, split_date='2023-10-01'):
    try:
        # Load the dataset, skipping the first two rows
        df = pd.read_excel(file_path, skiprows=1)

        # Set the correct column names
        df.columns = df.iloc[0]
        df = df[1:]

        # Ensure all column names are of string type
        df.columns = df.columns.map(str)

        # Remove the 'Sum of ' prefix from the column names
        df.columns = df.columns.map(lambda x: x.replace('Sum of ', '') if isinstance(x, str) else x)

        # Ensure 'Row Labels' is of string type and handle NaN values
        df['Row Labels'] = df['Row Labels'].astype(str).fillna('Unknown')

        # Melt the dataframe to long format
        df_long = df.melt(id_vars=['Row Labels'], var_name='week_year', value_name='value')

        # Extract year and week from the 'week_year' column
        df_long['year'] = df_long['week_year'].str[:4].astype(int)
        df_long['week'] = df_long['week_year'].str[4:].astype(int)

        # Convert year and week to a datetime
        df_long['ds'] = pd.to_datetime(df_long['year'].astype(str) + df_long['week'].astype(str) + '0', format='%Y%W%w')

        # Filter out non-numeric rows (like 'grand total') if necessary
        df_long = df_long[pd.to_numeric(df_long['value'], errors='coerce').notnull()]

        # Handle missing values (example: forward fill)
        df_long['value'] = df_long['value'].fillna(method='ffill')

        # Feature engineering for time series
        df_long['day_of_week'] = df_long['ds'].dt.dayofweek
        df_long['day_of_month'] = df_long['ds'].dt.day
        df_long['day_of_year'] = df_long['ds'].dt.dayofyear
        df_long['week_of_year'] = df_long['ds'].dt.isocalendar().week
        df_long['month'] = df_long['ds'].dt.month
        df_long['quarter'] = df_long['ds'].dt.quarter
        df_long['year'] = df_long['ds'].dt.year
        df_long['is_month_start'] = df_long['ds'].dt.is_month_start.astype(int)
        df_long['is_month_end'] = df_long['ds'].dt.is_month_end.astype(int)

        # Lag features (e.g., previous week's value)
        for lag in range(1, 5):
            df_long[f'lag_{lag}'] = df_long['value'].shift(lag)

        # Drop rows with NaN values (after creating lag features)
        df_long = df_long.dropna()

        # Prepare features (X) and target (y)
        feature_columns = ['day_of_week', 'day_of_month', 'day_of_year', 'week_of_year', 'month', 'quarter', 'year',
                           'is_month_start', 'is_month_end', 'lag_1', 'lag_2', 'lag_3', 'lag_4']
        X = df_long[feature_columns]
        y = df_long['value']

        # Split the data into training and testing sets
        train = df_long[df_long['ds'] < split_date]
        test = df_long[df_long['ds'] >= split_date]

        X_train = train[feature_columns]
        y_train = train['value']
        X_test = test[feature_columns]
        y_test = test['value']

        # Initialize and train the Random Forest Regressor
        model = RandomForestRegressor(n_estimators=100, random_state=42)
        model.fit(X_train, y_train)

        # Make predictions
        y_pred = model.predict(X_test)

        # Model Evaluation
        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        print(f'MSE: {mse:.2f}, MAE: {mae:.2f}, R2: {r2:.2f}')

        # Store the predictions
        test['predicted'] = y_pred

        return {
            'mse': mse,
            'mae': mae,
            'r2': r2,
            'y_test': y_test,
            'y_pred': y_pred,
            'test_ds': test['ds'],
            'test': test[['ds', 'value', 'predicted']]
        }
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
        return None

# Directory containing the datasets
directory_path = '/content/drive/MyDrive/DATASETS'

# Get a list of all files in the directory
file_paths = glob.glob(os.path.join(directory_path, '*.xlsx'))

# Load and process each dataset
results = {}
for file_path in file_paths:
    print(f"Processing file: {file_path}")
    result = preprocess_and_train(file_path)
    if result:
        results[os.path.basename(file_path)] = result

# Combine results for comparison
combined_results = pd.DataFrame()

for file_name, result in results.items():
    result['test']['file'] = file_name
    combined_results = pd.concat([combined_results, result['test']], axis=0)

# Plotly: Comparison of predictions across datasets
fig = go.Figure()

for file_name in combined_results['file'].unique():
    df = combined_results[combined_results['file'] == file_name]
    fig.add_trace(go.Scatter(x=df['ds'], y=df['predicted'], mode='lines', name=f'Predicted {file_name}'))

fig.update_layout(title='Predicted Values Comparison Across Datasets',
                  xaxis_title='Date',
                  yaxis_title='Predicted Value',
                  legend_title='Dataset')
fig.show()

# Plotly: Actual vs Predicted for the latest dataset
latest_file = max(results, key=lambda x: x.split('.')[0])
latest_result = results[latest_file]
fig = go.Figure()

fig.add_trace(go.Scatter(x=latest_result['test_ds'], y=latest_result['y_test'], mode='lines', name='Actual'))
fig.add_trace(go.Scatter(x=latest_result['test_ds'], y=latest_result['y_pred'], mode='lines', name='Predicted', line=dict(dash='dash')))

fig.update_layout(title=f'Actual vs Predicted Values for {latest_file}',
                  xaxis_title='Date',
                  yaxis_title='Value')
fig.show()


Processing file: /content/drive/MyDrive/DATASETS/CW012023.xlsx
MSE: 597212081.29, MAE: 14321.05, R2: 0.60
Processing file: /content/drive/MyDrive/DATASETS/CW022023.xlsx
MSE: 1157684176.42, MAE: 22151.04, R2: 0.22
Processing file: /content/drive/MyDrive/DATASETS/CW042023.xlsx
Error processing file /content/drive/MyDrive/DATASETS/CW042023.xlsx: 'Row Labels'
Processing file: /content/drive/MyDrive/DATASETS/CW062023.xlsx
Error processing file /content/drive/MyDrive/DATASETS/CW062023.xlsx: 'Row Labels'
Processing file: /content/drive/MyDrive/DATASETS/CW052023.xlsx
Error processing file /content/drive/MyDrive/DATASETS/CW052023.xlsx: 'Row Labels'
Processing file: /content/drive/MyDrive/DATASETS/CW072023.xlsx
MSE: 97485927.92, MAE: 7114.73, R2: 0.27
Processing file: /content/drive/MyDrive/DATASETS/CW0102023.xlsx
MSE: 105981052.14, MAE: 7463.49, R2: 0.09
Processing file: /content/drive/MyDrive/DATASETS/CW082023.xlsx
MSE: 1570106369.52, MAE: 25564.79, R2: 0.05
Processing file: /content/drive/MyD

In [16]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import glob
import os
import plotly.graph_objects as go
import plotly.express as px

def preprocess_and_train(file_path, split_date='2023-10-01'):
    try:
        # Load the dataset, skipping the first two rows
        df = pd.read_excel(file_path, skiprows=1)

        # Set the correct column names
        df.columns = df.iloc[0]
        df = df[1:]

        # Ensure all column names are of string type
        df.columns = df.columns.map(str)

        # Remove the 'Sum of ' prefix from the column names
        df.columns = df.columns.map(lambda x: x.replace('Sum of ', '') if isinstance(x, str) else x)

        # Ensure 'Row Labels' is of string type and handle NaN values
        df['Row Labels'] = df['Row Labels'].astype(str).fillna('Unknown')

        # Melt the dataframe to long format
        df_long = df.melt(id_vars=['Row Labels'], var_name='week_year', value_name='value')

        # Extract year and week from the 'week_year' column
        df_long['year'] = df_long['week_year'].str[:4].astype(int)
        df_long['week'] = df_long['week_year'].str[4:].astype(int)

        # Convert year and week to a datetime
        df_long['ds'] = pd.to_datetime(df_long['year'].astype(str) + df_long['week'].astype(str) + '0', format='%Y%W%w')

        # Filter out non-numeric rows (like 'grand total') if necessary
        df_long = df_long[pd.to_numeric(df_long['value'], errors='coerce').notnull()]

        # Handle missing values (example: forward fill)
        df_long['value'] = df_long['value'].fillna(method='ffill')

        # Feature engineering for time series
        df_long['day_of_week'] = df_long['ds'].dt.dayofweek
        df_long['day_of_month'] = df_long['ds'].dt.day
        df_long['day_of_year'] = df_long['ds'].dt.dayofyear
        df_long['week_of_year'] = df_long['ds'].dt.isocalendar().week
        df_long['month'] = df_long['ds'].dt.month
        df_long['quarter'] = df_long['ds'].dt.quarter
        df_long['year'] = df_long['ds'].dt.year
        df_long['is_month_start'] = df_long['ds'].dt.is_month_start.astype(int)
        df_long['is_month_end'] = df_long['ds'].dt.is_month_end.astype(int)

        # Lag features (e.g., previous week's value)
        for lag in range(1, 5):
            df_long[f'lag_{lag}'] = df_long['value'].shift(lag)

        # Drop rows with NaN values (after creating lag features)
        df_long = df_long.dropna()

        # Prepare features (X) and target (y)
        feature_columns = ['day_of_week', 'day_of_month', 'day_of_year', 'week_of_year', 'month', 'quarter', 'year',
                           'is_month_start', 'is_month_end', 'lag_1', 'lag_2', 'lag_3', 'lag_4']
        X = df_long[feature_columns]
        y = df_long['value']

        # Split the data into training and testing sets
        train = df_long[df_long['ds'] < split_date]
        test = df_long[df_long['ds'] >= split_date]

        X_train = train[feature_columns]
        y_train = train['value']
        X_test = test[feature_columns]
        y_test = test['value']

        # Initialize and train the Random Forest Regressor
        model = RandomForestRegressor(n_estimators=100, random_state=42)
        model.fit(X_train, y_train)

        # Make predictions
        y_pred = model.predict(X_test)

        # Model Evaluation
        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        print(f'MSE: {mse:.2f}, MAE: {mae:.2f}, R2: {r2:.2f}')

        # Store the predictions
        test['predicted'] = y_pred

        return {
            'mse': mse,
            'mae': mae,
            'r2': r2,
            'y_test': y_test,
            'y_pred': y_pred,
            'test_ds': test['ds'],
            'test': test[['ds', 'value', 'predicted']]
        }
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
        return None

# Directory containing the datasets
directory_path = '/content/drive/MyDrive/DATASETS'

# Get a list of all files in the directory
file_paths = glob.glob(os.path.join(directory_path, '*.xlsx'))

# Load and process each dataset
results = {}
for file_path in file_paths:
    print(f"Processing file: {file_path}")
    result = preprocess_and_train(file_path)
    if result:
        results[os.path.basename(file_path)] = result

# Combine results for comparison
combined_results = pd.DataFrame()

for file_name, result in results.items():
    result['test']['file'] = file_name
    combined_results = pd.concat([combined_results, result['test']], axis=0)

# Plotly: Comparison of predictions across datasets
fig = go.Figure()

for file_name in combined_results['file'].unique():
    df = combined_results[combined_results['file'] == file_name]
    fig.add_trace(go.Scatter(x=df['ds'], y=df['predicted'], mode='lines', name=f'Predicted {file_name}'))

fig.update_layout(title='Predicted Values Comparison Across Datasets',
                  xaxis_title='Date',
                  yaxis_title='Predicted Value',
                  legend_title='Dataset')
fig.show()

# Plotly: Actual vs Predicted for the latest dataset
latest_file = max(results, key=lambda x: x.split('.')[0])
latest_result = results[latest_file]
fig = go.Figure()

fig.add_trace(go.Scatter(x=latest_result['test_ds'], y=latest_result['y_test'], mode='lines', name='Actual'))
fig.add_trace(go.Scatter(x=latest_result['test_ds'], y=latest_result['y_pred'], mode='lines', name='Predicted', line=dict(dash='dash')))

fig.update_layout(title=f'Actual vs Predicted Values for {latest_file}',
                  xaxis_title='Date',
                  yaxis_title='Value')
fig.show()

# Plotly: Bar chart for weekly demand changes (actual vs predicted)
fig = go.Figure()

fig.add_trace(go.Bar(x=latest_result['test_ds'], y=latest_result['y_test'], name='Actual'))
fig.add_trace(go.Bar(x=latest_result['test_ds'], y=latest_result['y_pred'], name='Predicted'))

fig.update_layout(title=f'Weekly Demand Changes (Actual vs Predicted) for {latest_file}',
                  xaxis_title='Date',
                  yaxis_title='Value',
                  barmode='group')
fig.show()

# Plotly: Error metrics over time
error_metrics = pd.DataFrame({
    'file': list(results.keys()),
    'MSE': [results[file]['mse'] for file in results],
    'MAE': [results[file]['mae'] for file in results],
    'R2': [results[file]['r2'] for file in results]
})

fig = go.Figure()

fig.add_trace(go.Scatter(x=error_metrics['file'], y=error_metrics['MSE'], mode='lines+markers', name='MSE'))
fig.add_trace(go.Scatter(x=error_metrics['file'], y=error_metrics['MAE'], mode='lines+markers', name='MAE'))
fig.add_trace(go.Scatter(x=error_metrics['file'], y=error_metrics['R2'], mode='lines+markers', name='R2'))

fig.update_layout(title='Error Metrics Over Time',
                  xaxis_title='Dataset',
                  yaxis_title='Error Metric',
                  legend_title='Metric')
fig.show()


Processing file: /content/drive/MyDrive/DATASETS/CW012023.xlsx
MSE: 597212081.29, MAE: 14321.05, R2: 0.60
Processing file: /content/drive/MyDrive/DATASETS/CW022023.xlsx
MSE: 1157684176.42, MAE: 22151.04, R2: 0.22
Processing file: /content/drive/MyDrive/DATASETS/CW042023.xlsx
Error processing file /content/drive/MyDrive/DATASETS/CW042023.xlsx: 'Row Labels'
Processing file: /content/drive/MyDrive/DATASETS/CW062023.xlsx
Error processing file /content/drive/MyDrive/DATASETS/CW062023.xlsx: 'Row Labels'
Processing file: /content/drive/MyDrive/DATASETS/CW052023.xlsx
Error processing file /content/drive/MyDrive/DATASETS/CW052023.xlsx: 'Row Labels'
Processing file: /content/drive/MyDrive/DATASETS/CW072023.xlsx
MSE: 97485927.92, MAE: 7114.73, R2: 0.27
Processing file: /content/drive/MyDrive/DATASETS/CW0102023.xlsx
MSE: 105981052.14, MAE: 7463.49, R2: 0.09
Processing file: /content/drive/MyDrive/DATASETS/CW082023.xlsx
MSE: 1570106369.52, MAE: 25564.79, R2: 0.05
Processing file: /content/drive/MyD

In [17]:
import re

# Function to convert dataset filenames to readable format
def convert_filename(filename):
    match = re.match(r'CW(\d{3})(\d{4})\.xlsx', filename)
    if match:
        week = int(match.group(1))
        year = match.group(2)
        return f"Week {week} of {year}"
    return filename

# Combine results for comparison
combined_results = pd.DataFrame()

for file_name, result in results.items():
    result['test']['file'] = convert_filename(file_name)
    combined_results = pd.concat([combined_results, result['test']], axis=0)

# Plotly: Comparison of predictions across datasets
fig = go.Figure()

for file_name in combined_results['file'].unique():
    df = combined_results[combined_results['file'] == file_name]
    fig.add_trace(go.Scatter(x=df['ds'], y=df['predicted'], mode='lines', name=f'Predicted {file_name}'))

fig.update_layout(title='Predicted Values Comparison Across Datasets',
                  xaxis_title='Date',
                  yaxis_title='Predicted Value',
                  legend_title='Dataset')
fig.show()

# Plotly: Actual vs Predicted for the latest dataset
latest_file = max(results, key=lambda x: x.split('.')[0])
latest_file_converted = convert_filename(latest_file)
latest_result = results[latest_file]
fig = go.Figure()

fig.add_trace(go.Scatter(x=latest_result['test_ds'], y=latest_result['y_test'], mode='lines', name='Actual'))
fig.add_trace(go.Scatter(x=latest_result['test_ds'], y=latest_result['y_pred'], mode='lines', name='Predicted', line=dict(dash='dash')))

fig.update_layout(title=f'Actual vs Predicted Values for {latest_file_converted}',
                  xaxis_title='Date',
                  yaxis_title='Value')
fig.show()

# Plotly: Bar chart for weekly demand changes (actual vs predicted)
fig = go.Figure()

fig.add_trace(go.Bar(x=latest_result['test_ds'], y=latest_result['y_test'], name='Actual'))
fig.add_trace(go.Bar(x=latest_result['test_ds'], y=latest_result['y_pred'], name='Predicted'))

fig.update_layout(title=f'Weekly Demand Changes (Actual vs Predicted) for {latest_file_converted}',
                  xaxis_title='Date',
                  yaxis_title='Value',
                  barmode='group')
fig.show()

# Plotly: Error metrics over time
error_metrics = pd.DataFrame({
    'file': [convert_filename(file) for file in results.keys()],
    'MSE': [results[file]['mse'] for file in results],
    'MAE': [results[file]['mae'] for file in results],
    'R2': [results[file]['r2'] for file in results]
})

fig = go.Figure()

fig.add_trace(go.Scatter(x=error_metrics['file'], y=error_metrics['MSE'], mode='lines+markers', name='MSE'))
fig.add_trace(go.Scatter(x=error_metrics['file'], y=error_metrics['MAE'], mode='lines+markers', name='MAE'))
fig.add_trace(go.Scatter(x=error_metrics['file'], y=error_metrics['R2'], mode='lines+markers', name='R2'))

fig.update_layout(title='Error Metrics Over Time',
                  xaxis_title='Dataset',
                  yaxis_title='Error Metric',
                  legend_title='Metric')
fig.show()
