# AutoML: Train "the best" Time-Series Forecasting model for Retail Dataset.

# 1. Connect to Azure ML Workspace

In [None]:
import warnings
import logging

# Suppress OpenTelemetry warnings
warnings.filterwarnings("ignore", message="Overriding of current")
warnings.filterwarnings("ignore", message="Attempting to instrument")

# Suppress Azure SDK telemetry logging
logging.getLogger("azure.core.pipeline.policies.http_logging_policy").setLevel(logging.WARNING)
logging.getLogger("azure.identity").setLevel(logging.WARNING)
logging.getLogger("opentelemetry").setLevel(logging.ERROR)

In [None]:
# Import required libraries
from azure.ai.ml import MLClient

from azure.ai.ml.constants import AssetTypes
from azure.ai.ml import automl
from azure.ai.ml import Input

In [None]:
from azure.identity import AzureCliCredential

credential = AzureCliCredential()
ml_client = None
try:
    subscription_id = "57123c17-af1a-4ec2-9494-a214fb148bf4"
    resource_group = "admin-rg"
    workspace = "ml-demo-wksp-wus-01"
    ml_client = MLClient(credential, subscription_id, resource_group, workspace)
except Exception as ex:
    print("Ex:", ex)

In [None]:
# Verify connection
ws = ml_client.workspaces.get(ml_client.workspace_name)
print(f"Connected to: {ws.name} ({ws.location})")

# 2. Data Preparation

Using [Retail data analytics](https://www.kaggle.com/datasets/manjeetsingh/retaildataset) - weekly sales by store and department.

## 2.1 Load Datasets


In [None]:
import pandas as pd

# Load datasets
stores_df = pd.read_csv('../dataset/stores data-set.csv')
features_df = pd.read_csv('../dataset/Features data set.csv')
sales_df = pd.read_csv('../dataset/sales data-set.csv')

# Quick exploration
print(f"Stores: {stores_df.shape}")
print(f"Features: {features_df.shape}")
print(f"Sales: {sales_df.shape}")

print("\n--- Stores Data ---")
display(stores_df.head())

print("\n--- Features Data ---")
display(features_df.head())

print("\n--- Sales Data ---")
display(sales_df.head())


## 2.2 Merge Datasets


In [None]:
# Merge sales with stores (on Store)
merged_df = sales_df.merge(stores_df, on='Store', how='left')

# Merge with features (on Store and Date)
merged_df = merged_df.merge(features_df, on=['Store', 'Date'], how='left', suffixes=('', '_feat'))

# Drop duplicate IsHoliday column from features
merged_df = merged_df.drop(columns=['IsHoliday_feat'])

print(f"Merged dataset shape: {merged_df.shape}")
print(f"\nColumns: {merged_df.columns.tolist()}")
display(merged_df.head())


## 2.3 Feature Engineering


In [None]:
# Convert Date to datetime (format is dd/mm/yyyy)
merged_df['Date'] = pd.to_datetime(merged_df['Date'], dayfirst=True)

# Extract date features
merged_df['Year'] = merged_df['Date'].dt.year
merged_df['Month'] = merged_df['Date'].dt.month
merged_df['Week'] = merged_df['Date'].dt.isocalendar().week
merged_df['DayOfWeek'] = merged_df['Date'].dt.dayofweek

# Handle missing MarkDown values (only available after Nov 2011)
markdown_cols = ['MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']
merged_df[markdown_cols] = merged_df[markdown_cols].fillna(0)

# Encode categorical: Store Type (A, B, C)
if 'Type' in merged_df.columns:
    merged_df = pd.get_dummies(merged_df, columns=['Type'], prefix='StoreType')

print(f"Feature engineered dataset: {merged_df.shape}")
display(merged_df.head())


## 2.4 Train/Validation Split


In [None]:
# Sort by date
merged_df = merged_df.sort_values(['Store', 'Dept', 'Date'])

# Time-based split: train on data before 2012, validate on 2012
train_df = merged_df[merged_df['Year'] < 2012].copy()
validation_df = merged_df[merged_df['Year'] >= 2012].copy()

print(f"Training set: {train_df.shape}")
print(f"Validation set: {validation_df.shape}")
print(f"\nTrain date range: {train_df['Date'].min()} to {train_df['Date'].max()}")
print(f"Validation date range: {validation_df['Date'].min()} to {validation_df['Date'].max()}")
print(f"\nTrain/Validation split ratio: {len(train_df)/(len(train_df)+len(validation_df))*100:.1f}% / {len(validation_df)/(len(train_df)+len(validation_df))*100:.1f}%")


## 2.5 Prepare MLTable


In [None]:
import os

# Rename columns for AutoML compatibility
train_df = train_df.rename(columns={'Weekly_Sales': 'demand', 'Date': 'timeStamp'})
validation_df = validation_df.rename(columns={'Weekly_Sales': 'demand', 'Date': 'timeStamp'})

# Create output directories
os.makedirs('./data/training-mltable-folder', exist_ok=True)
os.makedirs('./data/validation-mltable-folder', exist_ok=True)

# Save as CSV (MLTable will reference these)
train_df.to_csv('./data/training-mltable-folder/train.csv', index=False)
validation_df.to_csv('./data/validation-mltable-folder/validation.csv', index=False)

print(f"Training data saved to: ./data/training-mltable-folder/train.csv")
print(f"Validation data saved to: ./data/validation-mltable-folder/validation.csv")
print(f"\nColumns in final datasets:")
print(train_df.columns.tolist())


In [None]:
# Create MLTable YAML files for Azure ML

mltable_train = """paths:
  - file: ./train.csv
transformations:
  - read_delimited:
      delimiter: ','
      header: all_files_same_headers
"""

mltable_val = """paths:
  - file: ./validation.csv
transformations:
  - read_delimited:
      delimiter: ','
      header: all_files_same_headers
"""

with open('./data/training-mltable-folder/MLTable', 'w') as f:
    f.write(mltable_train)
    
with open('./data/validation-mltable-folder/MLTable', 'w') as f:
    f.write(mltable_val)

print("MLTable files created:")
print("  - ./data/training-mltable-folder/MLTable")
print("  - ./data/validation-mltable-folder/MLTable")


## 2.6 Upload to Azure Blob Storage


In [None]:
import subprocess

# Azure Storage configuration
STORAGE_ACCOUNT = "mldemowkspwus02609576373"
CONTAINER = "azureml-blobstore-cff56e3a-d016-4526-aa58-71c460675066"

def upload_to_blob(source_folder, destination_path):
    """Upload local folder to Azure Blob Storage using OAuth authentication."""
    cmd = [
        "az", "storage", "blob", "upload-batch",
        "--account-name", STORAGE_ACCOUNT,
        "--destination", f"{CONTAINER}/{destination_path}",
        "--source", source_folder,
        "--auth-mode", "login",
        "--overwrite"
    ]
    print(f"Uploading {source_folder} to {destination_path}...")
    result = subprocess.run(cmd, capture_output=True, text=True)
    if result.returncode == 0:
        print(f"✓ Successfully uploaded to {destination_path}")
    else:
        print(f"✗ Upload failed: {result.stderr}")
    return result.returncode == 0

# Upload training data
upload_to_blob("./data/training-mltable-folder", "retail-training-data")

# Upload validation data
upload_to_blob("./data/validation-mltable-folder", "retail-validation-data")

print("\nData upload complete!")


In [None]:
# Reference data already uploaded to blob storage (via Azure CLI with OAuth)
# Using identity-based datastore (no SAS tokens required)

my_training_data_input = Input(
    type=AssetTypes.MLTABLE, 
    path="azureml://datastores/workspaceblobstore_identity/paths/retail-training-data"
)

my_validation_data_input = Input(
    type=AssetTypes.MLTABLE, 
    path="azureml://datastores/workspaceblobstore_identity/paths/retail-validation-data"
)

print("Using pre-uploaded data from identity-based datastore:")
print("  - Training: azureml://datastores/workspaceblobstore_identity/paths/retail-training-data")
print("  - Validation: azureml://datastores/workspaceblobstore_identity/paths/retail-validation-data")

# 3. Configure and Run AutoML Forecasting Job

## 3.1 Job Configuration

In [None]:
# general job parameters
max_trials = 5
exp_name = "sales-forecasting-experiment"

# Compute cluster name (must exist in your Azure ML workspace)
compute_name = "teslat4-gpu-wus"

In [None]:
# Create the AutoML forecasting job with the related factory-function.
forecasting_job = automl.forecasting(
    experiment_name=exp_name,
    compute=compute_name,  
    training_data=my_training_data_input,
    validation_data=my_validation_data_input, 
    target_column_name="demand",
    primary_metric="NormalizedRootMeanSquaredError",
    enable_model_explainability=True,
    tags={"retail": "forecasting"},
)

# Limits are all optional
forecasting_job.set_limits(
    timeout_minutes=600,
    trial_timeout_minutes=20,
    max_trials=max_trials,
    enable_early_termination=True,
)

# Specialized properties for Time Series Forecasting training
forecasting_job.set_forecast_settings(
    time_column_name="timeStamp",
    forecast_horizon=12,  # 12 weeks forecast (reduced for data consistency)
    frequency="W-FRI",    # Weekly frequency anchored to Friday (retail week ending)
    target_lags=[1, 2, 4],  # Lag features for 1, 2, and 4 weeks back
    target_rolling_window_size=4,
    time_series_id_column_names=["Store", "Dept"],
    short_series_handling_config="auto",  # Handle series with irregular/missing data
)

forecasting_job.set_training(blocked_training_algorithms=["ExtremeRandomTrees"])

## 3.2 Submit Job

In [None]:
# Submit the AutoML job
returned_job = ml_client.jobs.create_or_update(forecasting_job)
print(f"Created job: {returned_job}")

In [None]:
ml_client.jobs.stream(returned_job.name)

# 4. Get Predictions


## 4.1 Download Model


In [None]:
# Wait for the job to complete (if not already)
from azure.ai.ml.entities import Model

# Get the completed job details
completed_job = ml_client.jobs.get(returned_job.name)
print(f"Job status: {completed_job.status}")

# Download the best model artifacts
model_download_path = "./outputs/best_model"
os.makedirs(model_download_path, exist_ok=True)

ml_client.jobs.download(
    name=returned_job.name,
    download_path="./outputs",
    output_name="best_model"
)

print(f"Best model downloaded to: {model_download_path}")


## 4.2 Load Model and Predict


In [None]:
import mlflow

# Load the downloaded model
model_path = "./outputs/best_model"
loaded_model = mlflow.pyfunc.load_model(model_path)

print(f"Model loaded successfully from: {model_path}")
print(f"Model flavor: {loaded_model.metadata.flavors}")


In [None]:
# Prepare validation data for prediction (remove target column)
prediction_input = validation_df.drop(columns=['demand']).copy()

# Generate predictions
predictions = loaded_model.predict(prediction_input)

# Add predictions to validation dataframe
validation_df['predicted_demand'] = predictions

print(f"Generated {len(predictions)} predictions")
display(validation_df[['Store', 'Dept', 'timeStamp', 'demand', 'predicted_demand']].head(20))


## 4.3 Evaluate Predictions


In [None]:
import torch

# Convert to PyTorch tensors
actual_tensor = torch.tensor(validation_df['demand'].values, dtype=torch.float32)
predicted_tensor = torch.tensor(validation_df['predicted_demand'].values, dtype=torch.float32)

# Calculate evaluation metrics using PyTorch
# Mean Absolute Error (MAE)
mae = torch.mean(torch.abs(actual_tensor - predicted_tensor)).item()

# Root Mean Squared Error (RMSE)
mse = torch.mean((actual_tensor - predicted_tensor) ** 2)
rmse = torch.sqrt(mse).item()

# R² Score
ss_res = torch.sum((actual_tensor - predicted_tensor) ** 2)
ss_tot = torch.sum((actual_tensor - torch.mean(actual_tensor)) ** 2)
r2 = (1 - ss_res / ss_tot).item()

# Mean Absolute Percentage Error (MAPE) - handle zeros
non_zero_mask = actual_tensor != 0
mape = torch.mean(torch.abs((actual_tensor[non_zero_mask] - predicted_tensor[non_zero_mask]) / actual_tensor[non_zero_mask])).item() * 100

print("=" * 50)
print("MODEL EVALUATION METRICS (PyTorch)")
print("=" * 50)
print(f"Mean Absolute Error (MAE):        ${mae:,.2f}")
print(f"Root Mean Squared Error (RMSE):   ${rmse:,.2f}")
print(f"R² Score:                          {r2:.4f}")
print(f"Mean Absolute % Error (MAPE):      {mape:.2f}%")
print("=" * 50)


## 4.4 Save Predictions and Register Model


In [None]:
# Save predictions to CSV
output_file = "./outputs/predictions.csv"
os.makedirs("./outputs", exist_ok=True)

# Select relevant columns for output
prediction_output = validation_df[['Store', 'Dept', 'timeStamp', 'demand', 'predicted_demand']].copy()
prediction_output['error'] = prediction_output['demand'] - prediction_output['predicted_demand']
prediction_output['absolute_error'] = abs(prediction_output['error'])

prediction_output.to_csv(output_file, index=False)
print(f"Predictions saved to: {output_file}")
print(f"Total predictions: {len(prediction_output)}")

# Show summary by store
print("\n--- Prediction Summary by Store (Top 10) ---")
store_summary = prediction_output.groupby('Store').agg({
    'demand': 'sum',
    'predicted_demand': 'sum',
    'absolute_error': 'mean'
}).round(2)
store_summary.columns = ['Actual Sales', 'Predicted Sales', 'Avg Absolute Error']
display(store_summary.head(10))


In [None]:
# Register the best model in Azure ML Model Registry
model = Model(
    path=f"azureml://jobs/{returned_job.name}/outputs/best_model",
    name="retail-sales-forecasting-model",
    description="AutoML time-series forecasting model for retail weekly sales",
    type="mlflow_model"
)

registered_model = ml_client.models.create_or_update(model)
print(f"Registered model: {registered_model.name}, version: {registered_model.version}")
