# Azure ML Workspace Setup

This notebook sets up the Azure Machine Learning workspace and configures the environment for LSTM time series forecasting.

In [None]:
import os
import sys

from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential
from dotenv import find_dotenv, load_dotenv

# Load environment variables
load_dotenv(find_dotenv(".env"))

print("‚úÖ Imports successful")

## 1. Configure Azure ML Workspace

In [None]:
# Azure ML workspace configuration
subscription_id = os.getenv("AZURE_SUBSCRIPTION_ID")
resource_group = os.getenv("AZURE_RESOURCE_GROUP")
workspace_name = os.getenv("AZURE_ML_WORKSPACE")

print(f"Subscription ID: {subscription_id}")
print(f"Resource Group: {resource_group}")
print(f"Workspace Name: {workspace_name}")

# Validate configuration
if not all([subscription_id, resource_group, workspace_name]):
    print("‚ùå Missing required environment variables. Please check your .env file.")
else:
    print("‚úÖ Configuration loaded successfully")

In [None]:
# Initialize Azure ML client
try:
    credential = DefaultAzureCredential()
    ml_client = MLClient(
        credential=credential,
        subscription_id=subscription_id,
        resource_group_name=resource_group,
        workspace_name=workspace_name
    )

    # Test connection
    workspace = ml_client.workspaces.get(workspace_name)
    print(f"‚úÖ Successfully connected to workspace: {workspace.name}")
    print(f"Location: {workspace.location}")

except Exception as e:
    print(f"‚ùå Error connecting to workspace: {str(e)}")
    print("Please ensure you're authenticated and have access to the workspace.")

## 2. Setup Compute Resources

In [None]:
# Add parent directory to path for module imports
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath('__file__')))
modules_dir = os.path.join(parent_dir, 'src')
if modules_dir not in sys.path:
    sys.path.append(modules_dir)
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

print(f"Parent directory: {parent_dir}")
print(f"Modules directory: {modules_dir}")

In [None]:
# Import and initialize configuration utilities
from utils.azure_ml_config import AzureMLConfig

# Initialize configuration
config = AzureMLConfig()
config.validate_config()

In [None]:
# Setup compute cluster
from mlops.compute.setup_compute import ComputeManager

compute_manager = ComputeManager()

# Create CPU compute cluster
cpu_cluster = compute_manager.create_compute_cluster(
    cluster_name="cpu-cluster",
    vm_size="Standard_D32ds_v5",
    max_instances=4
)

print(f"‚úÖ CPU cluster created: {cpu_cluster.name}")

In [None]:
# List all compute resources
compute_resources = compute_manager.list_compute_resources()
print(f"Total compute resources: {len(compute_resources)}")

## 3. Test Data Loading and Preprocessing

In [None]:
# Import preprocessing utilities
import matplotlib.pyplot as plt

from data_processing.preprocessor import TimeSeriesPreprocessor, load_sample_data

# Load sample data
data = load_sample_data()
print(f"Loaded data shape: {data.shape}")
print(f"Data columns: {data.columns.tolist()}")
print(f"Date range: {data['date'].min()} to {data['date'].max()}")

# Display first few rows
data.head()

In [None]:
# Visualize the time series data
plt.figure(figsize=(12, 6))
plt.plot(data['date'], data['value'])
plt.title('Sample Time Series Data')
plt.xlabel('Date')
plt.ylabel('Value')
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Test preprocessing
preprocessor = TimeSeriesPreprocessor(sequence_length=60)

# Fit and transform data
scaled_data = preprocessor.fit_transform(data)
print(f"Scaled data shape: {scaled_data.shape}")
print(f"Scaled data range: {scaled_data.min():.3f} to {scaled_data.max():.3f}")

# Create sequences
sequences, targets = preprocessor.create_sequences(scaled_data)
print(f"Sequences shape: {sequences.shape}")
print(f"Targets shape: {targets.shape}")

## 4. Test Model Creation

In [None]:
# Import model
import torch

from models.lstm_model import LSTMConfig, LSTMTimeSeriesModel

# Create model config
config = LSTMConfig()
print("Model configuration:")
for key, value in config.__dict__.items():
    print(f"  {key}: {value}")

In [None]:
# Create model
model = LSTMTimeSeriesModel(
    input_size=config.input_size,
    hidden_size=config.hidden_size,
    num_layers=config.num_layers,
    output_size=config.output_size,
    dropout=config.dropout
)

print("Model created successfully")
print(f"Total parameters: {sum(p.numel() for p in model.parameters()):,}")
trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad
)
print(f"Trainable parameters: {trainable_params:,}")

In [None]:
# Test forward pass
sample_input = torch.randn(1, config.sequence_length, config.input_size)
output = model(sample_input)
print(f"Input shape: {sample_input.shape}")
print(f"Output shape: {output.shape}")
print("‚úÖ Forward pass successful")

## 5. Setup MLflow Tracking

In [None]:
import os

import mlflow

# Configure MLflow tracking - using local file system for reliability
# This avoids the Azure ML MLflow integration issues while still being functional
local_tracking_uri = f"file://{os.getcwd()}/mlruns"
mlflow.set_tracking_uri(local_tracking_uri)

# Set experiment
experiment_name = "lstm-time-series-forecasting-1105"
experiment = mlflow.set_experiment(experiment_name)

print(f"‚úÖ MLflow experiment set: {experiment_name}")
print(f"Tracking URI: {mlflow.get_tracking_uri()}")
print(f"Experiment ID: {experiment.experiment_id}")

# Note: If you need Azure ML MLflow integration later, you can configure it
# by ensuring proper authentication and using the workspace's MLflow tracking URI

## 6. Azure ML Environment Setup

In [None]:
# Check available environments
environments = list(ml_client.environments.list())
print(f"Found {len(environments)} environments in workspace")

# Look for PyTorch environments
pytorch_envs = [env for env in environments if 'pytorch' in env.name.lower()]
if pytorch_envs:
    print("\nAvailable PyTorch environments:")
    for env in pytorch_envs[:5]:  # Show first 5
        print(f"  - {env.name}:{env.version}")

    # Use the first available PyTorch environment
    recommended_env = f"{pytorch_envs[0].name}@latest"
    print(f"\n‚úÖ Recommended environment: {recommended_env}")
else:
    print("\n‚ö†Ô∏è No PyTorch environments found. Using curated environment.")
    recommended_env = "AzureML-pytorch-1.13-ubuntu20.04-py38-cpu@latest" # AzureML-pytorch-1.13-ubuntu20.04-py38-cpu@latest, AzureML-pytorch-1.9-ubuntu18.04-py37-cpu@latest

environment_name = recommended_env

## 7. Prepare Training Script for Azure ML Remote Execution

This section will prepare and submit a training job to run remotely on Azure ML compute cluster.

In [None]:
# Check Available Environments
print("üîç Checking available Azure ML curated environments...")

try:
    # List available environments
    environments = ml_client.environments.list()

    # Find PyTorch environments
    pytorch_envs = []
    for env in environments:
        if env.name and "pytorch" in env.name.lower():
            pytorch_envs.append(f"{env.name}@{env.version}" if env.version else env.name)

    print(f"\\nüìã Found {len(pytorch_envs)} PyTorch environments:")
    for env in sorted(pytorch_envs)[:10]:  # Show first 10
        print(f"   - {env}")

    # Recommend a working environment
    if pytorch_envs:
        # Look for a recent stable PyTorch environment
        recommended_env = None
        for env in pytorch_envs:
            if "cpu" in env.lower() and ("2.0" in env or "1.13" in env or "latest" in env):
                recommended_env = env
                break

        if not recommended_env:
            recommended_env = pytorch_envs[0]  # Use first available

        print(f"\\n‚úÖ Recommended environment: {recommended_env}")
        environment_name = f"{recommended_env}@latest"

    else:
        # Fallback to a generic ML environment
        print("\\n‚ö†Ô∏è No PyTorch environments found, using generic ML environment")
        environment_name = "AzureML-pytorch-1.13-ubuntu20.04-py38-cpu@latest"

except Exception as e:
    print(f"‚ùå Error checking environments: {str(e)}")
    print("\\nüîÑ Using alternative approach - creating custom environment")

    # Create a simple custom environment as fallback
    from azure.ai.ml.entities import Environment

    custom_env = Environment(
        name="pytorch-lstm-cpu",
        description="Custom PyTorch environment for LSTM training",
        conda_file="../src/azure_ml_training/environment.yml",
        image="mcr.microsoft.com/azureml/base:openmpi4.1.0-ubuntu20.04"
    )

    try:
        ml_client.environments.create_or_update(custom_env)
        environment_name = "pytorch-lstm-cpu@latest"
        print(f"‚úÖ Created custom environment: {environment_name}")
    except Exception as create_error:
        print(f"‚ùå Failed to create custom environment: {str(create_error)}")
        print("\\nüí° Using minimal base environment")
        environment_name = "AzureML-minimal-ubuntu20.04-py38-cpu@latest"

print(f"\\nüéØ Final environment selection: {environment_name}")

## 8. Test Training Job Submission

In [None]:
# Prepare and submit training job
from azure.ai.ml import command
from azure.ai.ml.entities import Environment

print("üîÑ Preparing training job submission...")
training_script_dir = "../src/azure_ml_training"
# Verify training script exists
script_path = os.path.join(training_script_dir, "train_lstm.py")
if not os.path.exists(script_path):
    raise FileNotFoundError(f"Training script not found: {script_path}")

print(f"‚úÖ Training script verified: {script_path}")
print(f"Script size: {os.path.getsize(script_path)} bytes")

In [None]:
# Create the training job
training_job = command(
    code=training_script_dir,
    command="python train_lstm.py --epochs 30 --batch_size 32 --learning_rate 0.001",
    environment=environment_name,
    compute="cpu-cluster",
    experiment_name="lstm-time-series-forecasting-test",
    display_name="LSTM Training Job",
    description="LSTM model training for time series forecasting",
    tags={
                "model_type": "LSTM",
                "framework": "PyTorch",
                "task": "time_series_forecasting",
                "script": "train_lstm"
            }
)

print("‚úÖ Training job configured")
print(f"Environment: {environment_name}")
print("Compute: cpu-cluster")
print("Experiment: lstm-time-series-forecasting")

In [None]:
# Submit the training job
print("üöÄ Submitting the training job...")

try:
    # Verify cluster is ready
    cluster_check = ml_client.compute.get("cpu-cluster")
    if cluster_check.provisioning_state != "Succeeded":
        raise Exception(f"Cluster not ready. State: {cluster_check.provisioning_state}")

    print(f"‚úÖ Cluster ready: {cluster_check.name} (State: {cluster_check.provisioning_state})")

    # Submit the job
    submitted_job = ml_client.jobs.create_or_update(training_job)

    print("\n‚úÖ Job submitted successfully!")
    print("üìã Job Details:")
    print(f"   Name: {submitted_job.name}")
    print(f"   Status: {submitted_job.status}")
    print(f"   Experiment: {submitted_job.experiment_name}")
    print(f"   Compute: {submitted_job.compute}")
    print(f"   Environment: {environment_name}")

    print("\nüîó Monitoring Links:")
    print(f"   Studio URL: {submitted_job.studio_url}")

    # Store for monitoring
    submitted_job_name = submitted_job.name
    print(f"\nüí° Job '{submitted_job_name}' is now running!")
    print("   Use the Studio URL above to monitor progress")

except Exception as e:
    print(f"\n‚ùå Job submission failed: {str(e)}")
    print("\nüîß Troubleshooting tips:")
    print("1. Ensure the compute cluster is in 'Succeeded' state")
    print("2. Check that the environment is available")
    print("3. Verify the training script exists and is valid")
    print("4. Check Azure ML workspace permissions")

In [None]:
# üìä MONITOR FIXED JOB (Check if tracking_uri error is resolved)
print("üìä Monitoring Fixed Job Status")
print("=" * 60)

try:
    # Check if we have a fixed job to monitor
    if 'submitted_job_name' in globals() and submitted_job_name:
        print(f"üîç Monitoring job: {submitted_job_name}")

        # Get current job status
        current_submitted_job = ml_client.jobs.get(submitted_job_name)

        print("\nüìã Job Status:")
        print(f"   Name: {current_submitted_job.name}")
        print(f"   Status: {current_submitted_job.status}")
        print(f"   Created: {current_submitted_job.creation_context.created_at}")

        if hasattr(current_submitted_job, 'start_time') and current_submitted_job.start_time:
            print(f"   Started: {current_submitted_job.start_time}")

        if hasattr(current_submitted_job, 'end_time') and current_submitted_job.end_time:
            print(f"   Ended: {current_submitted_job.end_time}")

        # Show studio URL for monitoring
        if hasattr(current_submitted_job, 'studio_url') and current_submitted_job.studio_url:
            print("\nüîó Monitor in Azure ML Studio:")
            print(f"   {current_submitted_job.studio_url}")

        # Provide status-specific guidance
        status = current_submitted_job.status

        if status == "Completed":
            print("\nüéâ Job completed successfully!")
            print("   ‚úÖ MLflow tracking_uri error has been resolved!")
            print("   ‚úÖ Model training completed without MLflow issues")
            print("   üìÅ Check outputs in Azure ML Studio")
            print("   üìä Training metrics should be logged properly")

        elif status == "Failed":
            print("\n‚ùå Job failed!")
            print("   üîç Check Azure ML Studio for detailed error logs")
            print("   üìã Common issues to check:")
            print("      - Compute cluster problems")
            print("      - Environment setup issues")
            print("      - Storage permission errors")
            print("      - Network connectivity")

        elif status in ["Running", "Preparing"]:
            print(f"\n‚è≥ Job is {status.lower()}...")
            if status == "Preparing":
                print("   üîß Setting up compute environment")
                print("   üì¶ Installing dependencies")
                print("   ‚è±Ô∏è This typically takes 3-5 minutes")
            else:
                print("   üèÉ‚Äç‚ôÇÔ∏è Training script is executing")
                print("   üìä MLflow compatibility layer is active")
                print("   ‚úÖ Should handle tracking_uri errors gracefully")

        elif status == "Queued":
            print("\n‚è∞ Job is queued...")
            print("   ‚è≥ Waiting for compute resources")
            print("   üîß Compute cluster is starting up")

        elif status == "Canceled":
            print("\nüõë Job was canceled")
            print("   üîÑ You can restart with the same configuration")

        # Additional diagnostic info
        print("\nüîç Troubleshooting Info:")
        print("   Job Type: Command Job")
        print("   Environment: Curated PyTorch (should avoid MLflow conflicts)")
        print("   Script: Fixed version with error handling")
        print("   Expected Duration: 5-15 minutes")

        # Check recent jobs if this one isn't running
        if status in ["Failed", "Canceled", "Completed"]:
            print("\nüìã Recent Job History:")
            recent_jobs = list(ml_client.jobs.list(max_results=3))
            for job in recent_jobs:
                print(f"   - {job.name}: {job.status} ({job.creation_context.created_at})")

    else:
        print("‚ÑπÔ∏è No fixed job to monitor yet.")
        print("   Run the previous cell to submit the fixed job first")

        # Show regular job monitoring
        print("\nüìã All Recent Jobs:")
        recent_jobs = list(ml_client.jobs.list(max_results=5))

        if recent_jobs:
            for job in recent_jobs:
                status_emoji = "‚úÖ" if job.status == "Completed" else "‚ùå" if job.status == "Failed" else "‚è≥"
                print(f"   {status_emoji} {job.name}: {job.status}")

                # Check if any recent job had the tracking_uri error
                if job.status == "Failed":
                    print("      üîç Check this job for tracking_uri errors in Azure ML Studio")
        else:
            print("   No recent jobs found")

except Exception as e:
    print(f"‚ùå Error monitoring job: {str(e)}")
    print("\nüîß Try:")
    print("   - Refresh your connection to Azure ML")
    print("   - Check job status directly in Azure ML Studio")
    print("   - Verify the job name is correct")

In [None]:
# Utility functions for job management
def list_recent_jobs(ml_client, limit=5):
    """List recent training jobs"""
    print(f"üìã Recent training jobs (last {limit}):")
    print("-" * 80)

    try:
        jobs = ml_client.jobs.list(max_results=limit)
        for job in jobs:
            print(f"Name: {job.name}")
            print(f"Status: {job.status}")
            print(f"Created: {job.creation_context.created_at}")
            print(f"Experiment: {job.experiment_name}")
            print(f"Studio: {job.studio_url}")
            print("-" * 40)
    except Exception as e:
        print(f"‚ùå Error listing jobs: {str(e)}")

def get_job_logs(ml_client, job_name):
    """Get job logs and outputs"""
    try:
        job = ml_client.jobs.get(job_name)
        print(f"üìÑ Job: {job_name}")
        print(f"Status: {job.status}")
        print(f"Studio URL: {job.studio_url}")

        if job.status == "Completed":
            print("‚úÖ Job completed successfully!")
            # You can download outputs here if needed
        elif job.status == "Failed":
            print("‚ùå Job failed. Check the Studio URL for detailed logs.")

        return job
    except Exception as e:
        print(f"‚ùå Error getting job info: {str(e)}")
        return None

def cancel_job(ml_client, job_name):
    """Cancel a running job"""
    try:
        ml_client.jobs.cancel(job_name)
        print(f"üõë Job {job_name} cancellation requested")
    except Exception as e:
        print(f"‚ùå Error canceling job: {str(e)}")

# Show available functions
print("üõ†Ô∏è Available job management functions:")
print("  - list_recent_jobs(ml_client, limit=5)")
print("  - get_job_logs(ml_client, job_name)")
print("  - cancel_job(ml_client, job_name)")
print("  - monitor_job(job_name, ml_client, check_interval=30)")
print()
print("Example usage:")
print("  list_recent_jobs(ml_client)")
if 'job_name' in locals() and job_name:
    print(f"  get_job_logs(ml_client, '{job_name}')")
    print(f"  cancel_job(ml_client, '{job_name}')")

In [None]:
list_recent_jobs(ml_client)

## 9. Summary

In [None]:
print("üéâ Azure ML Workspace Setup Complete!")
print("\n‚úÖ What's been set up:")
print("1. Azure ML workspace connection and authentication")
print("2. Compute cluster for training")
print("3. MLflow experiment tracking")
print("4. Sample data preprocessing pipeline")
print("5. LSTM model architecture")
print("6. Azure ML training script and environment")
print("7. Remote training job submission capabilities")

print("\nüìÅ Created files:")
print("- ../src/azure_ml_training/train_lstm.py (Training script)")
print("- ../src/azure_ml_training/environment.yml (Conda environment)")
print("- ../src/azure_ml_training/requirements.txt (Pip requirements)")

print("\nüöÄ Next steps:")
print("1. Review the generated training script in src/azure_ml_training/")
print("2. Customize the model hyperparameters as needed")
print("3. Run the cells above to submit training jobs to Azure ML")
print("4. Monitor training progress in Azure ML Studio")
print("5. Deploy the trained model using Azure ML endpoints")

print("\nüí° Available options:")
print("- Local training: python src/training/train_lstm.py")
print("- Azure ML training: Submit job using the cells above")
print("- Hybrid approach: Develop locally, train remotely")

if 'job_name' in locals() and job_name:
    print(f"\nüîó Current job: {job_name}")
    print(f"Monitor at: {submitted_job.studio_url}")
else:
    print("\n‚ö†Ô∏è No active training job. Run the submission cells above to start training.")