# System Pipeline: Load Preprocessed Data, Importance Scores, and Redundancy

This notebook demonstrates the complete pipeline for loading:
1. Preprocessed data for Building 1
2. Importance metrics from saved scores
3. Redundancy data

This is a streamlined version that loads pre-computed data without re-running expensive computations.

## Setup: Import Required Libraries

In [1]:
import sys
import subprocess

print(f"Current Python: {sys.executable}")

# Install packages in the current notebook environment
subprocess.check_call([sys.executable, "-m", "pip", "install", "openjij", "dwave-ocean-sdk", "-q"])

print("\n✓ Packages installed!")
print("Now restart your kernel: Kernel → Restart Kernel")
print("Then re-run all cells")

Current Python: C:\Users\Mohamed Khalil\Desktop\Quantum-Optimization-In-AP-Selection\venv\Scripts\python.exe

✓ Packages installed!
Now restart your kernel: Kernel → Restart Kernel
Then re-run all cells


In [1]:
# Add project root to Python path
import sys
from pathlib import Path

# Get the project root (2 levels up from this notebook)
project_root = Path.cwd().parent.parent
sys.path.insert(0, str(project_root))

print(f"✓ Added project root to Python path: {project_root}")

✓ Added project root to Python path: c:\Users\Mohamed Khalil\Desktop\Quantum-Optimization-In-AP-Selection


In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import MinMaxScaler
import warnings

# Import custom data loading functions
from scripts.data.data_loaders import (
    load_preprocessed_data,
    load_all_precomputed_data,
    load_importance_dict_from_csv,
    load_redundancy_matrix_from_csv
)

# Import QUBO optimization functions
from scripts.optimization.QUBO import (
    formulate_qubo,
    solve_qubo_with_openjij,
    solve_qubo_with_SA
)

# Import ML training functions
from scripts.ml.ML_post_processing import train_regressor

# Import evaluation functions
from scripts.evaluation.Analysis import calculate_comprehensive_metrics

warnings.filterwarnings('ignore')

print("✓ Libraries imported successfully")

ModuleNotFoundError: No module named 'openjij'

## Step 1: Load Preprocessed Data for Building 1

This loads the preprocessed RSSI data, coordinates, and AP columns from saved files.

In [None]:
# Specify building ID
building_id = 1

# Load preprocessed data (uses pickle for fast loading)
rssi_train, coords_train, rssi_val, coords_val, ap_columns = load_preprocessed_data(
    building_id=building_id,
    use_pickle=True  # True = fast (pickle), False = slower (Excel)
)

# Initialize and fit the coordinate scaler
scaler_coords = MinMaxScaler()
scaler_coords.fit(coords_train)

print("\n" + "="*60)
print("PREPROCESSED DATA SUMMARY")
print("="*60)
print(f"Building ID: {building_id}")
print(f"Training samples: {rssi_train.shape[0]}")
print(f"Validation samples: {rssi_val.shape[0]}")
print(f"Number of APs: {len(ap_columns)}")
print(f"\nRSSI Training shape: {rssi_train.shape}")
print(f"Coordinates Training shape: {coords_train.shape}")
print(f"RSSI Validation shape: {rssi_val.shape}")
print(f"Coordinates Validation shape: {coords_val.shape}")
print("="*60)

## Step 2: Load Importance Scores

This loads all pre-computed importance metrics from saved CSV files.

In [None]:
# Load all importance dictionaries at once
importance_dicts, _ = load_all_precomputed_data()

# Access individual importance methods
importance_entropy = importance_dicts['entropy']
importance_average = importance_dicts['average']
importance_median = importance_dicts['median']
importance_max = importance_dicts['max']
importance_variance = importance_dicts['variance']
importance_mutual_info = importance_dicts['mutual_info']

print("\n" + "="*60)
print("IMPORTANCE SCORES SUMMARY")
print("="*60)
print(f"Entropy importance: {len(importance_entropy)} APs")
print(f"Average importance: {len(importance_average)} APs")
print(f"Median importance: {len(importance_median)} APs")
print(f"Max importance: {len(importance_max)} APs")
print(f"Variance importance: {len(importance_variance)} APs")
print(f"Mutual Info importance: {len(importance_mutual_info)} APs")
print("="*60)

# Show top 5 APs for each method
print("\nTop 5 APs by Entropy Importance:")
top_entropy = sorted(importance_entropy.items(), key=lambda x: x[1], reverse=True)[:5]
for ap, score in top_entropy:
    print(f"  {ap}: {score:.4f}")

print("\nTop 5 APs by Mutual Information:")
top_mi = sorted(importance_mutual_info.items(), key=lambda x: x[1], reverse=True)[:5]
for ap, score in top_mi:
    print(f"  {ap}: {score:.4f}")

## Step 3: Load Redundancy Matrix

This loads the pre-computed redundancy matrix from saved files.

In [None]:
# Load redundancy matrix (second return value from load_all_precomputed_data)
_, redundancy_matrix = load_all_precomputed_data()

print("\n" + "="*60)
print("REDUNDANCY MATRIX SUMMARY")
print("="*60)
print(f"Matrix shape: {redundancy_matrix.shape}")
print(f"Matrix type: {type(redundancy_matrix)}")
print(f"\nRedundancy statistics:")
print(f"  Mean redundancy: {redundancy_matrix.values.mean():.4f}")
print(f"  Min redundancy: {redundancy_matrix.values.min():.4f}")
print(f"  Max redundancy: {redundancy_matrix.values.max():.4f}")
print(f"  Median redundancy: {np.median(redundancy_matrix.values):.4f}")
print("="*60)

# Show a sample of the redundancy matrix
print("\nSample of redundancy matrix (first 5x5):")
print(redundancy_matrix.iloc[:5, :5])

## Step 4: Verification and Summary

Verify that all data has been loaded correctly and is ready for use in the optimization pipeline.

In [None]:
# Verify all components are loaded
print("\n" + "="*60)
print("PIPELINE DATA VERIFICATION")
print("="*60)

checks = [
    ("Preprocessed training data", rssi_train is not None and len(rssi_train) > 0),
    ("Preprocessed validation data", rssi_val is not None and len(rssi_val) > 0),
    ("Training coordinates", coords_train is not None and len(coords_train) > 0),
    ("Validation coordinates", coords_val is not None and len(coords_val) > 0),
    ("AP columns", ap_columns is not None and len(ap_columns) > 0),
    ("Entropy importance", len(importance_entropy) > 0),
    ("Average importance", len(importance_average) > 0),
    ("Median importance", len(importance_median) > 0),
    ("Max importance", len(importance_max) > 0),
    ("Variance importance", len(importance_variance) > 0),
    ("Mutual info importance", len(importance_mutual_info) > 0),
    ("Redundancy matrix", redundancy_matrix is not None and redundancy_matrix.shape[0] > 0),
]

all_passed = True
for check_name, result in checks:
    status = "✓" if result else "✗"
    print(f"{status} {check_name}")
    if not result:
        all_passed = False

print("="*60)
if all_passed:
    print("\n✓ ALL DATA LOADED SUCCESSFULLY!")
    print("\nThe pipeline is ready. You can now:")
    print("  1. Run QUBO optimization with different importance metrics")
    print("  2. Train ML models on selected AP subsets")
    print("  3. Evaluate positioning accuracy")
else:
    print("\n✗ SOME DATA FAILED TO LOAD. Please check the above errors.")
print("="*60)

## Step 5: Load System Parameters

Load normalization parameters and configure QUBO settings.

In [None]:
# Load system parameters from CSV
system_params_path = Path('../../data') / 'system_input' / 'system_parameters.csv'
system_params_df = pd.read_csv(system_params_path)

# Convert to dictionary for easy access
system_params_dict = dict(zip(system_params_df['Parameter'], system_params_df['Value']))

# Extract parameters
LON_MIN = system_params_dict['LON_MIN']
LON_MAX = system_params_dict['LON_MAX']
LAT_MIN = system_params_dict['LAT_MIN']
LAT_MAX = system_params_dict['LAT_MAX']
FLOOR_HEIGHT = system_params_dict['FLOOR_HEIGHT']

# QUBO parameters
k = 20  # Number of APs to select
alpha = 0.9  # Importance vs redundancy trade-off (higher = more importance weight)
penalty = 2.0  # Penalty for violating the k constraint

print("✓ System parameters loaded from CSV:")
print(f"  LON_MIN: {LON_MIN}")
print(f"  LON_MAX: {LON_MAX}")
print(f"  LAT_MIN: {LAT_MIN}")
print(f"  LAT_MAX: {LAT_MAX}")
print(f"  FLOOR_HEIGHT: {FLOOR_HEIGHT}")
print(f"\nQUBO parameters:")
print(f"  k (num APs to select): {k}")
print(f"  alpha (importance weight): {alpha}")
print(f"  penalty: {penalty}")

## Step 6: Run QUBO Optimization with Different Importance Metrics

This section runs QUBO optimization using each importance metric and selects the top k APs.

In [None]:
# Dictionary to store results
results = {}

# Importance methods to test
importance_methods = {
    'mutual_info': importance_mutual_info,
    'entropy': importance_entropy,
    'average': importance_average,
    'max': importance_max,
    'variance': importance_variance
}

print("="*60)
print("RUNNING QUBO OPTIMIZATION FOR EACH IMPORTANCE METRIC")
print("="*60)

# Run QUBO for each importance metric
for label, imp_dict in importance_methods.items():
    print(f"\n[{label.upper()}]")
    
    # Check for zero importance scores
    nonzero_scores = [v for v in imp_dict.values() if v > 0]
    if len(nonzero_scores) == 0:
        print(f"  ✗ Skipped: all importance scores are zero or negative.")
        continue

    # 1. Formulate QUBO
    print(f"  → Formulating QUBO (k={k}, alpha={alpha}, penalty={penalty})...")
    Q, relevant_aps, offset = formulate_qubo(imp_dict, redundancy_matrix, k, alpha, penalty)
    
    if len(relevant_aps) == 0:
        print(f"  ✗ Skipped: no relevant APs selected after QUBO formulation.")
        continue

    # 2. Solve QUBO with OpenJij
    print(f"  → Solving QUBO with OpenJij SQA...")
    selected_indices, duration = solve_qubo_with_openjij(Q)
    
    if len(selected_indices) == 0:
        print(f"  ✗ Skipped: QUBO solver did not select any APs.")
        continue

    # Map indices to AP names
    selected_aps = [relevant_aps[i] for i in selected_indices]
    
    # Store preliminary results
    results[label] = {
        'selected_aps': selected_aps,
        'num_aps': len(selected_aps),
        'qubo_duration': duration,
        'Q_matrix': Q,
        'relevant_aps': relevant_aps
    }
    
    print(f"  ✓ Selected {len(selected_aps)} APs in {duration:.2f}s")
    print(f"    APs: {', '.join(selected_aps[:5])}{'...' if len(selected_aps) > 5 else ''}")

print("\n" + "="*60)
print(f"✓ QUBO optimization completed for {len(results)} methods")
print("="*60)

## Step 7: Train ML Models on Selected AP Subsets

For each importance method, train a Random Forest regressor using the selected APs.

In [None]:
print("="*60)
print("TRAINING ML MODELS")
print("="*60)

# Train models for each method
for label in results.keys():
    print(f"\n[{label.upper()}]")
    
    selected_aps = results[label]['selected_aps']
    
    # Train Random Forest regressor
    print(f"  → Training Random Forest with {len(selected_aps)} APs...")
    models, predictions = train_regressor(
        rssi_train, coords_train, 
        rssi_val, coords_val, 
        selected_aps
    )
    
    # Get validation predictions
    preds = predictions['rf_val']
    
    # Store model and predictions
    results[label]['models'] = models
    results[label]['predictions'] = predictions
    results[label]['preds_val'] = preds
    
    print(f"  ✓ Model trained successfully")
    if 'rf' in models:
        oob_score = models['rf'].estimators_[0].oob_score_ if hasattr(models['rf'].estimators_[0], 'oob_score_') else 'N/A'
        print(f"    Validation predictions shape: {preds.shape}")

print("\n" + "="*60)
print(f"✓ ML models trained for {len(results)} methods")
print("="*60)

## Step 8: Evaluate Positioning Accuracy

Calculate comprehensive metrics for each method including 3D positioning error and floor accuracy.

In [None]:
print("="*60)
print("EVALUATING POSITIONING ACCURACY")
print("="*60)

# Evaluate each method
for label in results.keys():
    print(f"\n[{label.upper()}]")
    
    preds = results[label]['preds_val']
    
    # Calculate comprehensive metrics
    print(f"  → Calculating positioning metrics...")
    _, _, metrics = calculate_comprehensive_metrics(
        coords_val, preds,
        LON_MIN, LON_MAX,
        LAT_MIN, LAT_MAX,
        FLOOR_HEIGHT
    )
    
    # Store metrics
    results[label]['mean_3d_error'] = metrics['real_mean_m']
    results[label]['median_3d_error'] = metrics['real_median_m']
    results[label]['min_error'] = metrics['real_min_m']
    results[label]['max_error'] = metrics['real_max_m']
    results[label]['floor_accuracy'] = metrics['floor_accuracy']
    results[label]['all_metrics'] = metrics
    
    print(f"  ✓ Metrics calculated:")
    print(f"    Mean 3D Error: {metrics['real_mean_m']:.2f} m")
    print(f"    Median 3D Error: {metrics['real_median_m']:.2f} m")
    print(f"    Floor Accuracy: {metrics['floor_accuracy']:.2%}")

print("\n" + "="*60)
print(f"✓ Evaluation completed for {len(results)} methods")
print("="*60)

## Step 9: Compare Results Across All Methods

Create a comprehensive comparison table and visualize the results.

In [None]:
# Create results DataFrame
results_data = []
for label, data in results.items():
    results_data.append({
        'Importance_Method': label.upper(),
        'Num_APs': data['num_aps'],
        'Selected_APs': ', '.join(data['selected_aps']),
        'Mean_3D_Error_m': data['mean_3d_error'],
        'Median_3D_Error_m': data['median_3d_error'],
        'Min_Error_m': data['min_error'],
        'Max_Error_m': data['max_error'],
        'Floor_Accuracy': data['floor_accuracy'],
        'QUBO_Duration_s': data['qubo_duration']
    })

results_df = pd.DataFrame(results_data)

# Sort by mean 3D error (best first)
results_df = results_df.sort_values('Mean_3D_Error_m', ascending=True)

print("="*80)
print("RESULTS COMPARISON - ALL IMPORTANCE METHODS")
print("="*80)
print("\nSummary Table (sorted by Mean 3D Error):")
print(results_df[['Importance_Method', 'Num_APs', 'Mean_3D_Error_m', 
                   'Median_3D_Error_m', 'Floor_Accuracy', 'QUBO_Duration_s']].to_string(index=False))

print("\n" + "="*80)
print("BEST PERFORMING METHOD")
print("="*80)
best_method = results_df.iloc[0]
print(f"Method: {best_method['Importance_Method']}")
print(f"Mean 3D Error: {best_method['Mean_3D_Error_m']:.2f} m")
print(f"Median 3D Error: {best_method['Median_3D_Error_m']:.2f} m")
print(f"Floor Accuracy: {best_method['Floor_Accuracy']:.2%}")
print(f"Number of APs: {best_method['Num_APs']}")
print("="*80)

## Step 10: Save Results

Save the results to Excel and CSV files for further analysis.

In [None]:
# Create output directory
output_dir = Path('../../data') / 'results'
output_dir.mkdir(parents=True, exist_ok=True)

# Save as Excel
excel_path = output_dir / 'pipeline_experiment_results.xlsx'
results_df.to_excel(excel_path, index=False)
print(f"✓ Results saved to Excel: {excel_path}")

# Save as CSV
csv_path = output_dir / 'pipeline_experiment_results.csv'
results_df.to_csv(csv_path, index=False)
print(f"✓ Results saved to CSV: {csv_path}")

# Also save a detailed version with selected APs
detailed_path = output_dir / 'pipeline_experiment_detailed.xlsx'
with pd.ExcelWriter(detailed_path, engine='openpyxl') as writer:
    # Summary sheet
    results_df.to_excel(writer, sheet_name='Summary', index=False)
    
    # Individual sheets for each method's selected APs
    for label, data in results.items():
        ap_df = pd.DataFrame({
            'AP_Name': data['selected_aps'],
            'Index': range(len(data['selected_aps']))
        })
        sheet_name = f"{label.upper()}_APs"[:31]  # Excel sheet name limit
        ap_df.to_excel(writer, sheet_name=sheet_name, index=False)

print(f"✓ Detailed results saved to: {detailed_path}")

print("\n" + "="*60)
print("✓ ALL RESULTS SAVED SUCCESSFULLY!")
print("="*60)