In [1]:
!pip install openjij
!pip install dwave-ocean-sdk dimod




[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [16]:
import openjij as oj
import time
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr
import dimod
from dwave.samplers import SimulatedAnnealingSampler
import itertools
import warnings
from datetime import datetime
import os
from pathlib import Path

warnings.filterwarnings('ignore')

In [17]:
# Import custom modules from organized scripts subpackages
from scripts.data.pre_processing import load_and_preprocess_data
from scripts.data import pre_processing

from scripts.optimization.Importance import (
    entropy_importance, 
    average_importance, 
    median_importance, 
    max_importance, 
    variance_importance,
    mutual_information_importance
)

from scripts.optimization.Redundancy import calculate_redundancy_matrix

from scripts.optimization.QUBO import (
    formulate_qubo, 
    solve_qubo_with_openjij, 
    solve_qubo_with_SA
)

from scripts.ml.ML_post_processing import train_regressor

from scripts.evaluation.Analysis import calculate_comprehensive_metrics

from scripts.utils.helpers import save_to_excel, remove_selected_aps

from scripts.data.data_loaders import (
    save_all_importance_dicts,
    load_all_precomputed_data,
    load_importance_dict_from_csv,
    load_redundancy_matrix_from_csv,
    save_preprocessed_data,
    load_preprocessed_data
)

# Global variables for storing preprocessed data
scaler_coords = MinMaxScaler()

# 1. Preprocessing

In [18]:
# Load data

data_dir = Path('../../data') / 'input_data'
df_train_path = data_dir / 'TrainingData.csv'
df_validation_path = data_dir / 'ValidationData.csv'

train_df = pd.read_csv(df_train_path)
val_df = pd.read_csv(df_validation_path)


In [19]:
# Preprocess data for a specific building

building_id = 1

rssi_train, coords_train, rssi_val, coords_val, ap_columns = load_and_preprocess_data(df_train_path, df_validation_path, building_id)


scaler_coords = MinMaxScaler()
scaler_coords.fit(coords_train)

Loading and preprocessing UJIIndoorLoc training and validation datasets...
Training samples: 19937
Validation samples: 1111
Training coordinate ranges:
  Longitude: [-7578.461972, -7404.491683]
  Latitude: [4864810.030100, 4864959.505251]
  Floors: [0, 1, 2, 3]
  Floor height: 3.0 meters


In [21]:
# Save preprocessed data for building 1
save_preprocessed_data(
    rssi_train, coords_train, 
    rssi_val, coords_val, 
    ap_columns,
    building_id=building_id
)

print(f"\n✓ Preprocessed data saved successfully!")
print(f"   Files saved to: data/output_data/preprocessed_data/")
print(f"   - Pickle file (fast loading): preprocessed_building_{building_id}.pkl")
print(f"   - Excel file (human-readable): preprocessed_building_{building_id}.xlsx")

✓ Saved preprocessed data to pickle: data\output_data\preprocessed_data\preprocessed_building_1.pkl
✓ Saved preprocessed data to Excel: data\output_data\preprocessed_data\preprocessed_building_1.xlsx

✓ Preprocessed data saved successfully!
   Files saved to: data/output_data/preprocessed_data/
   - Pickle file (fast loading): preprocessed_building_1.pkl
   - Excel file (human-readable): preprocessed_building_1.xlsx


In [20]:
# Save system parameters (normalization values and floor height) to CSV
system_params_dir = Path('../../data') / 'system_input'
system_params_dir.mkdir(parents=True, exist_ok=True)

system_params = pd.DataFrame({
    'Parameter': ['LON_MIN', 'LON_MAX', 'LAT_MIN', 'LAT_MAX', 'FLOOR_HEIGHT', 'BUILDING_ID'],
    'Value': [
        pre_processing.LON_MIN,
        pre_processing.LON_MAX,
        pre_processing.LAT_MIN,
        pre_processing.LAT_MAX,
        3.0,  # FLOOR_HEIGHT
        building_id
    ],
    'Description': [
        'Minimum longitude value for denormalization',
        'Maximum longitude value for denormalization',
        'Minimum latitude value for denormalization',
        'Maximum latitude value for denormalization',
        'Height of each floor in meters',
        'Building ID used for preprocessing'
    ]
})

system_params_path = system_params_dir / 'system_parameters.csv'
system_params.to_csv(system_params_path, index=False)

print(f"\n✓ System parameters saved to: {system_params_path}")
print("\nParameters:")
print(system_params.to_string(index=False))


✓ System parameters saved to: data\system_input\system_parameters.csv

Parameters:
   Parameter         Value                                 Description
     LON_MIN -7.578462e+03 Minimum longitude value for denormalization
     LON_MAX -7.404492e+03 Maximum longitude value for denormalization
     LAT_MIN  4.864809e+06  Minimum latitude value for denormalization
     LAT_MAX  4.864960e+06  Maximum latitude value for denormalization
FLOOR_HEIGHT  3.000000e+00              Height of each floor in meters
 BUILDING_ID  1.000000e+00          Building ID used for preprocessing


---
**OPTIONAL: Load Preprocessed Data from Files**

If you have already run preprocessing once and saved the data, you can skip cell 5 and use the cell below to load from saved files. This is much faster than re-running preprocessing.

**First-time:** Run cell 5 to preprocess and save.  
**Subsequent runs:** Skip cell 5 and run the cell below to load from files.

---

In [4]:
# Load preprocessed data from saved files (FAST alternative to cell 5)
# Comment/uncomment this cell to toggle between loading from files vs preprocessing from scratch

building_id = 1

rssi_train, coords_train, rssi_val, coords_val, ap_columns = load_preprocessed_data(
    building_id=building_id,
    use_pickle=True  # True = fast (pickle), False = slower (Excel)
)

scaler_coords = MinMaxScaler()
scaler_coords.fit(coords_train)

✓ Loaded preprocessed data from pickle: data\output_data\preprocessed_data\preprocessed_building_1.pkl
  Training samples: 5196
  Validation samples: 307
  Number of APs: 520


# 2. Different Importance Metrics

In [5]:
# Calculate feature importance using different methods

importance_entropy_array, importance_entropy_dict = entropy_importance(rssi_train, ap_columns)

importance_average_array, importance_average_dict = average_importance(rssi_train, ap_columns)

importance_median_array, importance_median_dict = median_importance(rssi_train, ap_columns)

importance_max_array, importance_max_dict = max_importance(rssi_train, ap_columns)

importance_variance_array, importance_variance_dict = variance_importance(rssi_train, ap_columns)

importance_mutual_info_array, importance_mutual_info_dict = mutual_information_importance(rssi_train, coords_train)

Calculating weighted mutual information importance scores...
APs with non-zero importance: 207/520
Done


In [6]:
# Save importance Scores
entropy_df = pd.DataFrame.from_dict(importance_entropy_dict, orient='index', columns=['Entropy_Score'])
entropy_df = entropy_df.sort_values('Entropy_Score', ascending=False)

average_df = pd.DataFrame.from_dict(importance_average_dict, orient='index', columns=['Average_Score'])
average_df = average_df.sort_values('Average_Score', ascending=False)

median_df = pd.DataFrame.from_dict(importance_median_dict, orient='index', columns=['Median_Score'])
median_df = median_df.sort_values('Median_Score', ascending=False)

max_df = pd.DataFrame.from_dict(importance_max_dict, orient='index', columns=['Max_Score'])
max_df = max_df.sort_values('Max_Score', ascending=False)

variance_df = pd.DataFrame.from_dict(importance_variance_dict, orient='index', columns=['Variance_Score'])
variance_df = variance_df.sort_values('Variance_Score', ascending=False)

mutual_info_df = pd.DataFrame.from_dict(importance_mutual_info_dict, orient='index', columns=['MutualInfo_Score'])
mutual_info_df = mutual_info_df.sort_values('MutualInfo_Score', ascending=False)

# Create the output directory path
output_dir = Path('../../data') / 'output_data' / 'importance_scores'

# Save each DataFrame to Excel
save_to_excel(entropy_df, output_dir, 'entropy_importance')
save_to_excel(average_df, output_dir, 'average_importance')
save_to_excel(median_df, output_dir, 'median_importance')
save_to_excel(max_df, output_dir, 'max_importance')
save_to_excel(variance_df, output_dir, 'variance_importance')
save_to_excel(mutual_info_df, output_dir, 'mutual_info_importance')


# Also create a combined DataFrame with all scores
combined_df = pd.DataFrame({
    'Entropy_Score': importance_entropy_dict,
    'Average_Score': importance_average_dict,
    'Median_Score': importance_median_dict,
    'Max_Score': importance_max_dict,
    'Variance_Score': importance_variance_dict,
    'MutualInfo_Score': importance_mutual_info_dict
})

combined_df = combined_df.sort_values('Entropy_Score', ascending=False)

# Save the combined DataFrame
save_to_excel(combined_df, output_dir, 'all_importance_scores')

# IMPORTANT: Save importance dictionaries as CSV for easy reloading
importance_dicts = {
    'entropy': importance_entropy_dict,
    'average': importance_average_dict,
    'median': importance_median_dict,
    'max': importance_max_dict,
    'variance': importance_variance_dict,
    'mutual_info': importance_mutual_info_dict
}

save_all_importance_dicts(importance_dicts, output_dir)

print("\n✓ All importance scores saved successfully!")
print(f"   - Excel files saved for analysis")
print(f"   - CSV dictionaries saved for QUBO formulation")
print(f"   - Location: {output_dir}")

Sheet 'Sheet1' updated in data\output_data\importance_scores\entropy_importance.xlsx
Sheet 'Sheet1' updated in data\output_data\importance_scores\average_importance.xlsx
Sheet 'Sheet1' updated in data\output_data\importance_scores\median_importance.xlsx
Sheet 'Sheet1' updated in data\output_data\importance_scores\max_importance.xlsx
Sheet 'Sheet1' updated in data\output_data\importance_scores\variance_importance.xlsx
Sheet 'Sheet1' updated in data\output_data\importance_scores\mutual_info_importance.xlsx
Sheet 'Sheet1' updated in data\output_data\importance_scores\all_importance_scores.xlsx
✓ Saved importance dictionary to data\output_data\importance_scores\entropy_importance_dict.csv
✓ Saved importance dictionary to data\output_data\importance_scores\average_importance_dict.csv
✓ Saved importance dictionary to data\output_data\importance_scores\median_importance_dict.csv
✓ Saved importance dictionary to data\output_data\importance_scores\max_importance_dict.csv
✓ Saved importance dict

# Redundancy Mertic

In [7]:
# Calculate redundancy matrix using the training RSSI data
redundancy_matrix = calculate_redundancy_matrix(rssi_train)

print(f"Redundancy matrix shape: {redundancy_matrix.shape}")  # Will be (n_aps, n_aps)


Calculating redundancy matrix...
Done
Redundancy matrix shape: (520, 520)


In [8]:
# Save redundancy_matrix to Excel and CSV

output_dir = Path('../../data') / 'output_data' / 'redundancy_scores'

# Save as Excel
save_to_excel(
    df=redundancy_matrix,
    folder_path=output_dir,
    filename='redundancy_matrix',
)

# Also save as CSV for easier loading
redundancy_matrix.to_csv(output_dir / 'redundancy_matrix.csv', index=True)

print("✓ Redundancy matrix saved successfully!")

Sheet 'Sheet1' updated in data\output_data\redundancy_scores\redundancy_matrix.xlsx
✓ Redundancy matrix saved successfully!


# QUBO Formulation

In [None]:
# Load system parameters from CSV
system_params_path = Path('../../data') / 'system_input' / 'system_parameters.csv'
system_params_df = pd.read_csv(system_params_path)

# Convert to dictionary for easy access
system_params_dict = dict(zip(system_params_df['Parameter'], system_params_df['Value']))

# Extract parameters
LON_MIN = system_params_dict['LON_MIN']
LON_MAX = system_params_dict['LON_MAX']
LAT_MIN = system_params_dict['LAT_MIN']
LAT_MAX = system_params_dict['LAT_MAX']
FLOOR_HEIGHT = system_params_dict['FLOOR_HEIGHT']

# QUBO parameters
results = {}
k = 20  
alpha = 0.9
penalty = 2.0

print("✓ System parameters loaded from CSV:")
print(f"  LON_MIN: {LON_MIN}")
print(f"  LON_MAX: {LON_MAX}")
print(f"  LAT_MIN: {LAT_MIN}")
print(f"  LAT_MAX: {LAT_MAX}")
print(f"  FLOOR_HEIGHT: {FLOOR_HEIGHT}")
print(f"\nQUBO parameters:")
print(f"  k (num APs): {k}")
print(f"  alpha: {alpha}")
print(f"  penalty: {penalty}")

In [12]:
# EASY METHOD: Load all pre-computed data at once
importance_dicts_loaded, redundancy_matrix_loaded = load_all_precomputed_data()

# Access individual importance methods like this:
# importance_dicts_loaded['entropy']
# importance_dicts_loaded['average']
# importance_dicts_loaded['max']
# importance_dicts_loaded['variance']
# importance_dicts_loaded['median']

Loading pre-computed importance scores and redundancy matrix

Loading importance scores...
✓ Loaded 520 APs for entropy importance
✓ Loaded 520 APs for average importance
✓ Loaded 520 APs for median importance
✓ Loaded 520 APs for max importance
✓ Loaded 520 APs for variance importance
✓ Loaded 520 APs for mutual_info importance

Loading redundancy matrix...
✓ Loaded redundancy matrix with shape: (520, 520)

✓ All data loaded successfully!


---
**NOTE:** The following cells load pre-computed importance scores and redundancy matrix from saved files. 

**First-time setup:** Run cells 8-14 once to compute and save the data.

**Subsequent runs:** Skip cells 8-14 and run the cell below to load from files directly.

This approach saves significant computation time by avoiding re-calculation of importance scores and redundancy matrix.

---

In [14]:
results = {}

# Use the LOADED dictionaries and matrix from files
for label in ['mutual_info', 'entropy', 'average', 'max', 'variance']:
    
    # Get importance dict from loaded data
    imp_dict = importance_dicts_loaded[label]
    
    # Check for zero importance scores
    nonzero_scores = [v for v in imp_dict.values() if v > 0]
    if len(nonzero_scores) == 0:
        print(f"Skipped {label}: all importance scores are zero or negative.")
        continue

    # 1. Formulate QUBO using LOADED data
    Q, relevant_aps, offset = formulate_qubo(imp_dict, redundancy_matrix_loaded, k, alpha, penalty)
    if len(relevant_aps) == 0:
        print(f"Skipped {label}: no relevant APs selected after QUBO formulation.")
        continue

    # 2. Solve QUBO
    selected_indices, duration = solve_qubo_with_openjij(Q)
    if len(selected_indices) == 0:
        print(f"Skipped {label}: QUBO solver did not select any APs.")
        continue

    selected_aps = [relevant_aps[i] for i in selected_indices]

    # 3. Train and get predictions
    models, predictions = train_regressor(rssi_train, coords_train, rssi_val, coords_val, selected_aps)
    preds = predictions['rf_val']

    # 4. Evaluate mean 3D error and floor accuracy
    _, _, metrics = calculate_comprehensive_metrics(
        coords_val, preds, 
        LON_MIN, LON_MAX, 
        LAT_MIN, LAT_MAX, 
        FLOOR_HEIGHT
    )

    mean_3d_error = metrics['real_mean_m']
    median_3d_error = metrics['real_median_m']
    real_min_m = metrics['real_min_m']
    real_max_m = metrics['real_max_m']
    floor_acc = metrics['floor_accuracy']

    # 5. Store and print results
    results[label] = {
        'selected_aps': selected_aps,
        'mean_3d_error': mean_3d_error,
        'median_3d_error': median_3d_error,
        'real_min_m': real_min_m,
        'real_max_m': real_max_m,
        'floor_accuracy': floor_acc,
        'duration': duration
    }
    print(f"{label}: {len(selected_aps)} APs, Mean 3D Error: {mean_3d_error:.2f} m, Floor Accuracy: {floor_acc:.2%}, QUBO time: {duration:.2f}s")


Formulating enhanced QUBO for k=20 APs selection...
Done

Solving QUBO with OpenJij Simulated Quantum Annealing (SQA)...
OpenJij completed in 51.9720 seconds
Training random forest regressor...
✓ Enhanced Random Forest trained
   Average OOB Score: 0.9003
mutual_info: 20 APs, Mean 3D Error: 19.65 m, Floor Accuracy: 62.87%, QUBO time: 51.97s
Formulating enhanced QUBO for k=20 APs selection...
Done

Solving QUBO with OpenJij Simulated Quantum Annealing (SQA)...
OpenJij completed in 54.2011 seconds
Training random forest regressor...
✓ Enhanced Random Forest trained
   Average OOB Score: 0.9162
entropy: 20 APs, Mean 3D Error: 15.99 m, Floor Accuracy: 58.96%, QUBO time: 54.20s
Formulating enhanced QUBO for k=20 APs selection...
Done

Solving QUBO with OpenJij Simulated Quantum Annealing (SQA)...
OpenJij completed in 56.6464 seconds
Training random forest regressor...
✓ Enhanced Random Forest trained
   Average OOB Score: 0.9298
average: 20 APs, Mean 3D Error: 15.14 m, Floor Accuracy: 66.12

In [15]:
# Create a DataFrame from results
results_data = []
for label, data in results.items():
    results_data.append({
        'Importance_Method': label,
        'Num_APs': len(data['selected_aps']),
        'Selected_APs': ', '.join(data['selected_aps']),
        'Mean_3D_Error_m': data['mean_3d_error'],
        'Median_3D_Error_m': data['median_3d_error'],
        'Min_Error_m': data['real_min_m'],
        'Max_Error_m': data['real_max_m'],
        'Floor_Accuracy': data['floor_accuracy'],
        'QUBO_Duration_s': data['duration']
    })

results_df = pd.DataFrame(results_data)

# Save as Excel
excel_path = 'data/results/qubo_results.xlsx'
results_df.to_excel(excel_path, index=False)
print(f"Results saved to {excel_path}")

# Save as CSV
csv_path = 'data/results/qubo_results.csv'
results_df.to_csv(csv_path, index=False)
print(f"Results saved to {csv_path}")

# Display the results
print("\nResults Summary:")
print(results_df.to_string(index=False))

Results saved to data/results/qubo_results.xlsx
Results saved to data/results/qubo_results.csv

Results Summary:
Importance_Method  Num_APs                                                                                                                                                   Selected_APs  Mean_3D_Error_m  Median_3D_Error_m  Min_Error_m  Max_Error_m  Floor_Accuracy  QUBO_Duration_s
      mutual_info       20 WAP015, WAP103, WAP107, WAP115, WAP119, WAP135, WAP140, WAP148, WAP167, WAP172, WAP177, WAP178, WAP179, WAP222, WAP288, WAP329, WAP344, WAP351, WAP446, WAP478        19.650160          13.634130     0.326116    83.371049        0.628664        51.971995
          entropy       20 WAP016, WAP037, WAP088, WAP091, WAP112, WAP113, WAP119, WAP120, WAP126, WAP130, WAP140, WAP141, WAP166, WAP172, WAP182, WAP189, WAP248, WAP260, WAP262, WAP334        15.988761          11.762935     0.977198    82.903321        0.589577        54.201057
          average       20 WAP016, WAP027, W