In [1]:
"""
SQQNN for Communities and Crime Dataset
This script evaluates a quantum-inspired neural network on the Communities and Crime dataset
from UCI repository, testing different numbers of neurons using k-fold cross validation.
Results are printed to console instead of being saved to files.
"""

import numpy as np
import matplotlib.pyplot as plt
from QNetwork import *
from sklearn import metrics
import time
import pandas as pd
from ucimlrepo import fetch_ucirepo
from sklearn.model_selection import KFold

# Configuration parameters
NUM_RUNS = 1                      # Number of network executions
KFOLDS = 10                       # Number of folds for cross-validation
MAX_ITERATIONS = 1000000          # Maximum training iterations
LEARNING_RATE = 0.1               # Learning rate
GRAD_UPDATE = 0.1                 # Gradient update parameter
TRAIN_RATIO = 0.8                 # Training data ratio
VAL_RATIO = 0.1                   # Validation data ratio
TEST_RATIO = 0.1                  # Test data ratio
RANDOM_SEED = 0                   # Random seed for reproducibility

# Load Communities and Crime dataset from UCI
print("\nLoading Communities and Crime dataset from UCI repository...")
crime_data = fetch_ucirepo(id=183)

# Extract features and targets
X = crime_data.data.features
y = crime_data.data.targets

print(f"\nDataset dimensions - Features: {X.shape}, Targets: {y.shape}")

# Data preprocessing
print("\nPreprocessing data...")
# Select only numeric columns and fill missing values
X = X.select_dtypes(include=[np.number])
X = X.fillna(X.mean())
y = y.fillna(y.mean())

# Convert to numpy arrays
X = X.astype(float).to_numpy()
y = y.astype(float).to_numpy()

# Normalize features to [-1, 1] range
x_min = np.min(X)
x_max = np.max(X)
X = 2 * (X - x_min) / (x_max - x_min) - 1

# Normalize targets to [0, 1] range
y_min = np.min(y)
y_max = np.max(y)
y = (y - y_min) / (y_max - y_min)

# Initialize results tracking
best_overall_mse = float('inf')
best_run = None

# Print header for results
print("\n" + "="*50)
print("Final Summary of All Executions")
print("="*50 + "\n")

for run in range(1, NUM_RUNS + 1):
    print(f"\nRun {run}/{NUM_RUNS} in progress...")
    min_mse = float('inf')
    mse_train_values, mse_val_values, mse_test_values = [], [], []
    r2_adj_values = []

    # Print run header
    print("\n" + "-"*40)
    print(f"Results Summary - Run {run}")
    print(f"Number of neurons: {run}")
    print(f"Total samples: {len(y)}")
    print(f"K-Fold Cross Validation: {KFOLDS} folds")
    print("-"*40 + "\n")
    
    np.random.seed(RANDOM_SEED)
    kf = KFold(n_splits=KFOLDS, shuffle=True, random_state=RANDOM_SEED)

    for fold, (train_val_idx, test_idx) in enumerate(kf.split(X), 1):
        print(f"\nProcessing K-Fold {fold}/{KFOLDS} for Run {run}")
        print("-"*30)

        # Split data
        X_train_val, X_test = X[train_val_idx], X[test_idx]
        y_train_val, y_test = y[train_val_idx], y[test_idx]
        
        # Further split into train/validation
        train_size = int(len(X_train_val) * TRAIN_RATIO)
        val_size = int(len(X_train_val) * VAL_RATIO)
        
        X_train, y_train = X_train_val[:train_size], y_train_val[:train_size]
        X_val, y_val = X_train_val[train_size:train_size + val_size], y_train_val[train_size:train_size + val_size]
        
        # Initialize quantum neural network
        qrn = QNetwork(X_train, y_train, X_val, y_val, run)
        activation_types = np.array([ATIVATION_TYPE.LINEAR]*5)

        # Train network
        start_time = time.time()
        final_cost = qrn.fit_gradient_descent_MEAN_SQUARED(
            MAX_ITERATIONS, 1, LEARNING_RATE, GRAD_UPDATE, False)
        training_time = time.time() - start_time

        # Make predictions
        y_train_pred = qrn.predict(X_train)
        y_val_pred = qrn.predict(X_val)
        y_test_pred = qrn.predict(X_test)
        
        # Calculate metrics
        mse_train = metrics.mean_squared_error(y_train, y_train_pred)
        mse_val = metrics.mean_squared_error(y_val, y_val_pred)
        mse_test = metrics.mean_squared_error(y_test, y_test_pred)
        
        # Calculate adjusted R-squared
        r2 = metrics.r2_score(y_train, y_train_pred)
        n = X_train_val.shape[0]
        p = X_train_val.shape[1]
        adj_r2 = 1 - ((1 - r2) * (n - 1) / (n - p * run))

        # Store metrics
        mse_train_values.append(mse_train)
        mse_val_values.append(mse_val)
        mse_test_values.append(mse_test)
        r2_adj_values.append(adj_r2)

        # Update minimum MSE
        if mse_test < min_mse:
            min_mse = mse_test

        # Print fold results
        print(f"\nFold {fold} Results:")
        print(f"Training set size: {X_train.shape[0]} samples")
        print(f"Validation set size: {X_val.shape[0]} samples")
        print(f"Test set size: {X_test.shape[0]} samples")
        print(f"Training time: {training_time:.2f} seconds")
        print(f"Final cost: {final_cost}")
        print(f"Training MSE: {mse_train:.4f}")
        print(f"Validation MSE: {mse_val:.4f}")
        print(f"Test MSE: {mse_test:.4f}")
        print(f"Adjusted R2: {adj_r2:.4f}")

    # Calculate mean and std of metrics across folds
    mse_train_mean = np.mean(mse_train_values)
    mse_val_mean = np.mean(mse_val_values)
    mse_test_mean = np.mean(mse_test_values)
    mse_train_std = np.std(mse_train_values)
    mse_val_std = np.std(mse_val_values)
    mse_test_std = np.std(mse_test_values)
    r2_mean = np.mean(r2_adj_values)
    r2_std = np.std(r2_adj_values)

    # Print final summary for this run
    print("\n" + "="*30)
    print(f"Final Summary - Run {run}")
    print("="*30)
    print(f"Mean Training MSE: {mse_train_mean:.4f} ± {mse_train_std:.4f}")
    print(f"Mean Validation MSE: {mse_val_mean:.4f} ± {mse_val_std:.4f}")
    print(f"Mean Test MSE: {mse_test_mean:.4f} ± {mse_test_std:.4f}")
    print(f"Mean Adjusted R2: {r2_mean:.4f} ± {r2_std:.4f}")
    print(f"Minimum Test MSE: {min_mse:.4f}")

    # Update best overall performance
    if min_mse < best_overall_mse:
        best_overall_mse = min_mse
        best_run = run

# Print best performance
print("\n" + "="*50)
print("Best Overall Performance")
print("="*50)
print(f"Best Run: Run {best_run} with Lowest Test MSE: {best_overall_mse:.4f}")

print("\nAll runs completed. Results printed above.")


Loading Communities and Crime dataset from UCI repository...

Dataset dimensions - Features: (1994, 127), Targets: (1994, 1)

Preprocessing data...

Final Summary of All Executions


Run 1/1 in progress...

----------------------------------------
Results Summary - Run 1
Number of neurons: 1
Total samples: 1994
K-Fold Cross Validation: 10 folds
----------------------------------------


Processing K-Fold 1/10 for Run 1
------------------------------
1% 
Fold 1 Results:
Training set size: 1435 samples
Validation set size: 179 samples
Test set size: 200 samples
Training time: 23.33 seconds
Final cost: (np.float64(0.03542981761267352), 10314)
Training MSE: 0.0354
Validation MSE: 0.0344
Test MSE: 0.0408
Adjusted R2: 0.3225

Processing K-Fold 2/10 for Run 1
------------------------------
1% 
Fold 2 Results:
Training set size: 1435 samples
Validation set size: 179 samples
Test set size: 200 samples
Training time: 24.43 seconds
Final cost: (np.float64(0.040299327460696234), 10851)
Training M