<a href="https://colab.research.google.com/github/jonanew/noise_detection/blob/main/KMeans.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install rdflib for KG
!pip install rdflib

# Install noise package for noise generation
!pip install noise



Collecting rdflib
  Downloading rdflib-7.1.4-py3-none-any.whl.metadata (11 kB)
Downloading rdflib-7.1.4-py3-none-any.whl (565 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/565.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m565.1/565.1 kB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdflib
Successfully installed rdflib-7.1.4
Collecting noise
  Downloading noise-1.2.2.zip (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.0/132.0 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: noise
  Building wheel for noise (setup.py) ... [?25l[?25hdone
  Created wheel for noise: filename=noise-1.2.2-cp311-cp311-linux_x86_64.whl size=56276 sha256=0641a3b08898df23a8e95a325fa170d51a4f67f46964d916329e43b910bc7c10
  Stored in directory: /root/.cache/pip/wheels/39/25/2e

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
import numpy as np
from rdflib import Graph
from sklearn.preprocessing import RobustScaler
from sklearn.cluster import KMeans
from sklearn.model_selection import KFold
from sklearn.metrics import precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# Paths to sensor reading csv file data and knowledge graph
data_path = "/content/drive/My Drive/ColabNotebooks/motion_sensor_readings.csv"
kg_path = "/content/drive/My Drive/ColabNotebooks/motion_sense_ssn_kg.ttl"

# Load knowledge graph
g = Graph()
try:
    g.parse(kg_path, format='turtle')
except Exception as e:
    print(f"Error parsing knowledge graph: {e}")
    exit(1)

# Query for participant characteristics
query = """
PREFIX ms: <http://example.org/motion-sense#>
SELECT ?p ?code ?age ?gender ?height ?weight
WHERE {
    ?p a ms:Participant ;
       ms:hasCode ?code ;
       ms:hasAge ?age ;
       ms:hasGender ?gender ;
       ms:hasHeightCm ?height ;
       ms:hasWeightKg ?weight .
}
"""
results = g.query(query)
participants = []
for row in results:
    participants.append({
        'participant': str(row.code).strip().upper(),
        'age': int(row.age),
        'gender': int(row.gender),
        'height': float(row.height),
        'weight': float(row.weight)
    })
participants_df = pd.DataFrame(participants)

if participants_df.empty:
    print("Error: No participants found in knowledge graph.")
    exit(1)

# Query for activity plausible ranges
query_ranges = """
PREFIX activity: <http://example.org/activity-recognition#>
SELECT ?activityCode ?min ?max
WHERE {
    ?act a activity:Activity ;
         activity:hasActivityCode ?activityCode ;
         activity:hasMinAcceleration ?min ;
         activity:hasMaxAcceleration ?max .
}
"""
results_ranges = g.query(query_ranges)
activity_ranges = {}
for row in results_ranges:
    activity_code = str(row.activityCode).strip().lower()
    activity_ranges[activity_code] = {
        'min': float(row.min),
        'max': float(row.max)
    }

if not activity_ranges:
    print("Error: No activity ranges found in knowledge graph.")
    exit(1)

# Load sensor readings csv file data
try:
    df = pd.read_csv(data_path)
except Exception as e:
    print(f"Error loading sensor data: {e}")
    exit(1)

# Parse sensor_id with validation
def parse_sensor_id(sid):
    parts = sid.split('_')
    if len(parts) >= 5:
        return parts[0].strip().upper(), parts[3], parts[4]
    return None, None, None

df['participant'], df['activity'], df['sensor_type'] = zip(*df['sensor_id'].apply(parse_sensor_id))
df = df.dropna(subset=['participant'])

# Filter for accelerometer data
df = df[(df['sensor_type'] == 'Accelerometer') &
        (df['property'].isin(['UserAccelerationX', 'UserAccelerationY', 'UserAccelerationZ']))]

# Convert measurement to numeric, drop invalid entries
df['measurement'] = pd.to_numeric(df['measurement'], errors='coerce')
df = df.dropna(subset=['measurement'])

if df.empty:
    print("Error: No valid accelerometer data after cleaning.")
    exit(1)

# Pivot to get X, Y, Z
acc_df = df.pivot_table(index=['timestamp', 'sensor_id', 'participant', 'activity'],
                        columns='property', values='measurement', aggfunc='first').reset_index()

# Drop rows with missing or invalid acceleration components
acc_df = acc_df.dropna(subset=['UserAccelerationX', 'UserAccelerationY', 'UserAccelerationZ'])
for col in ['UserAccelerationX', 'UserAccelerationY', 'UserAccelerationZ']:
    acc_df[col] = pd.to_numeric(acc_df[col], errors='coerce')
acc_df = acc_df.dropna(subset=['UserAccelerationX', 'UserAccelerationY', 'UserAccelerationZ'])

if acc_df.empty:
    print("Error: No valid data after pivot and cleaning.")
    exit(1)

# Compute magnitude
acc_df['magnitude'] = np.sqrt(acc_df['UserAccelerationX']**2 +
                              acc_df['UserAccelerationY']**2 +
                              acc_df['UserAccelerationZ']**2)

# Compute global mean and std for magnitude (for traditional Z-score)
global_magnitude_mean = acc_df['magnitude'].mean()
global_magnitude_std = acc_df['magnitude'].std()
print(f"Global magnitude stats: Mean = {global_magnitude_mean:.6f}, Std = {global_magnitude_std:.6f}")

# Compute per-activity stats for reference on cleaned data
activity_stats_df = acc_df.groupby('activity')['magnitude'].agg(['mean', 'std', 'count']).reset_index()
activity_stats_df.columns = ['activity', 'actual_mean', 'actual_std', 'sample_count']

# Ensure sufficient samples and smooth std
activity_stats_df = activity_stats_df[activity_stats_df['sample_count'] >= 100]
global_activity_std = activity_stats_df['actual_std'].mean()
activity_stats_df['actual_std'] = activity_stats_df['actual_std'].apply(lambda x: max(x, global_activity_std * 0.1))

# Debug: Print per-activity stats
print("Per-activity magnitude stats:\n", activity_stats_df[['activity', 'actual_mean', 'actual_std', 'sample_count']])

# Merge with participant characteristics for clustering
merged_df = pd.merge(acc_df, participants_df, on='participant', how='inner')

if merged_df.empty:
    print("Error: No matching participants found after merging.")
    exit(1)

# Debug: Print original data size
print("Original data size:", len(merged_df))

# Initialize 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
results_list = []

# Iterate over folds
for fold, (train_index, test_index) in enumerate(kf.split(merged_df), 1):
    print(f"Processing Fold {fold}")
    train_df = merged_df.iloc[train_index].copy()
    test_df = merged_df.iloc[test_index].copy()
    print(f"Training set size: {len(train_df)}, Test set size: {len(test_df)}")

    # Prepare data for clustering on training set, including magnitude
    X_cluster_train = train_df[['age', 'gender', 'height', 'weight', 'magnitude', 'activity']]
    scaler = RobustScaler()
    X_numeric_train = scaler.fit_transform(X_cluster_train[['age', 'gender', 'height', 'weight', 'magnitude']])
    X_activity_train = pd.get_dummies(X_cluster_train['activity'], prefix='activity')
    X_scaled_train = np.hstack([X_numeric_train, X_activity_train])

    # Determine optimal number of clusters using the elbow method on training set
    wcss = []
    max_clusters = 11
    for i in range(2, max_clusters + 1):
        kmeans = KMeans(n_clusters=i, random_state=42)
        kmeans.fit(X_scaled_train)
        wcss.append(kmeans.inertia_)

    # Plot the elbow curve for this fold (optional, can be disabled for efficiency)
    plt.figure(figsize=(10, 5))
    plt.plot(range(2, max_clusters + 1), wcss, marker='o')
    plt.title(f'Elbow Method for Optimal Number of Clusters - Fold {fold}')
    plt.xlabel('Number of Clusters')
    plt.ylabel('WCSS')
    plt.savefig(f'/content/drive/My Drive/ColabNotebooks/elbow_curve_fold_{fold}.png')
    plt.close()

    # Choose the optimal number of clusters
    optimal_clusters = 11  # Placeholder, adjust based on elbow curve
    for i in range(2, len(wcss) - 1):
        if (wcss[i-1] - wcss[i]) / (wcss[i] - wcss[i+1]) < 0.5:
            optimal_clusters = i + 2
            break
    print(f"Optimal number of clusters for Fold {fold}: {optimal_clusters}")

    # Perform K-means clustering with optimal number of clusters on training set
    kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)
    train_cluster_labels = kmeans.fit_predict(X_scaled_train)

    # Assign clusters to test set using the trained K-means model
    X_cluster_test = test_df[['age', 'gender', 'height', 'weight', 'magnitude', 'activity']]
    X_numeric_test = scaler.transform(X_cluster_test[['age', 'gender', 'height', 'weight', 'magnitude']])
    X_activity_test = pd.get_dummies(X_cluster_test['activity'], prefix='activity')
    # Align columns between train and test activity dummies
    X_activity_test = X_activity_test.reindex(columns=X_activity_train.columns, fill_value=0)
    X_scaled_test = np.hstack([X_numeric_test, X_activity_test])
    test_cluster_labels = kmeans.predict(X_scaled_test)

    # Add cluster labels to train and test dataframes
    train_df['cluster'] = train_cluster_labels
    test_df['cluster'] = test_cluster_labels

    # Combine train and test dataframes for this fold
    fold_df = pd.concat([train_df, test_df], axis=0)

    # Assign cluster mean and std for Z-score calculation using K-means clusters
    cluster_means = train_df.groupby('cluster')['magnitude'].mean().to_dict()
    cluster_stds = train_df.groupby('cluster')['magnitude'].std().to_dict()
    fold_df['cluster_mu'] = fold_df['cluster'].map(cluster_means)
    fold_df['cluster_sigma'] = fold_df['cluster'].map(cluster_stds).fillna(global_activity_std)

    # Compute Z-scores using cluster-based statistics
    fold_df['Z_score'] = (fold_df['magnitude'] - fold_df['cluster_mu']) / fold_df['cluster_sigma'].replace(0, np.nan)
    fold_df['traditional_Z_score'] = (fold_df['magnitude'] - global_magnitude_mean) / global_magnitude_std

    # Add global mean and std to output
    fold_df['global_magnitude_mean'] = global_magnitude_mean
    fold_df['global_magnitude_std'] = global_magnitude_std

    # Apply activity-specific plausible ranges
    fold_df['activity'] = fold_df['activity'].str.lower()
    default_min_plausible = 0
    default_max_plausible = 10
    fold_df['min_plausible'] = fold_df['activity'].map(lambda x: activity_ranges.get(x, {'min': default_min_plausible})['min'])
    fold_df['max_plausible'] = fold_df['activity'].map(lambda x: activity_ranges.get(x, {'max': default_max_plausible})['max'])

    # Determine if observations are outside plausible range
    fold_df['outside_plausible'] = (fold_df['magnitude'] < fold_df['min_plausible']) | (fold_df['magnitude'] > fold_df['max_plausible'])

    # Determine anomaly
    threshold = 2
    fold_df['is_anomaly'] = np.abs(fold_df['Z_score']) > threshold
    fold_df['is_anomaly_traditional'] = np.abs(fold_df['traditional_Z_score']) > threshold

    # Label noise
    fold_df['is_noise'] = fold_df['is_anomaly'] & fold_df['outside_plausible']
    fold_df['is_noise_traditional'] = fold_df['is_anomaly_traditional'] & fold_df['outside_plausible']

    # Round numerical columns in fold_df to 2 decimal places
    numerical_cols = fold_df.select_dtypes(include=['float64', 'int64']).columns
    fold_df[numerical_cols] = fold_df[numerical_cols].round(2)

    # Function to introduce Gaussian noise
    def introduce_gaussian_noise(df, noise_level=0.1, noise_fraction=0.05):
        noisy_df = df.copy()
        mask = np.random.rand(len(noisy_df)) < noise_fraction
        noise = np.random.normal(0, noise_level, size=len(noisy_df))
        noisy_df['magnitude'] = noisy_df['magnitude'] + noise * mask
        noisy_df['is_true_noise'] = mask.astype(int)  # Ground truth: 1 for noisy, 0 for normal
        return noisy_df

    # List of noise fractions to test
    noise_fractions = [0.02, 0.05, 0.10, 0.20]

    # Iterate over noise levels
    for noise_fraction in noise_fractions:
        print(f"Processing noise fraction: {noise_fraction*100}% in Fold {fold}")
        # Introduce noise to the dataset
        noisy_df = introduce_gaussian_noise(fold_df, noise_level=2 * global_magnitude_std, noise_fraction=noise_fraction)

        # Apply anomaly detection - Hybrid method
        noisy_df['Z_score'] = (noisy_df['magnitude'] - noisy_df['cluster_mu']) / noisy_df['cluster_sigma'].replace(0, np.nan)
        noisy_df['is_anomaly'] = np.abs(noisy_df['Z_score']) > threshold
        noisy_df['outside_plausible'] = (noisy_df['magnitude'] < noisy_df['min_plausible']) | (noisy_df['magnitude'] > noisy_df['max_plausible'])
        noisy_df['is_noise'] = noisy_df['is_anomaly'] & noisy_df['outside_plausible']

        # Apply anomaly detection - Traditional method
        noisy_df['traditional_Z_score'] = (noisy_df['magnitude'] - global_magnitude_mean) / global_magnitude_std
        noisy_df['is_anomaly_traditional'] = np.abs(noisy_df['traditional_Z_score']) > threshold
        noisy_df['is_noise_traditional'] = noisy_df['is_anomaly_traditional'] & noisy_df['outside_plausible']

        # Evaluate performance
        y_true = noisy_df['is_true_noise']
        y_pred_hybrid = noisy_df['is_noise'].astype(int)
        y_pred_traditional = noisy_df['is_noise_traditional'].astype(int)

        precision_hybrid = precision_score(y_true, y_pred_hybrid)
        recall_hybrid = recall_score(y_true, y_pred_hybrid)
        f1_hybrid = f1_score(y_true, y_pred_hybrid)

        precision_traditional = precision_score(y_true, y_pred_traditional)
        recall_traditional = recall_score(y_true, y_pred_traditional)
        f1_traditional = f1_score(y_true, y_pred_traditional)

        # Calculate percentage of noise detected
        total_noise_points = y_true.sum()
        hybrid_detected = y_pred_hybrid[y_true == 1].sum()
        traditional_detected = y_pred_traditional[y_true == 1].sum()
        hybrid_noise_percentage = (hybrid_detected / total_noise_points * 100) if total_noise_points > 0 else 0
        traditional_noise_percentage = (traditional_detected / total_noise_points * 100) if total_noise_points > 0 else 0

        # Store results for this noise level and fold
        results_list.append({
            'Fold': fold,
            'Algorithm': 'K-means',
            'Noise Fraction (%)': noise_fraction * 100,
            'Method': 'Hybrid (Cluster-based)',
            'Precision': precision_hybrid,
            'Recall': recall_hybrid,
            'F1 Score': f1_hybrid,
            'Noise Detected (%)': hybrid_noise_percentage
        })
        results_list.append({
            'Fold': fold,
            'Algorithm': 'K-means',
            'Noise Fraction (%)': noise_fraction * 100,
            'Method': 'Traditional (Global)',
            'Precision': precision_traditional,
            'Recall': recall_traditional,
            'F1 Score': f1_traditional,
            'Noise Detected (%)': traditional_noise_percentage
        })

# Create consolidated results table
results_df = pd.DataFrame(results_list)
print("\nConsolidated Evaluation Results:")
print(results_df.groupby(['Noise Fraction (%)', 'Method']).agg({
    'Precision': 'mean',
    'Recall': 'mean',
    'F1 Score': 'mean',
    'Noise Detected (%)': 'mean'
}).reset_index())

# Save noisy dataset and results for the last fold
try:
    noisy_df.to_csv('/content/drive/My Drive/ColabNotebooks/noisy_results_kmeans_gaussian.csv', index=False)
    print(f"Noisy dataset saved to '/content/drive/My Drive/ColabNotebooks/noisy_results_kmeans_gaussian.csv'.")
    results_df.to_csv('/content/drive/My Drive/ColabNotebooks/evaluation_results_kmeans_gaussian.csv', index=False)
    print(f"Evaluation results saved to '/content/drive/My Drive/ColabNotebooks/evaluation_results_kmeans_gaussian.csv'.")
except Exception as e:
    print(f"Error saving files: {e}")

# Debug: Print summary for the last noise level of the last fold
print("\nZ-score stats (noisy dataset, last noise level of last fold):\n", noisy_df[['Z_score', 'traditional_Z_score']].describe())
print("Noise detection summary (hybrid, noisy dataset, last noise level of last fold):\n", noisy_df['is_noise'].value_counts())
print("Noise detection summary (traditional, noisy dataset, last noise level of last fold):\n", noisy_df['is_noise_traditional'].value_counts())
print(f"Total noise points introduced (last noise level of last fold): {total_noise_points}")
print(f"Hybrid method detected (last noise level of last fold): {hybrid_detected} ({hybrid_noise_percentage:.2f}%)")
print(f"Traditional method detected (last noise level of last fold): {traditional_detected} ({traditional_noise_percentage:.2f}%)")

Global magnitude stats (after outlier removal): Mean = 0.753240, Std = 0.638734
Per-activity magnitude stats:
   activity  actual_mean  actual_std  sample_count
0      dws     0.575685    0.426152        131856
1      jog     1.445474    0.905656        134231
2      ups     0.447230    0.337824        157285
3      wlk     0.691151    0.471883        344288
Original data size: 767660
Processing Fold 1
Training set size: 614128, Test set size: 153532
Optimal number of clusters for Fold 1: 7
Processing noise fraction: 2.0% in Fold 1
Processing noise fraction: 5.0% in Fold 1
Processing noise fraction: 10.0% in Fold 1
Processing noise fraction: 20.0% in Fold 1
Processing Fold 2
Training set size: 614128, Test set size: 153532
Optimal number of clusters for Fold 2: 11
Processing noise fraction: 2.0% in Fold 2
Processing noise fraction: 5.0% in Fold 2
Processing noise fraction: 10.0% in Fold 2
Processing noise fraction: 20.0% in Fold 2
Processing Fold 3
Training set size: 614128, Test set s

In [4]:
import pandas as pd
import numpy as np
from rdflib import Graph
from sklearn.preprocessing import RobustScaler
from sklearn.cluster import KMeans
from sklearn.model_selection import KFold
from sklearn.metrics import precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import warnings
from scipy.stats import gennorm

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# Paths to sensor reading csv file data and knowledge graph
data_path = "/content/drive/My Drive/ColabNotebooks/motion_sensor_readings.csv"
kg_path = "/content/drive/My Drive/ColabNotebooks/motion_sense_ssn_kg.ttl"

# Load knowledge graph
g = Graph()
try:
    g.parse(kg_path, format='turtle')
except Exception as e:
    print(f"Error parsing knowledge graph: {e}")
    exit(1)

# Query for participant characteristics
query = """
PREFIX ms: <http://example.org/motion-sense#>
SELECT ?p ?code ?age ?gender ?height ?weight
WHERE {
    ?p a ms:Participant ;
       ms:hasCode ?code ;
       ms:hasAge ?age ;
       ms:hasGender ?gender ;
       ms:hasHeightCm ?height ;
       ms:hasWeightKg ?weight .
}
"""
results = g.query(query)
participants = []
for row in results:
    participants.append({
        'participant': str(row.code).strip().upper(),
        'age': int(row.age),
        'gender': int(row.gender),
        'height': float(row.height),
        'weight': float(row.weight)
    })
participants_df = pd.DataFrame(participants)

if participants_df.empty:
    print("Error: No participants found in knowledge graph.")
    exit(1)

# Query for activity plausible ranges
query_ranges = """
PREFIX activity: <http://example.org/activity-recognition#>
SELECT ?activityCode ?min ?max
WHERE {
    ?act a activity:Activity ;
         activity:hasActivityCode ?activityCode ;
         activity:hasMinAcceleration ?min ;
         activity:hasMaxAcceleration ?max .
}
"""
results_ranges = g.query(query_ranges)
activity_ranges = {}
for row in results_ranges:
    activity_code = str(row.activityCode).strip().lower()
    activity_ranges[activity_code] = {
        'min': float(row.min),
        'max': float(row.max)
    }

if not activity_ranges:
    print("Error: No activity ranges found in knowledge graph.")
    exit(1)

# Load sensor readings csv file data
try:
    df = pd.read_csv(data_path)
except Exception as e:
    print(f"Error loading sensor data: {e}")
    exit(1)

# Parse sensor_id with validation
def parse_sensor_id(sid):
    parts = sid.split('_')
    if len(parts) >= 5:
        return parts[0].strip().upper(), parts[3], parts[4]
    return None, None, None

df['participant'], df['activity'], df['sensor_type'] = zip(*df['sensor_id'].apply(parse_sensor_id))
df = df.dropna(subset=['participant'])

# Filter for accelerometer data
df = df[(df['sensor_type'] == 'Accelerometer') &
        (df['property'].isin(['UserAccelerationX', 'UserAccelerationY', 'UserAccelerationZ']))]

# Convert measurement to numeric, drop invalid entries
df['measurement'] = pd.to_numeric(df['measurement'], errors='coerce')
df = df.dropna(subset=['measurement'])

if df.empty:
    print("Error: No valid accelerometer data after cleaning.")
    exit(1)

# Pivot to get X, Y, Z
acc_df = df.pivot_table(index=['timestamp', 'sensor_id', 'participant', 'activity'],
                        columns='property', values='measurement', aggfunc='first').reset_index()

# Drop rows with missing or invalid acceleration components
acc_df = acc_df.dropna(subset=['UserAccelerationX', 'UserAccelerationY', 'UserAccelerationZ'])
for col in ['UserAccelerationX', 'UserAccelerationY', 'UserAccelerationZ']:
    acc_df[col] = pd.to_numeric(acc_df[col], errors='coerce')
acc_df = acc_df.dropna(subset=['UserAccelerationX', 'UserAccelerationY', 'UserAccelerationZ'])

if acc_df.empty:
    print("Error: No valid data after pivot and cleaning.")
    exit(1)

# Compute magnitude
acc_df['magnitude'] = np.sqrt(acc_df['UserAccelerationX']**2 +
                              acc_df['UserAccelerationY']**2 +
                              acc_df['UserAccelerationZ']**2)

# Compute global mean and std for magnitude (for traditional Z-score)
global_magnitude_mean = acc_df['magnitude'].mean()
global_magnitude_std = acc_df['magnitude'].std()
print(f"Global magnitude stats: Mean = {global_magnitude_mean:.6f}, Std = {global_magnitude_std:.6f}")

# Compute per-activity stats for reference on cleaned data
activity_stats_df = acc_df.groupby('activity')['magnitude'].agg(['mean', 'std', 'count']).reset_index()
activity_stats_df.columns = ['activity', 'actual_mean', 'actual_std', 'sample_count']

# Ensure sufficient samples and smooth std
activity_stats_df = activity_stats_df[activity_stats_df['sample_count'] >= 100]
global_activity_std = activity_stats_df['actual_std'].mean()
activity_stats_df['actual_std'] = activity_stats_df['actual_std'].apply(lambda x: max(x, global_activity_std * 0.1))

# Debug: Print per-activity stats
print("Per-activity magnitude stats:\n", activity_stats_df[['activity', 'actual_mean', 'actual_std', 'sample_count']])

# Merge with participant characteristics for clustering
merged_df = pd.merge(acc_df, participants_df, on='participant', how='inner')

if merged_df.empty:
    print("Error: No matching participants found after merging.")
    exit(1)

# Debug: Print original data size
print("Original data size:", len(merged_df))

# Initialize 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
results_list = []

# Iterate over folds
for fold, (train_index, test_index) in enumerate(kf.split(merged_df), 1):
    print(f"Processing Fold {fold}")
    train_df = merged_df.iloc[train_index].copy()
    test_df = merged_df.iloc[test_index].copy()
    print(f"Training set size: {len(train_df)}, Test set size: {len(test_df)}")

    # Prepare data for clustering on training set, including magnitude
    X_cluster_train = train_df[['age', 'gender', 'height', 'weight', 'magnitude', 'activity']]
    scaler = RobustScaler()
    X_numeric_train = scaler.fit_transform(X_cluster_train[['age', 'gender', 'height', 'weight', 'magnitude']])
    X_activity_train = pd.get_dummies(X_cluster_train['activity'], prefix='activity')
    X_scaled_train = np.hstack([X_numeric_train, X_activity_train])

    # Determine optimal number of clusters using the elbow method on training set
    wcss = []
    max_clusters = 11
    for i in range(2, max_clusters + 1):
        kmeans = KMeans(n_clusters=i, random_state=42)
        kmeans.fit(X_scaled_train)
        wcss.append(kmeans.inertia_)

    # Plot the elbow curve for this fold (optional, can be disabled for efficiency)
    plt.figure(figsize=(10, 5))
    plt.plot(range(2, max_clusters + 1), wcss, marker='o')
    plt.title(f'Elbow Method for Optimal Number of Clusters - Fold {fold}')
    plt.xlabel('Number of Clusters')
    plt.ylabel('WCSS')
    plt.savefig(f'/content/drive/My Drive/ColabNotebooks/elbow_curve_gnn_further_fold_{fold}.png')
    plt.close()

    # Choose the optimal number of clusters
    optimal_clusters = 11  # Placeholder, adjust based on elbow curve
    for i in range(2, len(wcss) - 1):
        if (wcss[i-1] - wcss[i]) / (wcss[i] - wcss[i+1]) < 0.5:
            optimal_clusters = i + 2
            break
    print(f"Optimal number of clusters for Fold {fold}: {optimal_clusters}")

    # Perform K-means clustering with optimal number of clusters on training set
    kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)
    train_cluster_labels = kmeans.fit_predict(X_scaled_train)

    # Assign clusters to test set using the trained K-means model
    X_cluster_test = test_df[['age', 'gender', 'height', 'weight', 'magnitude', 'activity']]
    X_numeric_test = scaler.transform(X_cluster_test[['age', 'gender', 'height', 'weight', 'magnitude']])
    X_activity_test = pd.get_dummies(X_cluster_test['activity'], prefix='activity')
    # Align columns between train and test activity dummies
    X_activity_test = X_activity_test.reindex(columns=X_activity_train.columns, fill_value=0)
    X_scaled_test = np.hstack([X_numeric_test, X_activity_test])
    test_cluster_labels = kmeans.predict(X_scaled_test)

    # Add cluster labels to train and test dataframes
    train_df['cluster'] = train_cluster_labels
    test_df['cluster'] = test_cluster_labels

    # Combine train and test dataframes for this fold
    fold_df = pd.concat([train_df, test_df], axis=0)

    # Assign cluster mean and std for Z-score calculation using K-means clusters
    cluster_means = train_df.groupby('cluster')['magnitude'].mean().to_dict()
    cluster_stds = train_df.groupby('cluster')['magnitude'].std().to_dict()
    fold_df['cluster_mu'] = fold_df['cluster'].map(cluster_means)
    fold_df['cluster_sigma'] = fold_df['cluster'].map(cluster_stds).fillna(global_activity_std)

    # Compute Z-scores using cluster-based statistics
    fold_df['Z_score'] = (fold_df['magnitude'] - fold_df['cluster_mu']) / fold_df['cluster_sigma'].replace(0, np.nan)
    fold_df['traditional_Z_score'] = (fold_df['magnitude'] - global_magnitude_mean) / global_magnitude_std

    # Add global mean and std to output
    fold_df['global_magnitude_mean'] = global_magnitude_mean
    fold_df['global_magnitude_std'] = global_magnitude_std

    # Apply activity-specific plausible ranges
    fold_df['activity'] = fold_df['activity'].str.lower()
    default_min_plausible = 0
    default_max_plausible = 10
    fold_df['min_plausible'] = fold_df['activity'].map(lambda x: activity_ranges.get(x, {'min': default_min_plausible})['min'])
    fold_df['max_plausible'] = fold_df['activity'].map(lambda x: activity_ranges.get(x, {'max': default_max_plausible})['max'])

    # Determine if observations are outside plausible range
    fold_df['outside_plausible'] = (fold_df['magnitude'] < fold_df['min_plausible']) | (fold_df['magnitude'] > fold_df['max_plausible'])

    # Determine anomaly
    threshold = 2
    fold_df['is_anomaly'] = np.abs(fold_df['Z_score']) > threshold
    fold_df['is_anomaly_traditional'] = np.abs(fold_df['traditional_Z_score']) > threshold

    # Label noise
    fold_df['is_noise'] = fold_df['is_anomaly'] & fold_df['outside_plausible']
    fold_df['is_noise_traditional'] = fold_df['is_anomaly_traditional'] & fold_df['outside_plausible']

    # Round numerical columns in fold_df to 2 decimal places
    numerical_cols = fold_df.select_dtypes(include=['float64', 'int64']).columns
    fold_df[numerical_cols] = fold_df[numerical_cols].round(2)

    # Function to introduce Generalized Normal noise with adjusted beta
    def introduce_gnn_noise(df, beta=1.5, scale=0.1, noise_fraction=0.05):
        noisy_df = df.copy()
        mask = np.random.rand(len(noisy_df)) < noise_fraction
        noise = gennorm.rvs(beta=beta, loc=0, scale=scale, size=len(noisy_df))
        noisy_df['magnitude'] = noisy_df['magnitude'] + noise * mask
        noisy_df['is_true_noise'] = mask.astype(int)  # Ground truth: 1 for noisy, 0 for normal
        return noisy_df

    # List of noise fractions to test
    noise_fractions = [0.02, 0.05, 0.10, 0.20]

    # Iterate over noise levels
    for noise_fraction in noise_fractions:
        print(f"Processing noise fraction: {noise_fraction*100}% in Fold {fold}")
        # Introduce noise to the dataset with adjusted beta
        noisy_df = introduce_gnn_noise(fold_df, beta=1.5, scale=2 * global_magnitude_std, noise_fraction=noise_fraction)

        # Apply anomaly detection - Hybrid method
        noisy_df['Z_score'] = (noisy_df['magnitude'] - noisy_df['cluster_mu']) / noisy_df['cluster_sigma'].replace(0, np.nan)
        noisy_df['is_anomaly'] = np.abs(noisy_df['Z_score']) > threshold
        noisy_df['outside_plausible'] = (noisy_df['magnitude'] < noisy_df['min_plausible']) | (noisy_df['magnitude'] > noisy_df['max_plausible'])
        noisy_df['is_noise'] = noisy_df['is_anomaly'] & noisy_df['outside_plausible']

        # Apply anomaly detection - Traditional method
        noisy_df['traditional_Z_score'] = (noisy_df['magnitude'] - global_magnitude_mean) / global_magnitude_std
        noisy_df['is_anomaly_traditional'] = np.abs(noisy_df['traditional_Z_score']) > threshold
        noisy_df['is_noise_traditional'] = noisy_df['is_anomaly_traditional'] & noisy_df['outside_plausible']

        # Evaluate performance
        y_true = noisy_df['is_true_noise']
        y_pred_hybrid = noisy_df['is_noise'].astype(int)
        y_pred_traditional = noisy_df['is_noise_traditional'].astype(int)

        precision_hybrid = precision_score(y_true, y_pred_hybrid)
        recall_hybrid = recall_score(y_true, y_pred_hybrid)
        f1_hybrid = f1_score(y_true, y_pred_hybrid)

        precision_traditional = precision_score(y_true, y_pred_traditional)
        recall_traditional = recall_score(y_true, y_pred_traditional)
        f1_traditional = f1_score(y_true, y_pred_traditional)

        # Calculate percentage of noise detected
        total_noise_points = y_true.sum()
        hybrid_detected = y_pred_hybrid[y_true == 1].sum()
        traditional_detected = y_pred_traditional[y_true == 1].sum()
        hybrid_noise_percentage = (hybrid_detected / total_noise_points * 100) if total_noise_points > 0 else 0
        traditional_noise_percentage = (traditional_detected / total_noise_points * 100) if total_noise_points > 0 else 0

        # Store results for this noise level and fold
        results_list.append({
            'Fold': fold,
            'Algorithm': 'K-means',
            'Noise Fraction (%)': noise_fraction * 100,
            'Method': 'Hybrid (Cluster-based)',
            'Precision': precision_hybrid,
            'Recall': recall_hybrid,
            'F1 Score': f1_hybrid,
            'Noise Detected (%)': hybrid_noise_percentage
        })
        results_list.append({
            'Fold': fold,
            'Algorithm': 'K-means',
            'Noise Fraction (%)': noise_fraction * 100,
            'Method': 'Traditional (Global)',
            'Precision': precision_traditional,
            'Recall': recall_traditional,
            'F1 Score': f1_traditional,
            'Noise Detected (%)': traditional_noise_percentage
        })

# Create consolidated results table
results_df = pd.DataFrame(results_list)
print("\nConsolidated Evaluation Results:")
print(results_df.groupby(['Noise Fraction (%)', 'Method']).agg({
    'Precision': 'mean',
    'Recall': 'mean',
    'F1 Score': 'mean',
    'Noise Detected (%)': 'mean'
}).reset_index())

# Save noisy dataset and results for the last fold
try:
    noisy_df.to_csv('/content/drive/My Drive/ColabNotebooks/noisy_results_kmeans_gnn_further.csv', index=False)
    print(f"Noisy dataset saved to '/content/drive/My Drive/ColabNotebooks/noisy_results_kmeans_gnn_further.csv'.")
    results_df.to_csv('/content/drive/My Drive/ColabNotebooks/evaluation_results_kmeans_gnn_further.csv', index=False)
    print(f"Evaluation results saved to '/content/drive/My Drive/ColabNotebooks/evaluation_results_kmeans_gnn_further.csv'.")
except Exception as e:
    print(f"Error saving files: {e}")

# Debug: Print summary for the last noise level of the last fold
print("\nZ-score stats (noisy dataset, last noise level of last fold):\n", noisy_df[['Z_score', 'traditional_Z_score']].describe())
print("Noise detection summary (hybrid, noisy dataset, last noise level of last fold):\n", noisy_df['is_noise'].value_counts())
print("Noise detection summary (traditional, noisy dataset, last noise level of last fold):\n", noisy_df['is_noise_traditional'].value_counts())
print(f"Total noise points introduced (last noise level of last fold): {total_noise_points}")
print(f"Hybrid method detected (last noise level of last fold): {hybrid_detected} ({hybrid_noise_percentage:.2f}%)")
print(f"Traditional method detected (last noise level of last fold): {traditional_detected} ({traditional_noise_percentage:.2f}%)")

Global magnitude stats: Mean = 0.753240, Std = 0.638734
Per-activity magnitude stats:
   activity  actual_mean  actual_std  sample_count
0      dws     0.575685    0.426152        131856
1      jog     1.445474    0.905656        134231
2      ups     0.447230    0.337824        157285
3      wlk     0.691151    0.471883        344288
Original data size: 767660
Processing Fold 1
Training set size: 614128, Test set size: 153532
Optimal number of clusters for Fold 1: 7
Processing noise fraction: 2.0% in Fold 1
Processing noise fraction: 5.0% in Fold 1
Processing noise fraction: 10.0% in Fold 1
Processing noise fraction: 20.0% in Fold 1
Processing Fold 2
Training set size: 614128, Test set size: 153532
Optimal number of clusters for Fold 2: 11
Processing noise fraction: 2.0% in Fold 2
Processing noise fraction: 5.0% in Fold 2
Processing noise fraction: 10.0% in Fold 2
Processing noise fraction: 20.0% in Fold 2
Processing Fold 3
Training set size: 614128, Test set size: 153532
Optimal numb

In [None]:
import pandas as pd
import numpy as np
from rdflib import Graph
from sklearn.preprocessing import RobustScaler
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import warnings
from scipy.stats import gennorm
import itertools

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# Paths to sensor reading csv file data and knowledge graph
data_path = "/content/drive/My Drive/ColabNotebooks/motion_sensor_readings.csv"
kg_path = "/content/drive/My Drive/ColabNotebooks/motion_sense_ssn_kg.ttl"

# Load knowledge graph
g = Graph()
try:
    g.parse(kg_path, format='turtle')
except Exception as e:
    print(f"Error parsing knowledge graph: {e}")
    exit(1)

# Query for participant characteristics
query = """
PREFIX ms: <http://example.org/motion-sense#>
SELECT ?p ?code ?age ?gender ?height ?weight
WHERE {
    ?p a ms:Participant ;
       ms:hasCode ?code ;
       ms:hasAge ?age ;
       ms:hasGender ?gender ;
       ms:hasHeightCm ?height ;
       ms:hasWeightKg ?weight .
}
"""
results = g.query(query)
participants = []
for row in results:
    participants.append({
        'participant': str(row.code).strip().upper(),
        'age': int(row.age),
        'gender': int(row.gender),
        'height': float(row.height),
        'weight': float(row.weight)
    })
participants_df = pd.DataFrame(participants)

if participants_df.empty:
    print("Error: No participants found in knowledge graph.")
    exit(1)

# Query for activity plausible ranges
query_ranges = """
PREFIX activity: <http://example.org/activity-recognition#>
SELECT ?activityCode ?min ?max
WHERE {
    ?act a activity:Activity ;
         activity:hasActivityCode ?activityCode ;
         activity:hasMinAcceleration ?min ;
         activity:hasMaxAcceleration ?max .
}
"""
results_ranges = g.query(query_ranges)
activity_ranges = {}
for row in results_ranges:
    activity_code = str(row.activityCode).strip().lower()
    activity_ranges[activity_code] = {
        'min': float(row.min),
        'max': float(row.max)
    }

if not activity_ranges:
    print("Error: No activity ranges found in knowledge graph.")
    exit(1)

# Load sensor readings csv file data
try:
    df = pd.read_csv(data_path)
except Exception as e:
    print(f"Error loading sensor data: {e}")
    exit(1)

# Parse sensor_id with validation
def parse_sensor_id(sid):
    parts = sid.split('_')
    if len(parts) >= 5:
        return parts[0].strip().upper(), parts[3], parts[4]
    return None, None, None

df['participant'], df['activity'], df['sensor_type'] = zip(*df['sensor_id'].apply(parse_sensor_id))
df = df.dropna(subset=['participant'])

# Filter for accelerometer data
df = df[(df['sensor_type'] == 'Accelerometer') &
        (df['property'].isin(['UserAccelerationX', 'UserAccelerationY', 'UserAccelerationZ']))]

# Convert measurement to numeric, drop invalid entries
df['measurement'] = pd.to_numeric(df['measurement'], errors='coerce')
df = df.dropna(subset=['measurement'])

if df.empty:
    print("Error: No valid accelerometer data after cleaning.")
    exit(1)

# Pivot to get X, Y, Z
acc_df = df.pivot_table(index=['timestamp', 'sensor_id', 'participant', 'activity'],
                        columns='property', values='measurement', aggfunc='first').reset_index()

# Drop rows with missing or invalid acceleration components
acc_df = acc_df.dropna(subset=['UserAccelerationX', 'UserAccelerationY', 'UserAccelerationZ'])
for col in ['UserAccelerationX', 'UserAccelerationY', 'UserAccelerationZ']:
    acc_df[col] = pd.to_numeric(acc_df[col], errors='coerce')
acc_df = acc_df.dropna(subset=['UserAccelerationX', 'UserAccelerationY', 'UserAccelerationZ'])

if acc_df.empty:
    print("Error: No valid data after pivot and cleaning.")
    exit(1)

# Compute magnitude
acc_df['magnitude'] = np.sqrt(acc_df['UserAccelerationX']**2 +
                              acc_df['UserAccelerationY']**2 +
                              acc_df['UserAccelerationZ']**2)

# Compute global mean and std for magnitude (for traditional Z-score)
global_magnitude_mean = acc_df['magnitude'].mean()
global_magnitude_std = acc_df['magnitude'].std()
print(f"Global magnitude stats (after outlier removal): Mean = {global_magnitude_mean:.6f}, Std = {global_magnitude_std:.6f}")

# Compute per-activity stats for reference on cleaned data
activity_stats_df = acc_df.groupby('activity')['magnitude'].agg(['mean', 'std', 'count']).reset_index()
activity_stats_df.columns = ['activity', 'actual_mean', 'actual_std', 'sample_count']

# Ensure sufficient samples and smooth std
activity_stats_df = activity_stats_df[activity_stats_df['sample_count'] >= 100]
global_activity_std = activity_stats_df['actual_std'].mean()
activity_stats_df['actual_std'] = activity_stats_df['actual_std'].apply(lambda x: max(x, global_activity_std * 0.1))

# Debug: Print per-activity stats
print("Per-activity magnitude stats:\n", activity_stats_df[['activity', 'actual_mean', 'actual_std', 'sample_count']])

# Merge with participant characteristics
merged_df = pd.merge(acc_df, participants_df, on='participant', how='inner')

if merged_df.empty:
    print("Error: No matching participants found after merging.")
    exit(1)

# Debug: Print original data size
print("Original data size:", len(merged_df))

# Split into training and test sets
train_df, test_df = train_test_split(merged_df, test_size=0.2, random_state=42)
print(f"Training set size: {len(train_df)}, Test set size: {len(test_df)}")

# Define feature sets to test
semantic_features = ['age', 'gender', 'height', 'weight', 'activity']
feature_combinations = []
# Include magnitude-only as baseline
feature_combinations.append(['magnitude'])
# Test magnitude with each semantic feature
for r in range(1, len(semantic_features) + 1):
    for combo in itertools.combinations(semantic_features, r):
        feature_combinations.append(['magnitude'] + list(combo))

# Function to preprocess features for clustering
def preprocess_features(df, features, activity_train=None):
    numerical_features = [f for f in features if f in ['magnitude', 'age', 'height', 'weight']]
    categorical_features = [f for f in features if f in ['gender', 'activity']]

    scaler = RobustScaler()
    X_numeric = scaler.fit_transform(df[numerical_features]) if numerical_features else np.array([])

    X_categorical = []
    if 'gender' in categorical_features:
        X_gender = df['gender'].values.reshape(-1, 1)
        X_categorical.append(X_gender)

    if 'activity' in categorical_features:
        X_activity = pd.get_dummies(df['activity'], prefix='activity')
        if activity_train is not None:
            X_activity = X_activity.reindex(columns=activity_train.columns, fill_value=0)
        else:
            activity_train = X_activity
        X_categorical.append(X_activity.values)

    if X_categorical:
        X_cat = np.hstack(X_categorical) if len(X_categorical) > 1 else X_categorical[0]
        X_scaled = np.hstack([X_numeric, X_cat]) if len(X_numeric) > 0 else X_cat
    else:
        X_scaled = X_numeric

    return X_scaled, scaler, activity_train

# Function to introduce GNN noise
def introduce_gnn_noise(df, beta=1.5, scale=0.1, noise_fraction=0.05):
    noisy_df = df.copy()
    mask = np.random.rand(len(noisy_df)) < noise_fraction
    noise = gennorm.rvs(beta=beta, loc=0, scale=scale, size=len(noisy_df))
    noisy_df['magnitude'] = noisy_df['magnitude'] + noise * mask
    noisy_df['is_true_noise'] = mask.astype(int)
    return noisy_df

# Evaluate each feature combination
results_list = []
noise_fraction = 0.2  # Test at 20% noise level
max_clusters = 11
threshold = 2

for features in feature_combinations:
    print(f"\nTesting feature combination: {features}")

    # Cross-validation setup
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    cv_f1_scores = []

    for fold, (train_idx, val_idx) in enumerate(kf.split(merged_df)):
        train_df = merged_df.iloc[train_idx]
        val_df = merged_df.iloc[val_idx]

        # Preprocess training data
        X_scaled_train, scaler, activity_train = preprocess_features(train_df, features)

        # Elbow method to determine optimal clusters
        wcss = []
        for i in range(2, max_clusters + 1):
            kmeans = KMeans(n_clusters=i, random_state=42)
            kmeans.fit(X_scaled_train)
            wcss.append(kmeans.inertia_)

        optimal_clusters = 11  # Default, adjust based on elbow method
        for i in range(2, len(wcss) - 1):
            if (wcss[i-1] - wcss[i]) / (wcss[i] - wcss[i+1]) < 0.5:
                optimal_clusters = i + 2
                break

        # Perform K-means clustering
        kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)
        train_cluster_labels = kmeans.fit_predict(X_scaled_train)

        # Assign clusters to validation set
        X_scaled_val, _, _ = preprocess_features(val_df, features, activity_train)
        val_cluster_labels = kmeans.predict(X_scaled_val)

        train_df = train_df.copy()
        val_df = val_df.copy()
        train_df['cluster'] = train_cluster_labels
        val_df['cluster'] = val_cluster_labels

        # Combine for anomaly detection
        acc_df = pd.concat([train_df, val_df], axis=0)

        # Compute cluster stats
        cluster_means = train_df.groupby('cluster')['magnitude'].mean().to_dict()
        cluster_stds = train_df.groupby('cluster')['magnitude'].std().to_dict()
        acc_df['cluster_mu'] = acc_df['cluster'].map(cluster_means)
        acc_df['cluster_sigma'] = acc_df['cluster'].map(cluster_stds).fillna(global_activity_std)

        # Compute Z-scores
        acc_df['Z_score'] = (acc_df['magnitude'] - acc_df['cluster_mu']) / acc_df['cluster_sigma'].replace(0, np.nan)
        acc_df['traditional_Z_score'] = (acc_df['magnitude'] - global_magnitude_mean) / global_magnitude_std

        # Apply activity-specific plausible ranges
        acc_df['activity'] = acc_df['activity'].str.lower()
        default_min_plausible = 0
        default_max_plausible = 10
        acc_df['min_plausible'] = acc_df['activity'].map(lambda x: activity_ranges.get(x, {'min': default_min_plausible})['min'])
        acc_df['max_plausible'] = acc_df['activity'].map(lambda x: activity_ranges.get(x, {'max': default_max_plausible})['max'])

        acc_df['outside_plausible'] = (acc_df['magnitude'] < acc_df['min_plausible']) | (acc_df['magnitude'] > acc_df['max_plausible'])

        # Introduce noise
        noisy_df = introduce_gnn_noise(acc_df, beta=1.5, scale=2 * global_magnitude_std, noise_fraction=noise_fraction)

        # Apply anomaly detection - Hybrid method
        noisy_df['Z_score'] = (noisy_df['magnitude'] - noisy_df['cluster_mu']) / noisy_df['cluster_sigma'].replace(0, np.nan)
        noisy_df['is_anomaly'] = np.abs(noisy_df['Z_score']) > threshold
        noisy_df['outside_plausible'] = (noisy_df['magnitude'] < noisy_df['min_plausible']) | (noisy_df['magnitude'] > noisy_df['max_plausible'])
        noisy_df['is_noise'] = noisy_df['is_anomaly'] & noisy_df['outside_plausible']

        # Evaluate performance
        y_true = noisy_df['is_true_noise']
        y_pred_hybrid = noisy_df['is_noise'].astype(int)

        precision_hybrid = precision_score(y_true, y_pred_hybrid, zero_division=0)
        recall_hybrid = recall_score(y_true, y_pred_hybrid, zero_division=0)
        f1_hybrid = f1_score(y_true, y_pred_hybrid, zero_division=0)

        total_noise_points = y_true.sum()
        hybrid_detected = y_pred_hybrid[y_true == 1].sum()
        hybrid_noise_percentage = (hybrid_detected / total_noise_points * 100) if total_noise_points > 0 else 0

        # Print results for this fold
        print(f"Fold {fold + 1} Results - Features: {features}")
        print(f"  Precision: {precision_hybrid:.6f}")
        print(f"  Recall: {recall_hybrid:.6f}")
        print(f"  F1 Score: {f1_hybrid:.6f}")
        print(f"  Noise Detected (%): {hybrid_noise_percentage:.6f}")

        cv_f1_scores.append(f1_hybrid)

    # Average F1-score across folds
    avg_f1_score = np.mean(cv_f1_scores)

    # Store results for this feature combination
    results_list.append({
        'Features': ', '.join(features),
        'Avg F1 Score': avg_f1_score,
        'Precision': precision_hybrid,  # From last fold
        'Recall': recall_hybrid,
        'Noise Detected (%)': hybrid_noise_percentage
    })

# Create results DataFrame
results_df = pd.DataFrame(results_list)
results_df = results_df.sort_values(by='Avg F1 Score', ascending=False)
print("\nFeature Selection Results (sorted by Avg F1 Score):")
print(results_df)

# Save results
try:
    results_df.to_csv('/content/drive/My Drive/ColabNotebooks/feature_selection_results.csv', index=False)
    print(f"Feature selection results saved to '/content/drive/My Drive/ColabNotebooks/feature_selection_results.csv'.")
except Exception as e:
    print(f"Error saving files: {e}")

Global magnitude stats (after outlier removal): Mean = 0.753240, Std = 0.638734
Per-activity magnitude stats:
   activity  actual_mean  actual_std  sample_count
0      dws     0.575685    0.426152        131856
1      jog     1.445474    0.905656        134231
2      ups     0.447230    0.337824        157285
3      wlk     0.691151    0.471883        344288
Original data size: 767660
Training set size: 614128, Test set size: 153532

Testing feature combination: ['magnitude']
Fold 1 Results - Features: ['magnitude']
  Precision: 0.995267
  Recall: 0.345343
  F1 Score: 0.512765
  Noise Detected (%): 34.534349
Fold 2 Results - Features: ['magnitude']
  Precision: 0.994994
  Recall: 0.345821
  F1 Score: 0.513255
  Noise Detected (%): 34.582099
Fold 3 Results - Features: ['magnitude']
  Precision: 0.996198
  Recall: 0.344464
  F1 Score: 0.511918
  Noise Detected (%): 34.446406
Fold 4 Results - Features: ['magnitude']
  Precision: 0.994923
  Recall: 0.343369
  F1 Score: 0.510540
  Noise Det