# Data Preprocessing of UWB BLE Raw Data

In [14]:
import pandas as pd
import numpy as np
from scipy import signal
import os
import re
from pykalman import KalmanFilter
import matplotlib.pyplot as plt

# Thai to English translation mapping
thai_to_english_mappings = {
    # Location zones (inExpectedZoneName)
    'โถงกลางชั้น 1': 'Central Hall Floor 1',
    'ล็อบบี้ ชั้น 1': 'Lobby Floor 1',
    'ห้องกระจก ชั้น 1': 'Glass Room Floor 1',
    'ทางเดินชั้น1': 'Corridor Floor 1',
    'ห้องน้ำชายด้านหลัง': 'Male Restroom Back Area',
    'ลิฟท์ด้านหน้า': 'Front Elevator',
    'ทางเดินชั้น3ด้านหน้า': 'Front Corridor Floor 3',
    'ทางเดินชั้น2': 'Corridor Floor 2',
    
    # Group names (group_name)
    'ทีมวิจัยอิเล็กทรอนิกส์สำหรับนวัตกรรมไร้สาย': 'Electronics Research Team for Wireless Innovation',
    'กลุ่มงานประเมินองค์กร': 'Organization Assessment Team'
}

## STAGE 1: Data Loading and Initial Preprocessing

In [15]:
def load_and_preprocess_data(file_paths):
    """
    Load data from multiple CSV files, remove unnecessary columns,
    translate Thai text to English and combine datasets.
    
    Args:
        file_paths (list): List of paths to CSV files
        
    Returns:
        pd.DataFrame: Combined preprocessed dataframe
    """
    dfs = []
    
    for file_path in file_paths:
        # Load data
        df = pd.read_csv(file_path)
        
        # Add source file column for traceability
        df['source_file'] = os.path.basename(file_path)
        
        # Function to check if text contains Thai characters
        def contains_thai(text):
            if not isinstance(text, str):
                return False
            thai_pattern = r'[\u0E00-\u0E7F]'
            import re
            return bool(re.search(thai_pattern, text))
        
        # Identify all columns that contain Thai text
        thai_cols = []
        for col in df.columns:
            if df[col].dtype == 'object':  # Check only string columns
                if df[col].apply(lambda x: contains_thai(x) if isinstance(x, str) else False).any():
                    thai_cols.append(col)
        
        if thai_cols:
            print(f"Found Thai text in these columns in {file_path}: {thai_cols}")
        
        # Translate all columns with Thai text
        for col in thai_cols:
            df[col] = df[col].apply(lambda x: translate_thai_to_english(x) if isinstance(x, str) else x)
        
        dfs.append(df)
    
    # Combine all dataframes
    combined_df = pd.concat(dfs, ignore_index=True)
    
    # Sort by timestamp to ensure chronological order
    combined_df = combined_df.sort_values('timestamp')
    
    # Create a time-based feature (seconds from first timestamp)
    min_timestamp = combined_df['timestamp'].min()
    combined_df['time_seconds'] = (combined_df['timestamp'] - min_timestamp) / 1000
    
    # Remove unnecessary columns
    columns_to_keep = [
        'id', 'x', 'y', 'timestamp', 'time_seconds',
        'inExpectedZoneName', 
        'group_name', 'floor', 'inZones'
    ]
    
    # Make sure we only keep columns that actually exist in the dataframe
    columns_to_keep = [col for col in columns_to_keep if col in combined_df.columns]
    
    combined_df = combined_df[columns_to_keep]
    
    # Check if there are any remaining Thai characters in the processed data
    remaining_thai = False
    for col in combined_df.columns:
        if combined_df[col].dtype == 'object':
            if combined_df[col].apply(lambda x: contains_thai(x) if isinstance(x, str) else False).any():
                remaining_thai = True
                print(f"WARNING: Column '{col}' still contains Thai characters after translation.")
                # Show examples of untranslated Thai text
                thai_examples = combined_df[combined_df[col].apply(
                    lambda x: contains_thai(x) if isinstance(x, str) else False
                )][col].unique()
                print("Examples of untranslated Thai text:")
                for example in thai_examples[:5]:  # Show at most 5 examples
                    print(f"  - {example}")
    
    if remaining_thai:
        print("\nTo fix this issue, add these Thai texts to the thai_to_english_mappings dictionary.")
    
    return combined_df

def translate_thai_to_english(text):
    """
    Translate Thai text to English using predefined mappings
    
    Args:
        text (str): Thai text to translate
        
    Returns:
        str: Translated English text if available, otherwise original text
    """
    return thai_to_english_mappings.get(text, text)

## STAGE 2: Outlier Detection and Removal using UQR Method

In [16]:
def remove_outliers_iqr(df, columns=['x', 'y', 'z'], multiplier=1.5):
    """
    Remove outliers using the Interquartile Range (IQR) method
    
    Args:
        df (pd.DataFrame): Input dataframe
        columns (list): Columns to check for outliers
        multiplier (float): IQR multiplier for defining outliers
        
    Returns:
        pd.DataFrame: Dataframe with outliers removed
    """
    df_clean = df.copy()
    outlier_indices = set()
    
    for col in columns:
        if col in df.columns:
            Q1 = df[col].quantile(0.25)
            Q3 = df[col].quantile(0.75)
            IQR = Q3 - Q1
            
            lower_bound = Q1 - multiplier * IQR
            upper_bound = Q3 + multiplier * IQR
            
            # Identify outliers
            col_outliers = df_clean[(df_clean[col] < lower_bound) | (df_clean[col] > upper_bound)].index
            outlier_indices.update(col_outliers)
    
    # Print outlier statistics
    print(f"Identified {len(outlier_indices)} outliers out of {len(df)} rows ({len(outlier_indices)/len(df)*100:.2f}%)")
    
    # Remove outliers
    df_clean = df_clean.drop(index=outlier_indices)
    
    return df_clean

## STAGE 3: Kalman Filtering

In [17]:
def apply_kalman_filtering(df, columns=['x', 'y']):
    """
    Apply Kalman filtering to smooth position data
    
    Args:
        df (pd.DataFrame): Input dataframe
        columns (list): Columns to apply Kalman filtering
        
    Returns:
        pd.DataFrame: Dataframe with Kalman filtered columns added
    """
    df_kalman = df.copy()
    
    # For each position dimension, apply Kalman filter
    for col in columns:
        if col in df.columns:
            # Initial state
            initial_state_mean = [df[col].iloc[0], 0]  # Position and velocity
            initial_state_covariance = np.eye(2)
            
            # Transition matrix (position and velocity)
            transition_matrix = np.array([[1, 1], [0, 1]])
            
            # Observation matrix (we observe only position)
            observation_matrix = np.array([[1, 0]])
            
            # Process noise (uncertainty in our model)
            process_noise = np.eye(2) * 0.01
            
            # Observation noise (uncertainty in our measurements)
            observation_noise = 1.0
            
            # Create Kalman filter
            kf = KalmanFilter(
                initial_state_mean=initial_state_mean,
                initial_state_covariance=initial_state_covariance,
                transition_matrices=transition_matrix,
                observation_matrices=observation_matrix,
                transition_covariance=process_noise,
                observation_covariance=observation_noise
            )
            
            # Apply filter
            smoothed_state_means, _ = kf.smooth(df[col].values)
            
            # Add filtered values to dataframe
            df_kalman[f'{col}_kalman'] = smoothed_state_means[:, 0]
    
    return df_kalman

## STAGE 4: Noise Filtering (Media + Low Pass)

In [19]:
def apply_noise_filtering(df, columns=['x', 'y'], window_size=5, cutoff=0.1):
    """
    Apply median and low-pass filters to remove noise
    
    Args:
        df (pd.DataFrame): Input dataframe
        columns (list): Columns to filter
        window_size (int): Window size for median filter
        cutoff (float): Cutoff frequency for low-pass filter
        
    Returns:
        pd.DataFrame: Dataframe with filtered columns added
    """
    df_filtered = df.copy()
    
    for col in columns:
        if col in df.columns:
            # Apply median filter
            median_filtered = signal.medfilt(df[col].values, window_size)
            df_filtered[f'{col}_median'] = median_filtered
            
            # Apply low-pass filter
            # Design Butterworth filter
            b, a = signal.butter(3, cutoff, 'low')
            # Apply filter
            low_pass_filtered = signal.filtfilt(b, a, median_filtered)
            df_filtered[f'{col}_lowpass'] = low_pass_filtered
    
    return df_filtered

## STAGE 5: Normalization

In [20]:
def normalize_data(df, columns=['x', 'y', 'x_kalman', 'y_kalman', 'x_lowpass', 'y_lowpass']):
    """
    Normalize data using Min-Max scaling
    
    Args:
        df (pd.DataFrame): Input dataframe
        columns (list): Columns to normalize
        
    Returns:
        pd.DataFrame: Dataframe with normalized columns added
    """
    df_norm = df.copy()
    
    for col in columns:
        if col in df.columns:
            min_val = df[col].min()
            max_val = df[col].max()
            range_val = max_val - min_val
            
            if range_val > 0:
                df_norm[f'{col}_norm'] = (df[col] - min_val) / range_val
            else:
                df_norm[f'{col}_norm'] = df[col]  # No normalization if range is 0
    
    return df_norm

## STAGE 6: Feature Engineering

In [21]:
def engineer_features(df):
    """
    Engineer additional features from the data
    
    Args:
        df (pd.DataFrame): Input dataframe
        
    Returns:
        pd.DataFrame: Dataframe with engineered features added
    """
    df_features = df.copy()
    
    # Calculate velocity between consecutive positions (using Kalman-filtered positions)
    if 'x_kalman' in df.columns and 'y_kalman' in df.columns and 'time_seconds' in df.columns:
        # Calculate position change
        df_features['dx'] = df_features['x_kalman'].diff()
        df_features['dy'] = df_features['y_kalman'].diff()
        
        # Calculate time difference
        df_features['dt'] = df_features['time_seconds'].diff()
        
        # Calculate velocity (where dt > 0 to avoid division by zero)
        mask = df_features['dt'] > 0
        df_features.loc[mask, 'velocity_x'] = df_features.loc[mask, 'dx'] / df_features.loc[mask, 'dt']
        df_features.loc[mask, 'velocity_y'] = df_features.loc[mask, 'dy'] / df_features.loc[mask, 'dt']
        
        # Calculate speed (magnitude of velocity)
        df_features['speed'] = np.sqrt(df_features['velocity_x']**2 + df_features['velocity_y']**2)
        
        # Calculate acceleration
        df_features['acceleration_x'] = df_features['velocity_x'].diff() / df_features['dt']
        df_features['acceleration_y'] = df_features['velocity_y'].diff() / df_features['dt']
        df_features['acceleration'] = np.sqrt(df_features['acceleration_x']**2 + df_features['acceleration_y']**2)
    
    # Calculate distance from origin
    if 'x_kalman' in df.columns and 'y_kalman' in df.columns:
        df_features['distance_from_origin'] = np.sqrt(df_features['x_kalman']**2 + df_features['y_kalman']**2)
    
    # Create time-based features
    if 'timestamp' in df.columns:
        # Convert timestamp to datetime
        df_features['datetime'] = pd.to_datetime(df_features['timestamp'], unit='ms')
        df_features['hour'] = df_features['datetime'].dt.hour
        df_features['minute'] = df_features['datetime'].dt.minute
        df_features['second'] = df_features['datetime'].dt.second
        df_features['day_of_week'] = df_features['datetime'].dt.dayofweek
    
    # Calculate distance between consecutive points
    if 'x_kalman' in df.columns and 'y_kalman' in df.columns:
        df_features['distance'] = np.sqrt(df_features['dx']**2 + df_features['dy']**2)
        
        # Calculate cumulative distance traveled
        df_features['cumulative_distance'] = df_features['distance'].cumsum()
    
    # Calculate direction/angle of movement (in radians)
    if 'dx' in df_features.columns and 'dy' in df_features.columns:
        df_features['direction'] = np.arctan2(df_features['dy'], df_features['dx'])
        
        # Convert to degrees for easier interpretation
        df_features['direction_degrees'] = df_features['direction'] * 180 / np.pi
    
    return df_features


## STAGE 7: Visualization and Quality Check

In [22]:
def plot_trajectories(df, output_path='trajectory_plot.png'):
    """
    Plot the original, Kalman filtered, and low-pass filtered trajectories
    
    Args:
        df (pd.DataFrame): Input dataframe
        output_path (str): Path to save the plot
    """
    plt.figure(figsize=(12, 10))
    
    # Plot original data
    plt.scatter(df['x'], df['y'], color='blue', alpha=0.3, label='Original')
    
    # Plot Kalman filtered data if available
    if 'x_kalman' in df.columns and 'y_kalman' in df.columns:
        plt.plot(df['x_kalman'], df['y_kalman'], color='red', linewidth=2, label='Kalman Filtered')
    
    # Plot low-pass filtered data if available
    if 'x_lowpass' in df.columns and 'y_lowpass' in df.columns:
        plt.plot(df['x_lowpass'], df['y_lowpass'], color='green', linewidth=2, label='Low-pass Filtered')
    
    plt.title('Position Trajectories: Original vs Filtered')
    plt.xlabel('X Position')
    plt.ylabel('Y Position')
    plt.legend()
    plt.grid(True)
    plt.savefig(output_path)
    plt.close()
    
    print(f"Saved trajectory plot to {output_path}")


## Main Processing Pipeline

In [23]:
def main():
    # Define file paths
    file_paths = ['3.4.2025.csv', '6.5.2025(1).csv', '6.5.2025(2).csv']
    
    # Set output path for processed data
    output_path = 'uwb_preprocessing.csv'
    
    # STAGE 1: Load and preprocess data
    print("STAGE 1: Loading and preprocessing data...")
    df = load_and_preprocess_data(file_paths)
    print(f"Combined data shape: {df.shape}")
    
    # STAGE 2: Remove outliers
    print("\nSTAGE 2: Removing outliers...")
    df_clean = remove_outliers_iqr(df)
    print(f"Data shape after outlier removal: {df_clean.shape}")
    
    # STAGE 3: Apply Kalman filtering
    print("\nSTAGE 3: Applying Kalman filtering...")
    df_kalman = apply_kalman_filtering(df_clean)
    
    # STAGE 4: Apply noise filtering
    print("\nSTAGE 4: Applying noise filtering...")
    df_filtered = apply_noise_filtering(df_kalman)
    
    # STAGE 5: Normalize data
    print("\nSTAGE 5: Normalizing data...")
    df_norm = normalize_data(df_filtered)
    
    # STAGE 6: Engineer features
    print("\nSTAGE 6: Engineering features...")
    df_features = engineer_features(df_norm)
    print(f"Final data shape: {df_features.shape}")
    
    # STAGE 7: Visualize data
    print("\nSTAGE 7: Visualizing data...")
    plot_trajectories(df_features)
    
    # Save processed data
    df_features.to_csv(output_path, index=False)
    print(f"\nProcessed data saved to {output_path}")
    
    # Print summary of processed data
    print("\nSummary of processed data:")
    print(df_features.describe())
    
    return df_features

if __name__ == "__main__":
    main()

STAGE 1: Loading and preprocessing data...
Found Thai text in these columns in 3.4.2025.csv: ['group_name']
Found Thai text in these columns in 6.5.2025(1).csv: ['group_name']
Found Thai text in these columns in 6.5.2025(2).csv: ['group_name']
Combined data shape: (1453, 9)

STAGE 2: Removing outliers...
Identified 169 outliers out of 1453 rows (11.63%)
Data shape after outlier removal: (1284, 9)

STAGE 3: Applying Kalman filtering...

STAGE 4: Applying noise filtering...

STAGE 5: Normalizing data...

STAGE 6: Engineering features...
Final data shape: (1284, 40)

STAGE 7: Visualizing data...
Saved trajectory plot to trajectory_plot.png

Processed data saved to uwb_preprocessing.csv

Summary of processed data:
                id            x            y     timestamp  time_seconds  \
count  1284.000000  1284.000000  1284.000000  1.284000e+03  1.284000e+03   
mean   5406.158879    27.719597    21.640298  1.745666e+12  2.001728e+06   
min     478.000000     7.701000    16.549000  1.7436

## Run Preprocessing

In [24]:

import os
import time
import re
# from uwb_preprocessing import (
#     load_and_preprocess_data,
#     remove_outliers_iqr,
#     apply_kalman_filtering,
#     apply_noise_filtering,
#     normalize_data,
#     engineer_features,
#     plot_trajectories,
#     translate_thai_to_english
# )

def check_thai_characters_in_data(file_paths):
    """
    Check if any Thai characters remain in the data
    
    Args:
        file_paths (list): List of paths to CSV files
    """
    import pandas as pd
    
    for file_path in file_paths:
        print(f"\nChecking for Thai characters in {file_path}...")
        df = pd.read_csv(file_path)
        
        thai_found = False
        
        # Check each column that might contain Thai characters
        for col in df.columns:
            if df[col].dtype == 'object':  # Only check string columns
                # Check if any value in this column contains Thai characters
                for val in df[col].dropna().unique():
                    if isinstance(val, str) and bool(re.search(r'[\u0E00-\u0E7F]', val)):
                        print(f"  Thai characters found in column '{col}': {val}")
                        thai_found = True
        
        if not thai_found:
            print(f"  No Thai characters found in {file_path}")

def validate_translations(output_file):
    """
    Validate that all Thai characters have been translated in the output file
    
    Args:
        output_file (str): Path to the output CSV file
    
    Returns:
        bool: True if all Thai characters have been translated, False otherwise
    """
    import pandas as pd
    
    print(f"\nValidating translations in {output_file}...")
    
    if not os.path.exists(output_file):
        print(f"  Error: Output file {output_file} does not exist")
        return False
    
    df = pd.read_csv(output_file)
    
    thai_found = False
    columns_with_thai = []
    
    # Check each column for Thai characters
    for col in df.columns:
        if df[col].dtype == 'object':  # Only check string columns
            for val in df[col].dropna().head(1000).values:  # Check first 1000 non-null values
                if isinstance(val, str) and bool(re.search(r'[\u0E00-\u0E7F]', val)):
                    thai_found = True
                    if col not in columns_with_thai:
                        columns_with_thai.append(col)
    
    if thai_found:
        print(f"  Thai characters found in output file in columns: {columns_with_thai}")
        print("  Translation was incomplete. Please update the translation dictionary.")
        return False
    else:
        print("  No Thai characters found in output file. Translation successful!")
        return True

def main():
    # Define file paths (update these to match your file locations)
    file_paths = ['3.4.2025.csv', '6.5.2025(1).csv', '6.5.2025(2).csv']
    
    # Set output paths
    output_dir = 'processed_data'
    os.makedirs(output_dir, exist_ok=True)
    
    final_output = os.path.join(output_dir, 'uwb_processed_final.csv')
    plot_output = os.path.join(output_dir, 'trajectory_plot.png')
    
    # Check for Thai characters in original data
    check_thai_characters_in_data(file_paths)
    
    # Track processing time
    start_time = time.time()
    
    print("="*50)
    print("UWB BLE DATA PREPROCESSING PIPELINE")
    print("="*50)
    
    # STAGE 1: Load and preprocess data
    print("\nSTAGE 1: Loading and preprocessing data...")
    df = load_and_preprocess_data(file_paths)
    print(f"Combined data shape: {df.shape}")
    stage1_output = os.path.join(output_dir, 'stage1_preprocessed.csv')
    df.to_csv(stage1_output, index=False)
    print(f"Stage 1 output saved to {stage1_output}")
    
    # Validate translations
    translate_valid = validate_translations(stage1_output)
    if not translate_valid:
        print("Warning: Some Thai characters remain in the data. Consider updating the translation dictionary.")
    
    # STAGE 2: Remove outliers
    print("\nSTAGE 2: Removing outliers...")
    df_clean = remove_outliers_iqr(df)
    print(f"Data shape after outlier removal: {df_clean.shape}")
    stage2_output = os.path.join(output_dir, 'stage2_outliers_removed.csv')
    df_clean.to_csv(stage2_output, index=False)
    print(f"Stage 2 output saved to {stage2_output}")
    
    # STAGE 3: Apply Kalman filtering
    print("\nSTAGE 3: Applying Kalman filtering...")
    df_kalman = apply_kalman_filtering(df_clean)
    stage3_output = os.path.join(output_dir, 'stage3_kalman_filtered.csv')
    df_kalman.to_csv(stage3_output, index=False)
    print(f"Stage 3 output saved to {stage3_output}")
    
    # STAGE 4: Apply noise filtering
    print("\nSTAGE 4: Applying noise filtering...")
    df_filtered = apply_noise_filtering(df_kalman)
    stage4_output = os.path.join(output_dir, 'stage4_noise_filtered.csv')
    df_filtered.to_csv(stage4_output, index=False)
    print(f"Stage 4 output saved to {stage4_output}")
    
    # STAGE 5: Normalize data
    print("\nSTAGE 5: Normalizing data...")
    df_norm = normalize_data(df_filtered)
    stage5_output = os.path.join(output_dir, 'stage5_normalized.csv')
    df_norm.to_csv(stage5_output, index=False)
    print(f"Stage 5 output saved to {stage5_output}")
    
    # STAGE 6: Engineer features
    print("\nSTAGE 6: Engineering features...")
    df_features = engineer_features(df_norm)
    print(f"Final data shape: {df_features.shape}")
    
    # STAGE 7: Visualize data
    print("\nSTAGE 7: Visualizing data...")
    plot_trajectories(df_features, output_path=plot_output)
    
    # Save final processed data
    df_features.to_csv(final_output, index=False)
    print(f"\nFinal processed data saved to {final_output}")
    
    # Final validation of translations
    final_valid = validate_translations(final_output)
    if not final_valid:
        print("Warning: Some Thai characters remain in the final output. Please update the translation dictionary.")
    else:
        print("All Thai text successfully translated to English in the final output.")
    
    # Print processing summary
    end_time = time.time()
    processing_time = end_time - start_time
    print("\nPROCESSING SUMMARY:")
    print(f"Total processing time: {processing_time:.2f} seconds")
    print(f"Input files: {len(file_paths)}")
    print(f"Total rows processed: {len(df)}")
    print(f"Final output rows: {len(df_features)}")
    print(f"Final output columns: {len(df_features.columns)}")
    
    # Print column information
    print("\nFinal dataset columns:")
    for col in sorted(df_features.columns):
        print(f"- {col}")
    
    return df_features

if __name__ == "__main__":
    main()


Checking for Thai characters in 3.4.2025.csv...
  Thai characters found in column 'group_name': ทีมวิจัยอิเล็กทรอนิกส์สำหรับนวัตกรรมไร้สาย
  Thai characters found in column 'group_name': กลุ่มงานประเมินองค์กร

Checking for Thai characters in 6.5.2025(1).csv...
  Thai characters found in column 'group_name': ทีมวิจัยอิเล็กทรอนิกส์สำหรับนวัตกรรมไร้สาย

Checking for Thai characters in 6.5.2025(2).csv...
  Thai characters found in column 'group_name': ทีมวิจัยอิเล็กทรอนิกส์สำหรับนวัตกรรมไร้สาย
UWB BLE DATA PREPROCESSING PIPELINE

STAGE 1: Loading and preprocessing data...
Found Thai text in these columns in 3.4.2025.csv: ['group_name']
Found Thai text in these columns in 6.5.2025(1).csv: ['group_name']
Found Thai text in these columns in 6.5.2025(2).csv: ['group_name']
Combined data shape: (1453, 9)
Stage 1 output saved to processed_data/stage1_preprocessed.csv

Validating translations in processed_data/stage1_preprocessed.csv...
  No Thai characters found in output file. Translation succ

## Data Analysis

In [25]:
#!/usr/bin/env python3
"""
UWB BLE Data Analysis

This script provides functions for analyzing the processed UWB BLE data,
including visualization, clustering, and statistical analysis.

Usage:
    python data_analysis.py

Requirements:
    - pandas
    - numpy
    - matplotlib
    - seaborn
    - scikit-learn
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA
from matplotlib.animation import FuncAnimation
import os

def load_processed_data(file_path='processed_data/uwb_processed_final.csv'):
    """
    Load the processed UWB data
    
    Args:
        file_path (str): Path to the processed data file
        
    Returns:
        pd.DataFrame: Loaded data
    """
    return pd.read_csv(file_path)

def plot_position_heatmap(df, output_path='analysis_output/position_heatmap.png'):
    """
    Create a heatmap of positions
    
    Args:
        df (pd.DataFrame): Input dataframe
        output_path (str): Path to save the plot
    """
    # Create output directory if it doesn't exist
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    
    plt.figure(figsize=(12, 10))
    
    # Use the Kalman filtered positions if available
    x_col = 'x_kalman' if 'x_kalman' in df.columns else 'x'
    y_col = 'y_kalman' if 'y_kalman' in df.columns else 'y'
    
    # Create heatmap
    heatmap, xedges, yedges = np.histogram2d(df[x_col], df[y_col], bins=50)
    extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]]
    
    plt.imshow(heatmap.T, extent=extent, origin='lower', cmap='viridis')
    plt.colorbar(label='Frequency')
    plt.title('Position Heatmap')
    plt.xlabel('X Position')
    plt.ylabel('Y Position')
    plt.savefig(output_path)
    plt.close()
    
    print(f"Saved position heatmap to {output_path}")

def visualize_movement_over_time(df, output_path='analysis_output/movement_animation.mp4', 
                              max_frames=300, interval=50):
    """
    Create an animation of movement over time
    
    Args:
        df (pd.DataFrame): Input dataframe
        output_path (str): Path to save the animation
        max_frames (int): Maximum number of frames to include
        interval (int): Interval between frames in milliseconds
    """
    # Create output directory if it doesn't exist
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    
    # Use the Kalman filtered positions if available
    x_col = 'x_kalman' if 'x_kalman' in df.columns else 'x'
    y_col = 'y_kalman' if 'y_kalman' in df.columns else 'y'
    
    # Subsample data if too large
    if len(df) > max_frames:
        step = len(df) // max_frames
        df_anim = df.iloc[::step].reset_index(drop=True)
    else:
        df_anim = df.reset_index(drop=True)
    
    # Create figure
    fig, ax = plt.subplots(figsize=(10, 8))
    
    # Set axis limits
    ax.set_xlim(df_anim[x_col].min() - 1, df_anim[x_col].max() + 1)
    ax.set_ylim(df_anim[y_col].min() - 1, df_anim[y_col].max() + 1)
    
    # Initialize line and points
    line, = ax.plot([], [], 'r-', linewidth=1.5, alpha=0.7)
    point, = ax.plot([], [], 'bo', markersize=8)
    
    # Set up text for time display
    time_text = ax.text(0.02, 0.95, '', transform=ax.transAxes)
    
    def init():
        line.set_data([], [])
        point.set_data([], [])
        time_text.set_text('')
        return line, point, time_text
    
    def animate(i):
        line.set_data(df_anim[x_col][:i+1], df_anim[y_col][:i+1])
        point.set_data(df_anim[x_col][i], df_anim[y_col][i])
        
        # Show time information if available
        if 'datetime' in df_anim.columns:
            time_text.set_text(f'Time: {df_anim["datetime"].iloc[i]}')
        else:
            time_text.set_text(f'Frame: {i}')
            
        return line, point, time_text
    
    # Create animation
    anim = FuncAnimation(fig, animate, init_func=init,
                         frames=len(df_anim), interval=interval, blit=True)
    
    # Save animation
    anim.save(output_path, writer='ffmpeg', fps=30)
    plt.close()
    
    print(f"Saved movement animation to {output_path}")

def identify_stationary_points(df, speed_threshold=0.1, min_duration=5):
    """
    Identify periods when the subject was stationary
    
    Args:
        df (pd.DataFrame): Input dataframe
        speed_threshold (float): Maximum speed to consider stationary
        min_duration (int): Minimum number of consecutive points to consider a stationary period
        
    Returns:
        pd.DataFrame: Dataframe with stationary periods identified
    """
    # Tag points with speed below threshold as potentially stationary
    df_stat = df.copy()
    if 'speed' in df.columns:
        df_stat['is_stationary'] = df['speed'] < speed_threshold
    else:
        # Calculate speed if not already in dataframe
        if 'x_kalman' in df.columns and 'y_kalman' in df.columns and 'time_seconds' in df.columns:
            dx = df['x_kalman'].diff()
            dy = df['y_kalman'].diff()
            dt = df['time_seconds'].diff()
            
            # Calculate speed (where dt > 0)
            mask = dt > 0
            speed = np.zeros(len(df))
            speed[mask] = np.sqrt(dx[mask]**2 + dy[mask]**2) / dt[mask]
            
            df_stat['speed'] = speed
            df_stat['is_stationary'] = speed < speed_threshold
    
    # Group consecutive stationary points
    df_stat['stationary_group'] = (df_stat['is_stationary'].shift(1) != df_stat['is_stationary']).cumsum()
    
    # Find groups that are stationary and have at least min_duration points
    stat_groups = df_stat[df_stat['is_stationary']].groupby('stationary_group').filter(lambda x: len(x) >= min_duration)
    
    # Get unique stationary group IDs that meet the criteria
    valid_groups = stat_groups['stationary_group'].unique()
    
    # Identify all points in these valid stationary groups
    df_stat['valid_stationary'] = df_stat['stationary_group'].isin(valid_groups)
    
    # Calculate center of each stationary cluster
    stationary_clusters = []
    
    for group_id in valid_groups:
        group_data = df_stat[df_stat['stationary_group'] == group_id]
        
        # Use Kalman filtered coordinates if available
        x_col = 'x_kalman' if 'x_kalman' in df.columns else 'x'
        y_col = 'y_kalman' if 'y_kalman' in df.columns else 'y'
        
        cluster = {
            'group_id': group_id,
            'center_x': group_data[x_col].mean(),
            'center_y': group_data[y_col].mean(),
            'start_time': group_data['time_seconds'].min(),
            'end_time': group_data['time_seconds'].max(),
            'duration': group_data['time_seconds'].max() - group_data['time_seconds'].min(),
            'point_count': len(group_data)
        }
        
        stationary_clusters.append(cluster)
    
    # Create a dataframe for the stationary clusters
    stationary_df = pd.DataFrame(stationary_clusters)
    
    return df_stat, stationary_df

def cluster_locations(df, eps=0.5, min_samples=5):
    """
    Cluster positions to identify significant locations
    
    Args:
        df (pd.DataFrame): Input dataframe
        eps (float): Maximum distance between two points to be considered in the same cluster
        min_samples (int): Minimum number of points to form a dense region
        
    Returns:
        pd.DataFrame, pd.DataFrame: Original dataframe with cluster labels, and cluster summary
    """
    # Use Kalman filtered positions if available
    x_col = 'x_kalman' if 'x_kalman' in df.columns else 'x'
    y_col = 'y_kalman' if 'y_kalman' in df.columns else 'y'
    
    # Extract position coordinates
    X = df[[x_col, y_col]].values
    
    # Apply DBSCAN clustering
    db = DBSCAN(eps=eps, min_samples=min_samples).fit(X)
    
    # Add cluster labels to the dataframe
    df_clustered = df.copy()
    df_clustered['cluster'] = db.labels_
    
    # Calculate cluster statistics
    clusters = []
    
    for cluster_id in sorted(set(db.labels_)):
        if cluster_id == -1:
            continue  # Skip noise points
            
        cluster_points = df_clustered[df_clustered['cluster'] == cluster_id]
        
        cluster = {
            'cluster_id': cluster_id,
            'center_x': cluster_points[x_col].mean(),
            'center_y': cluster_points[y_col].mean(),
            'point_count': len(cluster_points),
            'percentage': len(cluster_points) / len(df) * 100
        }
        
        # Add zone information if available
        if 'inExpectedZoneName' in df.columns:
            zone_counts = cluster_points['inExpectedZoneName'].value_counts()
            most_common_zone = zone_counts.idxmax() if not zone_counts.empty else 'Unknown'
            zone_percentage = zone_counts.max() / len(cluster_points) * 100 if not zone_counts.empty else 0
            
            cluster['most_common_zone'] = most_common_zone
            cluster['zone_percentage'] = zone_percentage
        
        clusters.append(cluster)
    
    # Create a dataframe for the clusters
    clusters_df = pd.DataFrame(clusters)
    
    return df_clustered, clusters_df

def visualize_clusters(df_clustered, clusters_df, output_path='analysis_output/clusters.png'):
    """
    Visualize the identified clusters
    
    Args:
        df_clustered (pd.DataFrame): Dataframe with cluster labels
        clusters_df (pd.DataFrame): Dataframe with cluster information
        output_path (str): Path to save the plot
    """
    # Create output directory if it doesn't exist
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    
    # Use the Kalman filtered positions if available
    x_col = 'x_kalman' if 'x_kalman' in df_clustered.columns else 'x'
    y_col = 'y_kalman' if 'y_kalman' in df_clustered.columns else 'y'
    
    plt.figure(figsize=(12, 10))
    
    # Plot noise points (cluster = -1)
    noise = df_clustered[df_clustered['cluster'] == -1]
    plt.scatter(noise[x_col], noise[y_col], c='lightgray', label='Noise', alpha=0.3, s=10)
    
    # Generate a color map for clusters
    cmap = plt.cm.get_cmap('tab10', len(clusters_df))
    
    # Plot each cluster with a different color
    for i, (_, cluster) in enumerate(clusters_df.iterrows()):
        cluster_id = cluster['cluster_id']
        cluster_points = df_clustered[df_clustered['cluster'] == cluster_id]
        
        plt.scatter(cluster_points[x_col], cluster_points[y_col], 
                   c=[cmap(i)], label=f'Cluster {cluster_id}', alpha=0.7, s=30)
        
        # Add cluster centroid and annotation
        plt.scatter(cluster['center_x'], cluster['center_y'], 
                   marker='X', c='red', s=100, edgecolor='black')
        
        # Add annotation
        if 'most_common_zone' in cluster:
            annotation = f"{cluster['most_common_zone']}"
        else:
            annotation = f"Cluster {cluster_id}"
            
        plt.annotate(annotation, 
                    (cluster['center_x'], cluster['center_y']),
                    xytext=(10, 10),
                    textcoords='offset points',
                    fontsize=10,
                    bbox=dict(boxstyle='round,pad=0.3', fc='yellow', alpha=0.7))
    
    plt.title('Location Clusters')
    plt.xlabel('X Position')
    plt.ylabel('Y Position')
    plt.grid(True, alpha=0.3)
    plt.savefig(output_path)
    plt.close()
    
    print(f"Saved cluster visualization to {output_path}")

def analyze_transitions(df_clustered, output_path='analysis_output/transitions.png'):
    """
    Analyze transitions between clusters
    
    Args:
        df_clustered (pd.DataFrame): Dataframe with cluster labels
        output_path (str): Path to save the plot
    """
    # Create output directory if it doesn't exist
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    
    # Get non-noise clusters
    clusters = sorted(set(df_clustered['cluster']))
    if -1 in clusters:
        clusters.remove(-1)
    
    if not clusters:
        print("No clusters to analyze transitions between.")
        return
    
    # Create transition matrix
    n_clusters = len(clusters)
    transition_matrix = np.zeros((n_clusters, n_clusters))
    
    # Count transitions
    prev_cluster = df_clustered['cluster'].iloc[0]
    prev_cluster_idx = clusters.index(prev_cluster) if prev_cluster in clusters else None
    
    for cluster in df_clustered['cluster'][1:]:
        if cluster in clusters and prev_cluster in clusters:
            curr_cluster_idx = clusters.index(cluster)
            prev_cluster_idx = clusters.index(prev_cluster)
            
            if prev_cluster_idx is not None and curr_cluster_idx != prev_cluster_idx:
                transition_matrix[prev_cluster_idx, curr_cluster_idx] += 1
        
        prev_cluster = cluster
    
    # Create heatmap
    plt.figure(figsize=(10, 8))
    sns.heatmap(transition_matrix, annot=True, fmt='g', cmap='viridis',
               xticklabels=[f'Cluster {c}' for c in clusters],
               yticklabels=[f'Cluster {c}' for c in clusters])
    
    plt.title('Transition Matrix Between Clusters')
    plt.xlabel('To Cluster')
    plt.ylabel('From Cluster')
    plt.tight_layout()
    plt.savefig(output_path)
    plt.close()
    
    print(f"Saved transition matrix to {output_path}")

def analyze_speed_distribution(df, output_path='analysis_output/speed_distribution.png'):
    """
    Analyze the distribution of speeds
    
    Args:
        df (pd.DataFrame): Input dataframe
        output_path (str): Path to save the plot
    """
    # Create output directory if it doesn't exist
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    
    if 'speed' not in df.columns:
        print("Speed column not found in dataframe.")
        return
    
    plt.figure(figsize=(12, 6))
    
    # Create subplot for histogram
    plt.subplot(1, 2, 1)
    sns.histplot(df['speed'].dropna(), kde=True)
    plt.title('Speed Distribution')
    plt.xlabel('Speed')
    plt.ylabel('Frequency')
    
    # Create subplot for boxplot
    plt.subplot(1, 2, 2)
    sns.boxplot(y=df['speed'].dropna())
    plt.title('Speed Boxplot')
    plt.ylabel('Speed')
    
    plt.tight_layout()
    plt.savefig(output_path)
    plt.close()
    
    print(f"Saved speed distribution analysis to {output_path}")

def analyze_by_zone(df, output_path='analysis_output/zone_analysis.png'):
    """
    Analyze data by zone
    
    Args:
        df (pd.DataFrame): Input dataframe
        output_path (str): Path to save the plot
    """
    # Create output directory if it doesn't exist
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    
    if 'inExpectedZoneName' not in df.columns:
        print("Zone information not found in dataframe.")
        return
    
    # Count points per zone
    zone_counts = df['inExpectedZoneName'].value_counts()
    
    # Calculate time spent in each zone
    if 'time_seconds' in df.columns:
        zone_times = df.groupby('inExpectedZoneName')['time_seconds'].agg(['min', 'max'])
        zone_times['duration'] = zone_times['max'] - zone_times['min']
        
        # Calculate average speed per zone if available
        if 'speed' in df.columns:
            zone_speeds = df.groupby('inExpectedZoneName')['speed'].mean()
    
    # Create plot
    plt.figure(figsize=(14, 8))
    
    # Plot zone counts
    plt.subplot(2, 1, 1)
    zone_counts.plot(kind='bar', color='skyblue')
    plt.title('Points per Zone')
    plt.xlabel('Zone')
    plt.ylabel('Count')
    plt.xticks(rotation=45, ha='right')
    
    # Plot zone durations if available
    if 'time_seconds' in df.columns:
        plt.subplot(2, 1, 2)
        zone_times['duration'].plot(kind='bar', color='salmon')
        plt.title('Time Spent per Zone')
        plt.xlabel('Zone')
        plt.ylabel('Duration (seconds)')
        plt.xticks(rotation=45, ha='right')
    
    plt.tight_layout()
    plt.savefig(output_path)
    plt.close()
    
    print(f"Saved zone analysis to {output_path}")

def dimensionality_reduction(df, output_path='analysis_output/pca_visualization.png'):
    """
    Apply PCA for dimensionality reduction and visualization
    
    Args:
        df (pd.DataFrame): Input dataframe
        output_path (str): Path to save the plot
    """
    # Create output directory if it doesn't exist
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    
    # Select numerical features for PCA
    numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    
    # Exclude specific columns that shouldn't be used for PCA
    exclude_cols = ['id', 'timestamp', 'time_seconds', 'floor', 'cluster']
    features = [col for col in numerical_cols if col not in exclude_cols]
    
    if len(features) < 3:
        print("Not enough numerical features for PCA.")
        return
    
    # Handle NaN values
    X = df[features].fillna(0).values
    
    # Standardize features
    X_std = (X - np.mean(X, axis=0)) / np.std(X, axis=0)
    
    # Apply PCA
    pca = PCA(n_components=2)
    principal_components = pca.fit_transform(X_std)
    
    # Create dataframe with principal components
    pca_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])
    
    # Add color information if clusters exist
    if 'cluster' in df.columns:
        pca_df['cluster'] = df['cluster'].values
    
    # Create plot
    plt.figure(figsize=(10, 8))
    
    if 'cluster' in pca_df.columns:
        clusters = sorted(set(pca_df['cluster']))
        cmap = plt.cm.get_cmap('tab10', len(clusters) if -1 not in clusters else len(clusters)-1)
        
        # Plot each cluster
        for i, cluster in enumerate(clusters):
            if cluster == -1:
                # Plot noise points
                points = pca_df[pca_df['cluster'] == cluster]
                plt.scatter(points['PC1'], points['PC2'], c='lightgray', label='Noise', alpha=0.5, s=10)
            else:
                # Plot cluster points
                points = pca_df[pca_df['cluster'] == cluster]
                plt.scatter(points['PC1'], points['PC2'], c=[cmap(i)], label=f'Cluster {cluster}', alpha=0.7)
        
        plt.legend()
    else:
        # If no clusters, plot all points the same
        plt.scatter(pca_df['PC1'], pca_df['PC2'], alpha=0.7)
    
    # Add explained variance information
    explained_variance = pca.explained_variance_ratio_
    plt.xlabel(f'Principal Component 1 ({explained_variance[0]:.2%} variance)')
    plt.ylabel(f'Principal Component 2 ({explained_variance[1]:.2%} variance)')
    plt.title('PCA of UWB Data')
    plt.grid(alpha=0.3)
    plt.savefig(output_path)
    plt.close()
    
    print(f"Saved PCA visualization to {output_path}")
    print(f"PCA Explained variance: PC1={explained_variance[0]:.2%}, PC2={explained_variance[1]:.2%}")

def main():
    """
    Main function to run all analyses
    """
    # Set paths
    input_file = 'processed_data/uwb_processed_final.csv'
    output_dir = 'analysis_output'
    
    # Create output directory
    os.makedirs(output_dir, exist_ok=True)
    
    print("="*50)
    print("UWB BLE DATA ANALYSIS")
    print("="*50)
    
    # Load processed data
    print("\nLoading processed data...")
    df = load_processed_data(input_file)
    print(f"Loaded {len(df)} rows of data")
    
    # Analyze position distribution
    print("\nAnalyzing position distribution...")
    plot_position_heatmap(df)
    
    # Identify stationary points
    print("\nIdentifying stationary periods...")
    df_stationary, stationary_clusters = identify_stationary_points(df)
    print(f"Identified {len(stationary_clusters)} stationary periods")
    
    # Cluster locations
    print("\nClustering locations...")
    df_clustered, clusters = cluster_locations(df)
    print(f"Identified {len(clusters)} location clusters")
    
    # Visualize clusters
    print("\nVisualizing clusters...")
    visualize_clusters(df_clustered, clusters)
    
    # Analyze transitions
    print("\nAnalyzing transitions between clusters...")
    analyze_transitions(df_clustered)
    
    # Analyze speed distribution
    print("\nAnalyzing speed distribution...")
    analyze_speed_distribution(df)
    
    # Analyze by zone
    print("\nAnalyzing data by zone...")
    analyze_by_zone(df)
    
    # Apply dimensionality reduction
    print("\nApplying dimensionality reduction...")
    dimensionality_reduction(df_clustered)
    
    print("\nAnalysis complete. All outputs saved to the 'analysis_output' directory.")

if __name__ == "__main__":
    main()

UWB BLE DATA ANALYSIS

Loading processed data...
Loaded 1284 rows of data

Analyzing position distribution...
Saved position heatmap to analysis_output/position_heatmap.png

Identifying stationary periods...
Identified 24 stationary periods

Clustering locations...
Identified 11 location clusters

Visualizing clusters...


  cmap = plt.cm.get_cmap('tab10', len(clusters_df))


Saved cluster visualization to analysis_output/clusters.png

Analyzing transitions between clusters...
Saved transition matrix to analysis_output/transitions.png

Analyzing speed distribution...
Saved speed distribution analysis to analysis_output/speed_distribution.png

Analyzing data by zone...
Saved zone analysis to analysis_output/zone_analysis.png

Applying dimensionality reduction...
Saved PCA visualization to analysis_output/pca_visualization.png
PCA Explained variance: PC1=29.39%, PC2=17.46%

Analysis complete. All outputs saved to the 'analysis_output' directory.


  cmap = plt.cm.get_cmap('tab10', len(clusters) if -1 not in clusters else len(clusters)-1)
