# Imports

In [None]:
import os
import gc
import numpy as np
import pandas as pd
import seaborn as sns

from scipy.signal import resample
from collections import Counter

from sklearn.preprocessing import MinMaxScaler

import matplotlib.pyplot as plt

# Functions
## map_labels
This function can be used to simplify the labels. With a given label dictionary, we can map the labels to a less specific label.

In [None]:
def map_labels(data, label_dict, label_mapping):
    """
    Maps annotations in a dataset to their corresponding labels using a given mapping.

    Args:
        data (pd.DataFrame): A DataFrame containing an 'annotation' column with values to be mapped.
        label_dict (dict): A dictionary containing mappings of annotation labels.
                                Keys are label mapping names, and values are Pandas Series or DataFrames
                                where the index represents annotation keys and values represent labels.
        label_mapping (str): The key in `anno_label_dict` corresponding to the desired label mapping.

    Returns:
        pd.DataFrame: The input DataFrame with an added or updated 'label' column containing the mapped labels.
    """
    data['label'] = (label_dict[label_mapping].reindex(data['annotation']).to_numpy())
    return data

## normalize
Normalize the data to make it better for generalization

In [None]:
def normalize(data, feature_cols):
    """
    Normalizes the specified feature columns of a DataFrame using Min-Max scaling.

    Args:
        data (pd.DataFrame): A DataFrame containing the features to be normalized.
        feature_cols (list of str): List of column names in `data` to normalize.
        
    Returns:
        pd.DataFrame: A new DataFrame with the specified columns normalized to the range [0, 1].
        The original `data` DataFrame is modified in place.
        
    Notes:
        - Min-Max normalization scales each feature column to the range [0, 1] based on 
          the minimum and maximum values of the column.
        - This transformation is often used to prepare data for machine learning models 
          that are sensitive to feature magnitudes.
    """
    scaler = MinMaxScaler()
    data[feature_cols] = scaler.fit_transform(data[feature_cols])
    return data

## calculate_window_purity

In [None]:
def calculate_window_purity(window_labels):
    """
    Calculates the purity of a label window based on the proportion of the most frequent label.

    Args:
        window_labels (pd.Series or np.ndarray): Labels in a time window.

    Returns:
        float: Purity score between 0 and 1. Higher means purer (more consistent label).
    """
    if len(window_labels) == 0:
        return 0.0
    counts = pd.Series(window_labels).value_counts()
    return counts.iloc[0] / len(window_labels)

## extract_windows
A function to create windows. We can also change the frequency of the windows by downsampling.

In [None]:
def extract_windows(data, winsize=90, input_frequency=100, output_frequency=64):
    """
    Extracts sliding windows from time series data and labels for classification tasks.

    Args:
        data (pd.DataFrame): Time series DataFrame with 'x', 'y', 'z' (accelerometer data) and 'label'.
                             Index should be a datetime-like index.
        winsize (int): Window size in seconds.
        input_frequency (int): Sampling frequency of input data in Hz.
        output_frequency (int): Desired output frequency in Hz (must be a divisor of input_frequency).

    Returns:
        X (np.ndarray): Shape (n_samples, output_samples, 3), accelerometer windows.
        Y (np.ndarray): Shape (n_samples,), most frequent label per window.
    """
    # Calculate window size in samples and target output samples
    window_samples = winsize * input_frequency
    output_samples = winsize * output_frequency  # Expected downsampled length

    X, Y = [], []
    purities = []

    # Sliding window extraction
    for start in range(0, len(data) - window_samples + 1, window_samples):
        window = data.iloc[start:start + window_samples]

        # Skip if missing values exist
        if window.isna().any().any() or len(window) != window_samples:
            continue

        # Extract and resample accelerometer data
        x = window[['x', 'y', 'z']].to_numpy()
        x = resample(x, output_samples)  # Resample to match output frequency

        # Extract the most frequent label (mode)
        y = window['label'].mode().iloc[0]

        purity = calculate_window_purity(window['label'])

        X.append(x)
        Y.append(y)
        purities.append(purity)

    X = np.stack(X) if X else np.empty((0, output_samples, 3))
    Y = np.array(Y) if Y else np.empty((0,))

    return X, Y, purities

# Print input files

In [None]:
# Dictionary of labels
label_dict = pd.read_csv('/kaggle/input/capture-24-human-activity-recognition/capture24/annotation-label-dictionary.csv', index_col='annotation', dtype='string')

# Chosen label mapping
label_mapping = "label:Willetts2018"
print(label_dict[label_mapping])
print()

# Print files
print('Content of data/')
print(sorted(os.listdir('/kaggle/input/capture-24-human-activity-recognition/capture24')))
print()

# All of the labels
all_labels = list({*label_dict[label_mapping]})
print('All of the possible labels:')
print(all_labels)

# Training the model
## Training parameters

In [None]:
# Path of the dataset
path = '/kaggle/input/capture-24-human-activity-recognition/capture24/'

# The files in the dataset
file_list = sorted([f for f in os.listdir(path) if f.endswith('.csv') and f.startswith('P')])

# Data and window parameters
winsize = 90
num_train_files = 100
num_test_files = 51
frequency = 64

if num_train_files + num_test_files > 151:
    raise Exception("Number of training and testing files exceeds the total number of files.")

# Choose test files
test_files = file_list[-num_test_files:]

# Choose train files
train_files = file_list[:num_train_files]

num_train_files = len(train_files)
num_test_files = len(test_files)

print("Number of files used for training:", num_train_files)
print("Number of files used for testing:", num_test_files)

## Training set

In [None]:
file_id = 0

Y_train = []

# Loop through the training files
for file in train_files:    
    print(f"Processing file {file_id + 1}")
    file_id = file_id + 1

    file_path = os.path.join(path, file)
    file_path = os.path.join(file_path, file)

    data = pd.read_csv(file_path, index_col='time', parse_dates=['time'],
                           dtype={'x': 'f4', 'y': 'f4', 'z': 'f4', 'annotation': 'string'})

    data = map_labels(data, label_dict, label_mapping)      
    data = normalize(data, ['x','y','z'])
    
    # Extract windows and labels
    _, Y_, purities = extract_windows(data)
    Y_train.append(Y_)
    
    # Print the average purity of the windows
    print(sum(purities) / len(purities) if purities else 0)
    gc.collect()

# Concatenate the labels
Y_train = np.concatenate(Y_train, axis=0)

# Count the labels
label_counts = Counter(Y_train)

# Ensure all labels are present (even with 0)
label_series = pd.Series({label: label_counts.get(label, 0) for label in all_labels})

# Normalize to get percentages
label_series = label_series / label_series.sum()

# Set a clean style
sns.set(style="whitegrid")

# Create figure and axis
plt.figure(figsize=(12, 6))
ax = sns.barplot(x=label_series.index, y=label_series.values, palette='tab10')

# Titles and labels
plt.title("Label Distribution in the Training Set", fontsize=16, weight='bold')
plt.xlabel("Activity Label", fontsize=12)
plt.ylabel("Percentage", fontsize=12)

# Rotate x-axis labels and style
plt.xticks(rotation=30, ha='right', fontsize=11)

# Annotate each bar with the percentage
for i, v in enumerate(label_series.values):
    ax.text(i, v + 0.01, f"{v:.2f}", ha='center', va='bottom', fontsize=10, weight='bold')

plt.ylim(0, max(label_series.values) + 0.1)
plt.tight_layout()
plt.show()

## Testing set

In [None]:
file_id = 0

Y_test = []

# Loop through the testing files
for file in test_files:    
    print(f"Processing file {file_id + 1}")
    file_id = file_id + 1

    file_path = os.path.join(path, file)
    file_path = os.path.join(file_path, file)
    data = pd.read_csv(file_path, index_col='time', parse_dates=['time'],
                           dtype={'x': 'f4', 'y': 'f4', 'z': 'f4', 'annotation': 'string'})

    data = map_labels(data, label_dict, label_mapping)      
    data = normalize(data, ['x','y','z'])
    
    # Extract windows and labels
    _, Y_, purities = extract_windows(data)
    Y_test.append(Y_)
    
    # Print the average purity of the windows
    print(sum(purities) / len(purities) if purities else 0)
    gc.collect()

# Concatenate the labels
Y_test = np.concatenate(Y_test, axis=0)

# Count the labels
label_counts = Counter(Y_test)

# Ensure all labels are included and ordered like in all_labels
label_series = pd.Series({label: label_counts.get(label, 0) for label in all_labels})

# Normalize to percentages
label_series = label_series / label_series.sum()

# Set a clean style
sns.set(style="whitegrid")

# Create figure and axis
plt.figure(figsize=(12, 6))
ax = sns.barplot(x=label_series.index, y=label_series.values, palette='tab10')

# Titles and labels
plt.title("Label Distribution in the Testing Set", fontsize=16, weight='bold')
plt.xlabel("Activity Label", fontsize=12)
plt.ylabel("Percentage", fontsize=12)

# Rotate x-axis labels and style
plt.xticks(rotation=30, ha='right', fontsize=11)

for i, v in enumerate(label_series.values):
    ax.text(i, v + 0.01, f"{v:.2f}", ha='center', va='bottom', fontsize=10, weight='bold')

plt.ylim(0, max(label_series.values) + 0.1)
plt.tight_layout()
plt.show()

## Purity per file

In [None]:
file_purities = []

# Loop through the files
for file in file_list:
    file_path = os.path.join(path, file)
    file_path = os.path.join(file_path, file)
    
    data = pd.read_csv(file_path, index_col='time', parse_dates=['time'],
                       dtype={'x': 'f4', 'y': 'f4', 'z': 'f4', 'annotation': 'string'})

    data = map_labels(data, label_dict, label_mapping)
    data = normalize(data, ['x', 'y', 'z'])
    
    # Extract windows and labels
    _, labels, purities = extract_windows(data)
    purity = sum(purities) / len(purities) if purities else 0
    
    # Append the file name and purity
    file_name = os.path.basename(file)
    file_purities.append((file_name, purity))

# Create a DataFrame and sort by file name
purity_df = pd.DataFrame(file_purities, columns=["File", "Average Window Purity"])
purity_df = purity_df.sort_values("File")

fig, ax = plt.subplots(figsize=(22, 6))  # Wider figure for breathing room
bars = ax.bar(purity_df["File"], purity_df["Average Window Purity"], color='steelblue', edgecolor='black', linewidth=0.4)

# Titles and labels
ax.set_title("Average Window Purity per File", fontsize=18, weight='bold', pad=15)
ax.set_ylabel("Purity", fontsize=14)
ax.set_xlabel("Files (n=151)", fontsize=14, labelpad=10)
ax.set_ylim(0, 1.05)
ax.grid(axis='y', linestyle='--', alpha=0.6)

# Hide x-axis tick labels (too many files)
ax.set_xticks([])

# Add mean purity reference line
mean_purity = purity_df["Average Window Purity"].mean()
ax.axhline(mean_purity, color='crimson', linestyle='--', linewidth=1.5, label=f'Mean Purity: {mean_purity:.2f}')
ax.legend(fontsize=12, loc='upper right')

plt.tight_layout()
plt.show()

## Visualize accelaration (x, y, z)

In [None]:
file_path = os.path.join(path, train_files[0])
file_path = os.path.join(file_path, train_files[0])

# The data of the first file
day_data = pd.read_csv(file_path, index_col='time', parse_dates=['time'],
                       dtype={'x': 'f4', 'y': 'f4', 'z': 'f4', 'annotation': 'string'})
day_data = map_labels(day_data, label_dict, label_mapping)
day_data = normalize(day_data, ['x', 'y', 'z'])

# Create subplots for each axis
fig, axs = plt.subplots(3, 1, figsize=(15, 8), sharex=True)

axs[0].plot(day_data['x'], color='blue')
axs[0].set_ylabel("X Acceleration")
axs[0].set_title("X-axis")

axs[1].plot(day_data['y'], color='green')
axs[1].set_ylabel("Y Acceleration")
axs[1].set_title("Y-axis")

axs[2].plot(day_data['z'], color='red')
axs[2].set_ylabel("Z Acceleration")
axs[2].set_title("Z-axis")
axs[2].set_xlabel("Timestep")

for ax in axs:
    ax.grid(True)

plt.tight_layout()
plt.show()

## Visualize the labels of a day

In [None]:
valid_labels = day_data['label'].dropna()

# Get unique labels and their indices
label_values, label_indices = np.unique(valid_labels, return_inverse=True)

# Create a label array with -1 for missing labels
label_array = np.full(day_data.shape[0], -1)
label_array[valid_labels.index.map(lambda idx: day_data.index.get_loc(idx))] = label_indices

# Resample if too many columns
max_columns = 10000
if label_array.shape[0] > max_columns:
    factor = label_array.shape[0] // max_columns
    label_array = label_array[::factor]

label_matrix = label_array.reshape(1, -1)

fig, ax = plt.subplots(figsize=(15, 2.5))
im = ax.imshow(label_matrix, aspect='auto', cmap='tab10', vmin=0, vmax=len(label_values) - 1)

ax.set_title("Activity Labels Over Time", fontsize=14)
ax.set_yticks([])
ax.set_xlabel("Timestep")

cbar = fig.colorbar(im, ax=ax, orientation='vertical', ticks=np.arange(len(label_values)))
cbar.ax.set_yticklabels(label_values)
cbar.set_label("Activity", fontsize=12)

plt.tight_layout()
plt.show()