### HAR dataset

In [None]:
# wget https://archive.ics.uci.edu/dataset/240/human+activity+recognition+using+smartphones

import os
from src import project_dir
import pandas as pd
import numpy as np

dataset_dir = os.path.join(project_dir, 'data', 'HAR', 'UCI HAR Dataset')

columns=['acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y', 'gyro_z', 'subject', 'label']


In [28]:
def extract_har_data(phase='train'):
    assert phase in ['train', 'test'], "Phase must be either 'train' or 'test'."

    phase_dir = os.path.join(dataset_dir, phase)
    files = [f'body_acc_x_{phase}.txt', f'body_acc_y_{phase}.txt', f'body_acc_z_{phase}.txt',
             f'body_gyro_x_{phase}.txt', f'body_gyro_y_{phase}.txt', f'body_gyro_z_{phase}.txt']
    df = pd.DataFrame()

    subjects = np.loadtxt(os.path.join(phase_dir, f'subject_{phase}.txt'), dtype=int)
    labels = np.loadtxt(os.path.join(phase_dir, f'y_{phase}.txt'), dtype=int)

    for file_idx, file in enumerate(files):
        x_file_path = os.path.join(phase_dir, 'Inertial Signals', file)
        if not os.path.exists(x_file_path):
            raise FileNotFoundError(f"File {x_file_path} does not exist.")
        
        data = np.loadtxt(x_file_path)
        column_name = columns[file_idx]

        for i in range(data.shape[0]):
            subject_id = subjects[i]
            label = labels[i]
            if i == 0:
                signal = data[i, :]
                subject_id_array = np.repeat(subject_id, signal.shape[0])
                label_array = np.repeat(label, signal.shape[0])
            else:
                # there is a window overlap of 50%, so we take the second half of every signal except the first
                # merge this with the previous signal
                signal = np.concatenate((signal, data[i, data.shape[1] // 2:]), axis=0)
                subject_id_array = np.concatenate((subject_id_array, np.repeat(subject_id, data.shape[1] // 2)), axis=0)
                label_array = np.concatenate((label_array, np.repeat(label, data.shape[1] // 2)), axis=0)


        # copy the signal to the corresponding column vertically
        df[column_name] = signal

        if file_idx == 0:
            df['subject'] = subject_id_array
            df['label'] = label_array

    # re-organize dataframe columns
    df = df[columns]
    return df

In [30]:
train_df = extract_har_data(phase='train')
test_df = extract_har_data(phase='test')

# concatenate vertically both dataframes
df = pd.concat([train_df, test_df], axis=0, ignore_index=True)
df.to_csv(os.path.join(project_dir, 'data', 'HAR.csv'), index=False)

### WISDM dataset

In [None]:
# wget https://archive.ics.uci.edu/dataset/507/wisdm+smartphone+and+smartwatch+activity+and+biometrics+dataset

import os
import pandas as pd
import numpy
import re
from src import project_dir

dataset_dir = os.path.join(project_dir, 'data', 'wisdm', 'wisdm-dataset')

individuals = list(range(1600, 1651))  # 1600 to 1650 inclusive
devices = ['phone', 'watch']
sensors = ['accel', 'gyro']


alldf = pd.DataFrame()

for indiv in individuals:
    indiv_df = pd.DataFrame()

    i = 0
    for device in devices:
        for sensor in sensors:
            datadir = os.path.join(dataset_dir, 'raw', device, sensor)
            datafile = os.path.join(datadir, f'data_{indiv}_{sensor}_{device}.txt')
            if not os.path.isfile(datafile):
                raise FileNotFoundError(f"Data file {datafile} does not exist.")

            df = pd.read_csv(datafile, header=None, names=['subject', 'activity', 'timestamp', 'x', 'y', 'z'])
            df[df.columns[-1]] = df[df.columns[-1]].str.rstrip(';') 
            # print(f"Processing indiv {indiv} device {device} sensor {sensor}...")
            # print(f"Data shape: {df.shape}")

            indiv_df[f'{device}_{sensor}_x'] = df['x']
            indiv_df[f'{device}_{sensor}_y'] = df['y']
            indiv_df[f'{device}_{sensor}_z'] = df['z']
            if i == 0:
                indiv_df['subject'] = df['subject']
                indiv_df['label'] = df['activity'].apply(lambda x: ord(x.upper()) - ord('A'))
            i += 1

    # remove NaN rows from indiv_df
    indiv_df = indiv_df.dropna()
    alldf = pd.concat([alldf, indiv_df], axis=0, ignore_index=True)


columns = [col for col in alldf.columns if col not in ['subject', 'label']] + ['subject', 'label']
alldf = alldf[columns]
alldf.to_csv(os.path.join(project_dir, 'data', 'wisdm.csv'), index=False)


Processing indiv 1600 device phone sensor accel...
Data shape: (64311, 6)
Processing indiv 1600 device phone sensor gyro...
Data shape: (64247, 6)
Processing indiv 1600 device watch sensor accel...
Data shape: (65462, 6)
Processing indiv 1600 device watch sensor gyro...
Data shape: (65435, 6)
Processing indiv 1601 device phone sensor accel...
Data shape: (81457, 6)
Processing indiv 1601 device phone sensor gyro...
Data shape: (81193, 6)
Processing indiv 1601 device watch sensor accel...
Data shape: (64840, 6)
Processing indiv 1601 device watch sensor gyro...
Data shape: (64829, 6)
Processing indiv 1602 device phone sensor accel...
Data shape: (84890, 6)
Processing indiv 1602 device phone sensor gyro...
Data shape: (64286, 6)
Processing indiv 1602 device watch sensor accel...
Data shape: (64985, 6)
Processing indiv 1602 device watch sensor gyro...
Data shape: (64950, 6)
Processing indiv 1603 device phone sensor accel...
Data shape: (81841, 6)
Processing indiv 1603 device phone sensor gy

### HARTH dataset

In [None]:
# wget https://archive.ics.uci.edu/static/public/779/harth.zip

import os
import pandas as pd
from src import project_dir

datadir = os.path.join(project_dir, 'data', 'harth')

dataset = pd.DataFrame()
for csv_file in os.listdir(datadir):
    if not csv_file.endswith('.csv'):
        continue
    csv_path = os.path.join(datadir, csv_file)
    df = pd.read_csv(csv_path)
    subject = csv_file.replace('.csv', '')

    # Add a subject column to the dataframe
    df['subject'] = subject
    dataset = pd.concat([dataset, df], ignore_index=True)

dataset = dataset.drop(columns=['Unnamed: 0', 'index'], errors='ignore')  # Remove unwanted columns
dataset = dataset.sort_values(by=['subject', 'timestamp'])
dataset.to_csv(os.path.join(project_dir, 'data', 'harth.csv'), index=False)


### Coswara dataset

In [None]:
# check src/misc/extract_coswara.ipynb

### DaphNET dataset

In [5]:
# https://ieeexplore.ieee.org/document/6240371
# wget https://archive.ics.uci.edu/static/public/245/daphnet+freezing+of+gait.zip

import os
import pandas as pd
from src import project_dir

dataset_dir = os.path.join(project_dir, 'data', 'dataset_fog_release', 'dataset')


# Each file comprises the data in a matrix format, with one line per sample, and one column per channel. The channels are as follows:
# Time of sample in millisecond
# Ankle (shank) acceleration - horizontal forward acceleration [mg]
# Ankle (shank) acceleration - vertical [mg]
# Ankle (shank) acceleration - horizontal lateral [mg]
# Upper leg (thigh) acceleration - horizontal forward acceleration [mg]
# Upper leg (thigh) acceleration - vertical [mg]
# Upper leg (thigh) acceleration - horizontal lateral [mg]
# Trunk acceleration - horizontal forward acceleration [mg]
# Trunk acceleration - vertical [mg]
# Trunk acceleration - horizontal lateral [mg]
# Annotations (see Annotations section)

column_names = [
    'time', 'ankle_acc_x', 'ankle_acc_y', 'ankle_acc_z',
    'thigh_acc_x', 'thigh_acc_y', 'thigh_acc_z',
    'trunk_acc_x', 'trunk_acc_y', 'trunk_acc_z', 'annotation',
    'individual'
]

dataset = pd.DataFrame()
results = []
for txt_file in os.listdir(dataset_dir):
    if not txt_file.endswith('.txt'):
        print(f"Skipping non-text file: {txt_file}")
        continue

    file_path = os.path.join(dataset_dir, txt_file)
    with open(file_path, 'r') as f:
        # Read the contents of the text file
        contents = f.read()
        
    # Split the contents into lines
    lines = contents.split('\n')
    # Remove empty lines
    lines = [line for line in lines if line.strip()]

    # iterate over the lines
    for line in lines:
        # Split the line into columns
        columns = line.split(' ')

        value_dict = {
            'time': float(columns[0]),
            'ankle_acc_x': float(columns[1]),
            'ankle_acc_y': float(columns[2]),
            'ankle_acc_z': float(columns[3]),
            'thigh_acc_x': float(columns[4]),
            'thigh_acc_y': float(columns[5]),
            'thigh_acc_z': float(columns[6]),
            'trunk_acc_x': float(columns[7]),
            'trunk_acc_y': float(columns[8]),
            'trunk_acc_z': float(columns[9]),
            'annotation': int(columns[10]), # maybe int?
            'individual': txt_file.replace('.txt', '')
        }
        results.append(value_dict)

dataset = pd.DataFrame(results)
dataset = dataset[column_names]  # Reorder columns
dataset.to_csv(os.path.join(project_dir, 'data', 'daphnet.csv'), index=False)

In [7]:
dataset.columns

Index(['time', 'ankle_acc_x', 'ankle_acc_y', 'ankle_acc_z', 'thigh_acc_x',
       'thigh_acc_y', 'thigh_acc_z', 'trunk_acc_x', 'trunk_acc_y',
       'trunk_acc_z', 'annotation', 'individual'],
      dtype='object')