In [1]:
import pandas as pd
import numpy as np
from scipy.stats import skew, kurtosis
from collections import Counter

datadir = '../Collected Data/'

def read_file(num):
    dataset = pd.read_csv(datadir+'Collected Updated Labeled Data - Phase 01/participant '+ num +'.csv')
    if 'Time' in dataset.columns:
        dataset.drop('Time', axis=1, inplace=True)
    if 'Start' in dataset.columns:
        dataset.drop('Start', axis=1, inplace=True)
    if 'End' in dataset.columns:
        dataset.drop('End', axis=1, inplace=True)

    dataset['Label'] = dataset['Label'].apply(
        lambda x: x.lower() if x != 'Null' else x
    )
        
    strings_to_replace = ['null', ' null', 'Null ', 'null ']
    replacement_string = 'Null'
    dataset = dataset.replace(strings_to_replace, replacement_string)
    return dataset

In [2]:
def calculate_slope(data):
    data = data.replace(np.inf, np.nan).replace(-np.inf, np.nan).dropna()
    x = np.arange(len(data))
    if len(x)>1:
        slope = np.polyfit(x, data, 1)[0]
    else: 
        slope = 0
    return slope

In [3]:
def summarize_interval(group):
    start_time = group['Timestamp'].min()
    end_time = group['Timestamp'].max()
    elapsed_time = end_time-start_time
    label_mode = group['Label'].mode().iloc[0] #calculating mode value to assign label
    labeldict = Counter(group['Label']) #getting the count of each value in label
    label_percentage = (labeldict[label_mode] / len(group['Label']) ) * 100 #calculating the percentage of values that are same as mode
    if ((label_percentage > 79) and (elapsed_time > 7.8)): #calculating the values only if the label is atleast 80% else return and if it's more than 7 seconds and also if the interval is atleast 7.8seconds long
        summary = {
            'avg_accX': group['X'].mean(),
            'min_accX': group['X'].min(),
            'max_accX': group['X'].max(),
            'var_accX': group['X'].var(),
            'std_accX': np.std(group['X']),
            'avg_accY': group['Y'].mean(),
            'min_accY': group['Y'].min(),
            'max_accY': group['Y'].max(),
            'var_accY': group['Y'].var(),
            'std_accY': np.std(group['Y']),
            'avg_accZ': group['Z'].mean(),
            'min_accZ': group['Z'].min(),
            'max_accZ': group['Z'].max(),
            'var_accZ': group['Z'].var(),
            'std_accZ': np.std(group['Z']),
            'avg_magnitude': group['Magnitude'].mean(),
            'min_magnitude': group['Magnitude'].min(),
            'max_magnitude': group['Magnitude'].max(),
            'var_magnitude': group['Magnitude'].var(),
            'std_magnitude': np.std(group['Magnitude']),
            'var_pressure': group['Pressure'].var(),
            'range_pressure': (group['Pressure'].max() - group['Pressure'].min()),
            'std_pressure': np.std(group['Pressure']),
            'slope_pressure': calculate_slope(group['Pressure']),
            'kurtosis_pressure': kurtosis(group['Pressure']),
            'skew_pressure': skew(group['Pressure']),
            'Label': label_mode,  # Most frequent label in the interval
        }
        return pd.Series(summary)

In [4]:
def preprocessing(num, dataset):
    # Applying the summarize_interval function to each 2-second interval
    interval_seconds = 8
    result_df = dataset.groupby(dataset['Timestamp'] // interval_seconds).apply(summarize_interval)
    # Reset index to flatten the DataFrame and remove the multi-index
    result_df = result_df.reset_index(drop=True)
    # Remove rows with any NaN values
    result_df = result_df.dropna(how='any')
    #save the preprocessed data to a new CSV file
    result_df.to_csv(datadir+'preprocessed/preprocessed_data'+str(num)+'.csv', index=False)

In [5]:
import glob

for i in range(1, 21):
        preprocessing(i, read_file(f"0{i}" if i < 10 else str(i)))


In [6]:
import glob
import re

def merge_files(test_participant, datadir):
    print(test_participant)
    csv_files = glob.glob(datadir+'preprocessed/*.csv')

    dfs = []

    filtered_files = [f for f in csv_files if datadir+'preprocessed\preprocessed_data'+str(test_participant)+'.csv' not in f]
    # Loop through each CSV file and append its DataFrame to the list
    for file in filtered_files:
        df = pd.read_csv(file)
        dfs.append(df)
        #print(file)
    # Concatenate all DataFrames in the list along rows (axis=0)
    merged_df = pd.concat(dfs, ignore_index=True)

    merged_df.to_csv(datadir+'preprocessed/traindata/preprocessed_traindata'+str(test_participant)+'.csv', index=False)


In [7]:
for i in range (1, 21):
    merge_files(i, datadir)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
