In [1]:
import pandas as pd
import numpy as np
from scipy.stats import skew, kurtosis

datadir = '../Collected Data/'
test_participant = 20

def read_file(num):
    dataset = pd.read_csv(datadir+'Collected Labeled Data - Phase 01/participant '+ num +'.csv')
    if 'Time' in dataset.columns:
        dataset.drop('Time', axis=1, inplace=True)
    if 'Start' in dataset.columns:
        dataset.drop('Start', axis=1, inplace=True)
    if 'End' in dataset.columns:
        dataset.drop('End', axis=1, inplace=True)
    return dataset

In [2]:
def calculate_slope(data):
    data = data.replace(np.inf, np.nan).replace(-np.inf, np.nan).dropna()
    x = np.arange(len(data))
    if len(x)>1:
        slope = np.polyfit(x, data, 1)[0]
    else: 
        slope = 0
    return slope

In [19]:
def avg_crossing(avg_value, pressure_values, flag):
    if(flag):
        count = len([x for x in pressure_values if x < avg_value])  #This will return the percentage of values below average
        return round(count/len(pressure_values), 2)
    else:
        count = len([x for x in pressure_values if x > avg_value]) #This will return the percentage of values above average
        return round(count/len(pressure_values), 2)

In [12]:
def summarize_interval(group):
    avg_pressure = group['Pressure'].mean()
    summary = {
        'avg_accX': group['X'].mean(),
        'min_accX': group['X'].min(),
        'max_accX': group['X'].max(),
        'var_accX': group['X'].var(),
        'std_accX': np.std(group['X']),
        'avg_accY': group['Y'].mean(),
        'min_accY': group['Y'].min(),
        'max_accY': group['Y'].max(),
        'var_accY': group['Y'].var(),
        'std_accY': np.std(group['Y']),
        'avg_accZ': group['Z'].mean(),
        'min_accZ': group['Z'].min(),
        'max_accZ': group['Z'].max(),
        'var_accZ': group['Z'].var(),
        'std_accZ': np.std(group['Z']),
        'avg_magnitude': group['Magnitude'].mean(),
        'min_magnitude': group['Magnitude'].min(),
        'max_magnitude': group['Magnitude'].max(),
        'var_magnitude': group['Magnitude'].var(),
        'std_magnitude': np.std(group['Magnitude']),
        'avg_pressure': avg_pressure,
        'min_pressure': group['Pressure'].min(),
        'max_pressure': group['Pressure'].max(),
        'var_pressure': group['Pressure'].var(),
        'range_pressure': (group['Pressure'].max() - group['Pressure'].min()),
        'std_pressure': np.std(group['Pressure']),
        'slope_pressure': calculate_slope(group['Pressure']),
        'kurtosis_pressure': kurtosis(group['Pressure']),
        'Pressure_below_avg': avg_crossing(avg_pressure, group['Pressure'], 1), #flag 1 to calulate below average values
        'Pressure_above_avg': avg_crossing(avg_pressure, group['Pressure'], 0), #flag 0 to calculate above average values
        'skew_pressure': skew(group['Pressure']),
        'Label': group['Label'].mode().iloc[0]  # Most frequent label in the interval
    }
    return pd.Series(summary)

In [20]:
def preprocessing(num, dataset):
    # Applying the summarize_interval function to each 2-second interval
    interval_seconds = 2
    result_df = dataset.groupby(dataset['Timestamp'] // interval_seconds).apply(summarize_interval)
    
    # Reset index and drop the Timestamp column (if needed)
    result_df.reset_index(drop=True, inplace=True)
    
    # Step 5: Save the preprocessed data to a new CSV file
    result_df.to_csv(datadir+'preprocessed/'+'preprocessed_data'+str(num)+'.csv', index=False)
    result_df.head(24)

In [23]:
import glob

for i in range(1, 21):
    if i != test_participant:
        preprocessing(i, read_file(f"0{i}" if i < 9 else str(i)))

dir = (datadir+'preprocessed/'+str(test_participant)+'/')
csv_files = glob.glob(datadir+'preprocessed/'+'*.csv')
print(len(csv_files))

  'kurtosis_pressure': kurtosis(group['Pressure']),
  'skew_pressure': skew(group['Pressure']),


19


In [24]:
dfs = []

csv_files = csv_files[:19]
# Loop through each CSV file and append its DataFrame to the list
for file in csv_files:
    df = pd.read_csv(file, converters={'Label': lambda x: x.lower() if x not in ['Null', 'null', ' null'] else x})
    dfs.append(df)
    print(file)
# Concatenate all DataFrames in the list along rows (axis=0)
merged_df = pd.concat(dfs, ignore_index=True)


../Collected Data/preprocessed\preprocessed_data1.csv
../Collected Data/preprocessed\preprocessed_data10.csv
../Collected Data/preprocessed\preprocessed_data11.csv
../Collected Data/preprocessed\preprocessed_data12.csv
../Collected Data/preprocessed\preprocessed_data13.csv
../Collected Data/preprocessed\preprocessed_data14.csv
../Collected Data/preprocessed\preprocessed_data15.csv
../Collected Data/preprocessed\preprocessed_data16.csv
../Collected Data/preprocessed\preprocessed_data17.csv
../Collected Data/preprocessed\preprocessed_data18.csv
../Collected Data/preprocessed\preprocessed_data19.csv
../Collected Data/preprocessed\preprocessed_data2.csv
../Collected Data/preprocessed\preprocessed_data3.csv
../Collected Data/preprocessed\preprocessed_data4.csv
../Collected Data/preprocessed\preprocessed_data5.csv
../Collected Data/preprocessed\preprocessed_data6.csv
../Collected Data/preprocessed\preprocessed_data7.csv
../Collected Data/preprocessed\preprocessed_data8.csv
../Collected Data/

In [25]:
merged_df.to_csv(datadir+'preprocessed/'+'preprocessed_traindata.csv', index=False)

In [26]:
preprocessing(test_participant, read_file(f"0{test_participant}" if test_participant < 9 else str(test_participant)))
newdf = pd.read_csv(datadir+'preprocessed/'+'preprocessed_data'+str(test_participant)+'.csv', converters={'Label': lambda x: x.lower() if x not in ['Null', 'null', ' null'] else x})
newdf.to_csv(datadir+'preprocessed/'+'preprocessed_testdata.csv', index=False)