In [74]:
import pandas as pd
import numpy as np
from scipy.stats import skew, kurtosis
datadir = '../Collected Data/'
dataset = pd.read_csv(datadir+'Collected Labeled Data - Phase 01/participant 10.csv')

In [3]:
dataset.head(10)

Unnamed: 0,Time,Timestamp,X,Y,Z,Magnitude,Pressure,Label
0,09:56:48.703,0.160706,-0.999268,0.120605,-0.02124,1.006744,972.628618,Null
1,09:56:48.728,0.185913,-0.997559,0.122925,-0.020386,1.005311,972.630476,Null
2,09:56:48.746,0.203033,-0.997559,0.122925,-0.020386,1.005311,972.630476,Null
3,09:56:48.763,0.22049,-0.997559,0.122925,-0.020386,1.005311,972.630476,Null
4,09:56:48.783,0.240662,-0.997559,0.122925,-0.020386,1.005311,972.630476,Null
5,09:56:48.807,0.26416,-0.998901,0.12085,-0.024048,1.006472,972.630476,Null
6,09:56:48.824,0.281097,-0.998901,0.12085,-0.024048,1.006472,972.630476,Null
7,09:56:48.843,0.300598,-0.998901,0.12085,-0.024048,1.006472,972.630476,Null
8,09:56:48.863,0.320557,-0.998901,0.12085,-0.024048,1.006472,972.630476,Null
9,09:56:48.889,0.345978,-1.000488,0.119995,-0.02063,1.00787,972.631825,Null


In [75]:
if 'Time' in dataset.columns:
    dataset.drop('Time', axis=1, inplace=True)

In [5]:
import numpy as np

def calculate_slope(data):
    data = data.replace(np.inf, np.nan).replace(-np.inf, np.nan).dropna()
    x = np.arange(len(data))
    if len(x)>1:
        slope = np.polyfit(x, data, 1)[0]
    else: 
        slope = 0
    return slope

In [17]:
def summarize_interval(group):
    summary = {
        'avg_accX': group['X'].mean(),
        'min_accX': group['X'].min(),
        'max_accX': group['X'].max(),
        'var_accX': group['X'].var(),
        'std_accX': np.std(group['X']),
        'avg_accY': group['Y'].mean(),
        'min_accY': group['Y'].min(),
        'max_accY': group['Y'].max(),
        'var_accY': group['Y'].var(),
        'std_accY': np.std(group['Y']),
        'avg_accZ': group['Z'].mean(),
        'min_accZ': group['Z'].min(),
        'max_accZ': group['Z'].max(),
        'var_accZ': group['Z'].var(),
        'std_accZ': np.std(group['Z']),
        'avg_magnitude': group['Magnitude'].mean(),
        'min_magnitude': group['Magnitude'].min(),
        'max_magnitude': group['Magnitude'].max(),
        'var_magnitude': group['Magnitude'].var(),
        'std_magnitude': np.std(group['Magnitude']),
        'avg_pressure': group['Pressure'].mean(),
        'min_pressure': group['Pressure'].min(),
        'max_pressure': group['Pressure'].max(),
        'var_pressure': group['Pressure'].var(),
        'range_pressure': (group['Pressure'].max() - group['Pressure'].min()),
        'std_pressure': np.std(group['Pressure']),
        'slope_pressure': calculate_slope(group['Pressure']),
        'kurtosis_pressure': kurtosis(group['Pressure']),
        'Interqartile_percentage_Pressure': (np.percentile(group['Pressure'], 75) - np.percentile(group['Pressure'], 25)),
        'skew_pressure': skew(group['Pressure']),
        'Label': group['Label'].mode().iloc[0]  # Most frequent label in the interval
    }
    return pd.Series(summary)

In [76]:
# Applying the summarize_interval function to each 2-second interval
interval_seconds = 4
result_df = dataset.groupby(dataset['Timestamp'] // interval_seconds).apply(summarize_interval)

# Reset index and drop the Timestamp column (if needed)
result_df.reset_index(drop=True, inplace=True)

# Step 5: Save the preprocessed data to a new CSV file
result_df.to_csv(datadir+'preprocessed/preprocessed_data10.csv', index=False)
result_df.head(24)

Unnamed: 0,avg_accX,min_accX,max_accX,var_accX,std_accX,avg_accY,min_accY,max_accY,var_accY,std_accY,...,min_pressure,max_pressure,var_pressure,range_pressure,std_pressure,slope_pressure,kurtosis_pressure,Interqartile_percentage_Pressure,skew_pressure,Label
0,0.043943,0.035522,0.054688,2.7e-05,0.005184,0.549046,0.53186,0.563965,4.4e-05,0.006581,...,970.750213,970.767564,2.5e-05,0.017352,0.00495,5.8e-05,-0.93622,0.006492,-0.281756,Null
1,0.05788,0.044312,0.091064,0.000101,0.010032,0.546731,0.53479,0.557007,3.4e-05,0.005855,...,970.753004,970.76742,1.1e-05,0.014415,0.003341,1.8e-05,-0.082931,0.003664,-0.292987,Null
2,0.058664,0.050415,0.070557,1.9e-05,0.004333,0.553639,0.539551,0.567261,3.4e-05,0.005795,...,970.756573,970.769603,1e-05,0.013031,0.003212,-2.3e-05,-0.745849,0.004894,-0.029762,Null
3,0.080873,-0.154785,0.184448,0.005851,0.076299,0.775188,0.440552,0.95166,0.015411,0.123831,...,970.668815,970.773334,0.000725,0.104518,0.026865,0.000136,4.036888,0.0091,-2.341516,Null
4,0.132888,0.119751,0.146118,4.6e-05,0.006739,0.753831,0.740356,0.765503,3.7e-05,0.006077,...,970.744277,970.76503,2e-05,0.020754,0.004494,-5e-05,0.266637,0.004582,-0.153587,Null
5,0.135353,0.120728,0.148804,4.3e-05,0.006512,0.732424,0.702026,0.748779,0.000146,0.012037,...,970.738104,970.757382,2e-05,0.019279,0.004512,-2.4e-05,-0.43957,0.006066,0.183128,Null
6,0.153969,0.098389,0.203613,0.000514,0.022626,0.003414,-0.216187,0.773438,0.125596,0.353508,...,970.714929,970.770196,0.000101,0.055267,0.01003,-6.4e-05,3.218477,0.007177,1.554673,Null
7,0.153834,-0.179321,1.211548,0.102808,0.319834,0.188464,-0.144287,0.754883,0.053521,0.230767,...,970.656542,970.788801,0.000561,0.132259,0.023619,-0.000138,0.722746,0.029586,-0.409583,Null
8,0.978551,0.777832,1.180298,0.007924,0.088795,0.222027,-0.09436,0.549072,0.0139,0.117603,...,970.694411,970.772816,0.000168,0.078405,0.012947,7.7e-05,1.222789,0.015733,-0.919482,Null
9,0.980381,0.774658,1.339233,0.020065,0.141295,0.247191,0.08728,0.488403,0.006694,0.081615,...,970.732451,970.791697,0.000169,0.059245,0.012976,4.7e-05,-0.321032,0.018066,0.268816,Null


In [82]:
import glob
dir = (datadir+'preprocessed/')
csv_files = glob.glob(dir+'*.csv')
print(len(csv_files))

21


In [83]:
dfs = []

csv_files = csv_files[:19]
# Loop through each CSV file and append its DataFrame to the list
for file in csv_files:
    df = pd.read_csv(file)
    dfs.append(df)
    print(file)
# Concatenate all DataFrames in the list along rows (axis=0)
merged_df = pd.concat(dfs, ignore_index=True)


../Collected Data/preprocessed\preprocessed_data01.csv
../Collected Data/preprocessed\preprocessed_data02.csv
../Collected Data/preprocessed\preprocessed_data03.csv
../Collected Data/preprocessed\preprocessed_data04.csv
../Collected Data/preprocessed\preprocessed_data05.csv
../Collected Data/preprocessed\preprocessed_data06.csv
../Collected Data/preprocessed\preprocessed_data07.csv
../Collected Data/preprocessed\preprocessed_data08.csv
../Collected Data/preprocessed\preprocessed_data09.csv
../Collected Data/preprocessed\preprocessed_data10.csv
../Collected Data/preprocessed\preprocessed_data11.csv
../Collected Data/preprocessed\preprocessed_data12.csv
../Collected Data/preprocessed\preprocessed_data13.csv
../Collected Data/preprocessed\preprocessed_data14.csv
../Collected Data/preprocessed\preprocessed_data15.csv
../Collected Data/preprocessed\preprocessed_data16.csv
../Collected Data/preprocessed\preprocessed_data17.csv
../Collected Data/preprocessed\preprocessed_data18.csv
../Collect

In [84]:
merged_df['Label'] = merged_df['Label'].str.lower()
merged_df.to_csv(datadir+'preprocessed/preprocessed_traindata.csv', index=False)

In [85]:
newdf = pd.read_csv(datadir+'preprocessed/preprocessed_data20.csv')
newdf['Label'] = newdf['Label'].str.lower()
newdf.to_csv(datadir+'preprocessed/preprocessed_testdata.csv', index=False)