In [1]:
import pandas as pd
import numpy as np
from scipy.stats import skew, kurtosis
from collections import Counter

datadir = '../Collected Data/'
test_participant = 20

def read_file(num):
    dataset = pd.read_csv(datadir+'Collected Updated Labeled Data - Phase 01/participant '+ num +'.csv')
    if 'Time' in dataset.columns:
        dataset.drop('Time', axis=1, inplace=True)
    if 'Start' in dataset.columns:
        dataset.drop('Start', axis=1, inplace=True)
    if 'End' in dataset.columns:
        dataset.drop('End', axis=1, inplace=True)
        
    strings_to_replace = ['null', ' null']
    replacement_string = 'Null'
    dataset = dataset.replace(strings_to_replace, replacement_string)
    return dataset

In [2]:
l= [1,2,3,4,1,2,3,3,1,2,4,4,5,4,5,4,5,4,4,3,2]
d = Counter(l)
n = len(l)
for each in set(l):
    print((d[each]/n)*100)
print(d)

14.285714285714285
19.047619047619047
19.047619047619047
33.33333333333333
14.285714285714285
Counter({4: 7, 2: 4, 3: 4, 1: 3, 5: 3})


In [3]:
def calculate_slope(data):
    data = data.replace(np.inf, np.nan).replace(-np.inf, np.nan).dropna()
    x = np.arange(len(data))
    if len(x)>1:
        slope = np.polyfit(x, data, 1)[0]
    else: 
        slope = 0
    return slope

In [4]:
def avg_crossing(avg_value, pressure_values, flag):
    if(flag):
        count = len([x for x in pressure_values if x < avg_value])  #This will return the percentage of values below average
        return round(count/len(pressure_values), 2)
    else:
        count = len([x for x in pressure_values if x > avg_value]) #This will return the percentage of values above average
        return round(count/len(pressure_values), 2)

In [5]:
def summarize_interval(group):
    avg_pressure = group['Pressure'].mean()
    label_mode = group['Label'].mode().iloc[0] #calculating mode value to assign label
    labeldict = Counter(group['Label']) #getting the count of each value in label
    label_percentage = (labeldict[label_mode] / len(group['Label']) ) * 100 #calculating the percentage of values that are same as mode
    #print(f"label is {label_mode} and label Percentage is {label_percentage}")
    if ((label_percentage > 79) and (label_mode not in ['Null','null',' null','\tnull'])): #calculating the values only if the label is atleast 80% else return 
        summary = {
            'avg_accX': group['X'].mean(),
            'min_accX': group['X'].min(),
            'max_accX': group['X'].max(),
            'var_accX': group['X'].var(),
            'std_accX': np.std(group['X']),
            'avg_accY': group['Y'].mean(),
            'min_accY': group['Y'].min(),
            'max_accY': group['Y'].max(),
            'var_accY': group['Y'].var(),
            'std_accY': np.std(group['Y']),
            'avg_accZ': group['Z'].mean(),
            'min_accZ': group['Z'].min(),
            'max_accZ': group['Z'].max(),
            'var_accZ': group['Z'].var(),
            'std_accZ': np.std(group['Z']),
            'avg_magnitude': group['Magnitude'].mean(),
            'min_magnitude': group['Magnitude'].min(),
            'max_magnitude': group['Magnitude'].max(),
            'var_magnitude': group['Magnitude'].var(),
            'std_magnitude': np.std(group['Magnitude']),
            'avg_pressure': avg_pressure,
            'min_pressure': group['Pressure'].min(),
            'max_pressure': group['Pressure'].max(),
            'var_pressure': group['Pressure'].var(),
            'range_pressure': (group['Pressure'].max() - group['Pressure'].min()),
            'std_pressure': np.std(group['Pressure']),
            'slope_pressure': calculate_slope(group['Pressure']),
            'kurtosis_pressure': kurtosis(group['Pressure']),
            'Pressure_below_avg': avg_crossing(avg_pressure, group['Pressure'], 1), #flag 1 to calulate below average values
            'Pressure_above_avg': avg_crossing(avg_pressure, group['Pressure'], 0), #flag 0 to calculate above average values
            'skew_pressure': skew(group['Pressure']),
            'Label': label_mode,  # Most frequent label in the interval
        }
        return pd.Series(summary)

In [6]:
def preprocessing(num, dataset):
    # Applying the summarize_interval function to each 2-second interval
    interval_seconds = 2
    result_df = dataset.groupby(dataset['Timestamp'] // interval_seconds).apply(summarize_interval)
    # Reset index to flatten the DataFrame and remove the multi-index
    result_df = result_df.reset_index(drop=True)
    # Remove rows with any NaN values
    result_df = result_df.dropna(how='any')
    #save the preprocessed data to a new CSV file
    result_df.to_csv(datadir+'preprocessed/'+str(test_participant)+'/preprocessed_data'+str(num)+'.csv', index=False)

In [7]:
import glob

for i in range(1, 21):
    if i != test_participant:
        preprocessing(i, read_file(f"0{i}" if i < 10 else str(i)))


  'kurtosis_pressure': kurtosis(group['Pressure']),
  'skew_pressure': skew(group['Pressure']),


In [8]:
csv_files = glob.glob(datadir+'preprocessed/'+str(test_participant)+'/*.csv')
print(len(csv_files))

19


In [9]:
dfs = []

csv_files = csv_files[:19]
# Loop through each CSV file and append its DataFrame to the list
for file in csv_files:
    df = pd.read_csv(file,converters={'Label': lambda x: x.lower() if x not in ['Null', 'null', ' null', '\tnull'] else 'Null'})
    dfs.append(df)
    print(file)
# Concatenate all DataFrames in the list along rows (axis=0)
merged_df = pd.concat(dfs, ignore_index=True)


../Collected Data/preprocessed/20\preprocessed_data1.csv
../Collected Data/preprocessed/20\preprocessed_data10.csv
../Collected Data/preprocessed/20\preprocessed_data11.csv
../Collected Data/preprocessed/20\preprocessed_data12.csv
../Collected Data/preprocessed/20\preprocessed_data13.csv
../Collected Data/preprocessed/20\preprocessed_data14.csv
../Collected Data/preprocessed/20\preprocessed_data15.csv
../Collected Data/preprocessed/20\preprocessed_data16.csv
../Collected Data/preprocessed/20\preprocessed_data17.csv
../Collected Data/preprocessed/20\preprocessed_data18.csv
../Collected Data/preprocessed/20\preprocessed_data19.csv
../Collected Data/preprocessed/20\preprocessed_data2.csv
../Collected Data/preprocessed/20\preprocessed_data3.csv
../Collected Data/preprocessed/20\preprocessed_data4.csv
../Collected Data/preprocessed/20\preprocessed_data5.csv
../Collected Data/preprocessed/20\preprocessed_data6.csv
../Collected Data/preprocessed/20\preprocessed_data7.csv
../Collected Data/pre

In [10]:
merged_df.to_csv(datadir+'preprocessed/'+str(test_participant)+'/preprocessed_traindata.csv', index=False)

In [11]:
preprocessing(test_participant, read_file(f"0{test_participant}" if test_participant < 10 else str(test_participant)))
newdf = pd.read_csv(datadir+'preprocessed/'+str(test_participant)+'/preprocessed_data'+str(test_participant)+'.csv', converters={'Label': lambda x: x.lower() if x not in ['Null', 'null', ' null'] else 'Null'})
newdf.to_csv(datadir+'preprocessed/'+str(test_participant)+'/preprocessed_testdata.csv', index=False)