In [1]:
import pandas as pd
from scipy import signal
import numpy as np
import matplotlib.pyplot as plt
import seaborn

from os import listdir
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer, PolynomialFeatures, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

In [2]:
seaborn.set()

In [3]:
# Sampling frequency [Hz]
Fs = -1

In [4]:
# Cutoff frequency [Hz]
cutoff = 5 

In [5]:
# Directory of input csv files
INPUT_DIRECTORY = "data/"

## Functions (may not all be needed)

In [6]:
# Calculate sampling rate (rounded to nearest integer) based on recorded data
# [IN]
#  x: Series containing the relative time values (from 0-##.##)
# [OUT]
#  Fs: samples per second, Hz 
def get_sampling_rate(x):
    return round(len(x.index) / x.iloc[-1])

In [7]:
# Shows plot of total acceleration values
# [IN]
#  df: Dataframe containing x, y, z, total acceleration, and time (from 0 to ## seconds)
def plot_accel(df):
    plt.figure(figsize=(10,5))
    plt.title('Total Acceleration')
    plt.xlabel('Time [s]')
    plt.ylabel('Acceleration [m/s^2]')
    plt.plot(df['time'].values, df['atotal'].values, 'b-', linewidth=1)
    plt.show()

In [8]:
# Apply butterworth filter to values in Dataframe; use with df.apply()
# [IN]
#  df: Dataframe containing x-,y-,z- acceleration values ('atotal' not needed, can be calculated after)
def butterworth_lowpass(df):
    nyq = 0.5 * Fs
    normalized_cutoff = cutoff / nyq
    b, a = signal.butter(3, normalized_cutoff, btype='lowpass')
    return signal.filtfilt(b, a, df)

In [9]:
# Shows FFT of the total acceleration
# [IN]
#  df: Dataframe containing acceleration values, must have 'atotal'
def plot_fft(df):
    w = np.fft.fft(df['atotal'])
    freqs = np.fft.fftfreq(len(df['atotal']))
    freqs = freqs * Fs

    n_samples = len(w)
    middle = -1
    if (n_samples % 2 == 0):
        middle = (n_samples // 2) - 1
    else:
        middle = (n_samples // 2)

    plt.figure(figsize=(10,5))
    plt.title('One-sided Frequency Spectrum (0 Hz excluded)')
    plt.xlabel('Frequency [Hz]')
    plt.ylabel('Magnitude')
    plt.plot(freqs[1:middle+1], np.abs(w[1:middle+1]))

    plt.figure(figsize=(10,5))
    plt.title('Frequency Spectrum')
    plt.xlabel('Frequency [Hz]')
    plt.ylabel('Magnitude')
    plt.plot(freqs, np.abs(w))

    plt.show()

In [10]:
# Calculates the FFT of the total acceleration into a Dataframe
# [IN]
#  df: Dataframe containing acceleration values, must have 'atotal'
# [OUT]
#  fft_df: Dataframe with two columns ['freq', 'value'], containing the result of applying FFT to total acceleration
def get_fft(df):
    w = np.fft.fft(df['atotal'])
    freqs = np.fft.fftfreq(len(df['atotal']))
    freqs = freqs * Fs
    
    fft_df = pd.DataFrame({
        'freq': freqs,
        'value': np.abs(w)
    })
    
    return fft_df

In [11]:
# Formats column names to be ['ax', 'ay', 'az', 'atotal']; Alfred's phone recorded it with units, and 'aT' label for last column
# [IN]
#  col_name: the column name as a string
# [OUT]
#  new_col: properly formatted column name (to conform to other group members' data)
def format_column(col_name):
    new_col = col_name.split('(')[0].strip()
    if (new_col == 'aT'):
        new_col = 'atotal'
    return new_col

In [12]:
# For filename with <id>_<pos>_<step_count>.csv format, returns the step count as an integer
# [IN]
#  filename: string with the filename of input csv
# [OUT]
#  int(count): the actual step count as an integer
def extract_step_count(filename):
    if (len(filename.split('_')) < 2):
        return np.nan
    count_with_csv = filename.split('_')[1]
    count = count_with_csv.split('.')[0]
    return int(count)

In [13]:
# Prints predicted step frequencies and step count, and compares with real step count if available
# [IN]
#  pred_freq: the predicted step frequency
#  duration: the entire time taken for data recording
#  real_step_count: the actual step count from data collection (default -1 if not available)
def print_step_comparisons(pred_freq, duration, real_step_count=-1):
    predicted_step_count = pred_freq * duration
    predicted_step_60 = pred_freq * 60
    
    print('Predicted step frequency: ', pred_freq)
    print('Predicted step frequency (assuming 60s walking): ', round(predicted_step_60))
    print('Predicted step count (using total duration): ', round(predicted_step_count))
    if real_step_count > 0:
        print('Real step count: ', real_step_count)

In [14]:
# Returns predicted step counts with given step frequency and duration
# [IN]
#  pred_freq: the predicted step frequency
#  duration: the entire time taken for data recording
# [OUT]
#  predicted_step_count: use given frequency to calculate steps throughout entire duration
#  predicted_step_60: use given frequency to calculate steps throught 60 seconds
def get_predicted_steps(pred_freq, duration):
    predicted_step_count = round(pred_freq * duration)
    predicted_step_60 = round(pred_freq * 60)
    
#     return (predicted_step_count, predicted_step_60)
    return predicted_step_count

In [15]:
def read_data(data):
    df = pd.read_csv(data)
    # Filtering dataset
    df = df[df.time != 'Time'] # there were 'time' values in the Time column
    df = df.drop([0]) # OPTIONAL: drop the first value because there is a gap between the starting time and the subsequent time
    df = df.reset_index(drop=True)

    # Convert date into a DateTime object
    # https://stackoverflow.com/questions/38110263/in-pandas-how-to-convert-a-string-to-a-datetime-object-with-milliseconds
    df['time'] = pd.to_datetime(df['time'], format="%Y-%m-%d %H:%M:%S.%f")
    return df

In [16]:
def get_step_freqs_with_count(INPUT_DIRECTORY):
    results = pd.DataFrame(columns=['filename', 'real_steps', 'freq1', 'steps1', 'freq2','steps2', 'freq3', 'steps3'])

    input_list = listdir(INPUT_DIRECTORY)

    for f in input_list:
        if '.csv' not in f.lower():
            continue

        new_row = []

        new_row.append(f)

        real_step_count = extract_step_count(f)
        new_row.append(real_step_count)

        global Fs  # 'global' keyword needed for assigning value
        data = pd.read_csv(INPUT_DIRECTORY + f)
        Fs = get_sampling_rate(data['time'])
        total_duration = data['time'].iloc[-1]
        
        data = data.dropna(axis=1, how='all')
        data['time'] = data['time'] - data['time'].iloc[0]
        data.rename(format_column, axis=1, inplace=True)

        
        # Filtering data
        data_accel = data[['ax','ay','az']].copy()
        data_accel_filtered = data_accel.apply(butterworth_lowpass, axis=0)
        data_accel_filtered['atotal'] = np.sqrt(data_accel_filtered['ax']**2 + data_accel_filtered['ay']**2 + data_accel_filtered['az']**2)
        data_accel_filtered['time'] = data['time'].copy()

        
        # Look for step frequency
        accel_fft = get_fft(data_accel_filtered)
        candidate_freqs = accel_fft[accel_fft.freq > 0].nlargest(10, ['value'])

        # Method 1:
        # Take the candidate frequency with largest magnitude as the step frequency

        estimated_freq = candidate_freqs['freq'].iloc[0]
        new_row.append(estimated_freq)
        new_row.append(get_predicted_steps(estimated_freq, total_duration))

        # Method 2:
        # Take the mean of the candidate frequencies as step frequency

        estimated_freq = candidate_freqs['freq'].mean()
        new_row.append(estimated_freq)
        new_row.append(get_predicted_steps(estimated_freq, total_duration))

        # Method 3:
        # Take the mean of candidate frequencies whose magnitudes are greater than half of the maximum magnitude (excluding 0 Hz)

        max_value = candidate_freqs['value'].iloc[0]
        best_freq = candidate_freqs[candidate_freqs['value'] > (max_value / 2)]

        estimated_freq = best_freq['freq'].mean()
        new_row.append(estimated_freq)
        new_row.append(get_predicted_steps(estimated_freq, total_duration))

        results.loc[len(results)] = new_row
    
    return results

In [17]:
def get_step_freqs(INPUT_DIRECTORY):
    results = pd.DataFrame(columns=['filename', 'freq1', 'freq2', 'freq3'])

    input_list = listdir(INPUT_DIRECTORY)

    for f in input_list:
        if '.csv' not in f.lower():
            continue

        new_row = []

        new_row.append(f)

        total_duration = 0
        global Fs  # 'global' keyword needed for assigning value
        Fs = 98
        data = read_data(INPUT_DIRECTORY + f)
        
        data = data.dropna(axis=1, how='all')
        data['time'] = data['time'] - data['time'].iloc[0]
        data.rename(format_column, axis=1, inplace=True)

        
        # Filtering data
        data_accel = data[['ax','ay','az']].copy()
        data_accel_filtered = data_accel.apply(butterworth_lowpass, axis=0)
        data_accel_filtered['atotal'] = np.sqrt(data_accel_filtered['ax']**2 + data_accel_filtered['ay']**2 + data_accel_filtered['az']**2)
        data_accel_filtered['time'] = data['time'].copy()

        
        # Look for step frequency
        accel_fft = get_fft(data_accel_filtered)
        candidate_freqs = accel_fft[accel_fft.freq > 0].nlargest(10, ['value'])

        # Method 1:
        # Take the candidate frequency with largest magnitude as the step frequency

        estimated_freq = candidate_freqs['freq'].iloc[0]
        new_row.append(estimated_freq)

        # Method 2:
        # Take the mean of the candidate frequencies as step frequency

        estimated_freq = candidate_freqs['freq'].mean()
        new_row.append(estimated_freq)

        # Method 3:
        # Take the mean of candidate frequencies whose magnitudes are greater than half of the maximum magnitude (excluding 0 Hz)

        max_value = candidate_freqs['value'].iloc[0]
        best_freq = candidate_freqs[candidate_freqs['value'] > (max_value / 2)]

        estimated_freq = best_freq['freq'].mean()
        new_row.append(estimated_freq)

        results.loc[len(results)] = new_row
    
    return results

## Step frequency for files in directory

In [18]:
INPUT_DIRECTORY = "alfred_data/"
results = get_step_freqs_with_count(INPUT_DIRECTORY)
results

Unnamed: 0,filename,real_steps,freq1,steps1,freq2,steps2,freq3,steps3
0,ankle4_112.csv,112,0.011193,1,1.28948,115,0.870847,78
1,pocket4_114.csv,114,0.012461,1,1.262325,101,0.72026,58
2,ankle2_108.csv,108,0.011696,1,1.632796,139,1.148182,98
3,hand4_111.csv,111,1.713106,130,1.043677,79,1.726283,131
4,ankle3_112.csv,112,0.011057,1,1.479388,134,1.159112,105
5,hand1_114.csv,114,1.758506,126,0.815053,58,1.758506,126
6,pocket2_117.csv,117,0.012046,1,1.874362,155,0.368608,31
7,pocket3_117.csv,117,0.012424,1,2.226293,179,1.276072,103
8,hand2_114.csv,114,1.739272,122,1.069225,75,1.739272,122
9,ankle1_100.csv,100,0.011661,1,1.315357,113,0.876904,75


In [19]:
INPUT_DIRECTORY = "huy_data/"
results = get_step_freqs(INPUT_DIRECTORY)
results

Unnamed: 0,filename,freq1,freq2,freq3
0,ankle2.csv,0.861538,1.461026,1.286325
1,ankle1.csv,0.857068,1.802402,1.509464
2,pocket2.csv,3.323073,3.114615,3.277528
3,hand1.csv,3.412018,3.334292,3.334292
4,pocket1.csv,3.578763,3.033365,3.105556
5,hand2.csv,3.276957,2.937388,3.265084


In [20]:
INPUT_DIRECTORY = "janit_data/"
results = get_step_freqs(INPUT_DIRECTORY)
results

Unnamed: 0,filename,freq1,freq2,freq3
0,Ankel1.1.csv,0.838348,1.87472,1.058776
1,Ankel1.2.csv,0.893529,1.970088,0.751814
2,Pocket1.1.csv,1.782387,2.451563,2.896378
3,Hand1.1.csv,0.014833,1.090207,0.652641
4,Pocket1.2.csv,0.048507,0.515789,0.515789
5,Hand1.2.csv,1.919081,1.173653,2.145088


In [21]:
def classifiers(X, y):

   X_train, X_test, y_train, y_test = train_test_split(X, y)

   bayes_model = GaussianNB()
   knn_model = KNeighborsClassifier(n_neighbors=3)
   svc_model = SVC(kernel='linear')

   models = [bayes_model, knn_model, svc_model]

   for i, m in enumerate(models):
      m.fit(X_train, y_train)

   print(OUTPUT_TEMPLATE_CLASSIFIER.format(
       bayes = bayes_model.score(X_test, y_test),
       knn = knn_model.score(X_test, y_test),
       svm = svc_model.score(X_test, y_test),
    ))

   OUTPUT_TEMPLATE_CLASSIFIER = (
       'Bayesian classifier: {bayes:.3g}\n'
       'kNN classifier:      {knn:.3g}\n'
       'SVM classifier:      {svm:.3g}\n'
)