In [2]:
#############
## IMPORTS ##
#############

import pandas as pd
import os.path
import numpy as np
import matplotlib
from scipy import signal


In [3]:
###################
## PREPROCESSING ##
###################
## Set the participant we're analyzing

PARTICIPANT = "user1"
DATA_1 = PARTICIPANT+'_1.csv'
DATA_2 = PARTICIPANT+'_2.csv'
DATA_3 = PARTICIPANT+'_3.csv'

#loading data in
file_paths = ['user_data\\'+PARTICIPANT+'\\'+DATA_1, 'user_data\\'+PARTICIPANT+'\\'+DATA_2, 'user_data\\'+PARTICIPANT+'\\'+DATA_1 ]
f = file_paths[0]
gen = pd.read_csv(f, names=['Sensor', 'Time', 'X', 'Y', 'Z'], on_bad_lines='skip') #general data

#accel is 10, gyro is 4
GYRO_ID = 4
ACCEL_ID = 10

#Separate data by sensor id 
gyro_df = gen[gen.Sensor == GYRO_ID]
accel_df = gen[gen.Sensor == ACCEL_ID]


#sort data by time 
gyro_df.sort_values('Time', inplace=True)
accel_df.sort_values('Time', inplace=True)

#data is now separated and ordered by time, ready for M-point filer
print(gyro_df.head())
print(accel_df.head())


    Sensor          Time        X         Y         Z
11       4  7.210914e+11 -0.11638  0.049002  0.068976
8        4  7.210947e+11 -0.11638  0.049002  0.068976
4        4  7.210969e+11 -0.11638  0.049002  0.068976
13       4  7.211012e+11 -0.11638  0.049002  0.068976
9        4  7.211076e+11 -0.11638  0.049002  0.068976
    Sensor          Time         X         Y         Z
12      10  7.210794e+11 -1.297232 -1.685075  0.404865
10      10  7.210914e+11 -1.297232 -1.685075  0.404865
2       10  7.210969e+11 -1.297232 -1.685075  0.404865
7       10  7.211076e+11 -1.297232 -1.685075  0.404865
1       10  7.211231e+11 -1.297232 -1.685075  0.404865


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


In [36]:
##################################
# APPLYING MOVING AVERAGE FILTER #
##################################

#Applying Moving Average Filter 


M = 10 #M-point filter

#starts averaging at Mth point rather than 0th as opposed to vice versa in original paper 
gyro_df['MA_Time'] = gyro_df['Time'].rolling(M).mean()
gyro_df['MA_X'] = gyro_df['X'].rolling(M).mean()
gyro_df['MA_Y'] = gyro_df['Y'].rolling(M).mean()
gyro_df['MA_Z'] = gyro_df['Z'].rolling(M).mean()

accel_df['MA_Time'] = accel_df['Time'].rolling(M).mean()
accel_df['MA_X'] = accel_df['X'].rolling(M).mean()
accel_df['MA_Y'] = accel_df['Y'].rolling(M).mean()
accel_df['MA_Z'] = accel_df['Z'].rolling(M).mean()

#Creating axis vectors 

N = 300 #number of samples for a profile feature 
M_IDX = N + M -1#index of the Nth sample (accounts for NaNs of first M rows)


#X axis 
x_a = accel_df.loc[:, "MA_X"]
x_a = list(x_a[M-1:M_IDX]) #this is done to avoid a keyerror in the loc function 

x_g = gyro_df.loc[:, "MA_X"]
x_g = list(x_g[M-1:M_IDX])  

#y axis
y_a = accel_df.loc[:, "MA_Y"]
y_a = list(y_a[M-1:M_IDX])

y_g = gyro_df.loc[:, "MA_Y"]
y_g = list(y_g[M-1:M_IDX]) 

#z axis
z_a = accel_df.loc[:, "MA_Z"]
z_a = list(z_a[M-1:M_IDX])

z_g = accel_df.loc[:, "MA_Z"]
z_g = list(z_g[M-1:M_IDX])

features = {'x_a': x_a, 'y_a': y_a, 'z_a': z_a, 'x_g': x_g, 'y_g': y_g, 'z_g': z_g}
features_df = pd.DataFrame.from_dict(features) #mostly for presentation purposes, will come in handy for feature extraction


start_time = gyro_df.MA_Time.loc[gyro_df.MA_Time.first_valid_index()]
end_time = gyro_df.MA_Time.iloc[-1]

print(features_df)

#preprocessing is now completed

          x_a       y_a       z_a       x_g       y_g       z_g
0   -0.862802 -1.437326  0.404291 -0.102052  0.046153  0.404291
1   -0.781090 -1.364904  0.396533 -0.089642  0.040480  0.396533
2   -0.687134 -1.295530  0.390052 -0.074222  0.037284  0.390052
3   -0.594113 -1.219587  0.376317 -0.054382  0.036272  0.376317
4   -0.491355 -1.144852  0.359657 -0.034728  0.035287  0.359657
..        ...       ...       ...       ...       ...       ...
295 -0.226977 -0.217326  0.843916 -0.117925  0.076992  0.843916
296 -0.257069 -0.236286  0.839527 -0.134277  0.075607  0.839527
297 -0.261433 -0.229763  0.794429 -0.149217  0.071080  0.794429
298 -0.255331 -0.214992  0.744446 -0.153025  0.061572  0.744446
299 -0.246414 -0.208342  0.665234 -0.135795  0.051426  0.665234

[300 rows x 6 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gyro_df['MA_Time'] = gyro_df['Time'].rolling(M).mean()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gyro_df['MA_X'] = gyro_df['X'].rolling(M).mean()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gyro_df['MA_Y'] = gyro_df['Y'].rolling(M).mean()
A value is trying to be set on a copy of a slice fro

In [37]:
###########################################
## FEATURE EXTRACTION and USER PROFILING ##
###########################################


#Mean, Median, Variance, Average Absolute
#Difference of Peaks, Range, Mode, Covariance,
#Mewan Absolute Deviation (MAD), Inter-
#quartile Range (IQR), correlation between axes
#(xy, yz, xz), Skewness, Kurtosis




extracted_features = { 'mean': [], 'median': [], 'variance': [], 'AADP': [], 'range': [], 'mode':[], 
                'covariance': [], 'mad': [], 'iqr': [], 'correlation': [], 'skewness': [], 'kurtosis': [],
                'entropy': [], 's_nrg': []} #features in the domain of frequency

for column in features_df:
    extracted_features['mean'].append(features_df[column].mean())
    extracted_features['median'].append(features_df[column].median())
    extracted_features['variance'].append(features_df.var()[column])
    extracted_features['AADP'].append(0) #FIX
    extracted_features['range'].append(features_df[column].max() - features_df[column].min())
    extracted_features['mode'].append(features_df[column].mode().iat[0])

#################
    extracted_features['covariance'].append(0) #covariance of column with 
     #check thesis for which columns 
    extracted_features['mad'].append(0) #FIX
######################
    extracted_features['iqr'].append( features_df[column].quantile(0.75) - features_df[column].quantile(0.25))

    ##########
    extracted_features['correlation'].append(0) #FIX
    ############
    extracted_features['skewness'].append(features_df[column].skew())
    extracted_features['kurtosis'].append(features_df[column].kurtosis())

    extracted_features['entropy'].append(0)
    extracted_features['s_nrg'].append(0)

labels = ['x_a', 'y_a', 'z_a', 'x_g', 'y_g', 'z_g']
feature_set = pd.DataFrame.from_dict(extracted_features, orient='index', columns=labels)
print(feature_set)

user_id = PARTICIPANT
t_start = start_time
t_end = end_time
f_vec = feature_set

user_profile = [user_id, t_start, t_end, f_vec]

                  x_a       y_a       z_a       x_g       y_g       z_g
mean         0.010654 -0.044117  0.261542 -0.023587  0.062408  0.261542
median      -0.064134  0.015688  0.326910 -0.006192  0.002943  0.326910
variance     0.132197  0.761849  0.463795  0.288607  0.165307  0.463795
AADP         0.000000  0.000000  0.000000  0.000000  0.000000  0.000000
range        1.824250  4.810850  4.196424  3.712074  2.107386  4.196424
mode        -0.862802 -2.430417 -2.625243 -1.406389 -0.495907 -2.625243
covariance   0.000000  0.000000  0.000000  0.000000  0.000000  0.000000
mad          0.000000  0.000000  0.000000  0.000000  0.000000  0.000000
iqr          0.412589  0.727518  0.553754  0.174584  0.209877  0.553754
correlation  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000
skewness     0.620581  0.117822 -1.237798  1.386776  2.287149 -1.237798
kurtosis     0.159654  1.014060  3.153945  6.352682  5.467729  3.153945
entropy      0.000000  0.000000  0.000000  0.000000  0.000000  0

In [None]:
#Euclidean Distance
#distance between two vectors or data points in a coordinate plane.


def minkow_dist(x, y, p=2):
    '''takes two vectors stored as lists to return minkowski distance. '''
    p = 2  #measurement for minkowski distance, euclidean distance when set to 2 
    distance_sum = 0   
    for i in range(0, len(x)):
        distance_sum += (x[i] - y[i])** p

    return distance_sum ** (1/p)


#if distance < threshold, genuine. if distance >= threshold, imposter

#paper also tests cosine distance, correlation distance, manhattan distance and minkowski with p=5. 


#performance evaluation is calculated using EER; need to do this. 