In [1]:
#############
## IMPORTS ##
#############

import pandas as pd
import os.path
import numpy as np
import matplotlib
import scipy
from scipy.fft import fft

In [2]:
###################
## PREPROCESSING ##
###################
## Set the participant we're analyzing

PARTICIPANT = "user1"
DATA_1 = PARTICIPANT+'_1.csv'
DATA_2 = PARTICIPANT+'_2.csv'
DATA_3 = PARTICIPANT+'_3.csv'

#loading data in
file_paths = ['user_data\\'+PARTICIPANT+'\\'+DATA_1, 'user_data\\'+PARTICIPANT+'\\'+DATA_2, 'user_data\\'+PARTICIPANT+'\\'+DATA_1 ]
f = file_paths[0]
gen = pd.read_csv(f, names=['Sensor', 'Time', 'X', 'Y', 'Z'], on_bad_lines='skip') #general data

#accel is 10, gyro is 4
GYRO_ID = 4
ACCEL_ID = 10

#Separate data by sensor id 
gyro_df = gen.loc[gen.Sensor == GYRO_ID]
accel_df = gen.loc[gen.Sensor == ACCEL_ID]

#sort data by time 
gyro_df.sort_values('Time', inplace=True)
accel_df.sort_values('Time', inplace=True)

#data is now separated and ordered by time, ready for M-point filer
print(gyro_df.head())
print(accel_df.head())


    Sensor          Time        X         Y         Z
11       4  7.210914e+11 -0.11638  0.049002  0.068976
8        4  7.210947e+11 -0.11638  0.049002  0.068976
4        4  7.210969e+11 -0.11638  0.049002  0.068976
13       4  7.211012e+11 -0.11638  0.049002  0.068976
9        4  7.211076e+11 -0.11638  0.049002  0.068976
    Sensor          Time         X         Y         Z
12      10  7.210794e+11 -1.297232 -1.685075  0.404865
10      10  7.210914e+11 -1.297232 -1.685075  0.404865
2       10  7.210969e+11 -1.297232 -1.685075  0.404865
7       10  7.211076e+11 -1.297232 -1.685075  0.404865
1       10  7.211231e+11 -1.297232 -1.685075  0.404865


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gyro_df.sort_values('Time', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  accel_df.sort_values('Time', inplace=True)


In [15]:
 ##################################
# APPLYING MOVING AVERAGE FILTER #
##################################

#Applying Moving Average Filter 


M = 9 #M-point filter

#starts averaging at Mth point rather than 0th as opposed to vice versa in original paper 
gyro_df.loc['MA_Time'] = gyro_df['Time'].rolling(M).mean()
gyro_df.loc['MA_X'] = gyro_df['X'].rolling(M).mean()
gyro_df.loc['MA_Y'] = gyro_df['Y'].rolling(M).mean()
gyro_df.loc['MA_Z'] = gyro_df['Z'].rolling(M).mean()

accel_df.loc['MA_Time'] = accel_df['Time'].rolling(M).mean()
accel_df.loc['MA_X'] = accel_df['X'].rolling(M).mean()
accel_df.loc['MA_Y'] = accel_df['Y'].rolling(M).mean()
accel_df.loc['MA_Z'] = accel_df['Z'].rolling(M).mean()

#Creating axis vectors 

N = 1500 #number of samples for a profile feature 
M_IDX = N + M -1#index of the Nth sample (accounts for NaNs of first M rows)


#X axis 
x_a = accel_df.loc[:, "MA_X"]
x_a = list(x_a[M-1:M_IDX]) #this is done to avoid a keyerror in the loc function 

x_g = gyro_df.loc[:, "MA_X"]
x_g = list(x_g[M-1:M_IDX])  

#y axis
y_a = accel_df.loc[:, "MA_Y"]
y_a = list(y_a[M-1:M_IDX])

y_g = gyro_df.loc[:, "MA_Y"]
y_g = list(y_g[M-1:M_IDX]) 

#z axis
z_a = accel_df.loc[:, "MA_Z"]
z_a = list(z_a[M-1:M_IDX])

z_g = gyro_df.loc[:, "MA_Z"]
z_g = list(z_g[M-1:M_IDX])

features = {'x_a': x_a, 'y_a': y_a, 'z_a': z_a, 'x_g': x_g, 'y_g': y_g, 'z_g': z_g}
features_df = pd.DataFrame.from_dict(features) #mostly for presentation purposes, will come in handy for feature extraction
print(features_df)


           x_a       y_a       z_a       x_g       y_g       z_g
0    -0.905323 -1.490267  0.412848 -0.116380  0.049002  0.068976
1    -0.814532 -1.409798  0.404227 -0.100460  0.045836  0.059477
2    -0.723741 -1.329329  0.395607 -0.086671  0.039533  0.053352
3    -0.619345 -1.252247  0.388406 -0.069538  0.035982  0.048558
4    -0.515988 -1.167866  0.373145 -0.047493  0.034858  0.041812
...        ...       ...       ...       ...       ...       ...
1495 -0.021500  0.059583 -0.017527 -0.044031  0.225451 -0.099218
1496  0.007907  0.018100 -0.135498  0.020388  0.249479 -0.097472
1497  0.026544  0.100127 -0.236050  0.065987  0.264570 -0.099158
1498  0.030339  0.089896 -0.281496  0.073414  0.273625 -0.098567
1499  0.015208  0.005428 -0.261732  0.078090  0.276051 -0.113036

[1500 rows x 6 columns]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, v, pi)


In [8]:
###########################################
## FEATURE EXTRACTION and USER PROFILING ##
###########################################


#Mean, Median, Variance, Average Absolute
#Difference of Peaks, Range, Mode, Covariance,
#Mewan Absolute Deviation (MAD), Inter-
#quartile Range (IQR), correlation between axes
#(xy, yz, xz), Skewness, Kurtosis


def spectral_energy(X):
    ''' X is a list of FFT data '''
    sum = 0
    for item in X:
        sum += item #is it supposed to be the square of each item or the square of the total sum?
    return sum**2 / len(X)

def shannon_entropy(label):
    vc = pd.Series(label).value_counts(normalize=True, sort=False)
    base = 2
    return -(vc * np.log(vc)/np.log(base)).sum()

extracted_features = { 'mean': [], 'median': [], 'variance': [], 'AADP': [], 'range': [], 'mode':[], 
                'covariance': [], 'mad': [], 'iqr': [], 'correlation': [], 'skewness': [], 'kurtosis': [],
                'entropy': [], 's_nrg': []} #features in the domain of frequency

for column in features_df:
    extracted_features['mean'].append(features_df[column].mean())
    extracted_features['median'].append(features_df[column].median())
    extracted_features['variance'].append(features_df.var()[column])
    extracted_features['range'].append(features_df[column].max() - features_df[column].min())
    extracted_features['mode'].append(features_df[column].mode().iat[0])
    extracted_features['iqr'].append( features_df[column].quantile(0.75) - features_df[column].quantile(0.25))
    extracted_features['skewness'].append(features_df[column].skew())
    extracted_features['kurtosis'].append(features_df[column].kurtosis())
    extracted_features['mad'].append(features_df[column].mad()) 
    #calculate the FFT for next calculations
    col_fft = fft(features_df[column].to_numpy())
    extracted_features['entropy'].append(shannon_entropy(column))  
    extracted_features['s_nrg'].append(spectral_energy(col_fft)) #what is up with the +0.00j??? 


    #need fixing :
    extracted_features['AADP'].append(0) ######################FIX
    

labels = ['x_a', 'y_a', 'z_a', 'x_g', 'y_g', 'z_g']
extracted_features['covariance'].append(features_df['x_a'].cov(features_df['y_a']))
extracted_features['covariance'].append(features_df['x_a'].cov(features_df['z_a']))
extracted_features['covariance'].append(features_df['y_a'].cov(features_df['z_a']))
extracted_features['covariance'].append(features_df['x_g'].cov(features_df['y_g']))
extracted_features['covariance'].append(features_df['x_g'].cov(features_df['z_g']))
extracted_features['covariance'].append(features_df['y_g'].cov(features_df['z_g']))

extracted_features['correlation'].append(features_df['x_a'].corr(features_df['y_a']))
extracted_features['correlation'].append(features_df['x_a'].corr(features_df['z_a']))
extracted_features['correlation'].append(features_df['y_a'].corr(features_df['z_a']))
extracted_features['correlation'].append(features_df['x_g'].corr(features_df['y_g']))
extracted_features['correlation'].append(features_df['x_g'].corr(features_df['z_g']))
extracted_features['correlation'].append(features_df['y_g'].corr(features_df['z_g']))

feature_set = pd.DataFrame.from_dict(extracted_features, orient='index', columns=labels)


user_id = PARTICIPANT
t_start = gyro_df.Time.loc[gyro_df.Time.first_valid_index()]
t_end = gyro_df.Time.iloc[-1]
f_vec = feature_set.unstack().to_frame().sort_index(level=1).T
f_vec.columns = f_vec.columns.map('_'.join)

user_profile = [user_id, t_start, t_end, f_vec]
print(f_vec)

   x_a_AADP  x_g_AADP  y_a_AADP  y_g_AADP  z_a_AADP  z_g_AADP  \
0  0.0+0.0j  0.0+0.0j  0.0+0.0j  0.0+0.0j  0.0+0.0j  0.0+0.0j   

      x_a_correlation     x_g_correlation     y_a_correlation  \
0  0.410265+0.000000j -0.138125+0.000000j -0.164253+0.000000j   

      y_g_correlation  ...        y_a_skewness        y_g_skewness  \
0 -0.107646+0.000000j  ...  0.121019+0.000000j  3.290726+0.000000j   

       z_a_skewness      z_g_skewness        x_a_variance        x_g_variance  \
0 -1.10947+0.00000j -1.10947+0.00000j  0.180408+0.000000j  0.115553+0.000000j   

         y_a_variance        y_g_variance        z_a_variance  \
0  0.311966+0.000000j  0.063252+0.000000j  0.231176+0.000000j   

         z_g_variance  
0  0.231176+0.000000j  

[1 rows x 84 columns]


In [42]:
#Euclidean Distance
#distance between two vectors or data points in a coordinate plane.


def minkow_dist(x, y, p=2):
    '''takes two vectors stored as lists to return minkowski distance. '''
    p = 2  #measurement for minkowski distance, euclidean distance when set to 2 
    distance_sum = 0   
    for i in range(0, len(x)):
        distance_sum += (x[i] - y[i])** p

    return distance_sum ** (1/p)

#manhattan_dist = scipy.cityblock()


#if distance < threshold, genuine. if distance >= threshold, imposter

#paper also tests cosine distance, correlation distance, manhattan distance and minkowski with p=5. 


#performance evaluation is calculated using EER; need to do this. 

#FAR = false acceptance rate, rate of incorrectly accepted unauthorized users among all the unauthorized attempts
#FRR = the rate of incorrectly rejected authorized users among all the legitimate authentication attempts 
#EER = intersection point of FAR and FRR (lower than EER, better the authentication system)
