In [1]:
# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)
simplefilter(action='ignore', category=RuntimeWarning)

In [2]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
from scipy.signal import argrelextrema
import matplotlib.pyplot as plt
import glob
import os


In [3]:
# Get the list of all files in PaHaW dataset
files_PaHaW = []
fileDir = 'PaHaW_csv'
for root, dirs, files in os.walk(fileDir, topdown=False):
    for name in dirs:
        files_PaHaW.append(os.path.join(root, name))
        
charts = []

for i in files_PaHaW[2:]:
    charts.append(os.path.join(i, ''.join([j for j in os.listdir(i) if '__1_1.csv' in j])))
 

In [4]:
def getListOfFiles(dirName):
    # create a list of file and sub directories 
    # names in the given directory 
    listOfFile = os.listdir(dirName)
    allFiles = list()
    # Iterate over all the entries
    for entry in listOfFile:
        # Create full path
        fullPath = os.path.join(dirName, entry)
        # If entry is a directory then get the list of files in this directory 
        if os.path.isdir(fullPath):
            allFiles = allFiles + getListOfFiles(fullPath)
        else:
            allFiles.append(fullPath)
                
    return allFiles

In [5]:
# Get the list of all files in directory tree at given path
files_HW = getListOfFiles('hw_dataset')

files_HW.remove('hw_dataset\\SVM_HW.ipynb')
files_HW.remove('hw_dataset\\readme.txt')
# files_HW

In [6]:
# calculate the whole length of the spiral drawing
def calculate_displacement(data):
    
    temp = np.sum((data['X coordinate'] - data['X coordinate'].shift(1))**2 + (data['Y coordinate'] - data['Y coordinate'].shift(1))**2)
    
    return np.sqrt(temp)

In [7]:
# For PaHaW dataset, the last position value is chosen for the same time
def calculate_Vel_X(data):

    new_df = data.groupby(['time stamp']).nth([-1]).reset_index()
    new_df['velocity_x'] = (new_df['X coordinate'] - new_df['X coordinate'].shift(1)) / (new_df['time stamp'] - new_df['time stamp'].shift(1))
    
    new_df = pd.merge(data, new_df[['time stamp', 'velocity_x']], on='time stamp', how='left')
    
    return new_df['velocity_x']

def calculate_Vel_Y(data):

    new_df = data.groupby(['time stamp']).nth([-1]).reset_index()
    new_df['velocity_y'] = (new_df['Y coordinate'] - new_df['Y coordinate'].shift(1)) / (new_df['time stamp'] - new_df['time stamp'].shift(1))
    
    new_df = pd.merge(data, new_df[['time stamp', 'velocity_y']], on='time stamp', how='left')
    
    return new_df['velocity_y']

In [8]:
# Calculate velocity for ParkingsonHW dataset
def cal_Vel_X(data):

    velocity_x = (data['X coordinate'] - data['X coordinate'].shift(1)) / (data['time stamp'] - data['time stamp'].shift(1))
        
    return velocity_x

def cal_Vel_Y(data):

    velocity_y = (data['Y coordinate'] - data['Y coordinate'].shift(1)) / (data['time stamp'] - data['time stamp'].shift(1))
        
    return velocity_y

In [9]:
# Calcuate real time velocity for both datasets
def calculate_Vel(data):

    velocity = np.sqrt(data['vel_x']**2 + data['vel_y']**2)
    
    return velocity

In [10]:
# For PaHaW dataset
def calculate_Acc_X(data):
    
    new_df = data.groupby(['time stamp']).nth([-1]).reset_index()
    new_df['acc_x'] = (data['vel_x'] - data['vel_x'].shift(1)) / (new_df['time stamp'] - new_df['time stamp'].shift(1))
    
    new_df = pd.merge(data, new_df[['time stamp', 'acc_x']], on='time stamp', how='left')
    
    return new_df['acc_x']

def calculate_Acc_Y(data):
    
    new_df = data.groupby(['time stamp']).nth([-1]).reset_index()
    new_df['acc_y'] = (data['vel_y'] - data['vel_y'].shift(1)) / (new_df['time stamp'] - new_df['time stamp'].shift(1))
    
    new_df = pd.merge(data, new_df[['time stamp', 'acc_y']], on='time stamp', how='left')

    return new_df['acc_y']

In [11]:
# For ParkinsonHW dataset
def cal_Acc_X(data):

    acc_x = (data['vel_x'] - data['vel_x'].shift(1)) / (data['time stamp'] - data['time stamp'].shift(1))
        
    return acc_x

def cal_Acc_Y(data):

    acc_y = (data['vel_y'] - data['vel_y'].shift(1)) / (data['time stamp'] - data['time stamp'].shift(1))
        
    return acc_y

In [12]:
def calculate_Acc(data):
    
    new_df = data.groupby(['time stamp']).nth([-1]).reset_index()
    new_df['acc'] = (data['velocity'] - data['velocity'].shift(1)) / (new_df['time stamp'] - new_df['time stamp'].shift(1))
    
    new_df = pd.merge(data, new_df[['time stamp', 'acc']], on='time stamp', how='left')

    return new_df['acc']

def cal_Acc(data):

    acc = (data['velocity'] - data['velocity'].shift(1)) / (data['time stamp'] - data['time stamp'].shift(1))
        
    return acc

In [13]:
# Calculate Normalised Velocity Variance on x_axis velocity, y_axis velocity and velocity
def calculate_NVV_X(x):
    x = x.replace([np.inf, -np.inf], np.nan)
    sum_x = np.nansum(abs((x['vel_x'] - x['vel_x'].shift(1))))
    NVV_x = sum_x / (x.iloc[-1, 2]-x.iloc[0, 2])
    
    return NVV_x
    
def calculate_NVV_Y(x):
    x = x.replace([np.inf, -np.inf], np.nan)
    sum_y = np.nansum(abs((x['vel_y'] - x['vel_y'].shift(1))))
    NVV_y = sum_y / (x.iloc[-1, 2]-x.iloc[0, 2])
    
    return NVV_y

def calculate_NVV(x):
#     x = x.replace([np.inf, -np.inf], np.nan)
    sum_y = np.nansum(abs((x['velocity'] - x['velocity'].shift(1))))
    NVV = sum_y / (x.iloc[-1, 2]-x.iloc[0, 2])
    
    return NVV

In [14]:
#Calculate Normalised Pressure Variance
def calculate_NPV_pre(p):
    sum_p = np.sum(abs(p['pressure'] - p['pressure'].shift(1)))
    NVV_p = sum_p / (p.iloc[-1, 2]-p.iloc[0, 2])
    
    return NVV_p

In [15]:
def calculate_Jerk_x(x):
    
    jerk_x = np.sum(abs(x['acc_x'] - x['acc_x'].shift(1))) / (x.iloc[-1, 2]-x.iloc[0, 2])
    return jerk_x

def calculate_Jerk_y(x):
            
    jerk_y = np.sum(abs(x['acc_y'] - x['acc_y'].shift(1))) / (x.iloc[-1, 2]-x.iloc[0, 2])
    return jerk_y

def calculate_Jerk(x):
            
    jerk = np.sum(abs(x['acc'] - x['acc'].shift(1))) / (x.iloc[-1, 2]-x.iloc[0, 2])
    return jerk

In [16]:
def conventional_energy_x(data):
    
    x_mean = np.mean(data['X coordinate'])
    x_var = np.var(data['X coordinate'], ddof=1)
    nor_x = (data['X coordinate']-x_mean)/x_var
    
    energy_x = np.sum(nor_x ** 2)/len(data['X coordinate'])
    
    return energy_x

def conventional_energy_y(data):
    
    y_mean = np.mean(data['Y coordinate'])
    y_var = np.var(data['Y coordinate'], ddof=1)
    nor_y = (data['Y coordinate']-y_mean)/y_var
    
    energy_y = np.sum(nor_y ** 2)/len(data['Y coordinate'])
    
    return energy_y

In [17]:
# Number of changes in velocity normalized on its duration
def NCV_max_vel(df):
    n=5 
    # number of points to be checked before and after 
    max_vel = df.iloc[argrelextrema(df['velocity'].values, np.greater_equal, order=n)[0]]['velocity']
    
    max_vel = max_vel.replace([np.inf, -np.inf], np.nan)
    NCV_max_vel = np.nanmean(max_vel)/(df.iloc[-1, 2]-df.iloc[0, 2])

    return NCV_max_vel

def NCV_min_vel(df):
    n=5 
    # number of points to be checked before and after 
    min_vel = df.iloc[argrelextrema(df['velocity'].values, np.less_equal, order=n)[0]]['velocity']
    
    min_vel = min_vel.replace([np.inf, -np.inf], np.nan)
    NCV_min_vel = np.nanmean(min_vel)/(df.iloc[-1, 2]-df.iloc[0, 2])

    return NCV_min_vel

In [18]:
# NCV on x-axis velocity
def NCV_max_x(df):
    n=5 
    # number of points to be checked before and after 
    max_vel_x = df.iloc[argrelextrema(df['vel_x'].values, np.greater_equal, order=n)[0]]['vel_x']
    
    max_vel_x = max_vel_x.replace([np.inf, -np.inf], np.nan)
    NCV_max_x = np.nanmean(max_vel_x)/(df.iloc[-1, 2]-df.iloc[0, 2])

    return NCV_max_x

def NCV_min_x(df):
    n=5 
    # number of points to be checked before and after 
    # Find local peaks
    min_vel_x = df.iloc[argrelextrema(df['vel_x'].values, np.less_equal, order=n)[0]]['vel_x']
    
    min_vel_x = min_vel_x.replace([np.inf, -np.inf], np.nan)
    NCV_min_x = np.nanmean(min_vel_x)/(df.iloc[-1, 2]-df.iloc[0, 2])

    return NCV_min_x


In [19]:
# NCV on y-axis velocity
def NCV_max_y(df):
    n=5 
    # number of points to be checked before and after 
    # Find local peaks
    max_vel_y = df.iloc[argrelextrema(df['vel_y'].values, np.greater_equal, order=n)[0]]['vel_y']
    
    max_vel_y = max_vel_y.replace([np.inf, -np.inf], np.nan)
    NCV_max_y = np.nanmean(max_vel_y)/(df.iloc[-1, 2]-df.iloc[0, 2])

    return NCV_max_y

def NCV_min_y(df):
    n=5 
    # number of points to be checked before and after 
    # Find local peaks
    min_vel_y = df.iloc[argrelextrema(df['vel_y'].values, np.less_equal, order=n)[0]]['vel_y']
    
    min_vel_y = min_vel_y.replace([np.inf, -np.inf], np.nan)
    NCV_min_y = np.nanmean(min_vel_y)/(df.iloc[-1, 2]-df.iloc[0, 2])

    return NCV_min_y
    

In [20]:
# Number of changes in acceleration normalised on its duration
def NCA_min_x(df):
    n=5
    # number of points to be checked before and after 
    # Find local peaks
    min_acc_x = df.iloc[argrelextrema(df['acc_x'].values, np.less_equal, order=n)[0]]['acc_x']
    
    min_acc_x = min_acc_x.replace([np.inf, -np.inf], np.nan)
    NCA_min_x = np.nanmean(min_acc_x)/(df.iloc[-1, 2]-df.iloc[0, 2])

    return NCA_min_x

def NCA_max_x(df):
    
    n=5
    # number of points to be checked before and after 
    # Find local peaks
    max_acc_x = df.iloc[argrelextrema(df['acc_x'].values, np.greater_equal, order=n)[0]]['acc_x']
    
    max_acc_x = max_acc_x.replace([np.inf, -np.inf], np.nan)
    NCA_max_x = np.nanmean(max_acc_x)/(df.iloc[-1, 2]-df.iloc[0, 2])

    return NCA_max_x
    

In [21]:
def NCA_min_y(df):
    n=5
    # number of points to be checked before and after 
    # Find local peaks
    min_acc_y = df.iloc[argrelextrema(df['acc_y'].values, np.less_equal, order=n)[0]]['acc_y']
    
    min_acc_y = min_acc_y.replace([np.inf, -np.inf], np.nan)
    NCA_min_y = np.nanmean(min_acc_y)/(df.iloc[-1, 2]-df.iloc[0, 2])

    return NCA_min_y

def NCA_max_y(df):
    n=5
    # number of points to be checked before and after 
    # Find local peaks
    max_acc_y = df.iloc[argrelextrema(df['acc_y'].values, np.greater_equal, order=n)[0]]['acc_y']
    
    max_acc_y = max_acc_y.replace([np.inf, -np.inf], np.nan)
    NCA_max_y = np.nanmean(max_acc_y)/(df.iloc[-1, 2]-df.iloc[0, 2])

    return NCA_max_y
    

In [22]:
def NCA_min(df):
    n=5
    # number of points to be checked before and after 
    # Find local peaks
    min_acc = df.iloc[argrelextrema(df['acc'].values, np.less_equal, order=n)[0]]['acc']
    
    min_acc = min_acc.replace([np.inf, -np.inf], np.nan)
    NCA_min = np.nanmean(min_acc)/(df.iloc[-1, 2]-df.iloc[0, 2])

    return NCA_min

def NCA_max(df):  
    n=5
    # number of points to be checked before and after 
    # Find local peaks
    max_acc = df.iloc[argrelextrema(df['acc'].values, np.greater_equal, order=n)[0]]['acc']
    
    max_acc = max_acc.replace([np.inf, -np.inf], np.nan)
    NCA_max = np.nanmean(max_acc)/(df.iloc[-1, 2]-df.iloc[0, 2])

    return NCA_max

In [23]:
# Number of changes in pressure
def NCP_min(df):
    
    n=5 
    # number of points to be checked before and after 
    # Find local peaks
    min_pre = df.iloc[argrelextrema(df['pressure'].values, np.less_equal, order=n)[0]]['pressure']
    
    min_pre = min_pre.replace([np.inf, -np.inf], np.nan)
    NCP_min = np.nanmean(min_pre)/(df.iloc[-1, 2]-df.iloc[0, 2])

    return NCP_min

def NCP_max(df):
    
    n=5 
    # number of points to be checked before and after 
    # Find local peaks
    max_pre = df.iloc[argrelextrema(df['pressure'].values, np.greater_equal, order=n)[0]]['pressure']
    
    max_pre = max_pre.replace([np.inf, -np.inf], np.nan)
    NCP_max = np.nanmean(max_pre)/(df.iloc[-1, 2]-df.iloc[0, 2])

    return NCP_max

In [24]:
from sklearn.neighbors.kde import KernelDensity
from scipy.stats import entropy

def calculate_entropy(data):
    
    coor_arr = np.column_stack((data['X coordinate'], data['Y coordinate']))

    kde = KernelDensity(kernel='gaussian', bandwidth = 0.25).fit(coor_arr)
    logprob = kde.score_samples(coor_arr)
    
    entropy_data = entropy(logprob, base = 2)
   
    return entropy_data

In [25]:
from scipy.stats import iqr

def calculate_iqr(data):
    
    return iqr(data)

In [26]:
# Normalise data and delete nan or inf values

def normalise_data(x):
    
    x = x.replace([np.inf, -np.inf], np.nan)
    x.dropna(axis=0, how='any', inplace=True)
    
    for column in x:
        if column != 'ID' and  column != 'result' and column != 'label':
            x_mean = np.mean(x[column])
            var_x = np.var(x[column], ddof=1)
            x[column] = (x[column]-x_mean)/(var_x)
       
    return x

In [27]:
#label = 2
#test_id = 1, dynamic drawing
final_HW = []

for i in files_HW:
    try:
        table = pd.read_csv(i, sep=';', index_col=False, header=None, names=['X coordinate', 'Y coordinate', 'Z', 'pressure', 'Grip Angle', 'time stamp', 'Test ID'],
                          lineterminator='\n')
        
        table = table[table['Test ID'] == 1]
        table = table.drop('Z', axis=1)
        table = table[['X coordinate', 'Y coordinate', 'time stamp', 'pressure', 'Grip Angle', 'Test ID']]

        table['vel_x'] = cal_Vel_X(table)
        table['vel_y'] = cal_Vel_Y(table)
        table['velocity'] = calculate_Vel(table)
        table['acc_x'] = cal_Acc_X(table)
        table['acc_y'] = cal_Acc_Y(table)
        table['acc'] = cal_Acc(table)
        
        width = np.amax(table['X coordinate'])-np.amin(table['X coordinate'])
        height = np.amax(table['Y coordinate'])-np.amin(table['Y coordinate'])
        nvv_x = calculate_NVV_X(table)
        nvv_y = calculate_NVV_Y(table)
        nvv = calculate_NVV(table)
        npv = calculate_NPV_pre(table)
        jerk_x = calculate_Jerk_x(table)
        jerk_y = calculate_Jerk_y(table)
        jerk = calculate_Jerk(table)
        nca_max_x = NCA_max_x(table)
        nca_min_x = NCA_min_x(table)
        nca_max_y = NCA_max_y(table)
        nca_min_y = NCA_min_y(table)
        nca_max = NCA_max(table)
        nca_min = NCA_min(table)
        ncv_max_x = NCV_max_x(table)
        ncv_max_y = NCV_max_y(table)
        ncv_min_x = NCV_min_x(table)
        ncv_min_y = NCV_min_y(table)
        ncv_max = NCV_max_vel(table)
        ncv_min = NCV_min_vel(table)
        ncv_nca_max_x = ncv_max_x/nca_max_x
        ncv_nca_max_y = ncv_max_y/nca_max_y
        ncv_nca_min_x = ncv_min_x/nca_min_x
        ncv_nca_min_y = ncv_min_y/nca_min_y
        ncv_nca_max = ncv_max/nca_max
        ncv_nca_min = ncv_min/nca_min
        ncp_max = NCP_max(table)
        ncp_min = NCP_min(table)
        std_pre = np.std(table['pressure'])
        std_grip = np.std(table['Grip Angle'])
        mean_grip = np.mean(table['Grip Angle'])
        duration = table.iloc[-1, 2]-table.iloc[0, 2]
        displacement = calculate_displacement(table)
        entropy_S = calculate_entropy(table)
        energy_x = conventional_energy_x(table)
        energy_y = conventional_energy_y(table)
        pressure_iqr = calculate_iqr(table['pressure'])
        grip_iqr = calculate_iqr(table['Grip Angle'])
        mv = displacement/duration
#         sdv = np.std(table['velocity'], ddof=1)
        
        file_name = os.path.realpath(i).split('\\')[-1].split('.')[0]
        result = 0
        if file_name.split('_')[0] == 'P':
            result = 1
            ids = int(file_name.split('_')[1])
        if file_name.split('_')[0] == 'H':
            result = 1
            numb = file_name.split('_')[1]
            ids = int(numb.split('-')[1])
        if file_name.split('_')[0] == 'C':
            result = 0
            ids = int(file_name.split('_')[1])
        label = 2
        
        final_HW.append([ids, nvv_x, nvv_y, nvv, npv, jerk_x, jerk_y, jerk, nca_max_x, nca_max_y, nca_min_x, nca_min_y, nca_max, nca_min, ncv_max_x, ncv_max_y, ncv_min_x, ncv_min_y, ncv_max, ncv_min, ncv_nca_max_x, ncv_nca_max_y, ncv_nca_min_x, ncv_nca_min_y, ncv_nca_max, ncv_nca_min, ncp_max, ncp_min, std_pre, std_grip, mean_grip, duration, entropy_S, pressure_iqr, grip_iqr, mv, width, height, displacement, energy_x, energy_y, label, result])

#         final_HW.append([ids, std_grip, entropy_S, label, result])
#         final_HW = normalise_data(final_HW)
        
    except:
        pass
    
# final_HW


In [28]:
#label = 1
final = []

for i in charts:
    try:
        df = pd.read_csv(i).reset_index()
        if(len(df.columns)==7):
            cols = ['Y coordinate', 'X coordinate', 'time stamp', 'on/off state', 'azimuth', 'altitude', 'pressure']
            df.columns = cols
        
        df['vel_x'] = calculate_Vel_X(df)
        df['vel_y'] = calculate_Vel_Y(df)
        df['velocity'] = calculate_Vel(df)
        df['acc_x'] = calculate_Acc_X(df)
        df['acc_y'] = calculate_Acc_Y(df)
        df['acc'] = calculate_Acc(df)

        width = np.amax(df['X coordinate'])-np.amin(df['X coordinate'])
        height = np.amax(df['Y coordinate'])-np.amin(df['Y coordinate'])
        nvv_x = calculate_NVV_X(df)
        nvv_y = calculate_NVV_Y(df)
        nvv = calculate_NVV(df)
        npv = calculate_NPV_pre(df)
        jerk_x = calculate_Jerk_x(df)
        jerk_y = calculate_Jerk_y(df)
        jerk = calculate_Jerk(df)
        nca_max_x = NCA_max_x(df)
        nca_min_x = NCA_min_x(df)
        nca_max_y = NCA_max_y(df)
        nca_min_y = NCA_min_y(df)
        nca_max = NCA_max(df)
        nca_min = NCA_min(df)
        ncv_max_x = NCV_max_x(df)
        ncv_max_y = NCV_max_y(df)
        ncv_min_x = NCV_min_x(df)
        ncv_min_y = NCV_min_y(df)
        ncv_max = NCV_max_vel(df)
        ncv_min = NCV_min_vel(df)
        ncv_nca_max_x = ncv_max_x/nca_max_x
        ncv_nca_max_y = ncv_max_y/nca_max_y
        ncv_nca_min_x = ncv_min_x/nca_min_x
        ncv_nca_min_y = ncv_min_y/nca_min_y
        ncv_nca_max = ncv_max/nca_max
        ncv_nca_min = ncv_min/nca_min
        ncp_max = NCP_max(df)
        ncp_min = NCP_min(df)
        std_pre = np.std(df['pressure'])
        std_grip = np.std(df['altitude'])
        mean_grip = np.mean(df['altitude'])
        duration = df.iloc[-1, 2]-df.iloc[0, 2]
        displacement = calculate_displacement(df)
        entropy_S = calculate_entropy(df)
        energy_x = conventional_energy_x(df)
        energy_y = conventional_energy_y(df)
        pressure_iqr = calculate_iqr(df['pressure'])
        grip_iqr = calculate_iqr(df['altitude'])
        mv = displacement/duration
#         sdv = np.std(table['velocity'], ddof=1)
        
        ids = os.path.dirname(i).split('\\')[-1]
        label = 1
        final.append([ids, nvv_x, nvv_y, nvv, npv, jerk_x, jerk_y, jerk, nca_max_x, nca_max_y, nca_min_x, nca_min_y, nca_max, nca_min, ncv_max_x, ncv_max_y, ncv_min_x, ncv_min_y, ncv_max, ncv_min, ncv_nca_max_x, ncv_nca_max_y, ncv_nca_min_x, ncv_nca_min_y, ncv_nca_max, ncv_nca_min, ncp_max, ncp_min, std_pre, std_grip, mean_grip, duration, entropy_S, pressure_iqr, grip_iqr, mv, width, height, displacement, energy_x, energy_y, label])

#         final.append([ids, std_grip, entropy_S, label]) 
        final.replace('nan', np.nan, inplace=True)
        final.isnull().any()
        final.fillna(0, inplace=True)
        
        
    except:
        pass


# final
### 01-05, 61, 80, 89 missing


In [29]:
#PaHaW: combine results in the dataframe
final = pd.DataFrame(final)
final.columns = ['ID', "nvv_x", "nvv_y", "nvv", "npv", "jerk_x", "jerk_y", "jerk", "nca_max_x", "nca_max_y", "nca_min_x", "nca_min_y", "nca_max", "nca_min", "ncv_max_x", "ncv_max_y", "ncv_min_x", "ncv_min_y", "ncv_max", "ncv_min", "ncv_nca_max_x", "ncv_nca_max_y", "ncv_nca_min_x", "ncv_nca_min_y", "ncv_nca_max", "ncv_nca_min", "ncp_max", "ncp_min", "std_pre", 'std_grip', 'mean_grip', "duration", 'entropy_S', 'pressure_iqr', 'grip_iqr', 'mv', 'width', 'height', 'displacement', 'energy_x', 'energy_y', 'label']
# final.columns = ['ID', 'std_grip', 'entropy_S', 'label']
final['ID'] = final['ID'].apply(lambda x: x.split('000')[1]).astype('int32')

df = pd.read_excel('PaHaW_csv/corpus_PaHaW.xlsx')
df['result'] = np.int32(df['PD status']=='ON')
final_data = pd.merge(final, df[['result', 'ID']], on='ID', how='left')
### 11, 12, 21, 35, 37, 38, 42, 45-47, 50, 56, 58-59, 63-65, 68, 79, 81, 86, 88, 93 missing
# final_data = normalise_data(final_data)

In [36]:
final_data = normalise_data(final_data)

In [30]:
final_HW = pd.DataFrame(final_HW)
final_HW.columns = ['ID', "nvv_x", "nvv_y", "nvv", "npv", "jerk_x", "jerk_y", "jerk", "nca_max_x", "nca_max_y", "nca_min_x", "nca_min_y", "nca_max", "nca_min", "ncv_max_x", "ncv_max_y", "ncv_min_x", "ncv_min_y", "ncv_max", "ncv_min", "ncv_nca_max_x", "ncv_nca_max_y", "ncv_nca_min_x", "ncv_nca_min_y", "ncv_nca_max", "ncv_nca_min", "ncp_max", "ncp_min", "std_pre", 'std_grip', 'mean_grip', "duration", 'entropy_S', 'pressure_iqr', 'grip_iqr', 'mv', 'width', 'height', 'displacement', 'energy_x', 'energy_y', 'label', 'result']
# final_HW.columns = ['ID','std_grip', 'entropy_S', 'label', 'result']


In [35]:
final_HW = normalise_data(final_HW)

In [31]:
final_data = final_data.append(final_HW)
final_data = normalise_data(final_data)

### 67 PaHaW, 76 ParkinsonHW
### 51 health controls (15 in parkinson_HW, 36 in PaHaW)
### 92 PD patients (61 in parkinson_HW, 31 in PaHaW)

In [36]:
X = np.array(final_HW[["nvv_x", "nvv_y", "nvv", "npv", "jerk_x", "jerk_y", "jerk", "nca_max_x", "nca_max_y", "nca_min_x", "nca_min_y", "nca_max", "nca_min", "ncv_max_x", "ncv_max_y", "ncv_min_x", "ncv_min_y", "ncv_max", "ncv_min", "ncv_nca_max_x", "ncv_nca_max_y", "ncv_nca_min_x", "ncv_nca_min_y", "ncv_nca_max", "ncv_nca_min", "ncp_max", "ncp_min", "std_pre", 'std_grip', 'mean_grip', "duration", 'entropy_S', 'pressure_iqr', 'grip_iqr', 'mv', 'width', 'height', 'displacement', 'energy_x', 'energy_y', 'label']])
# X = np.array(final_data[['std_grip', 'entropy_S']])
y = np.array(final_HW['result'])

In [44]:
##Decision Tree 10-FOLD
from sklearn import tree
from sklearn.model_selection import KFold
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score

kf = KFold(n_splits=10, shuffle=True)

acc = []
pre = []
rec = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    clf = tree.DecisionTreeClassifier()
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    
    acc.append(accuracy_score(y_test, pred))
    pre.append(precision_score(y_test, pred))
    rec.append(recall_score(y_test, pred))

print(np.mean(acc))
print(np.mean(pre))
print(np.mean(rec))

0.40476190476190477
0.37
0.4666666666666666


In [70]:
##Random Forest 10-FOLD
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

kf = KFold(n_splits=10, shuffle=True)

acc = []
pre = []

for train_index, test_index in kf.split(X):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    
    acc.append(accuracy_score(y_test, pred))
    pre.append(precision_score(y_test, pred))
    rec.append(recall_score(y_test, pred))

print(np.mean(acc))
print(np.mean(pre))
print(np.mean(rec))

0.6019047619047619
0.642006327006327
0.8035714285714285


In [71]:
##Naive Bayes 10-FOLD
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

kf = KFold(n_splits=10, shuffle=True)

acc = []

for train_index, test_index in kf.split(X):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    clf = GaussianNB()
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    
    acc.append(accuracy_score(y_test, pred))
    pre.append(precision_score(y_test, pred))
    rec.append(recall_score(y_test, pred))

print(np.mean(acc))
print(np.mean(pre))
print(np.mean(rec))

0.5585714285714286
0.6883642746142746
0.6958573833573833


In [55]:
##LR 10-FOLD
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

kf = KFold(n_splits=10, shuffle=True)

logisticRegr = LogisticRegression()

acc = []

for train_index, test_index in kf.split(X):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    logisticRegr.fit(X_train, y_train)
    pred = logisticRegr.predict(X_test)
    
    acc.append(accuracy_score(y_test, pred))
    pre.append(precision_score(y_test, pred))
    rec.append(recall_score(y_test, pred))

print(np.mean(acc))
print(np.mean(pre))
print(np.mean(rec))

0.6100000000000001
0.6682738095238094
0.7930429292929293


In [None]:
##SVM 10-FOLD
from sklearn.model_selection import KFold
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn import metrics
import matplotlib
from matplotlib import pyplot as plt
import scipy
from sklearn import svm

#y = final_data['result']
#X = final_data.drop(['ID', 'result'], axis=1)

kf = KFold(n_splits=10, shuffle=True)
#print(kf)

acc = []


predict=[]
test=[]


for train_index, test_index in kf.split(X):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    

    clf = SVC(kernel = 'poly',  gamma=0.7, C=1.0)
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    
    acc.append(accuracy_score(y_test, pred))
    print('True:', y_test)
    print('False:', pred)
    
    predict.append(pred)
    test.append(X_test)
    
print(np.mean(acc))
print(np.mean(pre))
print(np.mean(rec))


print(metrics.confusion_matrix(test, predict))


In [None]:
h = .02  # step size in the mesh
# create a mesh to plot in
x_min, x_max = X_train[:, 0].min() - 1, X_train[:, 0].max() + 1
y_min, y_max = X_train[:, 1].min() - 1, X_train[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))


# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, m_max]x[y_min, y_max].
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.contour(xx, yy, Z, cmap=plt.cm.Paired)

In [None]:
h = .02  # step size in the mesh
# create a mesh to plot in
x_min, x_max = X_train[:, 0].min() - 1, X_train[:, 0].max() + 1
y_min, y_max = X_train[:, 1].min() - 1, X_train[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))


# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, m_max]x[y_min, y_max].
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.contour(xx, yy, Z, cmap=plt.cm.Paired)

In [37]:
pd.DataFrame(X).to_csv('HW_X_nor.csv', index=False)

In [38]:
pd.DataFrame(y).to_csv('HW_y_nor.csv', index=False)