In [104]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import sklearn.metrics as metrics
from sklearn import svm

In [105]:
data_path = 'Data_new/data/'

In [106]:
def read_file(data_path):
    file_number = 0
    file_to_work = list()
    for filename in os.listdir(data_path):
        if filename.endswith(".csv"):
            file_number += 1
            file_to_work.append(data_path + filename)
    return(file_to_work, file_number)

In [107]:
file_list, file_number = read_file(data_path)

In [108]:
print('All datasets:', file_number)

All data sets: 15


In [109]:
file_list

['Data_new/data/1.csv',
 'Data_new/data/10.csv',
 'Data_new/data/11.csv',
 'Data_new/data/12.csv',
 'Data_new/data/13.csv',
 'Data_new/data/14.csv',
 'Data_new/data/15.csv',
 'Data_new/data/2.csv',
 'Data_new/data/3.csv',
 'Data_new/data/4.csv',
 'Data_new/data/5.csv',
 'Data_new/data/6.csv',
 'Data_new/data/7.csv',
 'Data_new/data/8.csv',
 'Data_new/data/9.csv']

In [110]:
def get_data_from_file(file):
    columns_name = ['Seq_number', 'X', 'Y', 'Z', 'Mark']
    data = pd.read_csv(file)
    data.columns = columns_name
    return data

In [111]:
def check_data(data):
    nan_flag = False
    for c in data.columns:
        if any(data[c] == np.nan):
            print(c, 'contains NaNs')
            nan_flag = True
        if not nan_flag:
            print('No missing values in column', c)
    print('\n')
    pass

In [112]:
def data_clean(data):
    return data[data.Mark != 0]

In [113]:
def preapre_test_marks(y_test):
    for i in range(len(y_test)):
        y_test.iat[i] = 1
    return y_test

In [114]:
def data_prepare(data):
    X = data.drop(['Seq_number', 'Mark'], axis=1)
    y = data['Mark']
    return X, y

In [115]:
def split_data(data):
    X = data.drop(['Seq_number', 'Mark'], axis=1)
    y = data['Mark']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42, stratify = data['Mark'])
    y_test = preapre_test_marks(y_test)
    y_train = preapre_test_marks(y_test)
    return X_train, X_test, y_train, y_test
    

In [116]:
def model_build(model, X_train):
    model.fit(X_train)
    return model

In [117]:
def model_evaluation(model, X_test, y_test, algorithm_name):
    pred = model.predict(X_test)
    print(algorithm_name, 'Accuracy:', metrics.accuracy_score(y_test, pred))
    print(algorithm_name, 'F1 score:', metrics.f1_score(y_test, pred), '\n')
    pass

In [118]:
outliers_fraction = 0.15

In [119]:
anomaly_algorithms = [('One-Class SVM', svm.OneClassSVM(nu=outliers_fraction, kernel='rbf', gamma=0.00001))]

For an one-class model, +1 or -1 is returned.

## For one data set file_list[1] Data_new/data/10.csv algorithm comparison

In [120]:
file = 'Data_new/data/10.csv'
data = get_data_from_file(file)
data_cleaned = data_clean(data)

In [121]:
data_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 126799 entries, 0 to 126798
Data columns (total 5 columns):
Seq_number    126799 non-null float64
X             126799 non-null int64
Y             126799 non-null int64
Z             126799 non-null int64
Mark          126799 non-null int64
dtypes: float64(1), int64(4)
memory usage: 5.8 MB


In [122]:
data_cleaned['Mark'].value_counts()

1    44049
3    23596
7    22231
4    22149
6     7449
5     3890
2     3435
Name: Mark, dtype: int64

In [123]:
data_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 126799 entries, 0 to 126798
Data columns (total 5 columns):
Seq_number    126799 non-null float64
X             126799 non-null int64
Y             126799 non-null int64
Z             126799 non-null int64
Mark          126799 non-null int64
dtypes: float64(1), int64(4)
memory usage: 5.8 MB


In [124]:
X_train, X_test, y_train, y_test = split_data(data_cleaned)

In [125]:
len(X_train)

88759

In [126]:
y_train = np.ones((len(X_train),), dtype=int)
y_train.fill(-1)
len(y_train)

88759

In [127]:
len(X_test)

38040

In [128]:
len(y_test)

38040

In [132]:
def run_model(file, anomaly_algorithms):
    data = get_data_from_file(file)
    check_data(data)
    data_cleaned = data_clean(data)
    #X, y = data_prepare(data_cleaned)
    X_train, X_test, y_train, y_test = split_data(data_cleaned)
    y_train = np.ones((len(X_train),), dtype=int)
    y_train.fill(1)
    for algorithms in anomaly_algorithms:
        model_evaluation(model_build(algorithms[1], X_train), X_test, y_test, algorithms[0])
    pass

In [133]:
run_model(file_list[1], anomaly_algorithms)

No missing values in column Seq_number
No missing values in column X
No missing values in column Y
No missing values in column Z
No missing values in column Mark


One-Class SVM Accuracy: 0.851393270242
One-Class SVM F1 score: 0.91973248896 



In [134]:
#fit model at full dataset
data = get_data_from_file(file)
data_cleaned = data_clean(data)
X = data.drop(['Seq_number', 'Mark'], axis=1)
model = svm.OneClassSVM(nu=outliers_fraction, kernel='rbf', gamma=0.00001)
full_data_model = model_build(model, X)

## For all data sets model evaluation

In [135]:
file_list_val = ['Data_new/data/1.csv',
 'Data_new/data/11.csv',
 'Data_new/data/12.csv',
 'Data_new/data/13.csv',
 'Data_new/data/14.csv',
 'Data_new/data/15.csv',
 'Data_new/data/2.csv',
 'Data_new/data/3.csv',
 'Data_new/data/4.csv',
 'Data_new/data/5.csv',
 'Data_new/data/6.csv',
 'Data_new/data/7.csv',
 'Data_new/data/8.csv',
 'Data_new/data/9.csv']

In [138]:
for file in file_list_val:
    data = get_data_from_file(file)
    data = data_clean(data)
    X = data.drop(['Seq_number', 'Mark'], axis=1)
    y = np.ones((len(X),), dtype=int)
    y.fill(-1)
    print(file.split('/')[2])
    model_evaluation(full_data_model, X, y, 'One-Class SVM')  

1.csv
One-Class SVM Accuracy: 0.999993846116
One-Class SVM F1 score: 0.0 

11.csv


  'recall', 'true', average, warn_for)


One-Class SVM Accuracy: 0.993106683645
One-Class SVM F1 score: 0.0 

12.csv
One-Class SVM Accuracy: 0.999982563208
One-Class SVM F1 score: 0.0 

13.csv
One-Class SVM Accuracy: 0.994663631391
One-Class SVM F1 score: 0.0 

14.csv
One-Class SVM Accuracy: 1.0
One-Class SVM F1 score: 0.0 

15.csv
One-Class SVM Accuracy: 0.386235615803
One-Class SVM F1 score: 0.0 

2.csv
One-Class SVM Accuracy: 0.997371669208
One-Class SVM F1 score: 0.0 

3.csv
One-Class SVM Accuracy: 0.998749254927
One-Class SVM F1 score: 0.0 

4.csv
One-Class SVM Accuracy: 0.99033543646
One-Class SVM F1 score: 0.0 

5.csv
One-Class SVM Accuracy: 0.999937499609
One-Class SVM F1 score: 0.0 

6.csv
One-Class SVM Accuracy: 0.996189636665
One-Class SVM F1 score: 0.0 

7.csv
One-Class SVM Accuracy: 0.999877299861
One-Class SVM F1 score: 0.0 

8.csv
One-Class SVM Accuracy: 0.99999274279
One-Class SVM F1 score: 0.0 

9.csv
One-Class SVM Accuracy: 0.149927628726
One-Class SVM F1 score: 0.0 



Reference Papers


https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4029702/
    
https://www.researchgate.net/figure/In-the-dynamic-windowing-strategy-the-feature-extraction-uses-a-sliding-window-of-width_fig2_279137283

https://dsp.stackexchange.com/questions/36513/applying-a-window-function-to-a-speech-signal

https://pdfs.semanticscholar.org/8c18/b86e75f068b59a5ac9c334c1bc59fff5d328.pdf

https://cs.uwaterloo.ca/~s255khan/files/KER_sskmm_final.pdf

http://scikit-learn.org/stable/modules/generated/sklearn.svm.OneClassSVM.html

http://scikit-learn.org/stable/modules/outlier_detection.html

https://www.researchgate.net/publication/221258784_Human_Activity_Recognition_from_Accelerometer_Data_Using_at_Wearable_Device?ev=prf_pub

https://www.researchgate.net/publication/227192676_Personalization_and_user_verification_in_wearable_systems_using_biometric_walking_patterns?ev=prf_pub