In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from DataTransformation import LowPassFilter, PrincipalComponentAnalysis
from TemporalAbstraction import NumericalAbstraction


# --------------------------------------------------------------
# Load data
# --------------------------------------------------------------
df = pd.read_csv("../Files/cleaned_file_outlier.csv")
predictor_columns = list(df.columns[1:6])


plt.style.use("fivethirtyeight")
plt.rcParams["figure.figsize"] = (20,5)
plt.rcParams["figure.dpi"] = 100
plt.rcParams["lines.linewidth"] = 2


# --------------------------------------------------------------
# Dealing with missing values (imputation)
# --------------------------------------------------------------
print(predictor_columns)

['acc_x', 'acc_y', 'acc_z', 'gyr_x', 'gyr_y']


In [3]:
# --------------------------------------------------------------
# Calculating set duration
# --------------------------------------------------------------
# df[df["set"] == 25]["acc_y"].plot()
import numpy as np

# duration = df[df["set"] == 1].index[-1] - df[df["set"] == 1].index[0]

for s in df["set"].unique():
    stop = df[df["set"] == s].index[-1] 
    start = df[df["set"] == s].index[0]
    
    duration = stop - start
    df.loc[(df["set"] == s), "duration"] = duration

df

Unnamed: 0,epoch (ms),acc_x,acc_y,acc_z,gyr_x,gyr_y,gyr_z,participant,label,category,set,duration
0,2019-01-11 15:08:05.200,0.013500,0.977000,-0.071000,-1.8904,2.4392,0.9388,B,bench,heavy,30,84.0
1,2019-01-11 15:08:05.400,-0.001500,0.970500,-0.079500,-1.6826,-0.8904,2.1708,B,bench,heavy,30,84.0
2,2019-01-11 15:08:05.600,0.001333,0.971667,-0.064333,2.5608,-0.2560,-1.4146,B,bench,heavy,30,84.0
3,2019-01-11 15:08:05.800,-0.024000,0.957000,-0.073500,8.0610,-4.5244,-2.0730,B,bench,heavy,30,84.0
4,2019-01-11 15:08:06.000,-0.028000,0.957667,-0.115000,2.4390,-1.5486,-3.6098,B,bench,heavy,30,84.0
...,...,...,...,...,...,...,...,...,...,...,...,...
9004,2019-01-20 17:33:27.000,-0.048000,-1.041500,-0.076500,1.4146,-5.6218,0.2926,E,row,medium,90,97.0
9005,2019-01-20 17:33:27.200,-0.037000,-1.030333,-0.053333,-2.7684,-0.5854,2.2440,E,row,medium,90,97.0
9006,2019-01-20 17:33:27.400,-0.060000,-1.031000,-0.082000,2.8416,-5.1342,-0.1220,E,row,medium,90,97.0
9007,2019-01-20 17:33:27.600,-0.038667,-1.025667,-0.044667,-0.2318,0.2562,1.1220,E,row,medium,90,97.0


In [4]:
# --------------------------------------------------------------
# Butterworth lowpass filter
# --------------------------------------------------------------
df_lowpass = df.copy()
LowPaas = LowPassFilter()

fs = 1000 / 200
cutoff = 1.3

# df_lowpass = LowPaas.low_pass_filter(df_lowpass, "acc_y" , fs,cutoff,order=5)

subset = df_lowpass[df_lowpass["set"] == 45]
# print(subset["label"][0])

# fig,ax = plt.subplots(nrows=2, sharex=True, figsize=(20, 10))
# ax[0].plot(subset["acc_y"].reset_index(drop=True), label="raw data")
# ax[1].plot(subset["acc_y_lowpass"].reset_index(drop=True), label="butterworth filter")


for col in predictor_columns:
    df_lowpass = LowPaas.low_pass_filter(df_lowpass, col , fs,cutoff,order=5)    
    df_lowpass[col] = df_lowpass[col + "_lowpass"]
    del df_lowpass[col + "_lowpass"]


In [5]:
# --------------------------------------------------------------
# Principal component analysis PCA
# --------------------------------------------------------------
df_pca = df_lowpass.copy()
PCA = PrincipalComponentAnalysis()

pc_values = PCA.determine_pc_explained_variance(df_pca,predictor_columns)

df_pca = PCA.apply_pca(df_pca, predictor_columns, 3)
df_pca

Unnamed: 0,epoch (ms),acc_x,acc_y,acc_z,gyr_x,gyr_y,gyr_z,participant,label,category,set,duration,pca_1,pca_2,pca_3
0,2019-01-11 15:08:05.200,0.013503,0.977007,-0.071001,-1.886218,2.438803,0.9388,B,bench,heavy,30,84.0,-0.310954,-0.065022,0.018133
1,2019-01-11 15:08:05.400,0.008515,0.973137,-0.066481,-0.367396,0.439794,2.1708,B,bench,heavy,30,84.0,-0.310105,-0.064149,0.004579
2,2019-01-11 15:08:05.600,-0.008450,0.966876,-0.071895,4.320608,-2.083978,-1.4146,B,bench,heavy,30,84.0,-0.309846,-0.071799,-0.034890
3,2019-01-11 15:08:05.800,-0.024359,0.962305,-0.084774,4.737252,-3.456339,-2.0730,B,bench,heavy,30,84.0,-0.310403,-0.083174,-0.039008
4,2019-01-11 15:08:06.000,-0.022092,0.959566,-0.096776,0.808615,-1.194575,-3.6098,B,bench,heavy,30,84.0,-0.309629,-0.089140,-0.005712
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9004,2019-01-20 17:33:27.000,-0.043276,-0.988153,-0.062545,2.032956,-4.019557,0.2926,E,row,medium,90,97.0,0.346652,-0.149024,-0.009988
9005,2019-01-20 17:33:27.200,-0.041403,-1.053448,-0.064530,-2.967835,-2.660315,2.2440,E,row,medium,90,97.0,0.368681,-0.151900,0.031684
9006,2019-01-20 17:33:27.400,-0.048048,-1.044737,-0.065951,-0.043109,-2.022091,-0.1220,E,row,medium,90,97.0,0.365233,-0.153618,0.008472
9007,2019-01-20 17:33:27.600,-0.050572,-1.010599,-0.060327,3.033517,-2.427441,1.1220,E,row,medium,90,97.0,0.353752,-0.149695,-0.016798


In [6]:
# --------------------------------------------------------------
# Sum of squares attributes
# --------------------------------------------------------------
df_squared = df_pca.copy()

acc_r = df_squared["acc_x"] **2 + df_squared["acc_y"] **2 + df_squared["acc_z"] **2
gyr_r = df_squared["gyr_x"] **2 + df_squared["gyr_y"] **2 + df_squared["gyr_z"] **2

df_squared["acc_r"] = np.sqrt(acc_r)
df_squared["gyr_r"] = np.sqrt(gyr_r)

df_squared

Unnamed: 0,epoch (ms),acc_x,acc_y,acc_z,gyr_x,gyr_y,gyr_z,participant,label,category,set,duration,pca_1,pca_2,pca_3,acc_r,gyr_r
0,2019-01-11 15:08:05.200,0.013503,0.977007,-0.071001,-1.886218,2.438803,0.9388,B,bench,heavy,30,84.0,-0.310954,-0.065022,0.018133,0.979677,3.222875
1,2019-01-11 15:08:05.400,0.008515,0.973137,-0.066481,-0.367396,0.439794,2.1708,B,bench,heavy,30,84.0,-0.310105,-0.064149,0.004579,0.975442,2.245166
2,2019-01-11 15:08:05.600,-0.008450,0.966876,-0.071895,4.320608,-2.083978,-1.4146,B,bench,heavy,30,84.0,-0.309846,-0.071799,-0.034890,0.969582,5.001171
3,2019-01-11 15:08:05.800,-0.024359,0.962305,-0.084774,4.737252,-3.456339,-2.0730,B,bench,heavy,30,84.0,-0.310403,-0.083174,-0.039008,0.966339,6.219740
4,2019-01-11 15:08:06.000,-0.022092,0.959566,-0.096776,0.808615,-1.194575,-3.6098,B,bench,heavy,30,84.0,-0.309629,-0.089140,-0.005712,0.964687,3.887354
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9004,2019-01-20 17:33:27.000,-0.043276,-0.988153,-0.062545,2.032956,-4.019557,0.2926,E,row,medium,90,97.0,0.346652,-0.149024,-0.009988,0.991076,4.513907
9005,2019-01-20 17:33:27.200,-0.041403,-1.053448,-0.064530,-2.967835,-2.660315,2.2440,E,row,medium,90,97.0,0.368681,-0.151900,0.031684,1.056234,4.573933
9006,2019-01-20 17:33:27.400,-0.048048,-1.044737,-0.065951,-0.043109,-2.022091,-0.1220,E,row,medium,90,97.0,0.365233,-0.153618,0.008472,1.047919,2.026227
9007,2019-01-20 17:33:27.600,-0.050572,-1.010599,-0.060327,3.033517,-2.427441,1.1220,E,row,medium,90,97.0,0.353752,-0.149695,-0.016798,1.013660,4.043956


In [17]:
# --------------------------------------------------------------
# Temporal abstraction
# --------------------------------------------------------------
df_temporal = df_squared.copy()
NumAbs = NumericalAbstraction()

predictor_columns = predictor_columns + ["acc_r","gyr_r"]

ws = int(1000 / 200)

for col in predictor_columns:
    df_temporal = NumAbs.abstract_numerical(df_temporal, [col] , ws, "mean")
    df_temporal = NumAbs.abstract_numerical(df_temporal, [col] , ws, "std")
    
    
df_temporal_list = []

for s in df_temporal["set"].unique():
    subset = df_temporal[df_temporal["set"] == s].copy()
    for col in predictor_columns:
        subset = NumAbs.abstract_numerical(subset, [col], ws, "mean")
        subset = NumAbs.abstract_numerical(subset, [col], ws, "std")
    df_temporal_list.append(subset)
    
df_temporal = pd.concat(df_temporal_list)


In [18]:
from FrequencyAbstraction import FourierTransformation
# --------------------------------------------------------------
# Frequency features
# --------------------------------------------------------------
df_freq = df_temporal.copy().reset_index()
FreqAbs = FourierTransformation()

fs = int(1000 / 200)
ws = int(2800 / 200)

df_freq = FreqAbs.abstract_frequency(df_freq, ["acc_y"], ws, fs)

df_freq_list = []
for s in df_freq["set"].unique():
    print("Applying Fourier transformation to set {s}")
    subset = df_freq[df_freq["set"] == s].reset_index(drop=True).copy()
    subset = FreqAbs.abstract_frequency(subset, predictor_columns, ws, fs)
    df_freq_list.append(subset)
    
df_freq = pd.concat(df_freq_list).set_index("epoch (ms)", drop=True)

Applying Fourier transformation to set {s}
Applying Fourier transformation to set {s}
Applying Fourier transformation to set {s}
Applying Fourier transformation to set {s}
Applying Fourier transformation to set {s}
Applying Fourier transformation to set {s}
Applying Fourier transformation to set {s}
Applying Fourier transformation to set {s}
Applying Fourier transformation to set {s}
Applying Fourier transformation to set {s}
Applying Fourier transformation to set {s}
Applying Fourier transformation to set {s}
Applying Fourier transformation to set {s}
Applying Fourier transformation to set {s}
Applying Fourier transformation to set {s}
Applying Fourier transformation to set {s}
Applying Fourier transformation to set {s}
Applying Fourier transformation to set {s}
Applying Fourier transformation to set {s}
Applying Fourier transformation to set {s}
Applying Fourier transformation to set {s}
Applying Fourier transformation to set {s}
Applying Fourier transformation to set {s}
Applying Fo

In [23]:
# --------------------------------------------------------------
# Dealing with overlapping windows
# --------------------------------------------------------------
df_freq = df_freq.dropna()

df_freq = df_freq.iloc[::2]


In [25]:
# --------------------------------------------------------------
# Clustering
# --------------------------------------------------------------
from sklearn.cluster import KMeans

df_cluster = df_freq.copy()
cluster_columns = ["acc_x","acc_y","acc_z"]
k_values = range(2, 10)
inertias = []

for k in k_values:
    subset = df_cluster[cluster_columns]
    kmeans = KMeans(n_clusters=k, n_init=20, random_state=0)
    cluster_labels = kmeans.fit_predict(subset)
    inertias.append(kmeans.inertia_)


In [27]:
# --------------------------------------------------------------
# Export dataset
# --------------------------------------------------------------

df_cluster.to_csv("../Files/cleaned_feature.csv")

In [26]:
kmeans = KMeans(n_clusters=5, n_init=20, random_state=0)
subset = df_cluster[cluster_columns]
df_cluster["cluster"] = kmeans.fit_predict(subset)

