In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
from sklearn import preprocessing

In [None]:
# Data Preprocessing
data_trail1 = pd.read_csv("Trail1_extracted_features_acceleration_m1ai1-2.csv")
data_trail2 = pd.read_csv("Trail2_extracted_features_acceleration_m1ai1-1.csv")
data_trail3 = pd.read_csv("Trail3_extracted_features_acceleration_m2ai0-1.csv")

df  = pd.concat([data_trail1,data_trail2,data_trail3], ignore_index=True)

df.drop(columns=["start_time", "axle", "cluster", "tsne_1", "tsne_2",], inplace=True)

df["event"] = df["event"].apply(lambda x : 1 if x == "normal" else 0)

In [None]:
# Data Transformation

#drop event cause I don't want to normalize 0 and 1
x = df.drop(columns="event").values

normalizer = preprocessing.Normalizer().fit(x)
norm_x = normalizer.transform(x)

#create the dataframe again with normalized values
df_norm = pd.DataFrame(norm_x, columns=df.drop(columns="event").columns)

#add the dropped event values
df_norm["event"] = df["event"].values

print(df_norm)

             mean           std       max           min     range  \
0   -7.224506e-10  2.046288e-07  0.000001 -9.382634e-07  0.000002   
1   -1.926502e-09  8.495266e-06  0.000075 -8.686790e-05  0.000162   
2    5.290685e-09  1.008559e-06  0.000004 -4.887409e-06  0.000009   
3    5.217376e-08  1.868642e-05  0.000233 -2.262712e-04  0.000459   
4   -1.961913e-07  1.069893e-05  0.000033 -5.626494e-05  0.000089   
..            ...           ...       ...           ...       ...   
145  3.260127e-08  3.194933e-06  0.000009 -1.356323e-05  0.000022   
146 -5.143785e-09  2.451955e-06  0.000019 -1.354632e-05  0.000032   
147  1.163203e-10  1.008091e-06  0.000003 -4.161629e-06  0.000008   
148 -3.566081e-09  5.450296e-07  0.000002 -2.230301e-06  0.000004   
149  9.639884e-10  3.609248e-07  0.000002 -1.788247e-06  0.000003   

         skewness  kurtosis           rms  crest_factor      variance  \
0   -7.258889e-07  0.000072  2.046301e-07      0.000847  2.762046e-10   
1   -1.280454e-05  0.0093

In [None]:
# Dataset Splitting
from sklearn.model_selection import train_test_split

X = df_norm.drop(columns=["event"])
Y = df_norm["event"]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, train_size=0.8, random_state=42)


1. Feature extraction: In order to indicate events successfully, it is important to first identify key attributes that correlates to these events.
2. Data preprocessing: To get the most realiable model the data needs to be inspected and cleaned. This is done by handling missing values and outliers. Normalizing the data is also common for different things but one is to scale the features to a similar range which gives all the features the same "chance".
3. Selection of Model: What type of method benefits the specific goal in mind.
4. Train: The model algorithm is then ready to be fitted to the training dataset.
5. Eval: What type of performance metrics give the best and clear representation of how well it did the job?