In [10]:

import pandas as pd
import numpy as np

from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report


In [11]:
#Функція створення features
def create_features(frame, rows):
  features = []
  #rows = frame.shape[0]

  for id in range(0, 3):
    col = frame.iloc[:,id]
    
    features.append(pd.DataFrame([col.min()] * rows,columns=[f'{col.name}_min']))
    features.append(pd.DataFrame([col.max()] * rows,columns=[f'{col.name}_max']))
    features.append(pd.DataFrame([col.mean()] * rows,columns=[f'{col.name}_mean']))
    features.append(pd.DataFrame([col.quantile(0.75)-col.quantile(0.25)] * rows,columns=[f'{col.name}_interquartile_range']))
    features.append(pd.DataFrame([col.idxmin()] * rows,columns=[f'{col.name}_index_of_minimum_value']))
    features.append(pd.DataFrame([np.mean(np.abs(col - col.mean()))] * rows,columns=[f'{col.name}_mean_of_absolute_deviation']))
    features.append(pd.DataFrame([col.median()] * rows,columns=[f'{col.name}_median']))
    features.append(pd.DataFrame([col.std()] * rows,columns=[f'{col.name}_standard_deviation']))
    features.append(pd.DataFrame([np.sqrt(np.mean(col)**2)] * rows,columns=[f'{col.name}_root_mean_square_error']))

  res = pd.concat(features, axis=1)
  return res

In [12]:
def line_frame(frame):
        columns=[f"{col}_{i}" for i in range(frame.shape[0]) for col in frame.columns]
        return pd.DataFrame(frame.values.reshape(1, -1), columns=columns)

Creating date frames.   
I wasn't sure how best to create a date frame, so I created several.

In [13]:
directory = Path("data")
classes= list(directory.iterdir())

df_list_f = []
df_list_l = []
df_list_fl = []
for class_id, class_path in enumerate(classes):
    #files = list(class_path.iterdir())
    files = list(sorted(class_path.glob('*.csv'), key=lambda path: int(path.stem.rsplit("-", 1)[1])))
    print(f"class '{class_path.name:7}' : {class_id}. frames: {len(files)}")
    for file_name in files:
        df_file = pd.read_csv(file_name)

        #df_features
        add_features = [df_file]
        add_features.append(create_features(df_file, add_features[0].shape[0]))
        df = pd.concat(add_features, axis=1)
        df['class'] = class_id
        df_list_f.append(df)
        
        #df_line
        df = line_frame(df_file)
        df['class'] = class_id
        df_list_l.append(df)

        #df_line_features
        add_features = [line_frame(df_file)]
        add_features.append(create_features(df_file, add_features[0].shape[0]))
        df = pd.concat(add_features, axis=1)
        df['class'] = class_id
        df_list_fl.append(df)

df_features = pd.concat(df_list_f, axis=0, ignore_index=True)
df_line = pd.concat(df_list_l, axis=0, ignore_index=True)
df_line_features = pd.concat(df_list_fl, axis=0, ignore_index=True)
#вийшов хардкод, але я ще не вигадала як це можна гарно покращити


class 'idle   ' : 0. frames: 1039
class 'running' : 1. frames: 3408
class 'stairs ' : 2. frames: 165
class 'walking' : 3. frames: 1850


In [14]:
df_line_features.head()

Unnamed: 0,accelerometer_X_0,accelerometer_Y_0,accelerometer_Z_0,accelerometer_X_1,accelerometer_Y_1,accelerometer_Z_1,accelerometer_X_2,accelerometer_Y_2,accelerometer_Z_2,accelerometer_X_3,...,accelerometer_Z_min,accelerometer_Z_max,accelerometer_Z_mean,accelerometer_Z_interquartile_range,accelerometer_Z_index_of_minimum_value,accelerometer_Z_mean_of_absolute_deviation,accelerometer_Z_median,accelerometer_Z_standard_deviation,accelerometer_Z_root_mean_square_error,class
0,1.000776,4.616021,8.576031,0.718261,4.209007,8.446744,-0.909797,-0.282516,9.203311,5.09965,...,8.418014,9.80665,9.605697,0.034716,3,0.275546,9.770737,0.406903,9.605697,0
1,-0.909797,-0.282516,9.203311,5.09965,0.148441,8.418014,1.762132,-0.162806,9.251195,-0.086191,...,8.418014,9.80665,9.690771,0.019153,1,0.148185,9.773131,0.278368,9.690771,0
2,1.762132,-0.162806,9.251195,-0.086191,-0.148441,9.787497,-0.062249,-0.191536,9.773131,-0.057461,...,9.251195,9.80665,9.755414,0.017956,0,0.040925,9.773131,0.098319,9.755414,0
3,-0.062249,-0.191536,9.773131,-0.057461,-0.114922,9.763555,-0.124498,-0.162806,9.782708,-0.138864,...,9.667787,9.80665,9.772653,0.014365,22,0.014365,9.775526,0.02433,9.772653,0
4,-0.124498,-0.162806,9.782708,-0.138864,-0.124498,9.768343,-0.095768,-0.129287,9.773131,-0.114922,...,9.667787,9.80665,9.771216,0.017956,20,0.016153,9.775526,0.025356,9.771216,0


Training of models and results.

In [15]:
def model_training(df):
    X_set = df.iloc[:,:-1]
    y_set = df.iloc[:,-1]

    X_train, X_test, y_train, y_test = train_test_split(X_set, y_set, test_size = 0.3, random_state = 42, stratify = y_set)

    class_list = list(d.name for d in classes)
    models = {"SVC": lambda: SVC(), "RandomForestClassifier": lambda: RandomForestClassifier()}
    conclusion = {}
    for model, classification in models.items():
        clf = classification()
        clf.fit(X_train, y_train)
        y_test_predict = clf.predict(X_test)
        conclusion[model] = classification_report(y_test, y_test_predict, digits=4, target_names=class_list)

    for model in conclusion:
        print(f"model: {model}")
        print(conclusion[model])

In [19]:
model_training(df_line)

model: SVC
              precision    recall  f1-score   support

        idle     0.9968    1.0000    0.9984       312
     running     1.0000    0.9990    0.9995      1023
      stairs     0.9412    0.3265    0.4848        49
     walking     0.9438    0.9982    0.9702       555

    accuracy                         0.9819      1939
   macro avg     0.9704    0.8309    0.8632      1939
weighted avg     0.9819    0.9819    0.9779      1939

model: RandomForestClassifier
              precision    recall  f1-score   support

        idle     1.0000    1.0000    1.0000       312
     running     1.0000    1.0000    1.0000      1023
      stairs     1.0000    0.0408    0.0784        49
     walking     0.9219    1.0000    0.9594       555

    accuracy                         0.9758      1939
   macro avg     0.9805    0.7602    0.7595      1939
weighted avg     0.9777    0.9758    0.9651      1939



As seen from the expanded dataframe without additional features, the SVC model performs slightly better than the RandomForestClassifier. The RandomForestClassifier particularly struggles to identify the stairs class. As evident from the recall (0.0408), there are almost no instances that the model predicts as stairs, even though it doesn't make errors in precision (1.0000). This could be due to the relatively small amount of stairs data in the training dataset.

In [20]:
model_training(df_line_features)

model: SVC
              precision    recall  f1-score   support

        idle     1.0000    1.0000    1.0000       312
     running     1.0000    1.0000    1.0000      1023
      stairs     0.9412    0.3265    0.4848        49
     walking     0.9438    0.9982    0.9702       555

    accuracy                         0.9825      1939
   macro avg     0.9712    0.8312    0.8638      1939
weighted avg     0.9824    0.9825    0.9785      1939

model: RandomForestClassifier
              precision    recall  f1-score   support

        idle     1.0000    1.0000    1.0000       312
     running     1.0000    1.0000    1.0000      1023
      stairs     1.0000    0.7347    0.8471        49
     walking     0.9771    1.0000    0.9884       555

    accuracy                         0.9933      1939
   macro avg     0.9943    0.9337    0.9589      1939
weighted avg     0.9934    0.9933    0.9928      1939



After adding features, the SVC model performs slightly worse than the RandomForestClassifier in terms of accuracy. The RandomForestClassifier has become significantly better at identifying the stairs class. In contrast, the SVC model has not improved in this aspect compared to before.

In [18]:
model_training(df_features)

model: SVC
              precision    recall  f1-score   support

        idle     1.0000    1.0000    1.0000      9351
     running     1.0000    1.0000    1.0000     30672
      stairs     0.9658    0.8949    0.9290      1485
     walking     0.9907    0.9972    0.9939     16650

    accuracy                         0.9965     58158
   macro avg     0.9891    0.9730    0.9807     58158
weighted avg     0.9965    0.9965    0.9964     58158

model: RandomForestClassifier
              precision    recall  f1-score   support

        idle     1.0000    1.0000    1.0000      9351
     running     1.0000    1.0000    1.0000     30672
      stairs     1.0000    1.0000    1.0000      1485
     walking     1.0000    1.0000    1.0000     16650

    accuracy                         1.0000     58158
   macro avg     1.0000    1.0000    1.0000     58158
weighted avg     1.0000    1.0000    1.0000     58158



On the unexpanded dataframe with additional features, both models outperform all previous versions. Although the SVC model still does not identify all stairs (recall 0.8949), the RandomForestClassifier has perfectly identified everything on this test set.