In [18]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [19]:
dataset_path = Path('data')
csv_files = dataset_path.glob('**/*.csv')

dataframes = []

for file_path in csv_files:
    activity = file_path.parent.name
    df = pd.read_csv(file_path)
    df['activity'] = activity
    dataframes.append(df)
combined_df = pd.concat(dataframes, ignore_index=True)

combined_df

Unnamed: 0,accelerometer_X,accelerometer_Y,accelerometer_Z,activity
0,1.000776,4.616021,8.576031,idle
1,0.718261,4.209007,8.446744,idle
2,-0.909797,-0.282516,9.203311,idle
3,5.099650,0.148441,8.418014,idle
4,1.762132,-0.162806,9.251195,idle
...,...,...,...,...
193855,5.109226,-15.452178,-1.470040,walking
193856,6.373365,-11.741165,-8.226476,walking
193857,3.289633,-9.993398,-0.383072,walking
193858,-2.978387,-3.050213,1.273715,walking


#### Calculation of time domain features for each axis of the accelerometer:

In [20]:
time_features = combined_df.groupby('activity').agg({'accelerometer_X': ['mean', 'std', 'median', 'max', 'min', 'sum'],
                                                     'accelerometer_Y': ['mean', 'std', 'median', 'max', 'min', 'sum'],
                                                     'accelerometer_Z': ['mean', 'std', 'median', 'max', 'min', 'sum']})

# Renaming columns
time_features.columns = ['_'.join(col) for col in time_features.columns]

time_features['activity'] = time_features.index
activity_col = time_features.pop('activity')
time_features.insert(0, 'activity', activity_col)

time_features.head()

Unnamed: 0_level_0,activity,accelerometer_X_mean,accelerometer_X_std,accelerometer_X_median,accelerometer_X_max,accelerometer_X_min,accelerometer_X_sum,accelerometer_Y_mean,accelerometer_Y_std,accelerometer_Y_median,accelerometer_Y_max,accelerometer_Y_min,accelerometer_Y_sum,accelerometer_Z_mean,accelerometer_Z_std,accelerometer_Z_median,accelerometer_Z_max,accelerometer_Z_min,accelerometer_Z_sum
activity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
idle,idle,0.096767,0.691892,0.234632,8.135497,-10.448297,3016.239864,2.225971,2.821235,0.02873,9.916783,-2.595315,69383.508951,8.973247,1.211058,9.749189,21.988348,-0.296881,279696.094989
running,running,5.045705,9.952696,3.605668,39.188293,-39.188293,515872.914034,8.083135,12.441402,7.967903,39.188293,-39.188293,826419.709676,1.387356,7.004236,0.407014,39.188293,-39.188293,141843.236611
stairs,stairs,0.353135,3.845949,0.483629,14.164097,-11.238382,1748.016884,-9.574353,4.922294,-8.944737,4.994305,-25.828648,-47393.045005,-1.841333,5.297376,-1.58975,17.492037,-38.08696,-9114.596029
walking,walking,-2.661943,4.648378,-2.298433,20.57577,-26.81506,-147737.820092,-9.703702,5.931165,-9.974244,14.441825,-39.188293,-538555.441522,-1.126623,6.927679,-0.933739,38.426937,-39.16914,-62527.593189


#### We reduce the data to one format using normalization:

In [21]:
columns_to_normalize = ['accelerometer_X', 'accelerometer_Y', 'accelerometer_Z']

scaler = MinMaxScaler()
combined_df[columns_to_normalize] = scaler.fit_transform(combined_df[columns_to_normalize])

combined_df

Unnamed: 0,accelerometer_X,accelerometer_Y,accelerometer_Z,activity
0,0.512769,0.558895,0.609421,idle
1,0.509164,0.553702,0.607771,idle
2,0.488392,0.496395,0.617424,idle
3,0.565066,0.501894,0.607405,idle
4,0.522483,0.497923,0.618035,idle
...,...,...,...,...
193855,0.565188,0.302847,0.481244,walking
193856,0.581317,0.350196,0.395039,walking
193857,0.541972,0.372495,0.495112,walking
193858,0.461999,0.461083,0.516251,walking


#### Let's divide the data into training and testing sets:

In [22]:
X = combined_df[['accelerometer_X', 'accelerometer_Y', 'accelerometer_Z']] 
y = combined_df['activity']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [23]:
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (135702, 3)
Shape of X_test: (58158, 3)
Shape of y_train: (135702,)
Shape of y_test: (58158,)


#### Creating SVM model:

In [24]:
svm_model = SVC()
svm_model.fit(X_train, y_train)

#### Creating Random forest model:

In [25]:
rf_model = RandomForestClassifier(n_jobs=-1)
rf_model.fit(X_train, y_train)

#### Comparing the accuracy of models:

In [26]:
score_svm = svm_model.score(X_test, y_test)
score_rf = rf_model.score(X_test, y_test)

print(f"Accuracy of the SVM model:", score_svm)
print(f"Accuracy of the RF model:", score_rf)

Accuracy of the SVM model: 0.8927232710891021
Accuracy of the RF model: 0.999535747446611


#### We compare the performance results of both algorithms on different features using the 'classification report' method:

In [27]:
# SVM predictions
svm_predictions = svm_model.predict(X_test)

# RF predicitons
rf_predictions = rf_model.predict(X_test)

print("SVM Classification Report:")
print(classification_report(y_test, svm_predictions))

print("Random Forest Classification Report:")
print(classification_report(y_test, rf_predictions))

SVM Classification Report:
              precision    recall  f1-score   support

        idle       0.96      0.99      0.97      9306
     running       0.93      0.90      0.92     30609
      stairs       1.00      0.00      0.01      1537
     walking       0.80      0.91      0.85     16706

    accuracy                           0.89     58158
   macro avg       0.92      0.70      0.69     58158
weighted avg       0.90      0.89      0.88     58158

Random Forest Classification Report:
              precision    recall  f1-score   support

        idle       1.00      1.00      1.00      9306
     running       1.00      1.00      1.00     30609
      stairs       1.00      0.99      0.99      1537
     walking       1.00      1.00      1.00     16706

    accuracy                           1.00     58158
   macro avg       1.00      1.00      1.00     58158
weighted avg       1.00      1.00      1.00     58158



#### The Random Forest model has significantly better indicators than the SVM model, in particular in terms of accuracy, macro avg precision, macro avg weighted precision, and f1 score. 
#### So let's summarize: both models achieve a fairly high level of accuracy, but the random forest model turned out to be more reliable, it has better accuracy indicators and greater completeness on all classes.