In [1]:
!nvidia-smi

Tue Jul  2 16:35:12 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 551.23                 Driver Version: 551.23         CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                     TCC/WDDM  | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 2080 ...  WDDM  |   00000000:01:00.0  On |                  N/A |
|  0%   40C    P8              8W /  280W |     290MiB /   8192MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import signal
from sklearn.model_selection import train_test_split,StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report,accuracy_score,ConfusionMatrixDisplay,confusion_matrix,precision_score,recall_score,roc_curve,roc_auc_score,balanced_accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
import datetime
from dataclasses import dataclass
from statsmodels.tsa.holtwinters import SimpleExpSmoothing
from scipy.ndimage import gaussian_filter1d
from itertools import combinations as comb
from pyts.image import RecurrencePlot
import PIL
from sklearn.utils import class_weight
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV,StratifiedKFold
from unidecode import unidecode

In [4]:
df = pd.read_csv('./dataset/dataset_1/train_motion_data.csv')
# test_data = pd.read_csv('./dataset/dataset_1/test_motion_data.csv')
# print(train_data.shape)
# print(test_data.shape)

In [5]:
df.describe()

Unnamed: 0,AccX,AccY,AccZ,GyroX,GyroY,GyroZ,Timestamp
count,3644.0,3644.0,3644.0,3644.0,3644.0,3644.0,3644.0
mean,0.040467,-0.073418,0.008271,0.001593,-0.001273,0.007949,3582707.0
std,0.985653,0.903408,0.985061,0.066918,0.126205,0.115687,642.1479
min,-4.636523,-4.699795,-7.143998,-0.751822,-1.587028,-1.236468,3581629.0
25%,-0.550695,-0.59254,-0.558464,-0.028558,-0.053756,-0.029398,3582121.0
50%,0.003931,-0.080833,0.002262,0.001985,-0.001833,0.002978,3582702.0
75%,0.595987,0.452401,0.556157,0.031918,0.051313,0.040852,3583270.0
max,4.985548,4.245151,5.171739,0.849255,1.679879,1.1905,3583791.0


In [6]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
# Columns to be scaled
columns_to_scale = ['AccX', 'AccY', 'AccZ', 'GyroX', 'GyroY', 'GyroZ']

# Extract the selected columns
data_to_scale = df[columns_to_scale]

# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Fit and transform the selected columns
scaled_data = scaler.fit_transform(data_to_scale)

# Replace the original columns with the scaled values
df[columns_to_scale] = scaled_data

In [7]:
df.describe()

Unnamed: 0,AccX,AccY,AccZ,GyroX,GyroY,GyroZ,Timestamp
count,3644.0,3644.0,3644.0,3644.0,3644.0,3644.0,3644.0
mean,0.486069,0.517206,0.580742,0.470567,0.485399,0.512745,3582707.0
std,0.102437,0.100996,0.079984,0.041795,0.038631,0.047667,642.1479
min,0.0,0.0,0.0,0.0,0.0,0.0,3581629.0
25%,0.424631,0.459171,0.534725,0.451736,0.469334,0.497357,3582121.0
50%,0.482272,0.516377,0.580254,0.470813,0.485228,0.510697,3582702.0
75%,0.543803,0.57599,0.625229,0.489508,0.501496,0.526303,3583270.0
max,1.0,1.0,1.0,1.0,1.0,1.0,3583791.0


In [8]:
# Update Timestamp column to be index of respective row + 1
df['Timestamp'] = (df.index + 1)/2

# Map 'SLOW', 'NORMAL', 'AGGRESSIVE' to 0, 1, 2 respectively in the 'Class' column
class_mapping = {'SLOW': 0, 'NORMAL': 1, 'AGGRESSIVE': 2}
df['Class'] = df['Class'].replace(class_mapping)
                                  
df

Unnamed: 0,AccX,AccY,AccZ,GyroX,GyroY,GyroZ,Class,Timestamp
0,0.481863,0.525413,0.580071,0.506677,0.432311,0.551472,1,0.5
1,0.312995,0.404396,0.563492,0.451736,0.501496,0.565316,1,1.0
2,0.420062,0.511729,0.597975,0.457459,0.476814,0.545683,1,1.5
3,0.558612,0.499873,0.634289,0.513163,0.476627,0.532092,1,2.0
4,0.492437,0.612342,0.574652,0.488745,0.484667,0.532092,1,2.5
...,...,...,...,...,...,...,...,...
3639,0.577029,0.299868,0.717091,0.750858,0.603590,0.000000,0,1820.0
3640,0.280846,0.627697,0.572275,0.670355,0.684555,0.312862,0,1820.5
3641,0.458704,0.608958,0.508014,0.695155,0.361257,0.531840,0,1821.0
3642,0.445549,0.532933,0.612092,0.386875,0.491960,0.507425,0,1821.5


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3644 entries, 0 to 3643
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   AccX       3644 non-null   float64
 1   AccY       3644 non-null   float64
 2   AccZ       3644 non-null   float64
 3   GyroX      3644 non-null   float64
 4   GyroY      3644 non-null   float64
 5   GyroZ      3644 non-null   float64
 6   Class      3644 non-null   int64  
 7   Timestamp  3644 non-null   float64
dtypes: float64(7), int64(1)
memory usage: 227.9 KB


In [10]:
# 1. Signal Smoothing
# Calculate window size and overlap based on the sampling rate
sampling_rate = 12  
window_size = int(8 * sampling_rate)  # Size of each window (equivalent to two-second windows)
overlap = int(window_size * 0.25)  # 25% overlap

# Group data into windows with the specified size and overlap for each column
grouped_data = {}
for column in df.columns:  # Exclude Timestamp column
    # Aggregate data into windows using rolling window function
    windowed_data = df[column]
    
    # Check if the column contains numeric data
    if df[column].dtype in ['int64', 'float64']:
        # Calculate mean with 25% overlap for numeric columns
        grouped_data[column] = windowed_data.rolling(window=window_size, min_periods=1).mean().shift(-overlap).dropna().reset_index(drop=True)
    else:
        # For non-numeric columns, retain the original values (e.g., Timestamp)
        grouped_data[column] = windowed_data.iloc[range(0, len(windowed_data), overlap)].reset_index(drop=True)

# Create a new DataFrame with aggregated data for each column
aggregated_df = pd.DataFrame(grouped_data)

# Display the aggregated DataFrame
print(aggregated_df)

          AccX      AccY      AccZ     GyroX     GyroY     GyroZ  Class  \
0     0.465175  0.522596  0.588665  0.471408  0.489581  0.508704    1.0   
1     0.463894  0.525826  0.586893  0.471649  0.489263  0.508751    1.0   
2     0.464486  0.528651  0.588440  0.472226  0.488594  0.508646    1.0   
3     0.463826  0.528032  0.587821  0.471603  0.488634  0.508513    1.0   
4     0.466263  0.531408  0.586203  0.471102  0.488890  0.508701    1.0   
...        ...       ...       ...       ...       ...       ...    ...   
3615  0.512700  0.518178  0.579959  0.464696  0.488604  0.494290    0.0   
3616  0.508393  0.518513  0.579572  0.466612  0.490664  0.492774    0.0   
3617  0.505880  0.519916  0.578555  0.469434  0.489128  0.493396    0.0   
3618  0.503947  0.518555  0.579467  0.468297  0.488631  0.494020    0.0   
3619  0.502466  0.519530  0.579269  0.467824  0.488734  0.494796    0.0   

      Timestamp  
0          6.50  
1          6.75  
2          7.00  
3          7.25  
4        

In [11]:
# Drop the first two rows of aggregated_df
aggregated_df = aggregated_df.iloc[2:].drop('Timestamp', axis=1)

# Display the updated DataFrame
print(aggregated_df)

          AccX      AccY      AccZ     GyroX     GyroY     GyroZ  Class
2     0.464486  0.528651  0.588440  0.472226  0.488594  0.508646    1.0
3     0.463826  0.528032  0.587821  0.471603  0.488634  0.508513    1.0
4     0.466263  0.531408  0.586203  0.471102  0.488890  0.508701    1.0
5     0.464204  0.532356  0.586362  0.470991  0.489398  0.508784    1.0
6     0.457026  0.532131  0.589392  0.471219  0.487846  0.508213    1.0
...        ...       ...       ...       ...       ...       ...    ...
3615  0.512700  0.518178  0.579959  0.464696  0.488604  0.494290    0.0
3616  0.508393  0.518513  0.579572  0.466612  0.490664  0.492774    0.0
3617  0.505880  0.519916  0.578555  0.469434  0.489128  0.493396    0.0
3618  0.503947  0.518555  0.579467  0.468297  0.488631  0.494020    0.0
3619  0.502466  0.519530  0.579269  0.467824  0.488734  0.494796    0.0

[3618 rows x 7 columns]


In [12]:
print(aggregated_df.columns)

Index(['AccX', 'AccY', 'AccZ', 'GyroX', 'GyroY', 'GyroZ', 'Class'], dtype='object')


In [13]:
df2 = aggregated_df

In [15]:
df2.head()

Unnamed: 0,AccX,AccY,AccZ,GyroX,GyroY,GyroZ,Class
2,0.464486,0.528651,0.58844,0.472226,0.488594,0.508646,1.0
3,0.463826,0.528032,0.587821,0.471603,0.488634,0.508513,1.0
4,0.466263,0.531408,0.586203,0.471102,0.48889,0.508701,1.0
5,0.464204,0.532356,0.586362,0.470991,0.489398,0.508784,1.0
6,0.457026,0.532131,0.589392,0.471219,0.487846,0.508213,1.0


In [14]:
# This is the rolling averages values for each column. Timestamp value is the middle of the interval time value.
df2['Class'].unique()

array([1.        , 1.01041667, 1.02083333, 1.03125   , 1.04166667,
       1.05208333, 1.0625    , 1.07291667, 1.08333333, 1.09375   ,
       1.10416667, 1.11458333, 1.125     , 1.13541667, 1.14583333,
       1.15625   , 1.16666667, 1.17708333, 1.1875    , 1.19791667,
       1.20833333, 1.21875   , 1.22916667, 1.23958333, 1.25      ,
       1.26041667, 1.27083333, 1.28125   , 1.29166667, 1.30208333,
       1.3125    , 1.32291667, 1.33333333, 1.34375   , 1.35416667,
       1.36458333, 1.375     , 1.38541667, 1.39583333, 1.40625   ,
       1.41666667, 1.42708333, 1.4375    , 1.44791667, 1.45833333,
       1.46875   , 1.47916667, 1.48958333, 1.5       , 1.51041667,
       1.52083333, 1.53125   , 1.54166667, 1.55208333, 1.5625    ,
       1.57291667, 1.58333333, 1.59375   , 1.60416667, 1.61458333,
       1.625     , 1.63541667, 1.64583333, 1.65625   , 1.66666667,
       1.67708333, 1.6875    , 1.69791667, 1.70833333, 1.71875   ,
       1.72916667, 1.73958333, 1.75      , 1.76041667, 1.77083

In [16]:
# Sample rate assumption
sampling_rate = 0.5  # 0.5 Hz (two-second windows)
window_size = int(8 * sampling_rate)  # Size of each window
overlap = int(window_size * 0.25)  # 25% overlap

# Create an empty list to store extracted features for each window interval
features_list = []

# Iterate through each window interval
for i in range(0, len(df), overlap):
    if i + window_size <= len(df):
        windowed_data = df.iloc[i:i+window_size, :-1]  # Select the window interval for sensor data
        windowed_timestamp = df.iloc[i:i+window_size, -1]  # Select the window interval for timestamps
        
        # Calculate features for each column in the window interval
        features_dict = {'Timestamp': windowed_timestamp.iloc[-1]}  # Use the last timestamp in the window
        for column in windowed_data.columns:
            if windowed_data[column].dtype in ['int64', 'float64']:
                mean_value = windowed_data[column].mean()
                min_value = windowed_data[column].min()
                max_value = windowed_data[column].max()
                std_dev = windowed_data[column].std()
                # 중위값
                simple_average = (min_value + max_value) / 2.0  # Simple average between min and max
                
                # Store the features in the dictionary
                features_dict[f"{column}_mean"] = mean_value
                features_dict[f"{column}_min"] = min_value
                features_dict[f"{column}_max"] = max_value
                features_dict[f"{column}_simple_average"] = simple_average
                features_dict[f"{column}_std"] = std_dev
        
        # Append features for the window interval as a new dictionary in the list
        features_list.append(features_dict)

# Create a DataFrame from the list of dictionaries
features_df = pd.DataFrame(features_list)

# Display the DataFrame with extracted features for each window interval
features_df

Unnamed: 0,Timestamp,AccX_mean,AccX_min,AccX_max,AccX_simple_average,AccX_std,AccY_mean,AccY_min,AccY_max,AccY_simple_average,...,GyroZ_mean,GyroZ_min,GyroZ_max,GyroZ_simple_average,GyroZ_std,Class_mean,Class_min,Class_max,Class_simple_average,Class_std
0,2.0,0.443383,0.312995,0.558612,0.435803,0.103768,0.485353,0.404396,0.525413,0.464905,...,0.548641,0.532092,0.565316,0.548704,0.013768,1.0,1,1,1.0,0.0
1,2.5,0.446026,0.312995,0.558612,0.435803,0.105200,0.507085,0.404396,0.612342,0.508369,...,0.543796,0.532092,0.565316,0.548704,0.015713,1.0,1,1,1.0,0.0
2,3.0,0.492361,0.420062,0.558612,0.489337,0.056722,0.547006,0.499873,0.612342,0.556107,...,0.531337,0.515479,0.545683,0.530581,0.012361,1.0,1,1,1.0,0.0
3,3.5,0.509842,0.489987,0.558612,0.524300,0.032701,0.540669,0.486377,0.612342,0.549359,...,0.527498,0.515479,0.532092,0.523786,0.008055,1.0,1,1,1.0,0.0
4,4.0,0.473436,0.412985,0.498333,0.455659,0.040452,0.534120,0.473678,0.612342,0.543010,...,0.526680,0.515479,0.532092,0.523786,0.007586,1.0,1,1,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3636,1820.0,0.349872,0.000000,0.577029,0.288514,0.254729,0.381986,0.299868,0.503853,0.401860,...,0.386547,0.000000,1.000000,0.500000,0.442677,0.0,0,0,0.0,0.0
3637,1820.5,0.297027,0.000000,0.577029,0.288514,0.236637,0.435651,0.299868,0.627697,0.463783,...,0.362761,0.000000,1.000000,0.500000,0.443694,0.0,0,0,0.0,0.0
3638,1821.0,0.411703,0.280846,0.577029,0.428937,0.133295,0.510094,0.299868,0.627697,0.463783,...,0.461175,0.000000,1.000000,0.500000,0.420319,0.0,0,0,0.0,0.0
3639,1821.5,0.440532,0.280846,0.577029,0.428937,0.121773,0.517364,0.299868,0.627697,0.463783,...,0.338032,0.000000,0.531840,0.265920,0.245733,0.0,0,0,0.0,0.0


In [18]:
print(features_df.columns)
print(len(features_df.columns))

Index(['Timestamp', 'AccX_mean', 'AccX_min', 'AccX_max', 'AccX_simple_average',
       'AccX_std', 'AccY_mean', 'AccY_min', 'AccY_max', 'AccY_simple_average',
       'AccY_std', 'AccZ_mean', 'AccZ_min', 'AccZ_max', 'AccZ_simple_average',
       'AccZ_std', 'GyroX_mean', 'GyroX_min', 'GyroX_max',
       'GyroX_simple_average', 'GyroX_std', 'GyroY_mean', 'GyroY_min',
       'GyroY_max', 'GyroY_simple_average', 'GyroY_std', 'GyroZ_mean',
       'GyroZ_min', 'GyroZ_max', 'GyroZ_simple_average', 'GyroZ_std',
       'Class_mean', 'Class_min', 'Class_max', 'Class_simple_average',
       'Class_std'],
      dtype='object')
36


In [19]:
# Drop the specified columns from features_df
# columns_to_drop = ['Class_mean', 'Class_min', 'Class_max', 'Class_simple_average','Class_std','Timestamp_mean', 'Timestamp_min', 'Timestamp_max','Timestamp_simple_average', 'Timestamp_std']
# features_df = features_df.drop(columns_to_drop, axis=1)

# Display the updated DataFrame
print(features_df.columns)

Index(['Timestamp', 'AccX_mean', 'AccX_min', 'AccX_max', 'AccX_simple_average',
       'AccX_std', 'AccY_mean', 'AccY_min', 'AccY_max', 'AccY_simple_average',
       'AccY_std', 'AccZ_mean', 'AccZ_min', 'AccZ_max', 'AccZ_simple_average',
       'AccZ_std', 'GyroX_mean', 'GyroX_min', 'GyroX_max',
       'GyroX_simple_average', 'GyroX_std', 'GyroY_mean', 'GyroY_min',
       'GyroY_max', 'GyroY_simple_average', 'GyroY_std', 'GyroZ_mean',
       'GyroZ_min', 'GyroZ_max', 'GyroZ_simple_average', 'GyroZ_std',
       'Class_mean', 'Class_min', 'Class_max', 'Class_simple_average',
       'Class_std'],
      dtype='object')


In [20]:
df3 = features_df

In [21]:
# Compute the time difference
time_diff = df['Timestamp'].diff().fillna(0)  # Assuming Timestamp is in seconds

# Compute the derivative (jerk) of acceleration
jerk_df = pd.DataFrame()
jerk_df['JerkX'] = df['AccX'].diff().div(time_diff, fill_value=0)
jerk_df['JerkY'] = df['AccY'].diff().div(time_diff, fill_value=0)
jerk_df['JerkZ'] = df['AccZ'].diff().div(time_diff, fill_value=0)
jerk_df['Timestamp'] = df['Timestamp']

# Drop the first three rows from jerk_df
jerk_df = jerk_df.iloc[3:]

# Display the updated DataFrame
print(jerk_df)

         JerkX     JerkY     JerkZ  Timestamp
3     0.277100 -0.023711  0.072627        2.0
4    -0.132349  0.224937 -0.119272        2.5
5     0.011791 -0.096519  0.068531        3.0
6    -0.016690 -0.155411 -0.013742        3.5
7    -0.154004 -0.025399 -0.060159        4.0
...        ...       ...       ...        ...
3639  0.493589 -0.407969  0.177149     1820.0
3640 -0.592365  0.655658 -0.289632     1820.5
3641  0.355715 -0.037478 -0.128521     1821.0
3642 -0.026310 -0.152051  0.208156     1821.5
3643 -0.011017  0.075787 -0.132737     1822.0

[3641 rows x 4 columns]


In [22]:
df4 = jerk_df

In [23]:
# Create a new dataframe
mag_df = pd.DataFrame()

# Compute the magnitude of acceleration
mag_df['AccMagnitude'] = np.sqrt(df['AccX']**2 + df['AccY']**2 + df['AccZ']**2)

# Compute the magnitude of gyroscope values
mag_df['GyroMagnitude'] = np.sqrt(df['GyroX']**2 + df['GyroY']**2 + df['GyroZ']**2)

# Create mag_df to store the magnitude values
mag_df['Timestamp'] = df['Timestamp']

# Drop the first three rows from jerk_df
mag_df = mag_df.iloc[3:]

# Display the new DataFrame with magnitude values
print(mag_df)

      AccMagnitude  GyroMagnitude  Timestamp
3         0.981958       0.879563        2.0
4         0.973490       0.869997        2.5
5         0.968145       0.877841        3.0
6         0.916030       0.865655        3.5
7         0.849749       0.859289        4.0
...            ...            ...        ...
3639      0.968041       0.963384     1820.0
3640      0.894638       1.007906     1820.5
3641      0.916143       0.946890     1821.0
3642      0.925844       0.805715     1821.5
3643      0.904042       0.827173     1822.0

[3641 rows x 3 columns]


In [24]:
df5 = mag_df