<a href="https://colab.research.google.com/github/idanh8/IoT_project_accelerometer_data/blob/main/step_counter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Imports

In [2]:
import numpy as np
import pandas as pd
import zipfile
import io
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb

Utilities

In [5]:
def norm(x, y, z):
  return (x**2 + y**2 + z**2)**0.5

In [3]:
def process_raw(zip_path):
  count = 0
  datasets = {}
  with zipfile.ZipFile(zip_path, 'r') as z:
    for file_name in z.namelist():
      if file_name == '8_walk_4_3.csv':
        continue
      file_data = {}
      count += 1
      csv_data = z.read(file_name)

      header = csv_data.decode().split('\n')[:5]
      header = [item.lower() for item in header]
      header = [item.replace('"', '') for item in header]
      rows = csv_data.decode().split('\n')[5:]
      df = pd.read_csv(io.StringIO('\n'.join(rows)))
      activity = 0 if ('walk' in header[0] or 'walking' in header[2]) else 1
      steps = int(header[3].split(',')[1])


      file_data.update([('Name', file_name),('Data', df), ('Steps', steps), ('Activity', activity)])
      exec(f"datasets[{count-1}] = file_data")
  print(f'There are {count} files in the dataset')
  problems = ['11_walk_5_1.csv', '6_run_3_1.csv', '6_run_4_1.csv','11_walk_1_1.csv', '11_walk_2_1.csv', '11_walk_3_1.csv', '6_walk_5_1.csv']
  for dataset in datasets.values():
    if dataset['Name'] in problems:
      df = dataset['Data']
      df = df.drop(0)
      df = df.reset_index(drop=True)
      dataset['Data'] = df
    if dataset['Name'] == '16_run_3_1.csv':
      df = dataset['Data']
      df = df.drop(106)
      df = df.reset_index(drop=True)
      dataset['Data'] = df
    if dataset['Name'] == '1_walk_4_1.csv':
      df = dataset['Data']
      df = df[:995]
      df = df.reset_index(drop=True)
      dataset['Data'] = df
    if dataset['Name'] == '31_walk_2_1.csv':
      df = dataset['Data']
      df = df.drop(207)
      df = df.reset_index(drop=True)
      dataset['Data'] = df
    if dataset['Name'] == '4_run_2_2.csv':
      df = dataset['Data']
      df = df.drop(185)
      df = df.reset_index(drop=True)
      dataset['Data'] = df
    if dataset['Name'] == '4_walk_1_3.csv':
      df = dataset['Data']
      df = df.drop(368)
      df = df.reset_index(drop=True)
      dataset['Data'] = df
    if dataset['Name'] == '4_walk_2_3.csv':
      df = dataset['Data']
      df = df.drop(95)
      df = df.reset_index(drop=True)
      dataset['Data'] = df
    if dataset['Name'] == '4_walk_4_2.csv':
      df = dataset['Data']
      df = df.drop(599)
      df = df.reset_index(drop=True)
      dataset['Data'] = df
    if dataset['Name'] == '4_walk_3_2.csv':
      df = dataset['Data']
      df = df.drop(39)
      df = df.drop(41)
      df = df.reset_index(drop=True)
      dataset['Data'] = df
    if dataset['Name'] == '5_run_3_1.csv':
      df = dataset['Data']
      df = df.drop(352)
      df = df.reset_index(drop=True)
      dataset['Data'] = df
    if dataset['Name'] == '8_run_3_1.csv':
      df = dataset['Data']
      df = df.drop(264)
      df = df.drop(605)
      df = df.drop(606)
      df = df.reset_index(drop=True)
      dataset['Data'] = df
    if dataset['Name'] == '8_run_3_1.csv':
      df = dataset['Data']
      df = df.drop(603)
      df = df.drop(604)
      df = df.drop(605)
      df = df.reset_index(drop=True)
      dataset['Data'] = df
    df = dataset['Data']
    df['Norm'] = df.apply(lambda row: norm(float(row[1]), float(row[2]), float(row[3])), axis =1)
  return datasets

In [4]:
def extract_features(data):
    features = {}

    # Statistical features
    features['mean'] = data.mean()
    features['std'] = data.std()
    features['min'] = data.min()
    features['max'] = data.max()
    features['range'] = data.max() - data.min()

    # Frequency domain features (using Fourier transform)
    fft_data = np.fft.fft(data)
    power_spectrum = np.abs(fft_data) ** 2
    features['power_spectrum_mean'] = power_spectrum.mean()
    features['power_spectrum_std'] = power_spectrum.std()

    features['custom_feature'] = data.sum() * data.std()

    return features

In [6]:
def combine_and_extract(datasets):
    extracted_features_list = []
    y = []
    for dataset in datasets.values():
        aggregated_features = {}
        df = dataset['Data'].astype(float)

        for axis in [1, 2, 3, 4]:
            axis_data = df.iloc[:,axis]
            axis_features = extract_features(axis_data)

            axis_features = {f'{axis}_{feature}': value for feature, value in axis_features.items()}

            aggregated_features.update(axis_features)

        aggregated_features['Act'] = dataset['Activity']
        y.append(dataset['Steps'])
        extracted_features_list.append(aggregated_features)

    combined_df = pd.DataFrame(extracted_features_list)
    nan_indices = combined_df.index[combined_df.isnull().any(axis=1)].tolist()
    combined_df = combined_df.dropna()
    del y[nan_indices[0]]
    return combined_df, y

In [7]:
def single_combine_and_extract(dataset):
    extracted_features_list = []
    y = []
    aggregated_features = {}
    df = dataset['Data'].astype(float)

    for axis in [1, 2, 3, 4]:
        axis_data = df.iloc[:,axis]
        axis_features = extract_features(axis_data)

        axis_features = {f'{axis}_{feature}': value for feature, value in axis_features.items()}

        aggregated_features.update(axis_features)

    aggregated_features['Act'] = dataset['Activity']
    y.append(dataset['Steps'])
    extracted_features_list.append(aggregated_features)

    combined_df = pd.DataFrame(extracted_features_list)
    combined_df = combined_df.dropna()
    return combined_df, y

In [18]:
def train_and_predict(train_data, train_labels, test_data, test_labels):
  scaler = StandardScaler()
  data_scaled = scaler.fit_transform(train_data)

  model = xgb.XGBRegressor(learning_rate=0.16743239807751675, max_depth=9, n_estimators=781)
  model.fit(train_data, train_labels)

  y_pred = model.predict(test_data)
  for i in range(len(test_labels)):
    print(f'True: {test_labels[i]}, Predicted: {y_pred[i]}')

  mse = mean_squared_error(test_labels, y_pred)
  mae = mean_absolute_error(test_labels, y_pred)

  print("")
  print("Mean Squared Error:", mse)
  print("Mean Absolute Error:", mae)

Train and Predict

In [17]:
zip_path = 'data_set.zip' # change accordingly, please ensure a zipfile is passed containing csv files in the correct format
data = process_raw(zip_path)
train_data, train_labels = combine_and_extract(data)


There are 243 files in the dataset


In [20]:
test_data, test_labels = single_combine_and_extract(data[130]) # change accordingly, please ensure correct format

In [21]:
train_and_predict(train_data, train_labels, test_data, test_labels)

True: 130, Predicted: 130.0008087158203

Mean Squared Error: 6.540212780237198e-07
Mean Absolute Error: 0.0008087158203125
