In [8]:
import os
import sys
from pathlib import Path
sys.path.append(os.path.dirname(os.path.abspath(Path.cwd())))
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from imblearn.over_sampling import SMOTE 
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, accuracy_score, make_scorer
# from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
from sklearn.metrics import plot_confusion_matrix
from scipy import stats
import joblib
import shap
import seaborn as sns
import src.data_prep.split_data as sd
import src.data_prep.preprocess as pp
import src.data_prep.eda as eda

## Create raw time series dataset

In the (train_basic_data)[./train_basic_data.ipynb] notebook, we have divide our accelerometer data into different intervals with 50% overlap, calculating the basic statistics of the time series (such as std, max, min, mean, ...) and fit some classical machine learning model to classify different activities. However, since our dataset is properly clean and collected in an experiment setting, with the transition period removed, if we were to use such model in real life, we might not be able to get a good performance.

In this notebook, we will feed the raw time series data collected from the sensors to some deep learning model, and test their accuracy in classifying different labeled activities. First, we need to prepare our dataset:

In [9]:
# load data
subject_df = pp.load_dataset(1)
# split data
intervals = sd.split_data_by_interval(subject_df)
intervals.pop()
# get intervals activities
y = [list(activity["activity"])[0] for activity in intervals]
# get intervals matrix
X = [activity[["x", 'y', 'z']].T.to_numpy() for activity in intervals]

## Baseline model
We will set up a simple multi-layer perceptron as our baseline model for this task. Since the MLP is sensitive to feature scaling, we will scale our data using the standard scaler:

In [12]:
# scale the data
X = StandardScaler.transform(X) 

[[1502 1667 1611 1601 1643 1604 1640 1607 1546 1529 1637 1596 1590 1601
  1542 1598 1511 1555 1508 1580 1627 1592 1634 1638 1593 1542 1601 1613
  1644 1642 1605 1586 1577 1598 1561 1628 1694 1627 1598 1612 1630 1609
  1600 1608 1612 1605 1640 1610 1633 1573 1568 1576 1599 1620 1654 1637
  1603 1605 1620 1616 1611 1597 1587 1566 1497 1455 1499 1556 1567 1568
  1533 1519 1586 1618 1630 1646 1638 1595 1625 1645 1673 1607 1609 1613
  1670 1663 1635 1649 1726 1778 1718 1724 1630 1664 1706 1691 1682 1690
  1735 1665 1643 1624 1641 1681 1651 1632 1632 1648 1643 1638 1586 1578
  1559 1635 1671 1687 1704 1757 1806 1814 1839 1844 1929 1806 1782 1775
  1855 1893 1876 1950 1811 1876 1889 1883 1902 1937 1978 2026 2066 2106
  2123 2052 2006 2028 2034 1985 1997 2069 2081 2105 2131 2098 2127 2175
  2029 2191 2182 2139 2151 2168 2156 2189 2238 2183 2100 1985 2001 2061
  2102 2085 2224 2227 2227 2195 2209 2092 2185 2275 2289 2356 2190 2092
  2112 2143 2176 2053 2274 2117 2131 2076 2115 1979 2107 2148 20

TypeError: transform() missing 1 required positional argument: 'X'