# MotionSense

dataset  
https://github.com/mmalekzadeh/motion-sense

reference  
https://www.kaggle.com/caspitush/activity-classification-gbc-fourier-transform

6 activities collected from 24 participants  
using smartphone sensors  
sampling rate : 50Hz

In [1]:
import os
import pickle
import glob
import pydot
from sklearn import preprocessing
from sklearn.metrics import f1_score
import pandas as pd
import itertools
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import figure
#plt.rcParams['figure.figsize'] = (6,4) # Make the figures a bit bigger

Matplotlib created a temporary config/cache directory at /tmp/matplotlib-436nawvk because the default path (/.config/matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.


In [2]:
import warnings
warnings.filterwarnings("ignore")

import socket
hostname = socket.gethostname()
print(hostname)

f1b2c6893cf4


In [3]:
def convert_to_one_hot(class_number, total_classes):
    one_hot = np.zeros(total_classes)
    one_hot[class_number] = 1
    return one_hot

In [4]:
dirout = 'out'
if not os.path.exists(dirout):
    os.makedirs(dirout)

## Data load

In [5]:
data_loc = '../Dataset/03_MotionSense_Dataset/'

In [6]:
class_name = ["down_stairs","up_stairs", "walking", "jogging", "standing", "sitting"]

In [7]:
def get_ds_infos():
    dss = pd.read_csv("{}/data_subjects_info.csv".format(data_loc))
    print("[INFO] -- Data subjects' information is imported.")
    return dss

In [8]:
def set_data_types(data_types=["userAcceleration"]):
    dt_list = []
    for t in data_types:
        if t != "attitude":
            dt_list.append([t+".x",t+".y",t+".z"])
        else:
            dt_list.append([t+".roll", t+".pitch", t+".yaw"])

    return dt_list

In [9]:
ACT_LABELS = ["dws","ups", "wlk", "jog", "std", "sit"]

TRIAL_CODES = {
    ACT_LABELS[0]:[1,2,11],
    ACT_LABELS[1]:[3,4,12],
    ACT_LABELS[2]:[7,8,15],
    ACT_LABELS[3]:[9,16],
    ACT_LABELS[4]:[6,14],
    ACT_LABELS[5]:[5,13]
}

feature_name = ['acc_x', 'acc_y', 'acc_z',  'ang_x', 'ang_y', 'ang_z']

In [10]:
sdt = ["userAcceleration", "rotationRate"]
print("[INFO] -- Selected sensor data types: "+str(sdt))
act_labels = ACT_LABELS [0:6]
print("[INFO] -- Selected activites: "+str(act_labels))    
trial_codes = [TRIAL_CODES[act] for act in act_labels]
dt_list = set_data_types(sdt)

[INFO] -- Selected sensor data types: ['userAcceleration', 'rotationRate']
[INFO] -- Selected activites: ['dws', 'ups', 'wlk', 'jog', 'std', 'sit']


In [11]:
def creat_time_series(dt_list, act_labels, trial_codes, mode="mag", labeled=True):
    num_data_cols = len(dt_list) if mode == "mag" else len(dt_list*3)

    if labeled:
        dataset = np.zeros((0,num_data_cols+7)) # "7" --> [act, code, weight, height, age, gender, trial] 
    else:
        dataset = np.zeros((0,num_data_cols))
        
    ds_list = get_ds_infos()
    
    print("[INFO] -- Creating Time-Series")
    for sub_id in ds_list["code"]:
        for act_id, act in enumerate(act_labels):
            for trial in trial_codes[act_id]:
                fname = '../Dataset/03_MotionSense_Dataset/'+act+'_'+str(trial)+'/sub_'+str(int(sub_id))+'.csv'
                raw_data = pd.read_csv(fname)
                raw_data = raw_data.drop(['Unnamed: 0'], axis=1)
                vals = np.zeros((len(raw_data), num_data_cols))
                for x_id, axes in enumerate(dt_list):
                    if mode == "mag":
                        vals[:,x_id] = (raw_data[axes]**2).sum(axis=1)**0.5        
                    else:
                        vals[:,x_id*3:(x_id+1)*3] = raw_data[axes].values
                    vals = vals[:,:num_data_cols]
                if labeled:
                    lbls = np.array([[act_id,
                            sub_id-1,
                            ds_list["weight"][sub_id-1],
                            ds_list["height"][sub_id-1],
                            ds_list["age"][sub_id-1],
                            ds_list["gender"][sub_id-1],
                            trial          
                           ]]*len(raw_data))
                    vals = np.concatenate((vals, lbls), axis=1)
                dataset = np.append(dataset,vals, axis=0)
    cols = []
    for axes in dt_list:
        if mode == "mag":
            cols += [str(axes[0][:-2])]
        else:
            cols += axes
            
    if labeled:
        cols += ["act", "id", "weight", "height", "age", "gender", "trial"]
    
    dataset = pd.DataFrame(data=dataset, columns=cols)
    return dataset

In [12]:
dataset = creat_time_series(dt_list, act_labels, trial_codes, mode="raw", labeled=True)

[INFO] -- Data subjects' information is imported.
[INFO] -- Creating Time-Series


act 는 동작을 뜻하고, id는 사람을 뜻한다

In [13]:
dataset

Unnamed: 0,userAcceleration.x,userAcceleration.y,userAcceleration.z,rotationRate.x,rotationRate.y,rotationRate.z,act,id,weight,height,age,gender,trial
0,0.294894,-0.184493,0.377542,0.316738,0.778180,1.082764,0.0,0.0,102.0,188.0,46.0,1.0,1.0
1,0.219405,0.035846,0.114866,0.842032,0.424446,0.643574,0.0,0.0,102.0,188.0,46.0,1.0,1.0
2,0.010714,0.134701,-0.167808,-0.138143,-0.040741,0.343563,0.0,0.0,102.0,188.0,46.0,1.0,1.0
3,-0.008389,0.136788,0.094958,-0.025005,-1.048717,0.035860,0.0,0.0,102.0,188.0,46.0,1.0,1.0
4,0.199441,0.353996,-0.044299,0.114253,-0.912890,0.047341,0.0,0.0,102.0,188.0,46.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1412860,0.000789,-0.005937,-0.004355,-0.001312,-0.011512,0.001284,5.0,23.0,74.0,173.0,18.0,0.0,13.0
1412861,-0.000409,-0.000608,0.000098,-0.000293,-0.022169,0.001305,5.0,23.0,74.0,173.0,18.0,0.0,13.0
1412862,-0.000486,0.000711,0.002045,0.007208,-0.012616,0.003482,5.0,23.0,74.0,173.0,18.0,0.0,13.0
1412863,0.000311,-0.003395,0.004746,0.006180,-0.003029,0.004531,5.0,23.0,74.0,173.0,18.0,0.0,13.0


In [14]:
#dataset.isna().sum()

In [15]:
subject_id = sorted(list(dataset['id'].unique()))
activity_id = sorted(list(dataset['act'].unique()))
n_classes = len(activity_id)
print('subject_id :',subject_id)
print('activity_id :',activity_id)
print('n_classes :',n_classes)

subject_id : [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0]
activity_id : [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]
n_classes : 6


In [16]:
# 같은 사람, activity 별로 묶는 코드
fdata = []
for i in range(len(subject_id)):
    data2 = dataset[(dataset['id'] == subject_id[i])]
    for j in range(len(activity_id)):
        data3 = data2[(dataset['act'] == activity_id[j])]
        fdata.append(data3.to_numpy())

In [17]:
np.shape(fdata[0]), np.shape(fdata[143])

((5105, 13), (14074, 13))

In [18]:
nsample_crop = 100

vdatac = [[] for i in range(n_classes)]
vlabelc =[[] for i in range(n_classes)] 
    
for i in range(len(fdata)):  #len(fdata)
    if len(fdata[i]) > nsample_crop:
        window_num = len(fdata[i])//nsample_crop
        for k in range(window_num):
            start = nsample_crop * k
            end = nsample_crop * (k+1)
            #print(k, start, end)
            vdata_temp = fdata[i][start:end, :6]  #[2:20]
            vlabel_temp = fdata[i][start:end, 6:7][0][0]
            for j in range(n_classes):
                if vlabel_temp == j:
                    vdatac[j].append(vdata_temp)
                    vlabelc[j].append(convert_to_one_hot(int(vlabel_temp), n_classes))

In [19]:
for i in range(n_classes):
    vdatac[i] = np.array(vdatac[i])
    print(i, class_name[i], '--->',np.shape(vdatac[i]), np.shape(vlabelc[i]))

0 down_stairs ---> (1307, 100, 6) (1307, 6)
1 up_stairs ---> (1562, 100, 6) (1562, 6)
2 walking ---> (3433, 100, 6) (3433, 6)
3 jogging ---> (1331, 100, 6) (1331, 6)
4 standing ---> (3051, 100, 6) (3051, 6)
5 sitting ---> (3375, 100, 6) (3375, 6)


In [20]:
for i in range(n_classes):
    temp = np.shape(vdatac[i])[0]
    print(i, class_name[i],'-->', round(temp*nsample_crop*20/1000/60, 1), 'min / ', round(temp*nsample_crop*20/1000/60/60, 2), 'hrs')

0 down_stairs --> 43.6 min /  0.73 hrs
1 up_stairs --> 52.1 min /  0.87 hrs
2 walking --> 114.4 min /  1.91 hrs
3 jogging --> 44.4 min /  0.74 hrs
4 standing --> 101.7 min /  1.7 hrs
5 sitting --> 112.5 min /  1.88 hrs


In [21]:
from sklearn.preprocessing import StandardScaler
import numpy as np

def scale_time_series_data(datas):
    length = np.shape(datas[0])[1]
    dim = np.shape(datas[0])[2]
    # Combine all the data
    all_data = np.concatenate(datas, axis=0)
    all_data = all_data.reshape(-1, dim)
    
    # Initialize a StandardScaler
    scaler = StandardScaler()
    
    # Fit the scaler to the data and scale it
    all_data = scaler.fit_transform(all_data)

    # Initialize an empty list to store the start indices of each class
    start_indices = [0]

    # Calculate the start index of each class
    for data in datas:
        start_indices.append(start_indices[-1] + len(data))

    all_data = all_data.reshape(-1, length, dim)
    # Split the combined data into the original classes
    scaled_datas = [all_data[start_indices[i]:start_indices[i+1]] for i in range(len(start_indices)-1)]

    return scaled_datas, scaler

In [22]:
# 데이터 정규화
datac_all_norm, scaler = scale_time_series_data(vdatac)

In [23]:
print('Shuffled data...')
for i in range(n_classes):
    np.random.shuffle(datac_all_norm[i])

Shuffled data...


In [24]:
from sklearn.model_selection import train_test_split

def split_data_and_labels(datas, labels, test_size=0.2):
    data_train = []
    data_test = []
    labels_train = []
    labels_test = []

    for data, label in zip(datas, labels):
        data_tr, data_te, label_tr, label_te = train_test_split(data, label, test_size=test_size, random_state=42)
        data_train.append(data_tr)
        data_test.append(data_te)
        labels_train.append(label_tr)
        labels_test.append(label_te)

    return data_train, data_test, labels_train, labels_test

In [25]:
data_train, data_test, labels_train, labels_test = split_data_and_labels(datac_all_norm, vlabelc)

In [26]:
sz_train = []
sz_test = []

for i in range(n_classes):
    print(np.shape(data_train[i]), np.shape(data_test[i]))
    sz_train.append(len(data_train[i]))
    sz_test.append(len(data_test[i]))
    if i == 0:
        trainX = data_train[i]
        trainy = labels_train[i]
        
        testX = data_test[i]
        testy = labels_test[i]
    else:
        trainX   = np.vstack((trainX, data_train[i]))
        trainy = np.vstack((trainy, labels_train[i]))
        
        testX   = np.vstack((testX, data_test[i]))
        testy = np.vstack((testy, labels_test[i]))

(1045, 100, 6) (262, 100, 6)
(1249, 100, 6) (313, 100, 6)
(2746, 100, 6) (687, 100, 6)
(1064, 100, 6) (267, 100, 6)
(2440, 100, 6) (611, 100, 6)
(2700, 100, 6) (675, 100, 6)


In [27]:
print(np.shape(trainX))
print(np.shape(trainy))
print(np.shape(testX))
print(np.shape(testy))

(11244, 100, 6)
(11244, 6)
(2815, 100, 6)
(2815, 6)


In [28]:
loadFromPickle = False  #True #False

In [29]:
outdir_pickle = '../pickle/MotionSense'
fn_pickle_pub = '{}/MotionSense_class{}_len{}_pub_23.pickle'.format(outdir_pickle, n_classes, nsample_crop)
print(fn_pickle_pub)

if not os.path.exists(outdir_pickle):
    os.mkdir(outdir_pickle)


if loadFromPickle:
    if os.path.exists(fn_pickle_pub):
        print('{} exists....OK'.format(fn_pickle_pub))
    else:
        print('{} does NOT exists....'.format(fn_pickle_pub))
        loadFromPickle = False
        print('loadFromPickle....{}'.format(loadFromPickle))
        

if not loadFromPickle:
    datasave = [None] * 9
    datasave[0] = trainX
    datasave[1] = trainy
    datasave[2] = testX
    datasave[3] = testy
    datasave[4] = class_name
    datasave[5] = feature_name
    datasave[6] = scaler
    datasave[7] = sz_train
    datasave[8] = sz_test

    file = open(fn_pickle_pub, 'wb')
    pickle.dump(datasave, file)
    file.close()

    print('-------------------------------------------')
    print('# Saved files for publication\n ---> \n{}'.format(fn_pickle_pub)) 
    print('-------------------------------------------')

else:
    print('load from pickle files')
    file = open(fn_pickle_pub, 'rb')
    data = pickle.load(file)
    file.close()
    
    trainX = data[0]
    trainy = data[1]  
    testX = data[2]  
    testy = data[3] 
    class_name_pub = data[4] 
    feature_name = data[5]
    scaler = data[6]
    sz_train = data[7]
    sz_test = data[8]

../pickle/MotionSense/MotionSense_class6_len100_pub_23.pickle
-------------------------------------------
# Saved files for publication
 ---> 
../pickle/MotionSense/MotionSense_class6_len100_pub_23.pickle
-------------------------------------------


In [30]:
# # 데이터 시각화
# f, ax = plt.subplots(n_classes, 1, figsize=(8,8), squeeze=False, sharex=True)
# for i in range(n_classes):
#     ax[i//1, i%1].plot(data_train[i][200])
#     ax[i//1, i%1].set_title('{}'.format(class_name[i]))
# plt.tight_layout(pad=0.3)
# plt.show()