# Load Data

In [13]:
import pandas as pd
import numpy as np
import os, pickle
from tqdm import tqdm_notebook as tqdm

In [15]:
dataset_dir = '../ExtraSensory.per_uuid_features_labels'
label_dir = '../ExtraSensory.per_uuid_original_labels'
data = pd.DataFrame()
for filename in tqdm(os.listdir(dataset_dir)):
    key = filename[:filename.find('.')]
    csv = pd.read_csv(os.path.join(dataset_dir, filename), index_col=0)
    label = pd.read_csv(os.path.join(label_dir, key+'.original_labels.csv.gz'), index_col=0)
    csv = csv.join(label.drop('label_source', axis=1))
    csv = pd.concat([csv], keys=[key], names=['filename'])
    data = csv if data.empty else data.append(csv)

HBox(children=(IntProgress(value=0, max=60), HTML(value='')))




In [16]:
data.columns = pd.MultiIndex.from_tuples([tuple(i.split(':')) for i in data.columns], names=['1', '2', '3'])

In [17]:
labels_col = [i for i in data.columns if i[0].startswith('original_label')]
main_labels_col = [('main_label', i[1], i[2]) for i in labels_col[:7]]

In [18]:
df = pd.DataFrame(data[labels_col[:7]].values,\
                  columns=pd.MultiIndex.from_tuples(main_labels_col, names=['1', '2', '3']), \
                  index=data.index)

data = data.join(df)

In [19]:
data[('main_label', 'missing', np.nan)] = 1-data['main_label'].sum(axis=1)

In [20]:
data.shape

(377346, 401)

In [21]:
data.head()

Unnamed: 0_level_0,1,raw_acc,raw_acc,raw_acc,raw_acc,raw_acc,raw_acc,raw_acc,raw_acc,raw_acc,raw_acc,...,original_label,original_label,main_label,main_label,main_label,main_label,main_label,main_label,main_label,main_label
Unnamed: 0_level_1,2,magnitude_stats,magnitude_stats,magnitude_stats,magnitude_stats,magnitude_stats,magnitude_stats,magnitude_stats,magnitude_stats,magnitude_stats,magnitude_spectrum,...,TAKING_CARE_OF_KIDS,WITH_A_PET,LYING_DOWN,SITTING,STANDING_IN_PLACE,STANDING_AND_MOVING,WALKING,RUNNING,BICYCLING,missing
Unnamed: 0_level_2,3,mean,std,moment3,moment4,percentile25,percentile50,percentile75,value_entropy,time_entropy,log_energy_band0,...,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN
filename,timestamp,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3,Unnamed: 22_level_3
11B5EC4D-4133-4289-B475-4E737182A406,1440000455,1.004012,0.04049,0.100981,0.178834,1.000016,1.003875,1.006938,0.327604,6.683913,5.04369,...,0,0,0,1,0,0,0,0,0,0
11B5EC4D-4133-4289-B475-4E737182A406,1440000515,1.002203,0.011955,-0.004731,0.025712,0.999446,1.002202,1.004927,1.209564,6.684541,5.043621,...,0,0,0,1,0,0,0,0,0,0
11B5EC4D-4133-4289-B475-4E737182A406,1440000575,1.002147,0.00593,0.006193,0.014327,0.999737,1.002325,1.004686,1.214844,6.684594,5.043792,...,0,0,0,1,0,0,0,0,0,0
11B5EC4D-4133-4289-B475-4E737182A406,1440000635,1.001937,0.005161,-0.007264,0.015998,0.99973,1.001946,1.00422,0.9192,6.684598,5.043255,...,0,0,0,1,0,0,0,0,0,0
11B5EC4D-4133-4289-B475-4E737182A406,1440000695,1.005683,0.066262,0.166331,0.29131,0.999031,1.003126,1.008375,0.353075,6.682873,5.036964,...,0,0,0,1,0,0,0,0,0,0


# Train/test split

In [22]:
users = list(set(data.index.get_level_values(0)))
np.random.shuffle(users)
train_user_num = int(len(users)*0.8)
valid_user_num = int(len(users)*0.1)
test_user_num = len(users) - train_user_num - valid_user_num

In [23]:
train, valid = data.loc[users[:train_user_num]], data.loc[users[train_user_num:train_user_num+valid_user_num]]
test = data.loc[users[-test_user_num:]]

In [24]:
len(train), len(valid), len(test)

(311408, 37383, 28555)

In [25]:
pickle.dump((train, valid, test), open('dataset.pickle', 'wb'))

In [12]:
train['main_label']

Unnamed: 0_level_0,2,LYING_DOWN,SITTING,STANDING_IN_PLACE,STANDING_AND_MOVING,WALKING,RUNNING,BICYCLING,missing
Unnamed: 0_level_1,3,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN
filename,timestamp,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
11B5EC4D-4133-4289-B475-4E737182A406,1440000455,,,,,,,,0
11B5EC4D-4133-4289-B475-4E737182A406,1440000515,,,,,,,,0
11B5EC4D-4133-4289-B475-4E737182A406,1440000575,,,,,,,,0
11B5EC4D-4133-4289-B475-4E737182A406,1440000635,,,,,,,,0
11B5EC4D-4133-4289-B475-4E737182A406,1440000695,,,,,,,,0
11B5EC4D-4133-4289-B475-4E737182A406,1440000756,,,,,,,,0
11B5EC4D-4133-4289-B475-4E737182A406,1440000816,,,,,,,,0
11B5EC4D-4133-4289-B475-4E737182A406,1440000876,,,,,,,,0
11B5EC4D-4133-4289-B475-4E737182A406,1440000936,,,,,,,,0
11B5EC4D-4133-4289-B475-4E737182A406,1440000996,,,,,,,,0


In [138]:
pd.isna(train).sum(axis=1)

filename                              timestamp 
11B5EC4D-4133-4289-B475-4E737182A406  1440000455     84
                                      1440000515     84
                                      1440000575     84
                                      1440000635     84
                                      1440000695     84
                                      1440000756     84
                                      1440000816     84
                                      1440000876     84
                                      1440000936     84
                                      1440000996     84
                                      1440001056     84
                                      1440001116     84
                                      1440001175     38
                                      1440001235     38
                                      1440001295     38
                                      1440001355     38
                                      1440001415     38