### Imports

In [211]:
from __future__ import print_function

# For number crunching
import numpy as np
import pandas as pd
import random
from collections import OrderedDict

# For visualisation
import matplotlib.pyplot as plt 
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns 

### Get data function

**Accelerometer and Video data contain column 't' for time in seconds**

**PIR sensor data, Annotation files, and Location files contain columns 'start' and 'stop' for time in seconds**

**Annotations also contain 'name' of activity and 'index' number corresponding to activity**

**Locations also contain 'name' of room in house and 'index' number corresponding to room**

**Only Target files contain 'start' and 'stop' in one-second intervals**

In [212]:
# sometimes annot and loc and _0 and _1; sometimes just _0
# reset index when concatting dfs

def get_data(data_type, list_of_participants):
    path_template = 'train/000{}/{}.csv'
    dfs = []
    for num in list_of_participants:
        path = path_template.format(num, data_type)
        df = pd.read_csv(path)
        df['participant'] = [num] * (int(df.shape[0]))
        dfs.append(df)
    return pd.concat(dfs, ignore_index=True)
        
participants = ['01','02','03','04','05','06','07','08','09','10']
file_names = ['acceleration','pir','targets','video_hallway','video_kitchen','video_living_room']

### Train Test Split (divide participants to avoid contaminating validation data)

In [213]:
# random selection: 30% participants for holdout 

np.random.seed(42)

test_participants = np.random.choice(participants, size=int(len(participants) * .30), replace=False)
test_participants


array(['09', '02', '06'], dtype='<U2')

In [214]:
train_participants = np.setdiff1d(np.array(participants), test_participants)
train_participants

array(['01', '03', '04', '05', '07', '08', '10'], dtype='<U2')

### Pre-processing

*Start with accelerometer data only:*
    * generate datetime seconds, minutes, and hours from column 't'
    * create column 'sec' listing total time in seconds in datetime format
    * ensure seconds are rounded to lowest integer (so we can groupby whole seconds)

In [215]:
# get acceleration data 
# later we will also get video data in this same step
#for name in file_names:

df_acc = get_data('acceleration', participants)

In [216]:
df_acc.tail()

Unnamed: 0,t,x,y,z,Kitchen_AP,Lounge_AP,Upstairs_AP,Study_AP,participant
301161,1798.325952,-0.89,0.352,0.12,-81.0,,-88.0,-78.0,10
301162,1798.376,-0.872,0.364,0.118,-81.0,,-88.0,-78.0,10
301163,1798.425856,-0.866,0.344,0.126,-82.0,,-89.0,-78.0,10
301164,1798.475904,-0.874,0.358,0.112,-82.0,,-89.0,-78.0,10
301165,1798.525952,-0.872,0.366,0.126,-82.0,,-89.0,-78.0,10


#### Format acceleration data into one-second intervals

In [217]:
# format accleration data into one-second intervals

seconds = pd.to_datetime(df_acc['t'], unit='s').dt.second
minutes = pd.to_datetime(df_acc['t'], unit='s').dt.minute
hours = pd.to_datetime(df_acc['t'], unit='s').dt.hour
df_acc['sec'] = seconds + (minutes * 60) + (hours * 60 * 60)
# seconds rounded to lowest int

In [218]:
df_acc.tail()

Unnamed: 0,t,x,y,z,Kitchen_AP,Lounge_AP,Upstairs_AP,Study_AP,participant,sec
301161,1798.325952,-0.89,0.352,0.12,-81.0,,-88.0,-78.0,10,1798
301162,1798.376,-0.872,0.364,0.118,-81.0,,-88.0,-78.0,10,1798
301163,1798.425856,-0.866,0.344,0.126,-82.0,,-89.0,-78.0,10,1798
301164,1798.475904,-0.874,0.358,0.112,-82.0,,-89.0,-78.0,10,1798
301165,1798.525952,-0.872,0.366,0.126,-82.0,,-89.0,-78.0,10,1798


**Investigate NaNs**

In [219]:
df_acc[(df_acc['sec'] == 105) & (df_acc['participant'] == '06')]

Unnamed: 0,t,x,y,z,Kitchen_AP,Lounge_AP,Upstairs_AP,Study_AP,participant,sec
163465,105.003,-0.932,0.288,0.088,-103.0,,,,6,105


In [220]:
df_acc.loc[163460:163470,:]

Unnamed: 0,t,x,y,z,Kitchen_AP,Lounge_AP,Upstairs_AP,Study_AP,participant,sec
163460,104.752952,-0.876,0.252,0.158,-96.0,,,,6,104
163461,104.803,-0.9,0.232,0.122,-96.0,,,,6,104
163462,104.852856,-0.946,0.246,0.156,-103.0,,,,6,104
163463,104.902904,-1.06,0.28,0.286,-103.0,,,,6,104
163464,104.952952,-0.974,0.24,0.214,-103.0,,,,6,104
163465,105.003,-0.932,0.288,0.088,-103.0,,,,6,105
163466,107.255856,-0.824,0.32,0.128,-97.0,,,,6,107
163467,107.305904,-0.848,0.316,0.198,-97.0,,,,6,107
163468,107.355952,-0.904,0.294,0.182,-97.0,,,,6,107
163469,107.406,-0.914,0.296,0.136,-97.0,,,,6,107


In [143]:
df_acc[(df_acc['sec'] > 100) & (df_acc['participant'] == '06')]

Unnamed: 0,t,x,y,z,Kitchen_AP,Lounge_AP,Upstairs_AP,Study_AP,participant,sec
163450,101.649856,-0.976,0.184,0.112,-97.0,,,,06,101
163451,101.699904,-0.930,0.226,0.132,-97.0,,,,06,101
163452,101.749952,-0.850,0.126,0.070,-97.0,,,,06,101
163453,101.800000,-1.344,0.172,-0.024,-97.0,,,,06,101
163454,103.851856,-0.912,0.248,0.172,-102.0,,,,06,103
163455,103.901904,-0.906,0.254,0.142,-102.0,,,,06,103
163456,103.951952,-0.904,0.248,0.124,-102.0,,,,06,103
163457,104.002000,-0.912,0.244,0.126,-102.0,,,,06,104
163458,104.652856,-0.910,0.216,0.184,-96.0,,,,06,104
163459,104.702904,-0.880,0.280,0.152,-96.0,,,,06,104


#### Groupby

In [221]:
# groupby one-second intervals
# create columns for x,y,z features
df_interval_acc = (df_acc.groupby(['sec','participant'], as_index=False)
                   .agg(OrderedDict(
                                   [('x',['mean','median','std','min','max']),
                                   ('y',['mean','median','std','min','max']),
                                   ('z',['mean','median','std','min','max']),
                                   ('Kitchen_AP','mean'),
                                   ('Lounge_AP','mean'),
                                   ('Upstairs_AP','mean'),
                                   ('Study_AP','mean')])))

In [222]:
df_interval_acc.head()

Unnamed: 0_level_0,sec,participant,x,x,x,x,x,y,y,y,y,y,z,z,z,z,z,Kitchen_AP,Lounge_AP,Upstairs_AP,Study_AP
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,median,std,min,max,mean,median,std,...,max,mean,median,std,min,max,mean,mean,mean,mean
0,0,1,0.9392,0.944,0.012556,0.896,0.952,-0.2797,-0.28,0.007205,...,-0.266,0.1461,0.144,0.008271,0.13,0.16,-91.6,-89.8,-79.2,
1,0,2,0.8619,0.863,0.015993,0.812,0.888,-0.2961,-0.304,0.024516,...,-0.24,0.4472,0.446,0.015686,0.416,0.48,-93.466667,-79.15,-82.65,
2,0,3,0.893263,0.896,0.007781,0.88,0.904,-0.372842,-0.376,0.012496,...,-0.338,0.250526,0.252,0.010389,0.216,0.262,-88.157895,-75.789474,-81.105263,
3,0,4,0.9061,0.906,0.005251,0.896,0.918,-0.3866,-0.386,0.006621,...,-0.372,0.0915,0.092,0.007222,0.078,0.104,-82.0,-79.0,-81.6,
4,0,5,-0.8459,-0.848,0.007772,-0.858,-0.832,0.3158,0.316,0.006354,...,0.326,0.3869,0.384,0.008397,0.376,0.406,-84.65,-79.0,-92.5625,


In [223]:
df_interval_acc.columns = ['_'.join(col).rstrip('_') for col in df_interval_acc.columns.values]

In [224]:
df_interval_acc.columns

Index(['sec', 'participant', 'x_mean', 'x_median', 'x_std', 'x_min', 'x_max',
       'y_mean', 'y_median', 'y_std', 'y_min', 'y_max', 'z_mean', 'z_median',
       'z_std', 'z_min', 'z_max', 'Kitchen_AP_mean', 'Lounge_AP_mean',
       'Upstairs_AP_mean', 'Study_AP_mean'],
      dtype='object')

In [225]:
df_interval_acc.participant.value_counts()

01    1824
10    1767
08    1699
02    1690
09    1676
03    1597
04    1574
05    1564
07    1521
06    1014
Name: participant, dtype: int64

In [226]:
df_interval_acc.head()

Unnamed: 0,sec,participant,x_mean,x_median,x_std,x_min,x_max,y_mean,y_median,y_std,...,y_max,z_mean,z_median,z_std,z_min,z_max,Kitchen_AP_mean,Lounge_AP_mean,Upstairs_AP_mean,Study_AP_mean
0,0,1,0.9392,0.944,0.012556,0.896,0.952,-0.2797,-0.28,0.007205,...,-0.266,0.1461,0.144,0.008271,0.13,0.16,-91.6,-89.8,-79.2,
1,0,2,0.8619,0.863,0.015993,0.812,0.888,-0.2961,-0.304,0.024516,...,-0.24,0.4472,0.446,0.015686,0.416,0.48,-93.466667,-79.15,-82.65,
2,0,3,0.893263,0.896,0.007781,0.88,0.904,-0.372842,-0.376,0.012496,...,-0.338,0.250526,0.252,0.010389,0.216,0.262,-88.157895,-75.789474,-81.105263,
3,0,4,0.9061,0.906,0.005251,0.896,0.918,-0.3866,-0.386,0.006621,...,-0.372,0.0915,0.092,0.007222,0.078,0.104,-82.0,-79.0,-81.6,
4,0,5,-0.8459,-0.848,0.007772,-0.858,-0.832,0.3158,0.316,0.006354,...,0.326,0.3869,0.384,0.008397,0.376,0.406,-84.65,-79.0,-92.5625,


**Rename column 'sec' as 'start' to match with Target data**

In [227]:
df_interval_acc.rename(columns={'sec':'start'},inplace=True)

In [228]:
df_interval_acc.head()

Unnamed: 0,start,participant,x_mean,x_median,x_std,x_min,x_max,y_mean,y_median,y_std,...,y_max,z_mean,z_median,z_std,z_min,z_max,Kitchen_AP_mean,Lounge_AP_mean,Upstairs_AP_mean,Study_AP_mean
0,0,1,0.9392,0.944,0.012556,0.896,0.952,-0.2797,-0.28,0.007205,...,-0.266,0.1461,0.144,0.008271,0.13,0.16,-91.6,-89.8,-79.2,
1,0,2,0.8619,0.863,0.015993,0.812,0.888,-0.2961,-0.304,0.024516,...,-0.24,0.4472,0.446,0.015686,0.416,0.48,-93.466667,-79.15,-82.65,
2,0,3,0.893263,0.896,0.007781,0.88,0.904,-0.372842,-0.376,0.012496,...,-0.338,0.250526,0.252,0.010389,0.216,0.262,-88.157895,-75.789474,-81.105263,
3,0,4,0.9061,0.906,0.005251,0.896,0.918,-0.3866,-0.386,0.006621,...,-0.372,0.0915,0.092,0.007222,0.078,0.104,-82.0,-79.0,-81.6,
4,0,5,-0.8459,-0.848,0.007772,-0.858,-0.832,0.3158,0.316,0.006354,...,0.326,0.3869,0.384,0.008397,0.376,0.406,-84.65,-79.0,-92.5625,


**Handle missing values**

In [229]:
df_interval_acc.isnull().sum()

start                   0
participant             0
x_mean                  0
x_median                0
x_std                  32
x_min                   0
x_max                   0
y_mean                  0
y_median                0
y_std                  32
y_min                   0
y_max                   0
z_mean                  0
z_median                0
z_std                  32
z_min                   0
z_max                   0
Kitchen_AP_mean      9781
Lounge_AP_mean      10399
Upstairs_AP_mean     6846
Study_AP_mean       11968
dtype: int64

In [230]:
df_interval_acc.shape

(15926, 21)

In [231]:
df_interval_acc[df_interval_acc['x_std'].isnull()]

Unnamed: 0,start,participant,x_mean,x_median,x_std,x_min,x_max,y_mean,y_median,y_std,...,y_max,z_mean,z_median,z_std,z_min,z_max,Kitchen_AP_mean,Lounge_AP_mean,Upstairs_AP_mean,Study_AP_mean
1016,105,6,-0.932,-0.932,,-0.932,-0.932,0.288,0.288,,...,0.288,0.088,0.088,,0.088,0.088,-103.0,,,
1672,178,7,-0.866,-0.866,,-0.866,-0.866,0.39,0.39,,...,0.39,0.176,0.176,,0.176,0.176,,,,-87.0
1689,180,10,-0.738,-0.738,,-0.738,-0.738,0.318,0.318,,...,0.318,0.632,0.632,,0.632,0.632,,,,-87.0
2348,252,9,0.936,0.936,,0.936,0.936,-0.318,-0.318,,...,-0.318,0.066,0.066,,0.066,0.066,,,,-101.0
2449,263,8,-0.81,-0.81,,-0.81,-0.81,-0.01,-0.01,,...,-0.01,-0.634,-0.634,,-0.634,-0.634,,,,-95.0
3200,344,9,0.18,0.18,,0.18,0.18,-0.858,-0.858,,...,-0.858,0.52,0.52,,0.52,0.52,,,-102.0,
3299,356,8,-0.096,-0.096,,-0.096,-0.096,0.846,0.846,,...,0.846,-0.148,-0.148,,-0.148,-0.148,,,,-105.0
4943,525,6,0.176,0.176,,0.176,0.176,0.758,0.758,,...,0.758,0.348,0.348,,0.348,0.348,,,-99.0,
5048,536,6,0.646,0.646,,0.646,0.646,0.616,0.616,,...,0.616,0.536,0.536,,0.536,0.536,,,-102.0,
5179,550,6,0.834,0.834,,0.834,0.834,0.428,0.428,,...,0.428,0.432,0.432,,0.432,0.432,,,-102.0,


In [232]:
df_interval_acc.iloc[1010:1020,:]

Unnamed: 0,start,participant,x_mean,x_median,x_std,x_min,x_max,y_mean,y_median,y_std,...,y_max,z_mean,z_median,z_std,z_min,z_max,Kitchen_AP_mean,Lounge_AP_mean,Upstairs_AP_mean,Study_AP_mean
1010,104,10,-0.8922,-0.897,0.013137,-0.912,-0.856,0.2342,0.228,0.033934,...,0.374,0.2773,0.279,0.010588,0.262,0.312,,,,-89.6
1011,105,1,0.9396,0.94,0.027565,0.884,0.976,-0.1635,-0.22,0.137962,...,0.158,0.1116,0.111,0.014848,0.088,0.134,,-73.3,,
1012,105,2,0.085294,0.102,0.195817,-0.212,0.404,0.152471,0.064,0.293981,...,0.822,0.991412,1.03,0.303008,0.572,1.504,,-72.823529,-99.0,
1013,105,3,0.8811,0.97,0.333445,0.142,1.388,-0.3864,-0.392,0.115692,...,-0.208,0.3936,0.416,0.087932,0.21,0.542,-104.0,-74.0,-100.0,
1014,105,4,-0.8621,-0.863,0.006138,-0.872,-0.848,-0.3709,-0.371,0.012061,...,-0.346,0.2298,0.23,0.011533,0.214,0.254,-100.0,-66.9,,
1015,105,5,-0.8673,-0.866,0.022396,-0.912,-0.828,0.296,0.302,0.022544,...,0.352,0.3268,0.32,0.037576,0.264,0.4,,-67.2,,
1016,105,6,-0.932,-0.932,,-0.932,-0.932,0.288,0.288,,...,0.288,0.088,0.088,,0.088,0.088,-103.0,,,
1017,105,7,-0.4675,-0.445,0.441115,-0.896,-0.084,-0.1105,-0.109,0.085047,...,-0.034,0.7745,0.78,0.174687,0.576,0.962,,,,-92.5
1018,105,8,-0.805875,-0.818,0.026102,-0.848,-0.758,0.492,0.478,0.034695,...,0.556,0.1755,0.175,0.013074,0.158,0.196,,,,-87.5
1019,105,10,-0.8607,-0.861,0.091178,-1.07,-0.702,0.388,0.371,0.083613,...,0.56,0.1278,0.061,0.149444,-0.05,0.37,,,,-88.0


In [233]:
df_interval_acc[['x_std','y_std','z_std']] = df_interval_acc[['x_std','y_std','z_std']].fillna(0)

In [234]:
df_interval_acc.isnull().sum()

start                   0
participant             0
x_mean                  0
x_median                0
x_std                   0
x_min                   0
x_max                   0
y_mean                  0
y_median                0
y_std                   0
y_min                   0
y_max                   0
z_mean                  0
z_median                0
z_std                   0
z_min                   0
z_max                   0
Kitchen_AP_mean      9781
Lounge_AP_mean      10399
Upstairs_AP_mean     6846
Study_AP_mean       11968
dtype: int64

In [235]:
df_interval_acc.head()

Unnamed: 0,start,participant,x_mean,x_median,x_std,x_min,x_max,y_mean,y_median,y_std,...,y_max,z_mean,z_median,z_std,z_min,z_max,Kitchen_AP_mean,Lounge_AP_mean,Upstairs_AP_mean,Study_AP_mean
0,0,1,0.9392,0.944,0.012556,0.896,0.952,-0.2797,-0.28,0.007205,...,-0.266,0.1461,0.144,0.008271,0.13,0.16,-91.6,-89.8,-79.2,
1,0,2,0.8619,0.863,0.015993,0.812,0.888,-0.2961,-0.304,0.024516,...,-0.24,0.4472,0.446,0.015686,0.416,0.48,-93.466667,-79.15,-82.65,
2,0,3,0.893263,0.896,0.007781,0.88,0.904,-0.372842,-0.376,0.012496,...,-0.338,0.250526,0.252,0.010389,0.216,0.262,-88.157895,-75.789474,-81.105263,
3,0,4,0.9061,0.906,0.005251,0.896,0.918,-0.3866,-0.386,0.006621,...,-0.372,0.0915,0.092,0.007222,0.078,0.104,-82.0,-79.0,-81.6,
4,0,5,-0.8459,-0.848,0.007772,-0.858,-0.832,0.3158,0.316,0.006354,...,0.326,0.3869,0.384,0.008397,0.376,0.406,-84.65,-79.0,-92.5625,


**Impute values for missing environmental sensor data**

Fill with 0 since nulls indicated participant was not close enough to register, meaning they were not in the room

In [236]:
"""
Kitchen_AP_mean      9781
Lounge_AP_mean      10399
Upstairs_AP_mean     6846
Study_AP_mean       11968
"""

df_interval_acc[['Kitchen_AP_mean','Lounge_AP_mean','Upstairs_AP_mean','Study_AP_mean']] = \
    df_interval_acc[['Kitchen_AP_mean','Lounge_AP_mean','Upstairs_AP_mean','Study_AP_mean']].fillna(0)

In [237]:
df_interval_acc.isnull().sum()

start               0
participant         0
x_mean              0
x_median            0
x_std               0
x_min               0
x_max               0
y_mean              0
y_median            0
y_std               0
y_min               0
y_max               0
z_mean              0
z_median            0
z_std               0
z_min               0
z_max               0
Kitchen_AP_mean     0
Lounge_AP_mean      0
Upstairs_AP_mean    0
Study_AP_mean       0
dtype: int64

In [238]:
df_interval_acc.shape

(15926, 21)

In [239]:
df_interval_acc.head()

Unnamed: 0,start,participant,x_mean,x_median,x_std,x_min,x_max,y_mean,y_median,y_std,...,y_max,z_mean,z_median,z_std,z_min,z_max,Kitchen_AP_mean,Lounge_AP_mean,Upstairs_AP_mean,Study_AP_mean
0,0,1,0.9392,0.944,0.012556,0.896,0.952,-0.2797,-0.28,0.007205,...,-0.266,0.1461,0.144,0.008271,0.13,0.16,-91.6,-89.8,-79.2,0.0
1,0,2,0.8619,0.863,0.015993,0.812,0.888,-0.2961,-0.304,0.024516,...,-0.24,0.4472,0.446,0.015686,0.416,0.48,-93.466667,-79.15,-82.65,0.0
2,0,3,0.893263,0.896,0.007781,0.88,0.904,-0.372842,-0.376,0.012496,...,-0.338,0.250526,0.252,0.010389,0.216,0.262,-88.157895,-75.789474,-81.105263,0.0
3,0,4,0.9061,0.906,0.005251,0.896,0.918,-0.3866,-0.386,0.006621,...,-0.372,0.0915,0.092,0.007222,0.078,0.104,-82.0,-79.0,-81.6,0.0
4,0,5,-0.8459,-0.848,0.007772,-0.858,-0.832,0.3158,0.316,0.006354,...,0.326,0.3869,0.384,0.008397,0.376,0.406,-84.65,-79.0,-92.5625,0.0


### Additional data

**Target files were generated from annotation and location files; don't worry abou these for now**

In [31]:
def reviewer_averages(files, participants):
    path = 'train/000{}/{}.csv'
    #path_1 = 'train/000{}/annotations_1.csv'
    dfs = []
    for num in participants:
        df_0 = pd.read_csv(path.format(num,files[0]))
        try:
            df_1 = pd.read_csv(path.format(num,files[1]))
            print('found df_1')
            df = (df_0 + df_1)/2
            print(df[0])
            print('averaged reviews')
        except:
            df = df_0
            print('only df_0 is available')
        dfs.append(df)
    
    return pd.concat(dfs)

annot_files = ['annotations_0','annotations_1']
location_files = ['location_0','location_1']

In [32]:
annot_dfs = reviewer_averages(annot_files, participant_nums)

found df_1
only df_0 is available
found df_1
only df_0 is available
only df_0 is available
found df_1
only df_0 is available
only df_0 is available
only df_0 is available
found df_1
only df_0 is available
only df_0 is available
only df_0 is available
only df_0 is available


In [9]:
annot_dfs.shape

(4562, 4)

In [14]:
loc_dfs = reviewer_averages(location_files, participant_nums)

only df_0 is available
only df_0 is available
only df_0 is available
only df_0 is available
only df_0 is available
only df_0 is available
only df_0 is available
only df_0 is available
only df_0 is available
only df_0 is available


In [11]:
loc_dfs.shape

(194, 4)

In [34]:
df1 = pd.read_csv('train/00001/annotations_0.csv')
df2 = pd.read_csv('train/00001/annotations_1.csv')
print(df1.head())
print(df2.head())

    start     end     name  index
0  45.930  49.730   a_walk      4
1  49.843  53.886   t_turn     19
2  53.958  58.727  p_stand     10
3  58.821  60.821   t_turn     19
4  60.892  61.240  p_stand     10
   start    end     name  index
0  45.93  47.93   a_walk      4
1  47.93  75.33  p_stand     10
2  75.33  80.33   a_walk      4
3  80.33  85.01  p_stand     10
4  85.01  87.01   t_turn     19


In [43]:
check1 = [i for i in df1['index']]
check2 = [i for i in df2['index']]
check1 == check2

False

In [36]:
df = pd.DataFrame()
for idx in df1['start']:
    if df1['start'][idx] > 0 and df2['start'][idx] > 0:
        df['start'] = (df1['start'][idx] + df2['start'][idx])/2
    elif 

TypeError: Could not operate 2 with block values unsupported operand type(s) for /: 'str' and 'int'

### Questions for Joe

* Dealing with missing Target Data
    * entire rows missing -- drop altogether? YES
        * if time later, build model to fill these values
    * rows where standard deviation could not be calculated b/c only one reading over that interval?
        * doesn't make sense to ffill/backfill b/c would be taking value from a different participant, likely in a totally different movement; FILL WITH ZERO
        * also question about row 1086 -- using mask resets index? NO; MAKE SURE TO RESET INDEX WHEN APPENDING DATAFRAMES
    * environmental sensor data only appears when sensor registered, so impute nulls with 0 - YES

### Align with Target data

**Drop rows with missing values in target data**

**In later development stages, consider modeling these values instead of dropping them**

In [252]:
# get target data

df_targets = get_data('targets', participants)

In [253]:
df_targets.isnull().sum()

start              0
end                0
a_ascend         507
a_descend        507
a_jump           507
a_loadwalk       507
a_walk           507
p_bent           507
p_kneel          507
p_lie            507
p_sit            507
p_squat          507
p_stand          507
t_bend           507
t_kneel_stand    507
t_lie_sit        507
t_sit_lie        507
t_sit_stand      507
t_stand_kneel    507
t_stand_sit      507
t_straighten     507
t_turn           507
participant        0
dtype: int64

In [254]:
df_targets.shape

(16611, 23)

In [255]:
df_targets.head()

Unnamed: 0,start,end,a_ascend,a_descend,a_jump,a_loadwalk,a_walk,p_bent,p_kneel,p_lie,...,t_bend,t_kneel_stand,t_lie_sit,t_sit_lie,t_sit_stand,t_stand_kneel,t_stand_sit,t_straighten,t_turn,participant
0,0.0,1.0,,,,,,,,,...,,,,,,,,,,1
1,1.0,2.0,,,,,,,,,...,,,,,,,,,,1
2,2.0,3.0,,,,,,,,,...,,,,,,,,,,1
3,3.0,4.0,,,,,,,,,...,,,,,,,,,,1
4,4.0,5.0,,,,,,,,,...,,,,,,,,,,1


In [256]:
df_targets = df_targets.dropna().reset_index(drop=True)

In [257]:
df_targets.head()

Unnamed: 0,start,end,a_ascend,a_descend,a_jump,a_loadwalk,a_walk,p_bent,p_kneel,p_lie,...,t_bend,t_kneel_stand,t_lie_sit,t_sit_lie,t_sit_stand,t_stand_kneel,t_stand_sit,t_straighten,t_turn,participant
0,46.0,47.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,47.0,48.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,48.0,49.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,49.0,50.0,0.0,0.0,0.0,0.0,0.4,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,1
4,50.0,51.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,1


In [258]:
df_targets.isnull().sum()

start            0
end              0
a_ascend         0
a_descend        0
a_jump           0
a_loadwalk       0
a_walk           0
p_bent           0
p_kneel          0
p_lie            0
p_sit            0
p_squat          0
p_stand          0
t_bend           0
t_kneel_stand    0
t_lie_sit        0
t_sit_lie        0
t_sit_stand      0
t_stand_kneel    0
t_stand_sit      0
t_straighten     0
t_turn           0
participant      0
dtype: int64

In [259]:
df_targets.shape

(16104, 23)

In [260]:
df_targets.head()

Unnamed: 0,start,end,a_ascend,a_descend,a_jump,a_loadwalk,a_walk,p_bent,p_kneel,p_lie,...,t_bend,t_kneel_stand,t_lie_sit,t_sit_lie,t_sit_stand,t_stand_kneel,t_stand_sit,t_straighten,t_turn,participant
0,46.0,47.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,47.0,48.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,48.0,49.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,49.0,50.0,0.0,0.0,0.0,0.0,0.4,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,1
4,50.0,51.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,1


**Join features with target**

In [261]:
# can't join on right b/c somehow there are more target values than train values...???
#df_xy = df_interval_acc.merge(df_targets, on=['participant','start'], how='right')

df_xy = df_interval_acc.merge(df_targets, on=['participant','start'], how='left')

In [262]:
df_xy.head()

Unnamed: 0,start,participant,x_mean,x_median,x_std,x_min,x_max,y_mean,y_median,y_std,...,p_stand,t_bend,t_kneel_stand,t_lie_sit,t_sit_lie,t_sit_stand,t_stand_kneel,t_stand_sit,t_straighten,t_turn
0,0,1,0.9392,0.944,0.012556,0.896,0.952,-0.2797,-0.28,0.007205,...,,,,,,,,,,
1,0,2,0.8619,0.863,0.015993,0.812,0.888,-0.2961,-0.304,0.024516,...,,,,,,,,,,
2,0,3,0.893263,0.896,0.007781,0.88,0.904,-0.372842,-0.376,0.012496,...,,,,,,,,,,
3,0,4,0.9061,0.906,0.005251,0.896,0.918,-0.3866,-0.386,0.006621,...,,,,,,,,,,
4,0,5,-0.8459,-0.848,0.007772,-0.858,-0.832,0.3158,0.316,0.006354,...,,,,,,,,,,


In [263]:
df_xy['participant'].value_counts()

01    1824
10    1767
08    1699
02    1690
09    1676
03    1597
04    1574
05    1564
07    1521
06    1014
Name: participant, dtype: int64

In [264]:
df_xy.columns

Index(['start', 'participant', 'x_mean', 'x_median', 'x_std', 'x_min', 'x_max',
       'y_mean', 'y_median', 'y_std', 'y_min', 'y_max', 'z_mean', 'z_median',
       'z_std', 'z_min', 'z_max', 'Kitchen_AP_mean', 'Lounge_AP_mean',
       'Upstairs_AP_mean', 'Study_AP_mean', 'end', 'a_ascend', 'a_descend',
       'a_jump', 'a_loadwalk', 'a_walk', 'p_bent', 'p_kneel', 'p_lie', 'p_sit',
       'p_squat', 'p_stand', 't_bend', 't_kneel_stand', 't_lie_sit',
       't_sit_lie', 't_sit_stand', 't_stand_kneel', 't_stand_sit',
       't_straighten', 't_turn'],
      dtype='object')

In [265]:
df_xy.shape

(15926, 42)

In [266]:
df_xy.isnull().sum()

start                 0
participant           0
x_mean                0
x_median              0
x_std                 0
x_min                 0
x_max                 0
y_mean                0
y_median              0
y_std                 0
y_min                 0
y_max                 0
z_mean                0
z_median              0
z_std                 0
z_min                 0
z_max                 0
Kitchen_AP_mean       0
Lounge_AP_mean        0
Upstairs_AP_mean      0
Study_AP_mean         0
end                 505
a_ascend            505
a_descend           505
a_jump              505
a_loadwalk          505
a_walk              505
p_bent              505
p_kneel             505
p_lie               505
p_sit               505
p_squat             505
p_stand             505
t_bend              505
t_kneel_stand       505
t_lie_sit           505
t_sit_lie           505
t_sit_stand         505
t_stand_kneel       505
t_stand_sit         505
t_straighten        505
t_turn          

In [269]:
df_xy = df_xy.dropna().reset_index(drop=True)

In [270]:
df_xy.shape

(15421, 42)

In [271]:
df_xy.head()

Unnamed: 0,start,participant,x_mean,x_median,x_std,x_min,x_max,y_mean,y_median,y_std,...,p_stand,t_bend,t_kneel_stand,t_lie_sit,t_sit_lie,t_sit_stand,t_stand_kneel,t_stand_sit,t_straighten,t_turn
0,7,7,-0.7077,-0.707,0.011337,-0.728,-0.69,0.3843,0.396,0.027791,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,8,7,-0.7125,-0.712,0.005871,-0.722,-0.706,0.4024,0.404,0.007007,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,9,7,-0.7109,-0.712,0.00533,-0.72,-0.704,0.4068,0.408,0.005745,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,10,7,-0.7097,-0.709,0.005243,-0.72,-0.698,0.4073,0.408,0.007116,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,10,9,0.944,0.944,0.010583,0.92,0.958,-0.2858,-0.284,0.032078,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [272]:
df_xy.isnull().sum()

start               0
participant         0
x_mean              0
x_median            0
x_std               0
x_min               0
x_max               0
y_mean              0
y_median            0
y_std               0
y_min               0
y_max               0
z_mean              0
z_median            0
z_std               0
z_min               0
z_max               0
Kitchen_AP_mean     0
Lounge_AP_mean      0
Upstairs_AP_mean    0
Study_AP_mean       0
end                 0
a_ascend            0
a_descend           0
a_jump              0
a_loadwalk          0
a_walk              0
p_bent              0
p_kneel             0
p_lie               0
p_sit               0
p_squat             0
p_stand             0
t_bend              0
t_kneel_stand       0
t_lie_sit           0
t_sit_lie           0
t_sit_stand         0
t_stand_kneel       0
t_stand_sit         0
t_straighten        0
t_turn              0
dtype: int64

In [276]:
df_xy.to_csv('sphere_xy.csv', index=False)


In [289]:
import csv
df_xy.to_csv('sphere_xy_2.csv', index=False, quoting=csv.QUOTE_NONNUMERIC)

In [277]:
df_xy.head()

Unnamed: 0,start,participant,x_mean,x_median,x_std,x_min,x_max,y_mean,y_median,y_std,...,p_stand,t_bend,t_kneel_stand,t_lie_sit,t_sit_lie,t_sit_stand,t_stand_kneel,t_stand_sit,t_straighten,t_turn
0,7,7,-0.7077,-0.707,0.011337,-0.728,-0.69,0.3843,0.396,0.027791,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,8,7,-0.7125,-0.712,0.005871,-0.722,-0.706,0.4024,0.404,0.007007,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,9,7,-0.7109,-0.712,0.00533,-0.72,-0.704,0.4068,0.408,0.005745,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,10,7,-0.7097,-0.709,0.005243,-0.72,-0.698,0.4073,0.408,0.007116,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,10,9,0.944,0.944,0.010583,0.92,0.958,-0.2858,-0.284,0.032078,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [281]:
df_xy.dtypes

start                object
participant          object
x_mean              float64
x_median            float64
x_std               float64
x_min               float64
x_max               float64
y_mean              float64
y_median            float64
y_std               float64
y_min               float64
y_max               float64
z_mean              float64
z_median            float64
z_std               float64
z_min               float64
z_max               float64
Kitchen_AP_mean     float64
Lounge_AP_mean      float64
Upstairs_AP_mean    float64
Study_AP_mean       float64
end                 float64
a_ascend            float64
a_descend           float64
a_jump              float64
a_loadwalk          float64
a_walk              float64
p_bent              float64
p_kneel             float64
p_lie               float64
p_sit               float64
p_squat             float64
p_stand             float64
t_bend              float64
t_kneel_stand       float64
t_lie_sit           

### Train Test Split df_xy

In [200]:
# populate training data with users not in the test set
# populate holdout test data with users from the test set

df_xy_train = df_xy[~df_xy['participant'].isin(test_participants)]

df_xy_test = df_xy[df_xy['participant'].isin(test_participants)] 

X_train = df_xy_train[['x_mean', 'x_median', 'x_std', 'x_min', 'x_max',
       'y_mean', 'y_median', 'y_std', 'y_min', 'y_max', 'z_mean', 'z_median',
       'z_std', 'z_min', 'z_max', 'Kitchen_AP_mean', 'Lounge_AP_mean',
       'Upstairs_AP_mean', 'Study_AP_mean']]

y_train = df_xy_train[['a_ascend', 'a_descend',
       'a_jump', 'a_loadwalk', 'a_walk', 'p_bent', 'p_kneel', 'p_lie', 'p_sit',
       'p_squat', 'p_stand', 't_bend', 't_kneel_stand', 't_lie_sit',
       't_sit_lie', 't_sit_stand', 't_stand_kneel', 't_stand_sit',
       't_straighten', 't_turn']]

X_test = df_xy_test[['x_mean', 'x_median', 'x_std', 'x_min', 'x_max',
       'y_mean', 'y_median', 'y_std', 'y_min', 'y_max', 'z_mean', 'z_median',
       'z_std', 'z_min', 'z_max', 'Kitchen_AP_mean', 'Lounge_AP_mean',
       'Upstairs_AP_mean', 'Study_AP_mean']]

y_test = df_xy_test[['a_ascend', 'a_descend',
       'a_jump', 'a_loadwalk', 'a_walk', 'p_bent', 'p_kneel', 'p_lie', 'p_sit',
       'p_squat', 'p_stand', 't_bend', 't_kneel_stand', 't_lie_sit',
       't_sit_lie', 't_sit_stand', 't_stand_kneel', 't_stand_sit',
       't_straighten', 't_turn']]

In [201]:
X_train.head(3)

Unnamed: 0,x_mean,x_median,x_std,x_min,x_max,y_mean,y_median,y_std,y_min,y_max,z_mean,z_median,z_std,z_min,z_max,Kitchen_AP_mean,Lounge_AP_mean,Upstairs_AP_mean,Study_AP_mean
76,-0.7077,-0.707,0.011337,-0.728,-0.69,0.3843,0.396,0.027791,0.326,0.414,0.5921,0.585,0.023328,0.566,0.658,-91.6,0.0,-83.5,-83.0
86,-0.7125,-0.712,0.005871,-0.722,-0.706,0.4024,0.404,0.007007,0.386,0.414,0.5735,0.574,0.006985,0.562,0.588,-91.8,0.0,-80.0,-83.0
96,-0.7109,-0.712,0.00533,-0.72,-0.704,0.4068,0.408,0.005745,0.398,0.414,0.5736,0.574,0.00791,0.556,0.59,-91.2,0.0,-80.0,-83.4


In [202]:
y_train.head(3)

Unnamed: 0,a_ascend,a_descend,a_jump,a_loadwalk,a_walk,p_bent,p_kneel,p_lie,p_sit,p_squat,p_stand,t_bend,t_kneel_stand,t_lie_sit,t_sit_lie,t_sit_stand,t_stand_kneel,t_stand_sit,t_straighten,t_turn
76,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
86,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [203]:
X_test.head(3)

Unnamed: 0,x_mean,x_median,x_std,x_min,x_max,y_mean,y_median,y_std,y_min,y_max,z_mean,z_median,z_std,z_min,z_max,Kitchen_AP_mean,Lounge_AP_mean,Upstairs_AP_mean,Study_AP_mean
108,0.944,0.944,0.010583,0.92,0.958,-0.2858,-0.284,0.032078,-0.356,-0.218,0.0838,0.083,0.017167,0.048,0.12,-82.6,0.0,-81.0,-79.0
118,0.9361,0.937,0.013572,0.914,0.962,-0.303,-0.302,0.031224,-0.378,-0.218,0.0977,0.103,0.022926,0.024,0.128,-81.1,0.0,-81.4,-82.8
128,0.9343,0.934,0.010448,0.912,0.952,-0.3161,-0.32,0.033255,-0.376,-0.216,0.1101,0.11,0.01615,0.088,0.146,-81.7,0.0,-81.9,-84.0


In [204]:
y_test.head(3)

Unnamed: 0,a_ascend,a_descend,a_jump,a_loadwalk,a_walk,p_bent,p_kneel,p_lie,p_sit,p_squat,p_stand,t_bend,t_kneel_stand,t_lie_sit,t_sit_lie,t_sit_stand,t_stand_kneel,t_stand_sit,t_straighten,t_turn
108,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
118,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
128,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [205]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(11158, 19)
(11158, 20)
(4263, 19)
(4263, 20)
