In [1]:
import numpy as np
import arff
#from scipy.io import arff
import pandas as pd
import random
import math
import pickle

In [2]:
#An accelerometer is an electromechanical device used to measure acceleration forces. 
#Such forces may be static, like the continuous force of gravity 
#or, as is the case with many mobile devices, dynamic to sense movement or vibrations.

#For each axis (x,y,z), the accelerometer will measure an electrical signal proportional to the acceleration
#experienced by the device.


#A gyroscope is a device that uses Earth's gravity to help determine orientation. 
#Its design consists of a freely-rotating disk called a rotor, mounted onto a spinning axis in the center
#of a larger and more stable wheel. As the axis turns, the rotor remains stationary to indicate the 
#central gravitational pull, and thus which way is "down."

#For each axis (x,y,z), the gyroscope will measure angular velocity. Imagine placing a gyroscope on a wheel, with
#x-axis parallel to the wheel. As it turns, x will measure some value for angular velocity as it is rotating
#on the x-axis, whereas y and z will measure zero.

In [3]:
def parse_data(first_index, last_index, input_directory):
    
    '''Simple function to parse data, merge accel & gyro, and create labels'''
    
    # Read in the files
    number_of_users = last_index - first_index
    output_list = (number_of_users + 1) * [None]  # Pre-allocate list, +1 inclusive of 50th user
    counter = 0
    
    for i in range(first_index, last_index + 1):
        
        # accelerator data
        path = input_directory + '/accel/data_' + str(i) + '_accel_watch.txt'
        file = pd.read_csv(path, sep = ',', header = None)
        file.columns = ['subject_id', 'activity_class', 'timestamp','accel_x', 'accel_y', 'accel_z']
        
        # gyroscope data
        path = input_directory + '/gyro/data_' + str(i) + '_gyro_watch.txt'
        file2 = pd.read_csv(path, sep = ',', header = None)
        file2.columns = ['subject_id', 'activity_class', 'timestamp', 'gyro_x', 'gyro_y', 'gyro_z']
        
        # Merge the two
        join_key = ['subject_id', 'timestamp', 'activity_class']
        file = pd.merge(file, file2, how = 'left', left_on = join_key, right_on = join_key)
        
        # Set element in list to df, O(1)
        output_list[counter] = file
        
        print('Completed subject: ', i)
        counter += 1
    
    # Concat list of dfs
    output_df = pd.concat(output_list)
    
    # Adjust z-axis column to float
    output_df['accel_z'] =  output_df['accel_z'].astype('str')
    output_df['gyro_z'] =  output_df['gyro_z'].astype('str')
    output_df['accel_z'] = output_df['accel_z'].apply(lambda x: float(x.replace(';', '')))
    output_df['gyro_z'] = output_df['gyro_z'].apply(lambda x: float(x.replace(';', '')))
    
    # Create binary class
    exercise = ['A','B','M','O','P']
    non_exercise = ['C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'Q', 'R', 'S']
    output_df.loc[output_df['activity_class'].isin(exercise), 'is_exercise'] = 1
    output_df.loc[output_df['activity_class'].isin(non_exercise), 'is_exercise'] = 0
    
    # Drop na (first and last measurement for gyroscope)
    output_df.dropna(inplace = True)
    
    # Sort by subject, time and create time period variable
    output_df["time_period"] = output_df.groupby("subject_id")["timestamp"].rank(method="first", ascending=True)
    output_df.sort_values(['subject_id', 'time_period'], ascending = True, inplace = True)
    
    return(output_df)

def train_test_split_on_users(output_df, train_pct):
    
    '''Split by users for train-testing'''
    
    users = list(set(output_df['subject_id']))
    random.shuffle(users)
    number_of_users = len(users)
    number_of_train_users = math.ceil(number_of_users * train_pct)
    
    train_df = output_df[output_df['subject_id'].isin(users[:number_of_train_users])]
    test_df = output_df[output_df['subject_id'].isin(users[number_of_train_users:])]
                 
    return(train_df, test_df)
    

In [4]:
output_df = parse_data(1600, 1650, '../data/watch')

Completed subject:  1600
Completed subject:  1601
Completed subject:  1602
Completed subject:  1603
Completed subject:  1604
Completed subject:  1605
Completed subject:  1606
Completed subject:  1607
Completed subject:  1608
Completed subject:  1609
Completed subject:  1610
Completed subject:  1611
Completed subject:  1612
Completed subject:  1613
Completed subject:  1614
Completed subject:  1615
Completed subject:  1616
Completed subject:  1617
Completed subject:  1618
Completed subject:  1619
Completed subject:  1620
Completed subject:  1621
Completed subject:  1622
Completed subject:  1623
Completed subject:  1624
Completed subject:  1625
Completed subject:  1626
Completed subject:  1627
Completed subject:  1628
Completed subject:  1629
Completed subject:  1630
Completed subject:  1631
Completed subject:  1632
Completed subject:  1633
Completed subject:  1634
Completed subject:  1635
Completed subject:  1636
Completed subject:  1637
Completed subject:  1638
Completed subject:  1639


In [5]:
train_df, test_df = train_test_split_on_users(output_df, 0.7)

In [6]:
def save_pickle(variable, path):
        output_file = open(path, 'wb')
        pickle.dump(variable, output_file)
        output_file.close()

In [7]:
save_pickle(train_df, '../data/watch/processed_data/train_df.pkl')
save_pickle(test_df, '../data/watch/processed_data/test_df.pkl')