## Assemble Holdout Dataset

This notebook unzips and assembles the data for subjects 11 to 15 to be used as a holdout dataset for testing the final models.

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import zipfile

In [8]:
# unzip data files for gyroscope
sensor = "gyr_"
activities =["climbingdown","climbingup","jumping","lying","running","sitting","standing","walking"]
for subject in range(11,16):  # extract subjects
    zipfilepath="./data/proband"+str(subject)+"/data/"
    for activity in activities:
        zipfilename = sensor+activity+"_csv.zip"
        if subject==13 and activity=="walking":
            filename = "Gyroscope_walking_2_thigh.csv"
        else:
            filename = "Gyroscope_"+activity+"_thigh.csv"
        savepath = "./data/"+str(subject)+"/"
        with zipfile.ZipFile(zipfilepath+zipfilename,"r") as zip_ref:
            zip_ref.extract(filename, path=savepath)

In [11]:
# unzip data files for accelerometer
sensor = "acc_"
activities =["climbingdown","climbingup","jumping","lying","running","sitting","standing","walking"]
for subject in range(11,16):  # extract subjects
    zipfilepath="./data/proband"+str(subject)+"/data/"
    for activity in activities:
        zipfilename = sensor+activity+"_csv.zip"
        if subject==13 and activity=="walking":
            filename = "acc_walking_2_thigh.csv"
        else:
            filename = "acc_"+activity+"_thigh.csv"
        savepath = "./data/"+str(subject)+"/"
        with zipfile.ZipFile(zipfilepath+zipfilename,"r") as zip_ref:
            zip_ref.extract(filename, path=savepath)

In [12]:
# rename _2 files
import os
os.rename("./data/13/Gyroscope_walking_2_thigh.csv", "./data/13/Gyroscope_walking_thigh.csv")
os.rename("./data/13/acc_walking_2_thigh.csv", "./data/13/acc_walking_thigh.csv")

In [3]:
# merge all activities into 1 df, for subjects 11-15, with gyroscope & accelerometer sensors combined
thigh_validate = pd.DataFrame()  # initialize empty df
windowsize = 100 # use 2 sec windows
last_sample = 0
activities =["climbingdown","climbingup","jumping","lying","running","sitting","standing","walking"]
# iterate over subjects
for subject in range(11,16):
    # iterate over the separate activity files
    for i, activity in enumerate(activities):
        g_filename = "./data/"+str(subject)+"/Gyroscope_"+activity+"_thigh.csv"
        a_filename = "./data/"+str(subject)+"/acc_"+activity+"_thigh.csv"
        gyr = pd.read_csv(g_filename, index_col=0)
        acc = pd.read_csv(a_filename, index_col=0)
        # make sure the different sensor files have the same number of readings
        if len(gyr)>len(acc):
            extra_records=len(gyr)-len(acc)
            acc = acc[:-extra_records]
        if len(gyr)<len(acc):
            extra_records=len(acc)-len(gyr)
            gyr = gyr[:-extra_records]
        df = pd.merge(gyr,acc,right_index=True,left_index=True,suffixes=('_gyr','_acc'))
        df['activity']=activity
        df['label']=i
        df['subject']=subject
        df['sample_num']=-1  # just to create the column   
        remainder=len(df)%windowsize
        df=df[:-remainder]  # delete rows so that all samples are the same size
        num_samples=int(len(df)/windowsize)  # the number of samples of size windowsize for the activity
        row_counter=0
    # label the samples within each activity file    
        for j in range(0,num_samples):   
            df.sample_num[row_counter:row_counter+windowsize]=last_sample+j # label samples
            row_counter=row_counter+windowsize
        last_sample+=num_samples # label the samples consecutively across activities
        thigh_validate = thigh_validate.append(df, ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [4]:
thigh_validate.to_pickle('./thigh_validate100.pkl')