In [1]:
library(tidyverse)

options(digits=5)
options(warn=-1)
options(width=100)

R.Version()$version.string
print(map_chr(c('tidyverse','caret','randomForest'),
              function(x){paste(x,': ',packageVersion(x),', ',sep='')}), quote=F)

Registered S3 methods overwritten by 'ggplot2':
  method         from 
  [.quosures     rlang
  c.quosures     rlang
  print.quosures rlang
-- [1mAttaching packages[22m --------------------------------------- tidyverse 1.2.1 --
[32m√[39m [34mggplot2[39m 3.1.1     [32m√[39m [34mpurrr  [39m 0.3.2
[32m√[39m [34mtibble [39m 2.1.1     [32m√[39m [34mdplyr  [39m 0.8.1
[32m√[39m [34mtidyr  [39m 0.8.3     [32m√[39m [34mstringr[39m 1.4.0
[32m√[39m [34mreadr  [39m 1.3.1     [32m√[39m [34mforcats[39m 0.4.0
-- [1mConflicts[22m ------------------------------------------ tidyverse_conflicts() --
[31mx[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31mx[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()


[1] tidyverse: 1.2.1,      caret: 6.0.84,         randomForest: 4.6.14, 


In [2]:
#============================
# Step 1. Read Data and Merge them
#

train <- read_table('UCI HAR Dataset/train/X_train.txt', col_types=str_dup('d',561), col_names=F)
test  <- read_table('UCI HAR Dataset/test/X_test.txt',   col_types=str_dup('d',561), col_names=F)

print(dim(train))
print(dim(test))
table(is.na(train))
table(is.na(test))

ytrain <- read_table('UCI HAR Dataset/train/y_train.txt', col_types='i', col_names='actid')
ytest  <- read_table('UCI HAR Dataset/test/y_test.txt',   col_types='i', col_names='actid')

table(ytrain)
table(ytest)

train <- bind_cols(train, ytrain)
test  <- bind_cols(test,  ytest)

print(dim(train))
print(dim(test))

ds1 <- bind_rows(train, test)
print(dim(ds1))

[1] 7352  561
[1] 2947  561



  FALSE 
4124472 


  FALSE 
1653267 

ytrain
   1    2    3    4    5    6 
1226 1073  986 1286 1374 1407 

ytest
  1   2   3   4   5   6 
496 471 420 491 532 537 

[1] 7352  562
[1] 2947  562
[1] 10299   562


In [3]:
#============================
# Step 2. Extract mean and std
#
# Extract coloumns that name contains 'mean' or 'std'. 

features <- read_delim('UCI HAR Dataset/features.txt', delim=' ', col_names=F)
print(dim(features))
features <-features$X2

print(sum(str_detect(features, 'mean|std')))
extract <- str_detect(features, 'mean|std')

ds1 <- ds1[,c(extract, T)]
print(dim(ds1))

head(ds1)

Parsed with column specification:
cols(
  X1 = [32mcol_double()[39m,
  X2 = [31mcol_character()[39m
)


[1] 561   2
[1] 79
[1] 10299    80


X1,X2,X3,X4,X5,X6,X41,X42,X43,X44,...,X516,X517,X526,X529,X530,X539,X542,X543,X552,actid
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,...,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>
0.28858,-0.020294,-0.13291,-0.99528,-0.98311,-0.91353,0.9634,-0.14084,0.115375,-0.98525,...,-0.99373,-0.99375,0.34699,-0.98013,-0.96131,-0.128989,-0.99199,-0.9907,-0.074323,5
0.27842,-0.016411,-0.12352,-0.99825,-0.9753,-0.96032,0.96656,-0.14155,0.109379,-0.99741,...,-0.99034,-0.99196,0.53206,-0.9883,-0.98332,-0.271958,-0.99585,-0.9964,0.158075,5
0.27965,-0.019467,-0.11346,-0.99538,-0.96719,-0.97894,0.96688,-0.14201,0.101884,-0.99957,...,-0.98928,-0.99087,0.6608,-0.98925,-0.98603,-0.212728,-0.99503,-0.99513,0.414503,5
0.27917,-0.026201,-0.12328,-0.99609,-0.9834,-0.99068,0.96762,-0.14398,0.09985,-0.99665,...,-0.99277,-0.9917,0.67892,-0.98941,-0.98784,-0.035684,-0.99522,-0.99524,0.404573,5
0.27663,-0.01657,-0.11536,-0.99814,-0.98082,-0.99048,0.96822,-0.14875,0.094486,-0.99843,...,-0.99552,-0.99439,0.55906,-0.99143,-0.98906,-0.273582,-0.99509,-0.99546,0.087753,5
0.2772,-0.010098,-0.10514,-0.99733,-0.99049,-0.99542,0.96795,-0.14821,0.09191,-0.99898,...,-0.99473,-0.99516,0.24691,-0.9905,-0.98586,-0.297329,-0.99514,-0.99524,0.019953,5


In [4]:
#============================
# Step 3. Name the activities
#

act <- read_table('UCI HAR Dataset/activity_labels.txt', 
                  col_types='ic', col_names=c('actid','activity'))
act <- act %>% mutate(activity=factor(activity,levels=act$activity))

ds1 <- ds1 %>% inner_join(act, by='actid') %>% select(-actid)

head(ds1)

X1,X2,X3,X4,X5,X6,X41,X42,X43,X44,...,X516,X517,X526,X529,X530,X539,X542,X543,X552,activity
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,...,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>
0.28858,-0.020294,-0.13291,-0.99528,-0.98311,-0.91353,0.9634,-0.14084,0.115375,-0.98525,...,-0.99373,-0.99375,0.34699,-0.98013,-0.96131,-0.128989,-0.99199,-0.9907,-0.074323,STANDING
0.27842,-0.016411,-0.12352,-0.99825,-0.9753,-0.96032,0.96656,-0.14155,0.109379,-0.99741,...,-0.99034,-0.99196,0.53206,-0.9883,-0.98332,-0.271958,-0.99585,-0.9964,0.158075,STANDING
0.27965,-0.019467,-0.11346,-0.99538,-0.96719,-0.97894,0.96688,-0.14201,0.101884,-0.99957,...,-0.98928,-0.99087,0.6608,-0.98925,-0.98603,-0.212728,-0.99503,-0.99513,0.414503,STANDING
0.27917,-0.026201,-0.12328,-0.99609,-0.9834,-0.99068,0.96762,-0.14398,0.09985,-0.99665,...,-0.99277,-0.9917,0.67892,-0.98941,-0.98784,-0.035684,-0.99522,-0.99524,0.404573,STANDING
0.27663,-0.01657,-0.11536,-0.99814,-0.98082,-0.99048,0.96822,-0.14875,0.094486,-0.99843,...,-0.99552,-0.99439,0.55906,-0.99143,-0.98906,-0.273582,-0.99509,-0.99546,0.087753,STANDING
0.2772,-0.010098,-0.10514,-0.99733,-0.99049,-0.99542,0.96795,-0.14821,0.09191,-0.99898,...,-0.99473,-0.99516,0.24691,-0.9905,-0.98586,-0.297329,-0.99514,-0.99524,0.019953,STANDING


In [5]:
#============================
# Step 4.  Labels the data set
#
colnames(ds1) <- c(features[extract], colnames(ds1)[ncol(ds1)])
head(ds1)
write.csv(ds1, 'DataSet1.csv', quote=T, row.names=F)


tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tGravityAcc-mean()-X,tGravityAcc-mean()-Y,tGravityAcc-mean()-Z,tGravityAcc-std()-X,...,fBodyBodyAccJerkMag-mean(),fBodyBodyAccJerkMag-std(),fBodyBodyAccJerkMag-meanFreq(),fBodyBodyGyroMag-mean(),fBodyBodyGyroMag-std(),fBodyBodyGyroMag-meanFreq(),fBodyBodyGyroJerkMag-mean(),fBodyBodyGyroJerkMag-std(),fBodyBodyGyroJerkMag-meanFreq(),activity
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,...,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>
0.28858,-0.020294,-0.13291,-0.99528,-0.98311,-0.91353,0.9634,-0.14084,0.115375,-0.98525,...,-0.99373,-0.99375,0.34699,-0.98013,-0.96131,-0.128989,-0.99199,-0.9907,-0.074323,STANDING
0.27842,-0.016411,-0.12352,-0.99825,-0.9753,-0.96032,0.96656,-0.14155,0.109379,-0.99741,...,-0.99034,-0.99196,0.53206,-0.9883,-0.98332,-0.271958,-0.99585,-0.9964,0.158075,STANDING
0.27965,-0.019467,-0.11346,-0.99538,-0.96719,-0.97894,0.96688,-0.14201,0.101884,-0.99957,...,-0.98928,-0.99087,0.6608,-0.98925,-0.98603,-0.212728,-0.99503,-0.99513,0.414503,STANDING
0.27917,-0.026201,-0.12328,-0.99609,-0.9834,-0.99068,0.96762,-0.14398,0.09985,-0.99665,...,-0.99277,-0.9917,0.67892,-0.98941,-0.98784,-0.035684,-0.99522,-0.99524,0.404573,STANDING
0.27663,-0.01657,-0.11536,-0.99814,-0.98082,-0.99048,0.96822,-0.14875,0.094486,-0.99843,...,-0.99552,-0.99439,0.55906,-0.99143,-0.98906,-0.273582,-0.99509,-0.99546,0.087753,STANDING
0.2772,-0.010098,-0.10514,-0.99733,-0.99049,-0.99542,0.96795,-0.14821,0.09191,-0.99898,...,-0.99473,-0.99516,0.24691,-0.9905,-0.98586,-0.297329,-0.99514,-0.99524,0.019953,STANDING


In [6]:
#============================
# Step 5.  # Step 5.  Summarize by Subject and Activity
#

strain <- read_table('UCI HAR Dataset/train/subject_train.txt', col_types='i', col_names='subid')
stest  <- read_table('UCI HAR Dataset/test/subject_test.txt',   col_types='i', col_names='subid')

print(dim(strain))
print(dim(stest))
table(strain)
table(stest)

ds2 <- ds1 %>% bind_cols(bind_rows(strain, stest))

print(dim(ds2))

ds2 <- ds2 %>% group_by(subid,activity) %>% summarize_all(funs(mean))

print(dim(ds2))
head(ds2)

write.csv(ds2, 'DataSet2.csv', quote=T, row.names=F)


[1] 7352    1
[1] 2947    1


strain
  1   3   5   6   7   8  11  14  15  16  17  19  21  22  23  25  26  27  28  29  30 
347 341 302 325 308 281 316 323 328 366 368 360 408 321 372 409 392 376 382 344 383 

stest
  2   4   9  10  12  13  18  20  24 
302 317 288 294 320 327 364 354 381 

[1] 10299    81
[1] 180  81


subid,activity,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tGravityAcc-mean()-X,tGravityAcc-mean()-Y,...,fBodyAccMag-meanFreq(),fBodyBodyAccJerkMag-mean(),fBodyBodyAccJerkMag-std(),fBodyBodyAccJerkMag-meanFreq(),fBodyBodyGyroMag-mean(),fBodyBodyGyroMag-std(),fBodyBodyGyroMag-meanFreq(),fBodyBodyGyroJerkMag-mean(),fBodyBodyGyroJerkMag-std(),fBodyBodyGyroJerkMag-meanFreq()
<int>,<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,...,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,WALKING,0.27733,-0.0173838,-0.111148,-0.28374,0.1144613,-0.260028,0.93522,-0.28217,...,0.190644,-0.057119,-0.10349,0.093822,-0.19925,-0.32102,0.26884437,-0.31931,-0.3816,0.19066
1,WALKING_UPSTAIRS,0.25546,-0.0239531,-0.097302,-0.354708,-0.0023203,-0.019479,0.89335,-0.36215,...,-0.097743,-0.442652,-0.53306,0.085352,-0.32596,-0.18299,-0.21930338,-0.63467,-0.69393,0.11428
1,WALKING_DOWNSTAIRS,0.28919,-0.0099185,-0.107566,0.030035,-0.0319359,-0.230434,0.93187,-0.26661,...,0.119187,0.026218,-0.10405,0.076492,-0.18572,-0.39835,0.3496139,-0.28196,-0.39192,0.19
1,SITTING,0.26124,-0.0013083,-0.104544,-0.977229,-0.9226186,-0.939586,0.83151,0.20441,...,0.236655,-0.985262,-0.98161,0.351852,-0.95844,-0.9322,-0.00026219,-0.9898,-0.98705,0.18478
1,STANDING,0.27892,-0.0161376,-0.110602,-0.99576,-0.9731901,-0.979776,0.94295,-0.27298,...,0.284555,-0.992542,-0.99254,0.42222,-0.98462,-0.97847,-0.02860577,-0.99482,-0.99467,0.3345
1,LAYING,0.2216,-0.040514,-0.113204,-0.928056,-0.8368274,-0.826061,-0.24888,0.70555,...,0.086409,-0.9333,-0.9218,0.266391,-0.86219,-0.82432,-0.13977501,-0.94237,-0.93266,0.17649
