-
Notifications
You must be signed in to change notification settings - Fork 3
/
data-cleane.R
112 lines (89 loc) · 3.24 KB
/
data-cleane.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
# Getting and Cleaning Data Course Project
# Part 2: Getting the UCI HAR Dataset
# This code is licensed under a MIT License Copyright (c) 2017 Juan C. López-Tavera
### Sourcing the Part 1
source('~/github/human-activity/R/data-gette.R')
## Merge the training and the test sets to create one data set.
raw_data <- bind_rows(train, test)
rm(test, train) # tidy environment
## Extract only the measurements on the mean and standard deviation for each measurement
### separate tidy variables (sub_act) and raw features for selection and tidying
sub_act <- raw_data %>%
select(subject, activity)
features <- raw_data %>%
select(-subject, -activity)
### getting the feature names provided with the data set
feature.names <-
fread(input = "~/github/human-activity/data/features.txt",
sep = "auto",
col.names = c("id", "feature"))
### logical vector for subsetting variables that contain either "mean" or "std"
sub <- grepl(pattern = "(mean|std)", x = feature.names$feature)
features <-
features[, sub] # subsetting "mean" and "std" from features
feature.names <- feature.names[sub] # same for varaible names
names(features) <- feature.names$feature # assigning variable names
raw_data <-
bind_cols(sub_act, features) # merging label and features datasets
rm(sub_act, features, sub, feature.names) # tidy environment
## Appropriately label the data set with descriptive variable names
raw_data <- raw_data %>%
gather(data = ., variable, value, -subject, -activity) %>%
separate(
col = variable,
into = c("signal", "parameter", "axis"),
convert = TRUE
)
raw_data <- raw_data %>%
extract(
col = signal,
into = "domain",
regex = "^(t|f)",
remove = FALSE
) %>%
extract(
col = signal,
into = "instrument",
regex = ("(Acc|Gyro)"),
remove = FALSE
) %>%
extract(
col = signal,
into = "acceleration",
regex = ("(Body|Gravity)"),
remove = FALSE
) %>%
extract(col = signal, into = "signal", regex = "(JerkMag|Jerk|Mag)")
raw_data$domain <-
gsub(pattern = "t",
replacement = "time",
x = raw_data$domain)
raw_data$domain <-
gsub(pattern = "f",
replacement = "frequency",
x = raw_data$domain)
raw_data$instrument <-
gsub(pattern = "Acc",
replacement = "accelerometer",
x = raw_data$instrument)
raw_data$instrument <-
gsub(pattern = "Gyro",
replacement = "gyroscope",
x = raw_data$instrument)
## Using descriptive activity names to name the activities in the data set
activity_labels <-
read_table("~/github/human-activity/data/activity_labels.txt",
col_names = c("id", "label"))
tidy_data <- raw_data %>%
mutate(activity = factor(activity, labels = activity_labels$label))
### final details
tidy_data <- tidy_data %>%
mutate(
axis = replace(x = axis, which(axis == ""), NA),
parameter = replace(x = parameter, which(parameter == "meanfreq"), "mean")
)
tidy_data <- tidy_data %>%
mutate_if(is.character, tolower)
rm(activity_labels, raw_data) # tidy environment
## Export the tidy data-set
write_csv(x = tidy_data, path = "~/github/human-activity/data/tidy/tidy_data.csv")