/
dataframe_str.R
231 lines (211 loc) · 7.29 KB
/
dataframe_str.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
####################################################################
#' Dataset columns and rows structure
#'
#' This function lets the user to check quickly the structure of a
#' dataset (data.frame). It returns multiple counters for useful metrics,
#' a plot, and a list of column names for each of the column metrics.
#'
#' @family Exploratory
#' @param df Dataframe
#' @param return Character. Return "skimr" for skim report, "numbers" for
#' stats and numbers, "names" for a list with the column names of each of
#' the class types, "plot" for a nice plot with "numbers" output, "distr"
#' for an overall summary plot showing categorical, numeric, and missing
#' values by using \code{plot_df}
#' distributions
#' @param subtitle Character. Add subtitle to plot
#' @param quiet Boolean. Keep quiet or show other options available?
#' @return Depending on \code{return} input and based on your \code{df} structure:
#' \itemize{
#' \item \code{list} with the names of the columns classified by class
#' \item \code{data.frame} with numbers: total values, row, columns,
#' complete rows
#' \item \code{plot} with visualizations
#' }
#' @examples
#' Sys.unsetenv("LARES_FONT") # Temporal
#' data(dft) # Titanic dataset
#' df_str(dft, "names")
#' df_str(dft, "numbers", quiet = TRUE)
#' df_str(dft, "plot", quiet = TRUE)
#' @export
df_str <- function(df,
return = "plot",
subtitle = NA,
quiet = FALSE) {
if (!quiet) {
rets <- c("skimr", "numbers", "names", "distr", "plot")
message(paste("Other available 'return' options:", vector2text(rets[rets != return])))
}
df <- data.frame(df)
if (return == "skimr") {
try_require("skimr")
return(skim(df))
}
if (return == "distr") {
p <- plot_df(df)
return(p)
}
names <- list(
cols = colnames(df),
nums = colnames(df)[unlist(lapply(df, is.numeric))],
char = colnames(df)[unlist(lapply(df, is.character))],
factor = colnames(df)[unlist(lapply(df, is.factor))],
logic = colnames(df)[unlist(lapply(df, is.logical))]
)
names[["time"]] <- names$cols[!colnames(df) %in% c(
names$nums, names$char, names$factor, names$logic
)]
names[["allnas"]] <- names$cols[unlist(lapply(df, function(x) all(is.na(x))))]
if (return == "names") {
return(names)
}
numbers <- data.frame(
"Total Values" = nrow(df) * ncol(df),
"Total Rows" = nrow(df),
"Total Columns" = ncol(df),
"Numeric Columns" = length(names$nums),
"Character Columns" = length(names$char),
"Factor Columns" = length(names$factor),
"Logical Columns" = length(names$logic),
"Time/Date Columns" = length(names$time),
"All Missing Columns" = length(names$allnas),
"Missing Values" = sum(is.na(df)),
"Complete Rows" = sum(complete.cases(df)),
"Memory Usage" = as.numeric(object.size(df))
)
intro2 <- data.frame(counter = t(numbers)) %>%
mutate(
metric = row.names(.),
type = ifelse(grepl("Column", colnames(numbers)), "Columns",
ifelse(grepl("Rows", colnames(numbers)), "Rows", "Values")
),
p = ifelse(.data$type == "Columns", 100 * .data$counter / numbers$Total.Columns,
ifelse(.data$type == "Rows", 100 * .data$counter / numbers$Total.Rows,
100 * .data$counter / numbers$Total.Values
)
),
p = round(.data$p, 2),
type = factor(.data$type, levels = c("Values", "Columns", "Rows"))
) %>%
select(.data$metric, .data$counter, .data$type, .data$p)
if (return == "numbers") {
return(select(intro2, -.data$type))
}
if (return == "plot") {
p <- intro2 %>%
filter(!.data$metric %in% "Memory.Usage") %>%
mutate(x = ifelse(.data$p < 75, -0.15, 1.15)) %>%
ggplot(aes(
x = reorder(.data$metric, as.integer(.data$counter)),
y = .data$p, fill = .data$type,
label = formatNum(.data$counter, 0)
)) +
geom_col() +
coord_flip() +
ylim(0, 100) +
theme_minimal() +
guides(fill = "none") +
labs(
title = "Dataset overall structure",
x = "", y = "% of total", fill = "",
caption = paste("Memory Usage:", formatNum(numbers$Memory.Usage / (1024 * 1024)), "Mb")
) +
facet_grid(type ~ ., scales = "free", space = "free") +
geom_text(aes(hjust = .data$x), size = 3) +
theme_lares(pal = 1)
if (!is.na(subtitle)) p <- p + labs(subtitle = subtitle)
return(p)
}
}
####################################################################
#' Plot All Numerical Features (Boxplots)
#'
#' This function filters numerical columns and plots boxplots.
#'
#' @family Exploratory
#' @param df Dataframe
#' @return Plot. Result of \code{df} numerical features.
#' @examples
#' Sys.unsetenv("LARES_FONT") # Temporal
#' data(dft) # Titanic dataset
#' plot_nums(dft)
#' @export
plot_nums <- function(df) {
which <- df %>% select_if(is.numeric)
if (length(which) > 0) {
p <- gather(which) %>%
filter(!is.na(.data$value)) %>%
ggplot(aes(x = .data$key, y = .data$value)) +
geom_jitter(alpha = 0.2, size = 0.8) +
geom_boxplot(alpha = 0.8, outlier.shape = NA, width = 1) +
facet_wrap(.data$key ~ ., scales = "free") +
labs(title = "Numerical Features Boxplots", x = NULL, y = NULL) +
theme_lares() +
theme(
axis.text.y = element_blank(),
axis.text.x = element_text(vjust = 2, size = 8),
panel.spacing.y = unit(-.5, "lines"),
strip.text = element_text(size = 10, vjust = -1.3)
) +
coord_flip()
return(p)
} else {
message("No numerical variables found!")
}
}
####################################################################
#' Plot All Categorical Features (Frequencies)
#'
#' This function filters categorical columns and plots the frequency
#' for each value on every feature.
#'
#' @family Exploratory
#' @param df Dataframe
#' @return Plot. Result of \code{df} categorical features.
#' @export
plot_cats <- function(df) {
plot <- df %>% select_if(Negate(is.numeric))
if (length(plot) > 0) {
p <- plot %>% freqs(plot = TRUE) +
labs(title = "Categorical Features Frequencies")
return(p)
} else {
message("No categorical variables found!")
}
}
####################################################################
#' Plot Summary of Numerical and Categorical Features
#'
#' This function plots all columns frequencies and boxplots, for
#' categorical and numerical respectively.
#'
#' @family Exploratory
#' @param df Dataframe
#' @return Plot. Result of \code{df} categorical and numerical features.
#' @export
plot_df <- function(df) {
plots <- list()
cats <- plot_cats(df)
if (length(cats) != 0) {
plots[["cats"]] <- cats +
theme(plot.title = element_text(size = 12))
}
nums <- plot_nums(df)
if (length(nums) != 0) {
plots[["nums"]] <- nums +
theme(plot.title = element_text(size = 12))
}
mis <- missingness(df, plot = TRUE, summary = FALSE)
if (length(mis) != 0) {
plots[["miss"]] <- mis +
theme(plot.title = element_text(size = 12)) + guides(fill = "none")
}
if (length(plots) == 3) heights <- c(4 / 12, 1 / 2, 3 / 12)
if (length(plots) == 2) heights <- c(0.5, 0.5)
if (length(plots) == 1) heights <- NULL
margin <- theme(plot.margin = unit(c(0.1, 0.5, 0.1, 0.5), "cm"))
plots <- lapply(plots, "+", margin)
p <- wrap_plots(plots, heights = heights)
return(p)
}