-
Notifications
You must be signed in to change notification settings - Fork 1
/
04-ggplot2.R
288 lines (239 loc) · 14.1 KB
/
04-ggplot2.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
# Data visualisation with ggplot2
####################################################################################
## Preparing data ##
####################################################################################
library(tidyverse)
interviews <- read_csv("data/SAFI_clean.csv", na = "NULL")
interviews_plotting <- interviews %>%
## spread data by items_owned
mutate(split_items = strsplit(items_owned, ";")) %>%
unnest() %>%
mutate(items_owned_logical = TRUE) %>%
spread(key = split_items, value = items_owned_logical, fill = FALSE) %>%
rename(no_listed_items = `<NA>`) %>%
## spread data by months_lack_food
mutate(split_months = strsplit(months_lack_food, ";")) %>%
unnest() %>%
mutate(months_lack_food_logical = TRUE) %>%
spread(key = split_months, value = months_lack_food_logical, fill = FALSE) %>%
## add some summary columns
mutate(number_months_lack_food = rowSums(select(., Apr:Sept))) %>%
mutate(number_items = rowSums(select(., bicycle:television)))
####################################################################################
## Plotting with ggplot2 ##
####################################################################################
# To build a ggplot, we will use the following basic template:
# ggplot(data = <DATA>, mapping = aes(<MAPPINGS>)) + <GEOM_FUNCTION>()
# - use the `ggplot()` function and bind the plot to a specific data frame using
# the `data` argument
ggplot(data = interviews_plotting)
# - define a mapping (using the aesthetic (`aes`) function), by selecting the variables to be plotted and specifying how to present them in the graph, e.g. as x/y positions or characteristics such as size, shape, color, etc.
ggplot(data = interviews_plotting, aes(x = no_membrs, y = number_items))
# - add 'geoms' – graphical representations of the data in the plot (points,
# lines, bars). **`ggplot2`** offers many different geoms; we will use some
# common ones today, including:
#
# * `geom_point()` for scatter plots, dot plots, etc.
# * `geom_boxplot()` for, well, boxplots!
# * `geom_line()` for trend lines, time series, etc.
#
# To add a geom to the plot use the `+` operator. Because we have two continuous variables,
# let's use `geom_point()` first:
ggplot(data = interviews_plotting, aes(x = no_membrs, y = number_items)) +
geom_point()
# The `+` in the **`ggplot2`** package is particularly useful because it allows
# you to modify existing `ggplot` objects. This means you can easily set up plot
# templates and conveniently explore different types of plots, so the above plot
# can also be generated with code like this:
# Assign plot to a variable
interviews_plot <- ggplot(data = interviews_plotting, aes(x = no_membrs, y = number_items))
# Draw the plot
interviews_plot +
geom_point()
####################################################################################
## Building your plots iteratively ##
####################################################################################
# Building plots with **`ggplot2`** is typically an iterative process. We start by
# defining the dataset we'll use, lay out the axes, and choose a geom:
ggplot(data = interviews_plotting, aes(x = no_membrs, y = number_items)) +
geom_point()
# Then, we start modifying this plot to extract more information from it. For
# instance, we can add transparency (`alpha`) to avoid overplotting:
ggplot(data = interviews_plotting, aes(x = no_membrs, y = number_items)) +
geom_point(alpha = 0.5)
# That only helped a little bit with the overplotting problem. We can also
# introduce a little bit of randomness into the position of our points
# using the `geom_jitter()` function.
ggplot(data = interviews_plotting, aes(x = no_membrs, y = number_items)) +
geom_jitter(alpha = 0.5)
# We can also add colors for all the points:
ggplot(data = interviews_plotting, aes(x = no_membrs, y = number_items)) +
geom_jitter(alpha = 0.5, color = "blue")
# Or to color each village in the plot differently, you could use a vector as an input to the argument **`color`**.
# Because we are now mapping features of the data to a color, instead of setting one color for all points, the color now needs to be set inside a call to the **`aes`** function. **`ggplot2`** will provide a different color corresponding to different values in the vector. We set the value of **`alpha`** outside of the **`aes`** function call because we are using the same value for all points. Here is an example where we color by **`village`**:
ggplot(data = interviews_plotting, aes(x = no_membrs, y = number_items)) +
geom_jitter(aes(color = village), alpha = 0.5)
# There appears to be a positive trend between number of household
# members and number of items owned (from the list provided). This trend
# does not appear to be different by village.
########## Exercise ##########
# Use what you just learned to create a scatter plot of `rooms` by `village`
# with the `respondent_wall_type` showing in different colors. Is this a good
# way to show this type of data?
##############################
####################################################################################
## Boxplot ##
####################################################################################
# We can use boxplots to visualize the distribution of rooms for each
# wall type:
ggplot(data = interviews_plotting, aes(x = respondent_wall_type, y = rooms)) +
geom_boxplot()
# By adding points to a boxplot, we can have a better idea of the number of
# measurements and of their distribution:
ggplot(data = interviews_plotting, aes(x = respondent_wall_type, y = rooms)) +
geom_boxplot(alpha = 0) +
geom_jitter(alpha = 0.5, color = "tomato")
# We can see that muddaub houses and sunbrick houses tend to be smaller than
# burntbrick houses.
########## Exercise ##########
# 1. Boxplots are useful summaries, but hide the *shape* of the distribution. For
# example, if the distribution is bimodal, we would not see it in a
# boxplot. An alternative to the boxplot is the violin plot, where the shape
# (of the density of points) is drawn.
# - Replace the box plot with a violin plot; see `geom_violin()`.
# 2. So far, we've looked at the distribution of room number within wall type. Try
# making a new plot to explore the distribution of another variable within wall
# type.
# - Create a boxplot for `liv_count` for each wall type. Overlay the boxplot
# layer on a jitter layer to show actual measurements.
# - Add color to the data points on your boxplot according to whether the
# respondent is a member of an irrigation association (`memb_assoc`).
##############################
####################################################################################
## Barplots ##
####################################################################################
# Barplots are also useful for visualizing categorical data. By default,
# `geom_bar` accepts a variable for x, and plots the number of instances each
# value of x (in this case, wall type) appears in the dataset.
ggplot(data = interviews_plotting, aes(x = respondent_wall_type)) +
geom_bar()
# We can use the `fill` aesthetic for the `geom_bar()` geom to color bars by
# the portion of each count that is from each village.
ggplot(data = interviews_plotting, aes(x = respondent_wall_type)) +
geom_bar(aes(fill = village))
# This creates a stacked bar chart. These are generally more difficult to read
# than side-by-side bars. We can separate the portions of the stacked bar that
# correspond to each village and put them side-by-side by using the `position`
# argument for `geom_bar()` and setting it to "dodge".
ggplot(data = interviews_plotting, aes(x = respondent_wall_type)) +
geom_bar(aes(fill = village), position = "dodge")
# This is a nicer graphic, but we're more likely to be interested in the
# proportion of each housing type in each village than in the actual count of
# number of houses of each type (because we might have sampled different numbers
# of households in each village). To compare proportions, we will first create a
# new data frame (`percent_wall_type`) with a new column named "percent"
# representing the percent of each house type in each village. We will remove
# houses with cement walls, as there was only one in the dataset.
percent_wall_type <- interviews_plotting %>%
filter(respondent_wall_type != "cement") %>%
count(village, respondent_wall_type) %>%
group_by(village) %>%
mutate(percent = n / sum(n)) %>%
ungroup()
# Now we can use this new data frame to create our plot showing the
# percentage of each house type in each village.
ggplot(percent_wall_type, aes(x = village, y = percent, fill = respondent_wall_type)) +
geom_bar(stat = "identity", position = "dodge")
########## Exercise ##########
# Create a bar plot showing the proportion of respondents in each
# village who are or are not part of an irrigation association
# (`memb_assoc`). Include only respondents who answered that question
# in the calculations and plot. Which village had the lowest proportion of
# respondents in an irrigation association?
##############################
####################################################################################
## Adding Labels and Titles ##
####################################################################################
# By default, the axes labels on a plot are determined by the name of the variable
# being plotted. However, ggplot2 offers lots of customization options,
# like specifying the axes labels, and adding a title to the plot with
# relatively few lines of code. We will add more informative x and y axis
# labels to our plot of proportion of house type by village and also add
# a title.
ggplot(percent_wall_type, aes(x = village, y = percent, fill = respondent_wall_type)) +
geom_bar(stat = "identity", position = "dodge") +
ylab("Percent") +
xlab("Wall Type") +
ggtitle("Proportion of wall type by village")
####################################################################################
## Faceting ##
####################################################################################
# Rather than creating a single plot with side-by-side bars for each
# village, we may want to create multiple plot, where each plot shows the
# data for a single village. This would be especially useful if we had
# a large number of villages that we had sampled, as a large number of
# side-by-side bars will become more difficult to read.
# ggplot2 has a special technique called *faceting* that allows the user to split one
# plot into multiple plots based on a factor included in the dataset. We
# will use it to split our barplot of housing type proportion by village
# so that each village has it's own panel in a multi-panel plot:
ggplot(percent_wall_type, aes(x = respondent_wall_type, y = percent)) +
geom_bar(stat = "identity", position = "dodge") +
ylab("Percent") +
xlab("Wall Type") +
ggtitle("Proportion of wall type by village") +
facet_wrap(~ village)
# Usually plots with white background look more readable when printed. We can set
# the background to white using the function `theme_bw()`. Additionally, you can remove
# the grid:
ggplot(percent_wall_type, aes(x = respondent_wall_type, y = percent)) +
geom_bar(stat = "identity", position = "dodge") +
ylab("Percent") +
xlab("Wall Type") +
ggtitle("Proportion of wall type by village") +
facet_wrap(~ village) +
theme_bw() +
theme(panel.grid = element_blank())
# What if we wanted to see the proportion of respondents in each village
# who owned a particular item? We can calculate the percent of people
# in each village who own each item and then create a faceted series of
# bar plots where each plot is a particular item. First we need to
# calculate the percentage of people in each village who own each item:
percent_items <- interviews_plotting %>%
gather(items, items_owned_logical, bicycle:no_listed_items) %>%
filter(items_owned_logical) %>%
count(items, village) %>%
## add a column with the number of people in each village
mutate(people_in_village = case_when(village == "Chirodzo" ~ 39,
village == "God" ~ 43,
village == "Ruaca" ~ 49)) %>%
mutate(percent = n / people_in_village)
# To calculate this percentage data frame, we needed to use the `case_when()`
# parameter within `mutate()`. In our earlier examples, we knew that each house
# was one and only one of the types specified. However, people can (and do) own
# more than one item, so we can't use the sum of the count column to give us the
# denominator in our percentage calculation. Instead, we need to specify the
# number of respondents in each village. Using this data frame, we can now create
# a multi-paneled bar plot.
ggplot(percent_items, aes(x = village, y = percent)) +
geom_bar(stat = "identity", position = "dodge") +
facet_wrap(~ items) +
theme_bw() +
theme(panel.grid = element_blank())
####################################################################################
## Saving ggplot ##
####################################################################################
# Define the plot into an object
my_plot <- ggplot(percent_items, aes(x = village, y = percent)) +
geom_bar(stat = "identity", position = "dodge") +
facet_wrap(~ items) +
labs(title = "Percent of respondents in each village \n who owned each item",
x = "Village",
y = "Percent of Respondents") +
theme_bw() +
theme(axis.text.x = element_text(colour = "grey20", size = 12, angle = 45, hjust = 0.5, vjust = 0.5),
axis.text.y = element_text(colour = "grey20", size = 12),
text = element_text(size = 16),
plot.title = element_text(hjust = 0.5))
# Use ggsave()
ggsave("fig_output/name_of_file.png", my_plot, width = 15, height = 10)