/
web_post_compare.R
112 lines (92 loc) · 3.57 KB
/
web_post_compare.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# Packages.
library(readr)
library(dplyr)
library(tidyr)
library(lubridate)
library(RColorBrewer)
library(stringr)
library(forcats)
library(pbapply)
library(ggplot2)
# Useful function(s).
`%nin%` <- Negate(`%in%`)
# Folder date range of interest.
scrape_folder <- "10jan2024_09feb2024"
# Unzip folder containing scrapes.
unzip(zipfile = paste0("scrapes/", scrape_folder, ".zip"), exdir = "scrapes", overwrite = FALSE)
# Paste together the working directory location, together with all the csv file names.
scrapes_list <- paste0(
# the working directory folder leading to the csv.
"scrapes/", scrape_folder, "/",
# list all the csvs in that folder.
list.files(
paste0("scrapes/", scrape_folder),
pattern = "*.csv")
)
# Load them all in.
scrapes_df_list <- pblapply(scrapes_list, read.csv)
# Row bind them together.
scrapes_df <- bind_rows(scrapes_df_list)
# Any missings? No.
sum(is.na(scrapes_df))
# Make sure dates are dates.
scrapes_clean_df <- scrapes_df %>%
as_tibble() %>%
mutate(scrape_date = ymd_hms(scrape_date)) %>%
rename(article_title = value) %>%
select(-url)
# What's your keyword(s) for each topic?
topic1 <- "ukraine|russia|putin|zalensky"
topic2 <- "israel|gaza|hamas|netanyahu"
topics <- paste(topic1, topic2, sep = "|")
# Identify the topics in stories.
scrapes_clean_df <- scrapes_clean_df %>%
mutate(article_title_lower = str_to_lower(article_title),
# topic 1.
interest_label = case_when(
str_detect(article_title_lower, topic1) ~ topic1,
str_detect(article_title_lower, topic2) ~ topic2,
article_title_lower %nin% topics ~ "Unrelated"
))
# What have we got?
count(scrapes_clean_df, interest_label)
# Change factor levels for visual, so that Other is always first.
scrapes_clean_df <- scrapes_clean_df %>%
mutate(interest_label = fct_relevel(interest_label, "Unrelated"))
# Defining number of topics, in case more than two.
n_stories_vec <- length(unique(scrapes_clean_df$interest_label))-1
# Colour scheme define. Note that 'unrelated' is now always the first level.
# Warning if topics two, but it is fine.
col_scheme <- c("grey95", brewer.pal(n = n_stories_vec, name = "Set2"))
# Tile plot.
tile_gg <- ggplot(data = scrapes_clean_df) +
geom_tile(mapping = aes(x = scrape_date, y = order_var, fill = interest_label, colour = interest_label)) +
scale_fill_manual (values = col_scheme) +
scale_colour_manual(values = col_scheme) +
scale_y_reverse(breaks = c(1:10)) +
theme_bw() +
labs(title = paste("BBC Top 10 'most read': comparing topics"),
subtitle = paste("Time range between", min(scrapes_clean_df$scrape_date), "and", max(scrapes_clean_df$scrape_date) ),
fill = NULL, x = NULL, y = "Top 10 ranking") +
guides(fill = guide_legend(nrow = 1),
colour = "none") +
theme(legend.position = "bottom")
# Save.
ggsave(plot = tile_gg, filename = "visuals/interest_tile_compare.png", width = 20, height = 12, unit = "cm", dpi = 300)
# Proportional visual (not finished).
props_breakdown_df <- scrapes_clean_df %>%
group_by(scrape_date) %>%
count(interest_label) %>%
ungroup() %>%
complete(scrape_date, interest_label, fill = list(n = 0)) %>%
mutate(prop_of_top_10 = n*10) %>%
arrange(scrape_date)
# Plot.
ggplot(data = props_breakdown_df) +
geom_area(mapping = aes(x = scrape_date,
y = prop_of_top_10,
fill = interest_label),
position = "fill") +
theme(legend.position = "bottom")
# Save.
ggsave(filename = "visuals/interest_tile_compare_prop.png", width = 20, height = 12, unit = "cm", dpi = 300)