-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathweb_post_single_topic.R
More file actions
136 lines (111 loc) · 4.49 KB
/
web_post_single_topic.R
File metadata and controls
136 lines (111 loc) · 4.49 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
# Packages.
library(dplyr)
library(lubridate)
library(RColorBrewer)
library(stringr)
library(forcats)
library(pbapply)
library(ggplot2)
# Folder date range of interest.
scrape_folder <- "10jan2024_09feb2024"
# Unzip folder containing scrapes.
unzip(zipfile = paste0("scrapes/", scrape_folder, ".zip"), exdir = "scrapes", overwrite = FALSE)
# Paste together the working directory location, together with all the csv file names.
scrapes_list <- paste0(
# the working directory folder leading to the csv.
"scrapes/", scrape_folder, "/",
# list all the csvs in that folder.
list.files(
paste0("scrapes/", scrape_folder),
pattern = "*.csv")
)
# Load them all in.
scrapes_df_list <- pblapply(scrapes_list, read.csv)
# Row bind them together.
scrapes_df <- bind_rows(scrapes_df_list)
# Any missings? No.
sum(is.na(scrapes_df))
# Make sure dates are dates.
scrapes_clean_df <- scrapes_df %>%
as_tibble() %>%
mutate(scrape_date = ymd_hms(scrape_date)) %>%
rename(article_title = value) %>%
select(-url)
# Time spent in top 10 during the period.
top10_time_df <- scrapes_clean_df %>%
group_by(article_title) %>%
tally() %>%
ungroup() %>%
mutate(mins = n*5,
hrs = mins/60) %>%
arrange(desc(n))
# Save.
write.csv(top10_time_df, file = paste0("output/top10_time_", scrape_folder ,".csv"))
# Time spent in each ranking position during the period.
ranking_time_df <- scrapes_clean_df %>%
group_by(order_var, article_title) %>%
tally() %>%
ungroup() %>%
mutate(mins = n*5,
hrs = mins/60) %>%
arrange(desc(n))
# Save.
write.csv(ranking_time_df, file = paste0("output/ranking_time_", scrape_folder, ".csv"))
# What about only stories that made number 1?
no1_time_df <- ranking_time_df %>%
filter(order_var == 1)
# Save.
write.csv(no1_time_df, file = paste0("output/no1_time_", scrape_folder, ".csv"))
# # Select a 24-hour time period (if wanted).
# scrapes_clean_df <- scrapes_clean_df %>%
# filter(date(scrape_date) == "2024-01-31")
# What's your keyword(s)?
interest_words <- "taylor swift"
# Identify a specific story.
scrapes_clean_df <- scrapes_clean_df %>%
mutate(article_title_lower = str_to_lower(article_title),
interest_label = if_else(condition = str_detect(article_title_lower, interest_words),
true = article_title,
false = "Unrelated"))
# What have we got?
count(scrapes_clean_df, interest_label)
# if else for high category topics.
if (length(unique(scrapes_clean_df$interest_label)) > 8 ) {
scrapes_clean_df <- scrapes_clean_df %>%
mutate(interest_label = fct_lump_n(interest_label, 7, other_level = "Other"))
} else {
print("Few than 8 unique stories. No recode done.")
}
# Change factor levels for visual. Unrelated should always be first.
scrapes_clean_df <- scrapes_clean_df %>%
mutate(interest_label = fct_relevel(interest_label, "Unrelated"))
# How many stories on the interest flag to we have?
n_stories_vec <- length(unique(scrapes_clean_df$interest_label))-1
# Colour scheme define. Note that 'Unrelated' is now always the first level, so it's grey.
col_scheme <- c("grey95", brewer.pal(n = n_stories_vec, name = "Set2"))
# Tile plot.
tile_gg <- ggplot(data = scrapes_clean_df) +
geom_tile(mapping = aes(x = scrape_date, y = order_var, fill = interest_label, colour = interest_label)) +
scale_fill_manual (values = col_scheme) +
scale_colour_manual(values = col_scheme) +
scale_y_reverse(breaks = c(1:10)) +
theme_bw() +
labs(title = paste("BBC Top 10 'most read' involving the term(s):", str_replace_all(interest_words, "\\|", ", ")),
subtitle = paste("Time range between",
format(min(scrapes_clean_df$scrape_date), "%d-%m-%Y %H:%M:%S"),
"and",
format(max(scrapes_clean_df$scrape_date), "%d-%m-%Y %H:%M:%S") ),
fill = NULL, x = NULL, y = "Top 10 ranking") +
guides(fill = guide_legend(nrow = 4),
colour = "none") +
theme(legend.position = "bottom",
legend.text = element_text (size = 8))
# Create format for saving interest word file. We replace spaces and 'or' statements.
interest_words_file <- str_replace_all(interest_words, " |\\|", "_")
# Save.
ggsave(plot = tile_gg,
filename = paste0("visuals/interest_tile_",
interest_words_file,
".png"),
width = 20, height = 12, unit = "cm", dpi = 300)
# End.