Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
122 lines (90 sloc) 4.83 KB
title: "Loopover Backend P2 and P3"
output: html_document
link.prefix <- "" <- c("291172&league=289234", "292573&league=289234", "291170&league=289234", "291183&league=289234",
"291182&league=289234", "291184&league=289234", "292295&league=289234", "282895&league=282881",
"289758&league=289234", "292286&league=289234", "292941&league=289234", "292736&league=289234",
#remove the two matches with no commentary data: <-[-c(7,10)]
links <- paste0(link.prefix,
#Loop opens
for (j in 1:length(links)){
html.string.vector <- readLines(links[j])
start.of.1stH.element.index <- grep("Start of first half", html.string.vector)[1]
#first string was selected because the "0-0" was found easily unlike on the second string <- html.string.vector[start.of.1stH.element.index]
#Substring the from the "End of second half" and the "Start of first half"
start.position <- gregexpr("End of second half",
end.position <- gregexpr("Start of first half", <- substr(, start.position, end.position)
#The code above indicates the ' straight after the minute number
test1 <- length(gregexpr("&#x27;",[[1]])
#test is passed (i.e. length is and should be 50) for
#because there are 67 ' (dashes) on the page including:
#15 on the scoreboard, 1 for the "End of Second Half" and 1 on the blue bubble speech on the top right <- strsplit(, "&#x27;")[[1]][-c(test1+1)]
#split the string into a vector of strings
#remove the last element because this just indicates the "Start of First Half"
#GET COLUMN ONE: minutes
#Get the minutes, ignore the last element's minute because it's just the Start of First Half time <- nchar(
minutes <- substr(,,[-c(test1)]
#In this case, this step is unnecessary, but is necessary if there are events during the 1st-9th minute
if(length(grep("\\D", minutes)) > 0){
minutes[grep("\\D", minutes)] <- substr(minutes[grep("\\D", minutes)], 2,2)
print("events happened during the 1st-9th minute")
} else {
print("no events during the 1st-9th minute")
minutes <- as.integer(minutes)
#GET COLUMN TWO, THREE AND FOUR: team, players and events
#Firstly, get the commentary quotes
#"</td></tr><tr data-reactid"
#The code above always comes straight after the commentary quote
commentary.end.pos <- unlist(gregexpr("</td></tr><tr data-reactid", - 1 <- substr(, 1,commentary.end.pos)[-c(1)]
#First element is just "End of Second half"
#The code above always comes some point before the commentary quote
game.details.start.pos <- unlist(gregexpr("game-details", <- substr(, game.details.start.pos, nchar(
#An upper case latter comes at the very start of commentary quote
commentary.start.pos <- unlist(regexpr("[[:upper:]]",
commentary.quotes <- substr(, commentary.start.pos, nchar(
#GET COLUMN TWO, THREE AND FOUR: team, players and events
#Now get team, players and events wtih removal
#"End of first half" and "Start of second half" from minutes object and the commentary quotes
#Notice how the commentary quotes are templated by:
#"[event] - [player] , [team]"
team.with.EndOfFirstHalf <- unlist(strsplit(commentary.quotes, " , "))[c(FALSE, TRUE)]
event.player.with.StartOfSecondHalf <- unlist(strsplit(commentary.quotes, " , "))[c(TRUE,FALSE)]
#Remove EndOfFirstHalf and StartOfSecondHalf
end.of.first.half.index <- which(team.with.EndOfFirstHalf == "End of first half")
start.of.second.half.index <- which(event.player.with.StartOfSecondHalf == "Start of second half")
#clean-up team
team <- team.with.EndOfFirstHalf[-end.of.first.half.index]
#clean-up event.players
event.player <- event.player.with.StartOfSecondHalf[-start.of.second.half.index]
#clean-up minutes
x <- unlist(strsplit(commentary.quotes, " , "))
start.of.second.half.index <- (which(x == "Start of second half")-1) / 2
minutes <- minutes[-c(start.of.second.half.index, start.of.second.half.index+1)]
#check if it matches the website info
head(cbind(minutes, event.player))
tail(cbind(minutes, event.player))
event <- unlist(strsplit(event.player, " - "))[c(TRUE,FALSE)]
player <- unlist(strsplit(event.player, " - "))[c(FALSE,TRUE)]
data.frame <- data.frame(as.integer(minutes), team, player, event)
write.csv(data.frame, paste0([j], ".csv"))