In [118]:
library(XML)

In [119]:
menURLs = 
  c("results/1999/cb99m.html","results/2012/2012cucb10m-m.htm")
ubase = "http://www.cherryblossom.org/"
murls = paste(ubase, menURLs, sep = "")

murls[1:2]

In [120]:
murls==urls

In [121]:
womenURLs = 
  c("results/1999/cb99f.html","results/2012/2012cucb10m-f.htm")
ubase = "http://www.cherryblossom.org/"
furls = paste(ubase, womenURLs, sep = "")

furls[1:2]

In [122]:
extractResTable =
  #
  # Retrieve data from web site, 
  # find the preformatted text,
  # and write lines or return as a character vector.
  #
  function(url = "http://www.cherryblossom.org/results/2009/09cucb-F.htm",
           year = 1999, sex = "male", file = NULL)
  {
    doc = htmlParse(url, encoding= 'UTF-8')

    if (year == 2000) {

      # Get preformatted text from 4th font element
      # The top file is ill formed so the <pre> search doesn't work.
      ff = getNodeSet(doc, "//font")
      txt = xmlValue(ff[[4]])
      els = strsplit(txt, "\r\n")[[1]]
    }
    else if (year == 2009 & sex == "male") {
      # Get preformatted text from <div class="Section1"> element
      # Each line of results is in a <pre> element
      div1 = getNodeSet(doc, "//div[@class='Section1']")
      pres = getNodeSet(div1[[1]], "//pre")
      els = sapply(pres, xmlValue)
    }         
    else if (year == 1999 & sex == "male") {
      # Get preformatted text from <pre> elements
      pres = getNodeSet(doc, "//pre")
      txt = xmlValue(pres[[1]])
      els = strsplit(txt, "\n")[[1]]   
    } 
  
    
    else {
      # Get preformatted text from <pre> elements
      pres = getNodeSet(doc, "//pre")
      txt = xmlValue(pres[[1]])
      els = strsplit(txt, "\r\n")[[1]]   
    } 
    
    if (is.null(file)) return(els)
    # Write the lines as a text file.
    writeLines(els, con = file)
  }



In [123]:
menTables = mapply(extractResTable, url = urls, year = years)
names(menTables) = years
sapply(menTables, length)



In [137]:
womenTables = mapply(extractResTable, url = furls, year = years)
names(menTables) = years
sapply(menTables, length)



In [125]:
#m2012 = read.table(file="MenTxt/2012.txt", skip = 8)
m2012 = menTables$'2012'[-1:-8]

#els = readLines("MenTxt/2012.txt")
els = menTables$'2012'

els[1:10]



In [126]:
eqIndex = grep("^===", els)
eqIndex

first3 = substr(els, 1, 3)
which(first3 == "===")

spacerRow = els[eqIndex]
headerRow = els[eqIndex - 1]
body = els[ -(1:eqIndex) ]

headerRow = tolower(headerRow)

ageStart = regexpr("ag", headerRow)
ageStart

age = substr(body, start = ageStart, stop = ageStart + 1)
head(age)

summary(as.numeric(age))

blankLocs = gregexpr(" ", spacerRow)
blankLocs

searchLocs = c(0, blankLocs[[1]])

Values = mapply(substr, list(body), 
                start = searchLocs[ -length(searchLocs)] + 1, 
                stop = searchLocs[ -1 ] - 1)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   9.00   29.00   35.00   37.75   45.00   89.00       1 

In [127]:
findColLocs = function(spacerRow) {

  spaceLocs = gregexpr(" ", spacerRow)[[1]]
  rowLength = nchar(spacerRow)

  if (substring(spacerRow, rowLength, rowLength) != " ")
    return( c(0, spaceLocs, rowLength + 1))
  else return(c(0, spaceLocs))
}

selectCols = 
function(colNames, headerRow, searchLocs) 
{
  sapply(colNames, 
         function(name, headerRow, searchLocs)
         {
           startPos = regexpr(name, headerRow)[[1]]
           if (startPos == -1) 
             return( c(NA, NA) )
    
           index = sum(startPos >= searchLocs)
           c(searchLocs[index] + 1, searchLocs[index + 1] - 1)
         },
         headerRow = headerRow, searchLocs = searchLocs )
}

searchLocs = findColLocs(spacerRow)
ageLoc = selectCols("ag", headerRow, searchLocs) 
ages = mapply(substr, list(body), 
              start = ageLoc[1,], stop = ageLoc[2, ])

summary(as.numeric(ages))

shortColNames = c("name", "home", "ag", "gun", "net", "time")

locCols = selectCols(shortColNames, headerRow, searchLocs)

Values = mapply(substr, list(body), start = locCols[1, ], 
                stop = locCols[2, ])

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   9.00   29.00   35.00   37.75   45.00   89.00       1 

In [128]:
class(Values)

colnames(Values) = shortColNames
head(Values)

tail(Values)[ , 1:3]

extractVariables = 
  function(file, varNames =c("name", "home", "ag", "gun",
                             "net", "time"))
{
       # Find the index of the row with =s
  eqIndex = grep("^===", file)
       # Extract the two key rows and the data
  spacerRow = file[eqIndex] 
  headerRow = tolower(file[ eqIndex - 1 ])
  body = file[ -(1 : eqIndex) ]
       
       # Obtain the starting and ending positions of variables
  searchLocs = findColLocs(spacerRow)
  locCols = selectCols(varNames, headerRow, searchLocs)

  Values = mapply(substr, list(body), start = locCols[1, ], 
                  stop = locCols[2, ])
  colnames(Values) = varNames
  
  invisible(Values)
}



name,home,ag,gun,net,time
Allan Kiprono,Kenya,22,,,45:15
Lani Kiplagat,Kenya,23,,,46:28
John Korir,Kenya,36,,,47:33
Ian Burrell,Tucson AZ,27,,,47:34
Jesse Cherry,Blowing Rock NC,24,,,47:40
Ketema Nugusse,Ethiopia,31,,,47:50


Unnamed: 0,name,home,ag
"[7188,]",Dana Brown,Randallstown MD,41.0
"[7189,]",Jurek Grabowski,Fairfax VA,39.0
"[7190,]",Larry Hume,Arlington VA,56.0
"[7191,]",Sean-Patrick Alexander,Alexandria VA,35.0
"[7192,]",Joseph White,Forestville MD,
"[7193,]",Lee Jordan,Herndon VA,48.0


In [140]:
names(womenTables) = years

In [138]:
mfilenames = paste("data/MenTxt/", years, ".txt", sep = "")
write(menTables$'1999',mfilenames[1])
write(menTables$'2012',mfilenames[2])


In [130]:
mfilenames = paste("data/MenTxt/", readYears, ".txt", sep = "")
menFiles = lapply(mfilenames, readLines)
names(menFiles) = readYears

menResMat = lapply(menFiles, extractVariables)
length(menResMat)

sapply(menResMat, nrow)

age = sapply(menResMat,
             function(x) as.numeric(x[ , 'ag']))

In [141]:
wfilenames = paste("data/WomenTxt/", years, ".txt", sep = "")
write(womenTables$'1999',wfilenames[1])
write(womenTables$'2012',wfilenames[2])

In [144]:
wfilenames = paste("data/WomenTxt/", readYears, ".txt", sep = "")
womenFiles = lapply(mfilenames, readLines)
names(womenFiles) = years

womenResMat = lapply(womenFiles, extractVariables)
length(womenResMat)

sapply(womenResMat, nrow)


age = sapply(womenResMat,
             function(x) as.numeric(x[ , 'ag']))

In [167]:
shortColNames = c("name", "home", "ag", "gun", "net", "time","sex","year")

In [168]:
w99 = womenResMat['1999']
w99 = as.data.frame(w99)
w99['sex']= 'female'
w99['year']= '1999'

names(w99)= shortColNames

w12 = womenResMat['2012']
w12 = as.data.frame(w12)
w12['sex']= 'female'
w12['year']= '2012'
names(w12)= shortColNames


m99 = menResMat['1999']
m99 = as.data.frame(m99)
m99['sex']= 'male'
m99['year']= '1999'
names(m99)= shortColNames


m12 = menResMat['1999']
m12 = as.data.frame(m12)
m12['sex']= 'male'
m12['year']= '2012'
names(m12)= shortColNames





In [171]:
raceDf = rbind(w99,w12,m99,w12)

In [172]:
raceDf

name,home,ag,gun,net,time,sex,year
Worku Bikila,Ethiopia,28,,,46:59,female,1999
Lazarus Nyakeraka,Kenya,24,,,47:01,female,1999
James Kariuki,Kenya,27,,,47:03,female,1999
William Kiptum,Kenya,28,,,47:07,female,1999
Joseph Kimani,Kenya,26,,,47:31,female,1999
Josphat Machuka,Kenya,25,,,47:33,female,1999
Julius Randich,Kenya,26,,,47:37,female,1999
Leonid Shvetsov,Russia,30,,,49:10,female,1999
Reuben Chesang,Kenya,38,,,49:22,female,1999
Daniel Kihara,Kenya,30,,,49:23,female,1999
