Permalink
Browse files

First pass of downloading, parsing, cleaning and exploring

  • Loading branch information...
0 parents commit c4945c9f8fb88cd83b83e03e18a097686dcb1e2f @hadley committed May 15, 2009
Showing with 71 additions and 0 deletions.
  1. +2 −0 .gitignore
  2. +13 −0 1-download.r
  3. +17 −0 2-parse.rb
  4. +24 −0 3-clean.r
  5. +15 −0 4-explore.r
@@ -0,0 +1,2 @@
+original
+raw
@@ -0,0 +1,13 @@
+library(RCurl)
+library(plyr)
+library(XML)
+
+save_year <- function(year) {
+ url <- "http://www.ssa.gov/cgi-bin/popularnames.cgi"
+ data <- postForm(url, style = "post",
+ "number" = "p", "top" = "1000", "year" = year)
+ writeLines(data, paste("original/", year, ".html", sep=""))
+}
+
+years <- 1880:2008
+l_ply(years, save_year)
@@ -0,0 +1,17 @@
+require 'nokogiri'
+require "FasterCSV"
+
+def parse_year(year)
+ doc = Nokogiri::HTML(open("original/#{year}.html"))
+ rows = (doc/"body/table[2]/tr/td[2]/table/tr")
+
+ FasterCSV.open("raw/#{year}.csv", "w") do |csv|
+ rows.each {|row| csv << parse_row(row)}
+ end
+end
+
+def parse_row(row)
+ (row/"td").map{|e| e.content}
+end
+
+(1880..2008).each{|year| parse_year(year)}
@@ -0,0 +1,24 @@
+library(plyr)
+
+files <- dir("raw", full = T)
+names(files) <- gsub("\\.csv", "", dir("raw"))
+
+bnames <- ldply(files, read.csv, header = F, skip = 1, nrows = 1000,
+ stringsAsFactors = FALSE)
+names(bnames) <- c("year", "rank", "boy_name", "boy_percent", "girl_name", "girl_percent")
+
+# Change from wide to long form
+boys <- bnames[c("year", "boy_name", "boy_percent")]
+girls <- bnames[c("year", "girl_name", "girl_percent")]
+
+names(boys) <- names(girls) <- c("year", "name", "percent")
+boys$sex <- "boy"
+girls$sex <- "girl"
+
+all <- rbind(boys, girls)
+
+# Turn percent string into a number
+all$percent <- as.numeric(gsub("%", "", all$percent)) / 100
+all$year <- as.numeric(as.character(all$year))
+
+write
@@ -0,0 +1,15 @@
+
+
+ofall <- ddply(all, .(year, sex), function(df) sum(df$percent))
+library(ggplot2)
+qplot(year, V1, data = ofall, colour = sex, geom = "line") + ylim(0, 1)
+
+
+all$lastletter <- with(all, substr(name, nchar(name), nchar(name)))
+
+ll <- ddply(all, .(year, sex, lastletter), function(df) sum(df$percent))
+qplot(year, V1, data = ll, colour = sex, geom = "line") +
+ facet_wrap(~ lastletter) +
+ scale_x_continuous(breaks = c(1900, 1950, 2000))
+
+last_plot() + scale_y_log10()

0 comments on commit c4945c9

Please sign in to comment.