Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

Combine both sex related explorations into one

  • Loading branch information...
commit 2f3abf2df5fd15f54dbd62a428c23518ec3b91bf 1 parent aa195cf
@hadley authored
Showing with 133 additions and 88 deletions.
  1. +0 −45 6-sex-dist.r
  2. +133 −0 6-sex-exploration.r
  3. +0 −43 8-sex-errors.r
View
45 6-sex-dist.r
@@ -1,45 +0,0 @@
-
-bnames <- read.csv("baby-names.csv")
-library(reshape)
-
-sexdist <- cast(bnames, name ~ sex, sum, value = "percent")
-sexdist <- transform(sexdist,
- total = boy + girl,
- pmale = boy / (boy + girl),
- pfemale = girl / (boy + girl))
-
-
-
-sexdist <- ddply(bnames, .(name), function(df) {
- boys <- df$sex == "boy"
- data.frame(
- boy = sum(df$percent[boys]),
- girl = sum(df$percent[!boys]),
- total = sum(df$percent)
- )
-}, .progress = "text")
-sexdist <- sexdist[order(sexdist$total), ]
-
-sexdist <- subset(sexdist, boy != 0 & girl != 0)
-dim(sexdist)
-
-top <- subset(sexdist, boy > 0.02 & girl > 0.02)
-
-
-top_dist <- ddply(bnames[bnames$name %in% top$name, ], .(name, year), function(df) {
- boys <- df$sex == "boy"
- data.frame(
- boy = sum(df$percent[boys]),
- girl = sum(df$percent[!boys]),
- total = sum(df$percent)
- )
-}, .progress = "text")
-
-top_dist$mperc <- top_dist$boy / top_dist$total
-
-qplot(year, mperc, data = top_dist) + facet_wrap(~ name)
-
-qplot(year, mperc, data = subset(top_dist, boy > 0.0001 & girl > 0.0001), geom="line") + facet_wrap(~ name)
-
-# Would be interesting mturk experiment - what five words does this name
-# evoke.
View
133 6-sex-exploration.r
@@ -0,0 +1,133 @@
+options(stringsAsFactors = FALSE)
+library(plyr)
+library(ggplot2)
+
+# Two basic tasks:
+# * for names that are used for both boys and girls, how has usage changed?
+# * can we use names that clearly have the incorrect sex to estimate error
+# rates in this data?
+
+bnames <- read.csv("baby-names.csv")
+
+# The complete data set takes a long time to work with, so we want to limit it
+# to names that plenty of data: i.e. that they've been in the top 1000 for
+# both boys and girls, and there are a decent number of babies with those
+# names.
+
+# Your turn: create a summary that for each name, gives:
+# * total proportion of babies with that name
+# * the total proportion of boys
+# * the total proportion of girls
+# * the number of years the name was in the top 1000 as a girls name
+# * the number of years the name was in the top 1000 as a boys name
+#
+# Hint: Start with a single name and figure out how to solve the problem
+# Hint: Use summarise
+times <- ddply(bnames, c("name"), summarise,
+ boys = sum(prop[sex == "boy"]),
+ boys_n = sum(sex == "boy"),
+ girls = sum(prop[sex == "girl"]),
+ girls_n = sum(sex == "girl"),
+ .progress = "text"
+)
+
+# ~7000 names in total - want to limit to under 200. In real-life would
+# probably use more, but starting with a subset for easier comprehension isn't
+# a bad idea.
+nrow(times)
+
+# At absolute minimum need at least 1 year each in top 1000. This cuts it
+# down to 582 names.
+times <- subset(times, boys_n > 1 & girls_n > 1)
+
+# Use basic graphics to figure out good cut-offs
+qplot(boys_n, girls_n, data = times)
+qplot(log10(boys), log10(girls), data = times)
+
+qplot(pmin(boys_n, girls_n), data = times, binwidth = 1)
+times$both <- with(times, boys_n > 10 & girls_n > 10)
+
+# Still a few too many names. Lets focus on names that have managed a certain
+# level of popularity.
+qplot(pmin(boys, girls), data = subset(times, both), binwidth = 0.01)
+qplot(pmax(boys, girls), data = subset(times, both), binwidth = 0.1)
+qplot(boys + girls, data = subset(times, both), binwidth = 0.1)
+
+both_sexes <- subset(times, both & boys + girls > 0.4)
+selected_names <- both_sexes$name
+
+# Now that we have figured out which names we should focus on (at least to
+# begin with), we need to calculate yearly summaries for each of those names.
+
+selected <- subset(bnames, name %in% selected_names)
+nrow(selected) / nrow(bnames)
+
+# For each name in each year, figure out total boys and total girls
+bysex <- ddply(selected, c("name", "year"), summarise,
+ boys = sum(prop[sex == "boy"]),
+ girls = sum(prop[sex == "girl"]),
+ .progress = "text"
+)
+
+# It's useful to have a symmetric means of comparing the relative abundance
+# of boys and girls - the log ratio is good for this.
+bysex$lratio <- log10(bysex$boys / bysex$girls)
+bysex$lratio[!is.finite(bysex$lratio)] <- NA
+
+# Explore the distribution of lratio for each name. This is why a smaller
+# number of names helps - we can more easily see the name associated with the
+# pattern and use our knowledge of names to determine whether or not it's
+# plausible that the name is used for both sexes.
+qplot(lratio, reorder(name, lratio, na.rm = T), data = bysex)
+qplot(abs(lratio), reorder(name, lratio, na.rm = T), data = bysex)
+
+qplot(abs(lratio), reorder(name, lratio, na.rm = T), data = bysex) +
+ geom_point(data = both_sexes, colour = "red")
+
+qplot(year, lratio, data = bysex, group = name, geom = "line")
+
+# Two things seem to be helpful in differentiating true dual-sex names from
+# errors: the average position and the amount of spread.
+#
+# Brainstorm: how could we summarise these things quantitatively?
+# Your turn: Compute the mean and range of lratio for each name
+rng <- ddply(bysex, "name", summarise,
+ diff = diff(range(lratio, na.rm = T)),
+ mean = mean(lratio, na.rm = T)
+)
+
+qplot(diff, abs(mean), data = rng)
+qplot(diff, abs(mean), data = rng, colour = abs(mean) < 1.75 | diff > 0.9)
+
+shared_names <- subset(rng, abs(mean) < 1.75 | diff > 0.9)$name
+
+qplot(abs(lratio), reorder(name, lratio, na.rm=T),
+ data = subset(bysex, name %in% shared_names))
+
+# Dual-sex names -------------------------------------------------------------
+
+shared <- subset(bysex, name %in% shared_names)
+
+qplot(year, lratio, data = shared, geom = "line") + facet_wrap(~ name)
+
+# lratio useful because it's symmetric, but probably easier to switch back to
+# something more familiar.
+qplot(year, boys / (boys + girls), data = shared, geom = "line") +
+ facet_wrap(~ name)
+
+# What's going on with Carol? Let's go back to the raw data
+qplot(year, prop, data = subset(bnames, name == "Carol"), colour = sex)
+# How about Shirley?
+qplot(year, prop, data = subset(bnames, name == "Shirley"), colour = sex)
+
+# Your turn: what is alike about these patterns?
+
+# Sex encoding errors --------------------------------------------------------
+
+errors <- subset(bysex, !(name %in% shared_names))
+qplot(year, lratio, data = errors, group = name, geom = "line")
+qplot(year, abs(lratio), data = errors, group = name, geom = "line",
+ colour = lratio > 0)
+
+# Your turn: What do you see in this plot? Do you think we can use this data
+# to estimate sex-encoding errors? Why/Why not?
View
43 8-sex-errors.r
@@ -1,43 +0,0 @@
-library(plyr)
-# Hypothesis: constant percentage of sex recording errors
-# Only see for very popular names because otherwise the
-# error never makes it into the top thousand.
-
-bnames <- read.csv("baby-names.csv", stringsAsFactors = FALSE)
-
-# For each year, find names used for both boys and girls
-one <- subset(bnames, year == 2008)
-
-both <- ddply(bnames, "year", subset,
- name %in% intersect(name[sex == "boy"], name[sex == "girl"]))
-
-both <- both[with(both, order(name, year, sex)), ]
-# both$freq <- 1 / both$percent
-
-library(reshape)
-bysex <- cast(both, name + year ~ sex, value = "percent")
-bysex <- add.all.combinations(bysex, c("name", "year"))
-bysex$ratio <- log(bysex$boy / bysex$girl)
-
-# Work out years each appears in dataset
-years <- ddply(bysex, "name", function(df) sum(!is.na(df$ratio)))
-years10 <- subset(years, V1 > 10)
-
-common <- subset(bysex, name %in% years10$name)
-
-library(ggplot2)
-qplot(year, ratio, data = common, geom = "line", group = name)
-
-pop <- subset(common, (boy > 0.01 | girl > 0.01 | is.na(boy) | is.na(girl))
-qplot(year, abs(ratio), data = pop, geom = "line", group = name)
-
-pop <- ddply(pop, "name", transform,
- sex = ifelse(max(boy, na.rm = T) > max(girl, na.rm = T), "Boy", "Girl"))
-
-ggplot(pop, aes(year, 1 / exp(abs(ratio)) * 1000)) +
- geom_line(aes(group = name)) +
- ylim(0, 15) + xlim(1880, 2000) +
- ylab("Errors per thousand") +
- facet_grid(sex ~ .) +
- geom_smooth(se = FALSE, size = 1)
-ggsave("sex-errors.png")
Please sign in to comment.
Something went wrong with that request. Please try again.