# gapminder-20180301

- preliminary analysis of the data using `purr`, `broom` etc. 
- create subsets of data
- create repo for preliminary data
- should we slice by continent or slice by year? maybe lets do both  
   - filter
   - save in arrow format

tools:  
1. tidyverse and the dslabs gapminder dataset

analysis example credits to Hadley Wickham
- video: https://www.youtube.com/watch?v=rz3_FDVt9eg&index=2&list=PLNtpLD4WiWbw9Cgcg6IU75u-44TrrN3A4
- speakder deck: https://speakerdeck.com/hadley/managing-many-models
- github gist:https://gist.github.com/hadley/056cf4074acedc164161d6abb751cb35


In [None]:
library(tidyverse)
library(gapminder)

In [None]:
head(gapminder)

In [None]:
gapminder <- gapminder %>% mutate(year1960 = year - 1960)

# purr and list columns

In [None]:
# group data by continent then country
by_country <- gapminder %>%
    group_by(continent, country) %>%
    nest()

In [None]:
str(by_country)

In [None]:
by_country$data[[1]][1:5,]

In [None]:
country_model <- function(df) {
    lm(lifeExp ~ year1960, data = df)
}

In [None]:
models <- by_country %>%
    mutate(
        mod = map(data, country_model))

In [None]:
models$mod[[1]]

In [None]:
str(models$data)

In [None]:
models %>%
  ggplot(aes(rsq, reorder(country, rsq))) +
    geom_point(aes(colour = continent))

# broom

In [None]:
models <- models %>% 
    filter(continent == "Africa")

In [None]:
models <- models %>%
  mutate(
    glance  = models %>% map(broom::glance),
    rsq     = glance %>% map_dbl("r.squared"),
    tidy    = models %>% map(broom::tidy),
    augment = models %>% map(broom::augment)
  )
models

In [None]:
models %>% arrange(desc(rsq))
models %>% filter(continent == "Africa")

In [None]:
models %>%
  ggplot(aes(rsq, reorder(country, rsq))) +
    geom_point(aes(colour = continent))

In [None]:
models
unnest(models, data) # back to where we started
unnest(models, glance, .drop = TRUE) %>% View()
unnest(models, tidy)

In [None]:
models %>%
  unnest(tidy) %>%
  select(continent, country, term, estimate, rsq) %>%
  spread(term, estimate) %>%
  ggplot(aes(`(Intercept)`, year1950)) +
    geom_point(aes(colour = continent, size = rsq)) +
    geom_smooth(se = FALSE) +
    xlab("Life Expectancy (1950)") +
    ylab("Yearly improvement") +
    scale_size_area()

In [None]:
models %>%
  unnest(augment) %>%
  ggplot(aes(year1950, .resid)) +
    geom_line(aes(group = country), alpha = 1/3) +
    geom_hline(yintercept = 0, colour = 'white', size = 2) +
    geom_smooth(se = FALSE) +
    facet_wrap(~continent)

In [None]:
summary <- models %>%
  transmute(
    continent,
    country,
    slope = model %>% map(coef) %>% map_dbl(2),
    rsq = glance %>% map_dbl("r.squared")
  )

summary %>%
  ggplot(aes(rsq, slope)) +
  geom_point(aes(colour = continent)) +
  xlab(quote(R ^ 2)) +
  ylab("Estimated yearly increase in life expectancy") +
  theme(legend.position = "top", aspect.ratio = 1)