Skip to content
This repository
Browse code

finished adding JSS recommended changes and debugged stand alone R sc…

…ript (currently works)
  • Loading branch information...
commit 031da02dbecc53f5079e02ced4394af2bbb121c6 1 parent 1943672
Garrett Grolemund garrettgman authored
8 R/util.r
@@ -114,13 +114,13 @@ with_tz <- function (time, tzone = ""){
114 114 #' x <- as.POSIXct("2009-08-07 00:00:01", tz = "America/New_york")
115 115 #' force_tz(x, "GMT")
116 116 #' # "2009-08-07 00:00:01 GMT"
117   -force_tz <- function(time, tz = ""){
  117 +force_tz <- function(time, tzone = ""){
118 118 x <- as.POSIXlt(time)
119 119
120   - if(is.null(tz)) tz <- ""
  120 + if(is.null(tzone)) tzone <- ""
121 121 new <- ISOdatetime(year(x), month(x), mday(x), hour(x),
122   - minute(x), second(x), tz)
123   - new[hour(with_tz(new, tz)) != hour(time)] <- NA
  122 + minute(x), second(x), tzone)
  123 + new[hour(with_tz(new, tzone)) != hour(time)] <- NA
124 124
125 125 reclass_date(new, time)
126 126 }
34,625 data/lakers.csv
34,625 additions, 0 deletions not shown
BIN  data/lakers.rda
Binary file not shown
BIN  paper/dates-points.png
43 paper/lubridate.r
@@ -15,19 +15,15 @@ library(lubridate)
15 15 date <- as.POSIXct("01-01-2010", format = "%d-%m-%Y", tz = "UTC")
16 16 as.numeric(format(date, "%m"))
17 17 as.POSIXlt(date)$month + 1
18   -date <- as.POSIXct(format(date, "%Y-2-%d"))
  18 +date <- as.POSIXct(format(date, "%Y-2-%d"), tz = "UTC")
  19 +date <- seq(date, length = 2, by = "-1 day")[2]
  20 +as.POSIXct(format(as.POSIXct(date), tz = "UTC"), tz = "GMT")
19 21
20 22 # lubridate examples
21 23 date <- dmy("01-01-2010")
22 24 month(date)
23 25 month(date) <- 2
24   -
25   -# additional base R examples
26   -seq(date, length = 2, by = "-1 day")[2]
27   -as.POSIXct(format(as.POSIXct(date). tz = "UTC"), tz = "GMT")
28   -
29   -# additional lubridate examples
30   -date - days(1)
  26 +date <- date - days(1)
31 27 with_tz(date, "GMT")
32 28
33 29
@@ -158,8 +154,8 @@ as.POSIXct(strptime(z, "%m/%d/%Y"))
158 154 # Parse z = 15101970
159 155 z <- 15101970
160 156 dmy(z)
161   -as.Date(as.character(z), format = "%d/%m/%Y")
162   -as.POSIXct(as.character(z), tz = "UTC", format = "%d/%m/%Y")
  157 +as.Date(as.character(z), format = "%d%m%Y")
  158 +as.POSIXct(as.character(z), tz = "UTC", format = "%d%m%Y")
163 159
164 160 # 1 second
165 161 seconds(1)
@@ -197,7 +193,7 @@ month(date, label = TRUE, abbr = FALSE)
197 193 wday(date, label = TRUE, abbr = FALSE)
198 194
199 195 day(date) <- 5
200   -dates <- ymd_hms("2010-01-01 01:00:00", "2010-01-01- 01:30:00")
  196 +dates <- ymd_hms("2010-01-01 01:00:00", "2010-01-01 01:30:00")
201 197 minute(dates) <- mean(minute(dates))
202 198 day(date) <- 30
203 199 day(date) <- 1
@@ -216,7 +212,7 @@ date + hours(3)
216 212 start_2012 <- ymd_hms("2012-01-01 12:00:00")
217 213 is.instant(364)
218 214 is.instant(start_2012)
219   -round_date(date, "day")
  215 +round_date(start_2012, "day")
220 216 now()
221 217 today()
222 218
@@ -257,9 +253,9 @@ force_tz(date, "UTC")
257 253 # 7. Daylight savings time
258 254 # _______________________________________________________________
259 255 dst_time <- ymd_hms("2010-03-14 01:59:59")
260   -dst_time <- force_tz(date, "America/Chicago") # Note: Time zone
261   -# names are operating system dependent and are not standard across
262   -# all operating systems. See ?timezone for details.
  256 +dst_time <- force_tz(dst_time, "America/Chicago") # Note: Time
  257 +# zone names are operating system dependent and are not standard
  258 +# across all operating systems. See ?timezone for details.
263 259
264 260 dst_time + eseconds(1)
265 261 dst_time + hours(2)
@@ -284,24 +280,29 @@ wday(date, label = T, abbr = F)
284 280
285 281 # 9. Case study 2
286 282 # _______________________________________________________________
287   -head(lakers)
288   -str(lakers$date[1])
  283 +str(lakers$date)
289 284 lakers$date <- ymd(lakers$date)
  285 +str(lakers$date)
290 286
291 287 # to use qplot we must first install and load the ggplot2 package
292 288 install.packages("ggplot2")
293 289 library(ggplot2)
294   -qplot(date, 0, data = lakers, colour = lakers$home == "LAL") +
295   - scale_colour_discrete(name= "Venue", labels = c("home game",
296   - "away game"))
  290 +qplot(date, 0, data = lakers, colour = game_type)
  291 +
297 292 qplot(wday(date, label = T), data = lakers, geom = "histogram")
  293 +
298 294 lakers$time <- ms(lakers$time)
299 295 lakers$time <- as.duration(lakers$time)
300 296 lakers$time <- eminutes(12) * lakers$period - lakers$time
  297 +lakers <- subset(lakers, period != 5)
  298 +
  299 +
  300 +
301 301 qplot(as.integer(time), data = lakers, geom = "histogram",
302 302 binwidth = 60)
303 303 lakers$demo <- ymd("2008-01-01") + lakers$time
304 304 qplot(demo, data = lakers, geom = "histogram", binwidth = 60)
  305 +
305 306 game1 <- lakers[lakers$date == ymd("20081028"),]
306 307 attempts <- game1[game1$etype == "shot",]
307 308 attempts$wait <- c(attempts$time[1], diff(attempts$time))
@@ -310,5 +311,5 @@ qplot(as.integer(wait), data = attempts, geom = "histogram",
310 311 game1_scores <- ddply(game1, "team", transform, score =
311 312 cumsum(points))
312 313 game1_scores <- game1_scores[game1_scores$team != "OFF",]
313   -qplot(ymd("2008-01-01") + time, score, data = game1_scores, geom =
  314 +qplot(demo, score, data = game1_scores, geom =
314 315 "line", colour = team)
119 paper/lubridate.tex
@@ -74,7 +74,7 @@ \section{Motivation}
74 74 \code{as.POSIXlt(date)$month + 1} &\\
75 75 & \\
76 76 \code{date <- as.POSIXct(format(date, } & \code{month(date) <- 2} \\
77   - \indent \code{ "\%Y-2-\%d"))} & \\
  77 + \indent \code{ "\%Y-2-\%d"), tz = "UTC")} & \\
78 78
79 79 \end{tabular}
80 80 \end{center}
@@ -118,23 +118,23 @@ \section{Motivation}
118 118 \section{Parsing date-times}
119 119 \label{sec:parsing}
120 120
121   -We can read dates into R using the \code{ymd()} series of functions provided by \pkg{lubridate}. The letters y, m, and d correspond to the year, month, and day elements of a date-time. To read in a date, choose the function name that matches the order of elements in your date-time object. For example,\\
  121 +We can read dates into R using the \code{ymd()} series of functions provided by \pkg{lubridate}. The letters y, m, and d correspond to the year, month, and day elements of a date-time. To read in a date, choose the function name that matches the order of elements in your date-time object. For example, in the following date the month element comes first, followed by the day and then the year. So we'd use the \code{mdy()} function:\\
122 122
123 123 \code{R> mdy("12-01-2010")}\\
124 124 \code{[1] "2010-12-01 UTC"}\\
125 125
126   -or
  126 +The same character string can be parsed as January 12, 2001 by reversing the month and day element with \code{dmy()}.\\
127 127
128 128 \code{R> dmy("12-01-2010")}\\
129 129 \code{[1] "2010-01-12 UTC"}\\
130 130
131   -or
  131 +The \code{ydm()} series of functions can also parse vectors of dates.\\
132 132
133 133 \code{R> dmy(c("31.12.2010", "01.01.2011"))}\\
134 134 \code{[1] "2010-12-31 UTC" "2011-01-01 UTC"}\\
135 135
136 136
137   -These functions create a POSIXct date-time object that matches the date described by the character string. The functions automatically recognize the following separators: ``-", ``/", ``.", and ``" (i.e., no separator). When a \code{ymd()} function is applied to a vector of dates, \pkg{lubridate} will assume that all of the dates have the same order and the same separators. It will also print a message that tells the user which format was used to parse the dates. \code{ymd()} type functions also exist for times recorded with hours, minutes, and seconds. These functions make it simple to parse any date-time object that can be converted to a character string. See Table~\ref{tbl:parsers} for a complete list of \code{ymd()} type parsing functions.
  137 +These functions create a POSIXct date-time object that matches the date described by the character string. The functions automatically recognize the separators commonly used to record dates. These include: ``-", ``/", ``.", and ``" (i.e., no separator). When a \code{ymd()} function is applied to a vector of dates, \pkg{lubridate} will assume that all of the dates have the same order and the same separators. \code{ymd()} type functions also exist for times recorded with hours, minutes, and seconds. These functions make it simple to parse any date-time object that can be converted to a character string. See Table~\ref{tbl:parsers} for a complete list of \code{ymd()} type parsing functions.
138 138
139 139 \begin{table}
140 140 \begin{center}
@@ -160,12 +160,12 @@ \section{Parsing date-times}
160 160 \section{Manipulating date-times}
161 161 \label{sec:accessors}
162 162
163   -Every date-time is a combination of different elements, each with its own value. For example, most date-times include a year value, a month value, a day value, etc. Together, these elements specify the exact moment that the date-time refers to. We can easily extract each element of a date-time with the accessor function that has its name, as shown in Table~\ref{tbl:accessors}. For example, if we save the current system time\\
  163 +Every date-time is a combination of different elements, each with its own value. For example, most date-times include a year value, a month value, a day value and so on. Together these elements specify the exact moment that the date-time refers to. We can easily extract each element of a date-time with the accessor function that has its name, as shown in Table~\ref{tbl:accessors}. For example, if we save the current system time\\
164 164
165 165 \code{R> date <- now()}\\
166 166 \code{[1] "2010-02-25 09:51:48 CST"}\\
167 167
168   -we can extract each of its elements.\\
  168 +we can extract each of its elements. Note that this was the system time when this example was written. \code{now()} will return a different date-time each time it is used.\\
169 169
170 170 \code{R> year(date)}\\
171 171 \code{[1] 2010}\\
@@ -218,7 +218,7 @@ \section{Manipulating date-times}
218 218
219 219 changes our date to the fifth day of the month. We can also set the elements to more complicated values, e.g.\\
220 220
221   -\code{R> dates <- ymd_hms("2010-01-01 01:00:00", "2010-01-01- 01:30:00")}\\
  221 +\code{R> dates <- ymd_hms("2010-01-01 01:00:00", "2010-01-01 01:30:00")}\\
222 222 \code{R> minute(dates) <- mean(minute(dates))}\\
223 223 \code{[1] "2010-01-01 01:15:00 UTC" "2010-01-01 01:15:00 UTC"}\\
224 224
@@ -288,8 +288,8 @@ \subsection{Instants}
288 288
289 289 We can easily round instants to the nearest minute, hour, month, etc. using \code{floor_date()}, \code{ceiling_date()}, and \code{round_date()}. For example,
290 290
291   -\code{R> round_date(date, "day")}\\
292   -\code{[1] "2012-01-02 00:00:00 CST"}\\
  291 +\code{R> round_date(start_2012, "day")}\\
  292 +\code{[1] "2012-01-02 UTC"}\\
293 293
294 294 We can also capture the current time as an instant with \code{now()}, and the current day with \code{today()}.
295 295
@@ -305,12 +305,12 @@ \subsection{Intervals}
305 305 \code{R> start_2011 <- ymd_hms("2011-01-01 12:00:00")}\\
306 306 \code{R> start_2010 <- ymd_hms("2010-01-01 12:00:00")}\\
307 307 \code{R> span <- start_2011 - start_2010}\\
308   -\code{[1] 365 days beginning at 2010-01-01}\\
  308 +\code{[1] 365 days beginning at 2010-01-01 12:00:00}\\
309 309
310 310 Unfortunately, since intervals are anchored to their start and end dates, they are not very useful for date-time math. It only makes sense to add an interval to its start date or to subtract it from its end date.\\
311 311
312 312 \code{R> start_2010 + span}\\
313   -\code{[1] "2011-01-01 12:00 UTC"}\\
  313 +\code{[1] "2011-01-01 12:00:00 UTC"}\\
314 314
315 315
316 316 \subsection{Durations}
@@ -328,16 +328,16 @@ \subsection{Durations}
328 328 \code{eyears()}, \code{eweeks()}, \code{edays()}, \code{eminutes()}, and \code{eseconds()}. The e in the title stands for estimated. Each object creates a duration in seconds using the estimated relationships given above. The argument of each function is the number of estimated units we wish to include in the duration. For example,\\
329 329
330 330 \code{R> eminutes(1)}\\
331   -\code{Duration of 1 mins}\\
  331 +\code{Time difference of 1 mins}\\
332 332
333 333 \code{R> eseconds(60)}\\
334   -\code{Duration of 1 mins \# 60 seconds = 1 estimated minute}\\
  334 +\code{Time difference of 1 mins \# 60 seconds = 1 estimated minute}\\
335 335
336 336 \code{R> eminutes(2)}\\
337   -\code{Duration of 2 mins}\\
  337 +\code{Time difference of 2 mins}\\
338 338
339 339 \code{R> c(1:3) * ehours(1) }\\
340   -\code{Durations in hours}\\
  340 +\code{Time difference in hours}\\
341 341 \code{[1] 1 2 3}\\
342 342
343 343 Durations can be added and subtracted to any instant object. For example,\\
@@ -351,12 +351,12 @@ \subsection{Durations}
351 351 Durations can also be added to or subtracted from intervals and other durations. For example,\\
352 352
353 353 \code{R> eweeks(1) + edays(6) + ehours(2) + eminutes(1.5) + eseconds(3)}\\
354   -\code{Duration of 1.869201 weeks}\\
  354 +\code{Time difference of 1.869201 weeks}\\
355 355
356 356 We can also create durations from intervals using \code{as.duration()}.
357 357
358 358 \code{R> as.duration(span)}\\
359   -\code{Duration of 1 year}\\
  359 +\code{Time difference of 52.14286 weeks \# 1 year}\\
360 360
361 361
362 362 \subsection{Periods}
@@ -405,23 +405,23 @@ \subsection{Periods}
405 405
406 406 \end{tabular}
407 407 \end{center}
408   - \caption{Object that results from adding two date-time objects.}
  408 + \caption{Adding two date-time objects will create the above type of object.}
409 409 \label{tbl:date-math}
410 410 \end{table}
411 411
412 412 \section{Time zones}
413 413 \label{sec:tz}
414 414
415   -Time zones give multiple names to the same instant. For example, ``2010-03-26 11:53:24 CDT" and ``2010-03-26 12:53:24 EDT" both describe the same instant. The first shows how the instant is labeled in the United States' central time zone (CDT). The second shows how the same instant is labelled in the United States' eastern time zone (EDT). Time zones complicate date-time data, but are useful for mapping clock time to local daylight conditions. When working with instants, it is standard to give the clock time as it appears in the Coordinated Universal time zone (UTC). This saves calculations, but can be annoying if your computer insists on translating times to your current time zone. It may also be inconvenient to discuss clock times that occur in a place unrelated to the data.
  415 +Time zones give multiple names to the same instant. For example, ``2010-03-26 11:53:24 CDT" and ``2010-03-26 12:53:24 EDT" both describe the same instant. The first shows how the instant is labeled in the United States' central time zone (CDT). The second shows how the same instant is labelled in the United States' eastern time zone (EDT). Time zones complicate date-time data but are useful for mapping clock time to local daylight conditions. When working with instants, it is standard to give the clock time as it appears in the Coordinated Universal time zone (UTC). This saves calculations but can be annoying if your computer insists on translating times to your current time zone. It may also be inconvenient to discuss clock times that occur in a place unrelated to the data.
416 416
417   -\pkg{lubridate} eases the frustration caused by time zones in two ways. We can change the the time zone in which an instant is displayed by using the function \code{with_tz()}. This changes how the clock time is displayed, but not the instant that is referred to. For example,\\
  417 +\pkg{lubridate} eases the frustration caused by time zones in two ways. We can change the time zone in which an instant is displayed by using the function \code{with_tz()}. This changes how the clock time is displayed, but not the instant that is referred to. For example,\\
418 418
419 419 \code{R> date}\\
420 420 \code{[1] "2010-01-01 09:51:48 CST"}\\
421 421 \code{R> with_tz(date, "UTC")}\\
422 422 \code{[1] "2010-01-01 15:51:48 UTC"}\\
423 423
424   -Occasionally, it is useful to keep the same clock time and change the time zone it is assigned to. This switch is accomplished with the \code{force_tz()} function. \code{force_tz()} does the opposite of \code{with_tz()}: it changes the instant that is displayed, but the clock time remains the same. For example, the code below moves us to a new instant that occurs 6 hours earlier.\\
  424 +\code{force_tz()} does the opposite of \code{with_tz()}: it changes the instant that is displayed, but the clock time remains the same. For example, the code below moves us to a new instant that occurs 6 hours earlier.\\
425 425
426 426 \code{R> date}\\
427 427 \code{[1] "2010-01-01 09:51:48 CST"}\\
@@ -429,16 +429,16 @@ \section{Time zones}
429 429 \code{[1] "2010-01-01 09:51:48 UTC"}\\
430 430
431 431
432   -\section{Daylight Savings Time}
  432 +\section{Daylight savings time}
433 433 \label{sec:DST}
434 434
435   -In many parts of the world, the official clock time springs forward by one hour in the spring and falls back one hour in the fall. For example, in Houston, Texas a change in daylight savings time occurred at 2:00AM on March 14, 2010. The last instant before this change was 2010-03-14 01:59:59 CST.\\
  435 +In many parts of the world, the official clock time springs forward by one hour in the spring and falls back one hour in the fall. For example, in Chicago, Illinois a change in daylight savings time occurred at 2:00AM on March 14, 2010. The last instant to occur before this change was 2010-03-14 01:59:59 CST.\\
436 436
437   -\code{R> dst_time <- ymd.hms("2010-03-14 01:59:59")}\\
438   -\code{R> dst_time <- force_tz(date, "CST")}\\
  437 +\code{R> dst_time <- ymd_hms("2010-03-14 01:59:59")}\\
  438 +\code{R> dst_time <- force_tz(dst_time, "America/Chicago")}\\
439 439 \code{[1] "2010-03-14 01:59:59 CST"}\\
440 440
441   -One second later, Houston clock times read\\
  441 +One second later, Chicago clock times read\\
442 442
443 443 \code{R> dst_time + eseconds(1)}\\
444 444 \code{[1] "2010-03-14 03:00:00 CDT"}\\
@@ -448,7 +448,7 @@ \section{Daylight Savings Time}
448 448 \code{R> dst_time + hours(2)}\\
449 449 \code{[1] "2010-03-14 03:59:59 CDT"}\\
450 450
451   -displays the clock time that usually occurs two hours after 1:59:59 AM. When using periods, we do not have to track dst changes because they will not affect our calculations. Adding a duration would give us the actual clock time that appeared two hours later on March 14, 2010.
  451 +displays the clock time that usually occurs two hours after 1:59:59 AM. When using periods, we do not have to track dst changes because they will not affect our calculations. Adding a duration would give us the actual clock time that appeared exactly two hours later on March 14, 2010.
452 452
453 453 \code{R> dst_time + ehours(2)}\\
454 454 \code{[1] "2010-03-14 04:59:59 CDT"}\\
@@ -457,7 +457,7 @@ \section{Daylight Savings Time}
457 457
458 458 We can also avoid the complications created by daylight savings time by keeping our date-times in a time zone such as ``UTC", which does not adopt daylight savings hours.
459 459
460   -\section{Case Study 1}
  460 +\section{Case study 1}
461 461
462 462 The next two sections will work through some techniques using \pkg{lubridate}. First, we will use \pkg{lubridate} to calculate the dates of holidays. Then we'll use \pkg{lubridate} to explore an example data set (\code{lakers}).
463 463
@@ -500,89 +500,96 @@ \subsection{Memorial Day}
500 500 \code{R> date <- date + months(5) - days(1)}\\
501 501 \code{[1] "2010-05-31 UTC"}\\
502 502
503   -We can then check which day of the week May 31st falls on. It happens to be a Monday, so we are done. If May 31st had been another day of the week, we could've subtract an appropriate number of days to get the last monday of May.\\
  503 +We can then check which day of the week May 31st falls on. It happens to be a Monday, so we are done. If May 31st had been another day of the week, we could've subtracted an appropriate number of days to get the last monday of May.\\
504 504
505 505 \code{R> wday(date, label = T, abbr = F)}\\
506 506 \code{[1] Monday}\\
507 507
508 508
509   -\section{Case Study 2}
510   -Now let's explore the \code{lakers} data set. The \code{lakers} data set contains play by play statistics of every major league basketball game played by the Los Angeles Lakers during the 2008-2009 season. This data is from \url{http://www.basketballgeek.com/downloads/} \citep{bball}. \\
  509 +\section{Case study 2}
  510 +The \code{lakers} data set contains play by play statistics of every major league basketball game played by the Los Angeles Lakers during the 2008-2009 season. This data is from \url{http://www.basketballgeek.com/downloads/} \citep{bball} and comes with the \pkg{lubridate} package. We will explore the distribution of Lakers' games throughout the year as well as the distribution of plays within Lakers' games. We choose to use the \pkg{ggplot2} ~\citep{ggplot2} package to create our graphs.\\
511 511
512   -\code{R> head(lakers)}\\
  512 +The lakers data set comes with a \code{date} variable which records the date of each game. Using the \code{str()} command, we see that \proglang{R} recognizes the dates as integers.\\
513 513
514   -First we'll examine when during the year the Lakers have games. We choose to use the \pkg{ggplot2} ~\citep{ggplot2} package to create our graphs.\\
  514 +\code{str(lakers$date)}\\
  515 +\code{int [1:34624] 20081028 20081028 20081028 ...}\\
515 516
516   -\code{str(lakers$date[1])}\\
517   -\code{int 20081028}\\
518   -
519   -\proglang{R} recognizes the dates in the \code{lakers} data set as integers. So our first task is to parse the dates, or read them into \proglang{R} as date-time objects. We recognize that the dates include the year element first, followed by the month element, and then the day element. Hence, we should use the \code{ymd()} parsing function.\\
  517 +Before we can work with them as dates, we must parse them into \proglang{R} as date-time objects. The dates appear to be arranged with their year element first, followed by the month element, and then the day element. Hence, we should use the \code{ymd()} parsing function.\\
520 518
521 519 \code{R> lakers$date <- ymd(lakers$date)}\\
522   -\code{R> qplot(date, 0, data = lakers, geom = "point", colour = lakers$home == "LAL") + scale_colour_discrete(name = "Venue", labels = c("home game", "away game"))}\\
  520 +\code{str(lakers$date)}\\
  521 +\code{POSIXct[1:34624], format: "2008-10-28" "2008-10-28" ...}\\
  522 +
  523 +\proglang{R} now recognizes the dates as POSIXct date-time objects. It will now treat them as date-times in any functions that have POSIXct specific methods. For example, if we plot the occurrences of home and away games throughout the season, our x axis will display date-time information for the tick marks (Figure \ref{fig:games-date}).\\
  524 +
  525 +\code{R> qplot(date, 0, data = lakers, colour = game_type)}\\
523 526
524 527 \begin{figure}[htpb]
525 528 \centering
526 529 \includegraphics[width=\textwidth]{dates-points.png}
527   - \caption{Dates of Lakers games for 2008-2009 season.}
  530 + \caption{Occurrences of home and away games for the 2008-2009 season.}
528 531 \label{fig:games-date}
529 532 \end{figure}
530 533
531   -Figure~\ref{fig:games-date} shows that games are played continuously throughout the season with a few short breaks. The frequency of games seems lower at the start of the season and games appear to be grouped into clusters of home games and away games. Notice the tick marks on the x axis; the labels and breaks are automatically generated by \code{pretty.date()}, which is in the \pkg{lubridate} package.
  534 +Figure~\ref{fig:games-date} shows that games are played continuously throughout the season with a few short breaks. The frequency of games seems lower at the start of the season and games appear to be grouped into clusters of home games and away games. The tick marks and breaks on the x axis are automatically generated by the \pkg{lubridate} method \code{pretty.date()}.\\
532 535
533   -Next we'll examine how Lakers games are distributed throughout the week.\\
  536 +Next we'll examine how Lakers games are distributed throughout the week. We use the \code{wday()} command to extract the day of the week of each date.\\
534 537
535 538 \code{R> qplot(wday(date, label = T), data = lakers, geom = "histogram")}\\
536 539
537 540 \begin{figure}[htpb]
538 541 \centering
539 542 \includegraphics[width=\textwidth]{weekdays-histogram.png}
540   - \caption{Number of games played per weekday.}
  543 + \caption{The number of games played per weekday varies.}
541 544 \label{fig:games-days}
542 545 \end{figure}
543 546
544 547
545   -The frequency of basketball games appears to vary throughout the week, figure~\ref{fig:games-days}. Surprisingly, the highest number of games are played on Tuesdays.
  548 +The frequency of basketball games varies throughout the week (Figure~\ref{fig:games-days}). Surprisingly, the highest number of games are played on Tuesdays.
546 549
547 550 Now let's look at the games themselves. In particular, let's look at the distribution of plays throughout the game. The \code{lakers} data set lists the time that appeared on the game clock for each play. These times begin at 12:00 at the beginning of each period and then count down to 00:00, which marks the end of the period. The first two digits refer to the number of minutes left in the period. The second two digits refer to the number of seconds.
548 551
549   -The times have not been parsed as date-time data to \proglang{R}. In fact, it would be difficult to record the time data as a date-time object because the data is incomplete: a minutes and seconds element are not sufficient to identify a unique date-time. However, we can store the minutes and seconds information as a \emph{period} object, as defined in Section~\ref{sec:periods}, using the \code{ms()} parse function.\\
  552 +The times have not been parsed as date-time data to \proglang{R}. It would be difficult to record the time data as a date-time object because the data is incomplete: a minutes and seconds element are not sufficient to identify a unique date-time. However, we can store the minutes and seconds information as a \emph{period} object, as defined in Section~\ref{sec:periods}, using the \code{ms()} parse function.\\
550 553
551 554 \code{R> lakers$time <- ms(lakers$time)}\\
552 555
553   -Recall that periods have relative lengths. Since we'd like to compare times against each other, we should first convert our periods to \emph{durations}, which have exact lengths.\\
  556 +Since periods have relative lengths, it is dangerous to compare them to each other. So we should next convert our periods to \emph{durations}, which have exact lengths.\\
554 557
555 558 \code{R> lakers$time <- as.duration(lakers$time)}\\
556 559
557   -This allows us to directly compare different durations. It would also allow us to determine exactly when each play occurred by adding the duration to the \emph{instant} the game began. (Unfortunately, the starting time for each game is not available in the data set). We can now subtract our time information from a duration of 12, 24, 36, or 48 minutes (depending on the period of play) to create a new duration that records exactly how far into the game each play occurred.\\
  560 +This allows us to directly compare different durations. It would also allow us to determine exactly when each play occurred by adding the duration to the \emph{instant} the game began. (Unfortunately, the starting time for each game is not available in the data set). However, we can still calculate when in each game each play occurred. Each period of play is 12 minutes long. At the start of each period, the game clock begins counting down from 12:00. So to calculate how much play time elapses before each play, we subtract the time that appears on the game clock from a duration of 12, 24, 36, or 48 minutes (depending on the period of play). This creates a new duration that records exactly how far into the game each play occurred.\\
558 561
559   -\code{lakers$time <- eminutes(12) * lakers$period - lakers$time}\\
  562 +\code{R> lakers$time <- eminutes(12) * lakers$period - lakers$time}\\
560 563
561   -Unfortunately, \pkg{ggplot2} does not support plotting durations, or difftimes, the class used by durations. To plot our data, we can extract the integer value of our durations, which will equal the number of seconds that occurred in each duration.\\
  564 +One complication is that some games went into overtime. These games have a fifth period that is only five minutes long. To keep things simple, we will ignore plays that occur in overtime.\\
  565 +
  566 +\code{R> lakers <- subset(lakers, period != 5)}
  567 +
  568 +\pkg{ggplot2} does not support plotting difftimes, which is the class used by durations. To plot our data, we can extract the integer value of our durations, which will equal the number of seconds that occurred in each duration (Figure \ref{fig:plays}).\\
562 569
563 570 \code{R> qplot(as.integer(time), data = lakers, geom = "histogram", binwidth = 60)}\\
564 571
565 572 \begin{figure}[htpb]
566 573 \centering
567 574 \includegraphics[width=\textwidth]{play-time-histogram.png}
568   - \caption{Distribution of plays within game.}
  575 + \caption{By default, the time of each play is recorded in seconds on the x axis.}
569 576 \label{fig:plays}
570 577 \end{figure}
571 578
572   -Alternatively, we can create date-times, which \pkg{ggplot2} does support, by adding each of our durations to the same starting instant. This creates a plot whose tick marks are determined by \code{pretty.date()}. This helper function recognizes the most intuitive binning and labeling of date-time data, which further enhances our graph.\\
  579 +Or we can create date-times, which \pkg{ggplot2} does support, by adding each of our durations to the same starting instant. This creates a plot whose tick marks are determined by \code{pretty.date()}. This helper function recognizes the most intuitive binning and labeling of date-time data, which further enhances our graph (Figure \ref{fig:plays2}).\\
573 580
574   -\code{R> lakers$demo <- ymd("2008-01-01") + lakers$time}\\
575   -\code{R> qplot(demo, data = lakers, geom = "histogram", binwidth = 60)}\\
  581 +\code{R> lakers$minutes <- ymd("2008-01-01") + lakers$time}\\
  582 +\code{R> qplot(minutes, data = lakers, geom = "histogram", binwidth = 60)}\\
576 583
577 584 \begin{figure}[htpb]
578 585 \centering
579 586 \includegraphics[width=\textwidth]{play-time-histogram2.png}
580   - \caption{Distribution of plays within game.}
581   - \label{fig:plays}
  587 + \caption{It makes more sense to show the time of each play in minutes.}
  588 + \label{fig:plays2}
582 589 \end{figure}
583 590
584 591
585   -We see that the number of plays peaks within each of the four periods and then plummets at the beginning of the next period, figure~\ref{fig:plays}. We also see a small number of plays that occur in overtime. Observations that occur after 48 minutes suggest games that were decided in overtime.
  592 +The number of plays peaks within each of the four periods and then plummets at the beginning of the next period, figure~\ref{fig:plays2}.
586 593
587 594 Now lets look more closely at just one basketball game: the first game of the season. This game was played on October 28, 2008. For this game, we can easily model the amounts of time that occurred between each shot attempt.\\
588 595
BIN  paper/play-time-histogram.png
BIN  paper/play-time-histogram2.png
BIN  paper/score-comparison.png

0 comments on commit 031da02

Please sign in to comment.
Something went wrong with that request. Please try again.