Skip to content

Commit

Permalink
Add some basic exploration of city level data
Browse files Browse the repository at this point in the history
  • Loading branch information
hadley committed Nov 21, 2008
1 parent 4d407ef commit 40a602f
Show file tree
Hide file tree
Showing 6 changed files with 105 additions and 24 deletions.
3 changes: 2 additions & 1 deletion .gitignore
@@ -1 +1,2 @@
temp.xml
temp.xml
*.rdata
40 changes: 40 additions & 0 deletions date.r
@@ -0,0 +1,40 @@
# Date functions for parsing like: mdy, ymd, ydm, dmy, mdy

month_seq <- seq(as.Date("2000-01-01"), as.Date("2000-12-31"), "month")
months <- factor(months(month_seq), levels = months(month_seq))
months_abbr <- factor(months(month_seq, TRUE), levels = months(month_seq, TRUE))

wday_seq <- seq(as.Date("2000-01-02"), as.Date("2000-01-08"), "day")
wdays <- factor(weekdays(wday_seq), levels = weekdays(wday_seq))
wdays_abbr <- factor(weekdays(wday_seq, TRUE), levels = weekdays(wday_seq, TRUE))

second <- function(x) as.POSIXlt(x)$sec
minute <- function(x) as.POSIXlt(x)$min
hour <- function(x) as.POSIXlt(x)$hour
yday <- function(x) as.POSIXlt(x)$yday + 1
wday <- function(x) as.POSIXlt(x)$wday + 1
mday <- function(x) as.POSIXlt(x)$mday
week <- function(x) yday(x) %/% 7 + 1
month <- function(x) as.POSIXlt(x)$mon + 1
year <- function(x) as.POSIXlt(x)$year + 1900
tz <- function(x) {
tzs <- attr(as.POSIXlt(x),"tzone")
tzs[length(tzs)]
}

"second<-" <- function(x, value) as.POSIXlt(x) - (second(x) - value)
"minute<-" <- function(x, value) as.POSIXlt(x) - (minute(x) - value) * 60
"hour<-" <- function(x, value) as.POSIXlt(x) - (hour(x) - value) * 3600
"yday<-" <- function(x, value) as.POSIXlt(x) - (yday(x) - value) * 3600 * 24
"wday<-" <- function(x, value) as.POSIXlt(x) - (wday(x) - value) * 3600 * 24
"mday<-" <- function(x, value) as.POSIXlt(x) - (mday(x) - value) * 3600 * 24
"week<-" <- function(x, value) as.POSIXlt(x) - (week(x) - value) * 3600 * 24 * 7
"month<-" <- function(x, value) {
ISOdatetime(year(x) + (value - 1) %/% 12, (value - 1) %% 12 + 1, mday(x), hour(x), minute(x), second(x), tz(x))
}
"year<-" <- function(x, value) {
ISOdatetime(value, month(x), mday(x), hour(x), minute(x), second(x), tz(x))
}
"tz<-" <- function(x, value) {
ISOdatetime(year(x), month(x), mday(x), hour(x), minute(x), second(x), value)
}
34 changes: 34 additions & 0 deletions explore-city.r
@@ -0,0 +1,34 @@
library(ggplot2)
source("date.r")
source("explore-data.r")


# Select the biggest cities in terms of numbers of sales
cities <- as.data.frame(table(geo$city))
names(cities) <- c("city", "freq")
big_cities <- subset(cities, freq > 5000)

qplot(freq, reorder(city, freq), data = big_cities)

# Only look at houses in big cities, reduces records to ~ 200,000
inbig <- subset(geo, city %in% big_cities$city)
inbig$month <- inbig$date
mday(inbig$month) <- 15
inbig$month <- as.Date(inbig$month)

inbig <- subset(inbig, month > as.Date("2003-04-15"))

# Summarise sales by month and city
bigsum <- ddply(inbig, .(city, month), function(df) {
data.frame(
n = nrow(df),
avg = mean(df$price, na.rm = T),
sd = sd(df$price, na.rm = T),
med = median(df$price)
)
})


qplot(month, n, data = bigsum, geom = "line", group = city, log="y")
qplot(month, avg, data = bigsum, geom = "line", group = city, log="y")
qplot(month, n * avg, data = bigsum, geom = "line", group = city, log="y")
20 changes: 20 additions & 0 deletions explore-data.r
@@ -0,0 +1,20 @@
if (file.exists("geo.rdata")) {
load("geo.rdata")
} else {
ad <- read.csv("addresses.csv", stringsAsFactors = FALSE)
sales <- read.csv("house-sales.csv", stringsAsFactors = FALSE)

geo <- merge(sales, ad, by = c("street", "city", "zip"))
geo$date <- as.Date(strptime(geo$date, "%Y-%m-%d"))
geo$datesold <- as.Date(strptime(geo$datesold, "%Y-%m-%d"))
geo$price <- as.numeric(geo$price)

save(geo, file = "geo.rdata")
}

# good <- subset(ad, quality == "QUALITY_ADDRESS_RANGE_INTERPOLATION")
# good$quality <- NULL
# good$error <- NULL
# good$success <- NULL

# qplot(long, lat, data=good, shape=I("."))
9 changes: 9 additions & 0 deletions explore-geo.r
@@ -0,0 +1,9 @@
library(ggplot2)
source("date.r")
source("explore-data.r")



# Look at geocoded addresses in SF
# Can we see how the city has grown?
qplot(long, lat, data=subset(geo, city == "San Francisco"), colour = year, xlim=c(-122.51, -122.38), ylim=c(37.7, 37.8), size=I(0.5))
23 changes: 0 additions & 23 deletions explore.r

This file was deleted.

0 comments on commit 40a602f

Please sign in to comment.