# A Script for Scraping Weather Data from Environment Canada

2018-01-08 version 1.1

In [1]:
#-----------------
  # Dependencies:
  #-----------------
  #install.packages(c("lubridate","XML"),repos="http://cran.rstudio.com")
  
  library("rvest")
  library(dplyr)
  library(lubridate)
  library(XML)
  

Loading required package: xml2

Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union


Attaching package: 'lubridate'

The following object is masked from 'package:base':

    date


Attaching package: 'XML'

The following object is masked from 'package:rvest':

    xml



In [48]:
# sample station
day1 = '2010-11-25' 
day2 = '2010-11-26'
stationID = 31408
stationName = 'Barnwell AGDM'

In [49]:
  ############################
  # HOURLY WEATHER DOWNLOADS #
  ############################
  
  
  # sample
  
  # http://climate.weather.gc.ca/climate_data/hourly_data_e.html?hlyRange=2004-06-16%7C2017-04-20&dlyRange=2004-06-01%7C2017-04-20&mlyRange=2004-06-01%7C2007-10-01&StationID=42729&Prov=AB&urlExtension=_e.html&searchType=stnName&optLimit=yearRange&StartYear=1840&EndYear=2017&selRowPerPage=25&Line=1&searchMethod=contains&Month=4&Day=20&txtStationName=Raymond&timeframe=1&Year=2017
  
  # Things you need to change for each station:
  #location_url = paste("Prov=AB&StationID=",stationID,"&hlyRange=2004-06-16%7C",day2,sep='')   # 
  #day1 = day1
  #day2 = day2
  #--------------------day1 and day1 inside the hlyRange=2010-10-01%7C2013-03-11
  
  # Things that don't need to be changed: (make a date vector)
  start = strptime(day1, format = '%Y-%m-%d')
  end = strptime(day2, format = '%Y-%m-%d')
  days = seq(start, end, 'days')

  


In [50]:
days

[1] "2010-11-25 PST" "2010-11-26 PST"

In [51]:
# try first day
    i=1    
    year =  year(days[i])
    month = month(days[i])
    day = day(days[i])
    dly =strtrim(days[1],10)
      #http://climate.weather.gc.ca/climate_data/hourly_data_e.html?hlyRange=2004-06-16%7C2017-04-20&dlyRange=2004-06-01%7C2017-04-20&mlyRange=2004-06-01%7C2007-10-01&StationID=42729&Prov=AB&urlExtension=_e.html&searchType=stnName&optLimit=yearRange&StartYear=1840&EndYear=2017&selRowPerPage=25&Line=1&searchMethod=contains&Month=4&Day=20&txtStationName=Raymond&timeframe=1&Year=2017
    #http://climate.weather.gc.ca/climate_data/hourly_data_e.html?hlyRange=2004-06-16%7C
    # dly 2017-04-20
    #&dlyRange=2004-06-01%7C
    # dly 2017-04-20
    #42729
    #&Prov=AB&urlExtension=_e.html&searchType=stnName&optLimit=yearRange&StartYear=1840&EndYear=2017&selRowPerPage=25&Line=1&searchMethod=contains&Month=4&Day=20&txtStationName=Raymond&timeframe=1&Year=2017
    
    preamb = 'http://climate.weather.gc.ca/climate_data/hourly_data_e.html?hlyRange=2004-06-16%7C'
    # daily range
    dlyRange = '&dlyRange=2004-06-01%7C'
    # monthly range  
    mlyRange = '&mlyRange=2004-06-01%7C2007-10-01&StationID='
    # location   
    location_url = "&Prov=AB&urlExtension=_e.html&searchType=stnName" 
    
    # year range
    year0_url = '&optLimit=yearRange&StartYear=1840&EndYear='
    # month  
    month_url = "&selRowPerPage=25&Line=1&searchMethod=contains&Month="
    # day  
    day_url = '&Day='
    # station name  
    st_url = '&txtStationName='
    # year  
    year1_url = '&timeframe=1&Year='
    # put together
    url = paste0(preamb,dly,
                dlyRange,dly,
                mlyRange,stationID,
                location_url,
                year0_url ,year, 
                month_url, month, 
                day_url, day, 
                st_url,URLencode(stationName), # replace space to %20
                year1_url,year)

In [52]:
### key part to read data from website     
# read xml table from website
    #data = readHTMLTable(url)
    #data = data.frame(data[1] )
    
    data <- url %>%
      read_html() %>%
      html_nodes(xpath='//*[@id="dynamicDataTable"]/table') %>%
      html_table() 

In [54]:
##  clean the data format for csv

    
    # from list to take first dataframe item
    data = data[[1]]
    # rename the first column of the dataframe   
    names(data)[1]="time"
    # Select rows by position from 2nd row
    data = data%>%slice( 2:n())
    #data = data.frame(data[[1]])
    
    ### solve missing data problem
    if (is.factor(data)){
      # generate NA dataframe for missing data day, for example 2010/11/25
      data=data.frame(matrix(data=NA,nrow=24,ncol = 11))
      data[,1] = 0:23  # first column
    }else{
      # Fix dumb time 'Legend add-on
      #data[,1] = matrix(unlist(strsplit(as.character(data[,1]), ':', fixed = T)),ncol = 2,byrow = T)[,1]
    }
      
    # add last four columns
    data$Year = year
    data$Month = month
    data$Day = day
    data$StationName = stationName
    # change colname
    colnames(data) = c("Time", 'TempC', 'DewTempC', 'RelH', 'WindDir', "WindSpdkm.h", "Visibilitykm","PresskPa", "Hmdx", "WindChill", "Weather","Year","Month","Day","StationName")
    

NULL

In [56]:
## loop by days  
  weather_data = c()
  for (i in 1:length(days)){
    year =  year(days[i])
    month = month(days[i])
    day = day(days[i])
    dly =strtrim(days[1],10)
      #http://climate.weather.gc.ca/climate_data/hourly_data_e.html?hlyRange=2004-06-16%7C2017-04-20&dlyRange=2004-06-01%7C2017-04-20&mlyRange=2004-06-01%7C2007-10-01&StationID=42729&Prov=AB&urlExtension=_e.html&searchType=stnName&optLimit=yearRange&StartYear=1840&EndYear=2017&selRowPerPage=25&Line=1&searchMethod=contains&Month=4&Day=20&txtStationName=Raymond&timeframe=1&Year=2017
    #http://climate.weather.gc.ca/climate_data/hourly_data_e.html?hlyRange=2004-06-16%7C
    # dly 2017-04-20
    #&dlyRange=2004-06-01%7C
    # dly 2017-04-20
    #42729
    #&Prov=AB&urlExtension=_e.html&searchType=stnName&optLimit=yearRange&StartYear=1840&EndYear=2017&selRowPerPage=25&Line=1&searchMethod=contains&Month=4&Day=20&txtStationName=Raymond&timeframe=1&Year=2017
    
    preamb = 'http://climate.weather.gc.ca/climate_data/hourly_data_e.html?hlyRange=2004-06-16%7C'
    # daily range
    dlyRange = '&dlyRange=2004-06-01%7C'
    # monthly range  
    mlyRange = '&mlyRange=2004-06-01%7C2007-10-01&StationID='
    # location   
    location_url = "&Prov=AB&urlExtension=_e.html&searchType=stnName" 
    
    # year range
    year0_url = '&optLimit=yearRange&StartYear=1840&EndYear='
    # month  
    month_url = "&selRowPerPage=25&Line=1&searchMethod=contains&Month="
    # day  
    day_url = '&Day='
    # station name  
    st_url = '&txtStationName='
    # year  
    year1_url = '&timeframe=1&Year='
    # put together
    url = paste0(preamb,dly,
                dlyRange,dly,
                mlyRange,stationID,
                location_url,
                year0_url ,year, 
                month_url, month, 
                day_url, day, 
                st_url,URLencode(stationName), # replace space to %20
                year1_url,year)
    
### key part to read data from website     
# read xml table from website
    #data = readHTMLTable(url)
    #data = data.frame(data[1] )
    
    data <- url %>%
      read_html() %>%
      html_nodes(xpath='//*[@id="dynamicDataTable"]/table') %>%
      html_table() 
    
 ##  clean the data format for csv

    
    # from list to take first dataframe item
    data = data[[1]]
    # rename the first column of the dataframe   
    names(data)[1]="time"
    # Select rows by position from 2nd row
    data = data%>%slice( 2:n())
    #data = data.frame(data[[1]])
    
    ### solve missing data problem
    if (is.factor(data)){
      # generate NA dataframe for missing data day, for example 2010/11/25
      data=data.frame(matrix(data=NA,nrow=24,ncol = 11))
      data[,1] = 0:23  # first column
    }else{
      # Fix dumb time 'Legend add-on
      #data[,1] = matrix(unlist(strsplit(as.character(data[,1]), ':', fixed = T)),ncol = 2,byrow = T)[,1]
    }
      
    # add last four columns
    data$Year = year
    data$Month = month
    data$Day = day
    data$StationName = stationName
    # change colname
    colnames(data) = c("Time", 'TempC', 'DewTempC', 'RelH', 'WindDir', "WindSpdkm.h", "Visibilitykm","PresskPa", "Hmdx", "WindChill", "Weather","Year","Month","Day","StationName")
    # Collate Data, stack by rows
    weather_data = rbind(weather_data,data)  
  }

In [58]:
# start off new function for data scraping
getWindData = function(day1 = '2009-10-01', 
                       day2 = '2016-03-10',
                       stationID = 42729,
                       stationName = 'Raymond'){
  
  #-----------------
  # Dependencies:
  #-----------------
  #install.packages(c("lubridate","XML"),repos="http://cran.rstudio.com")
  
  library("rvest")
  library(dplyr)
  library(lubridate)
  library(XML)
  
  ############################
  # HOURLY WEATHER DOWNLOADS #
  ############################
  
  
  # sample
  
  # http://climate.weather.gc.ca/climate_data/hourly_data_e.html?hlyRange=2004-06-16%7C2017-04-20&dlyRange=2004-06-01%7C2017-04-20&mlyRange=2004-06-01%7C2007-10-01&StationID=42729&Prov=AB&urlExtension=_e.html&searchType=stnName&optLimit=yearRange&StartYear=1840&EndYear=2017&selRowPerPage=25&Line=1&searchMethod=contains&Month=4&Day=20&txtStationName=Raymond&timeframe=1&Year=2017
  
  # Things you need to change for each station:
  #location_url = paste("Prov=AB&StationID=",stationID,"&hlyRange=2004-06-16%7C",day2,sep='')   # 
  #day1 = day1
  #day2 = day2
  #--------------------day1 and day1 inside the hlyRange=2010-10-01%7C2013-03-11
  
  # Things that don't need to be changed: (make a date vector)
  start = strptime(day1, format = '%Y-%m-%d')
  end = strptime(day2, format = '%Y-%m-%d')
  days = seq(start, end, 'days')
  
## loop by days  
  weather_data = c()
  for (i in 1:length(days)){
    year =  year(days[i])
    month = month(days[i])
    day = day(days[i])
    dly =strtrim(days[1],10)
      #http://climate.weather.gc.ca/climate_data/hourly_data_e.html?hlyRange=2004-06-16%7C2017-04-20&dlyRange=2004-06-01%7C2017-04-20&mlyRange=2004-06-01%7C2007-10-01&StationID=42729&Prov=AB&urlExtension=_e.html&searchType=stnName&optLimit=yearRange&StartYear=1840&EndYear=2017&selRowPerPage=25&Line=1&searchMethod=contains&Month=4&Day=20&txtStationName=Raymond&timeframe=1&Year=2017
    #http://climate.weather.gc.ca/climate_data/hourly_data_e.html?hlyRange=2004-06-16%7C
    # dly 2017-04-20
    #&dlyRange=2004-06-01%7C
    # dly 2017-04-20
    #42729
    #&Prov=AB&urlExtension=_e.html&searchType=stnName&optLimit=yearRange&StartYear=1840&EndYear=2017&selRowPerPage=25&Line=1&searchMethod=contains&Month=4&Day=20&txtStationName=Raymond&timeframe=1&Year=2017
    
    preamb = 'http://climate.weather.gc.ca/climate_data/hourly_data_e.html?hlyRange=2004-06-16%7C'
    # daily range
    dlyRange = '&dlyRange=2004-06-01%7C'
    # monthly range  
    mlyRange = '&mlyRange=2004-06-01%7C2007-10-01&StationID='
    # location   
    location_url = "&Prov=AB&urlExtension=_e.html&searchType=stnName" 
    
    # year range
    year0_url = '&optLimit=yearRange&StartYear=1840&EndYear='
    # month  
    month_url = "&selRowPerPage=25&Line=1&searchMethod=contains&Month="
    # day  
    day_url = '&Day='
    # station name  
    st_url = '&txtStationName='
    # year  
    year1_url = '&timeframe=1&Year='
    # put together
    url = paste0(preamb,dly,
                dlyRange,dly,
                mlyRange,stationID,
                location_url,
                year0_url ,year, 
                month_url, month, 
                day_url, day, 
                st_url,URLencode(stationName), # replace space to %20
                year1_url,year)
    
### key part to read data from website     
# read xml table from website
    #data = readHTMLTable(url)
    #data = data.frame(data[1] )
    
    data <- url %>%
      read_html() %>%
      html_nodes(xpath='//*[@id="dynamicDataTable"]/table') %>%
      html_table() 
    
 ##  clean the data format for csv

    
    # from list to take first dataframe item
    data = data[[1]]
    # rename the first column of the dataframe   
    names(data)[1]="time"
    # Select rows by position from 2nd row
    data = data%>%slice( 2:n())
    #data = data.frame(data[[1]])
    
    ### solve missing data problem
    if (is.factor(data)){
      # generate NA dataframe for missing data day, for example 2010/11/25
      data=data.frame(matrix(data=NA,nrow=24,ncol = 11))
      data[,1] = 0:23  # first column
    }else{
      # Fix dumb time 'Legend add-on
      #data[,1] = matrix(unlist(strsplit(as.character(data[,1]), ':', fixed = T)),ncol = 2,byrow = T)[,1]
    }
      
    # add last four columns
    data$Year = year
    data$Month = month
    data$Day = day
    data$StationName = stationName
    # change colname
    colnames(data) = c("Time", 'TempC', 'DewTempC', 'RelH', 'WindDir', "WindSpdkm.h", "Visibilitykm","PresskPa", "Hmdx", "WindChill", "Weather","Year","Month","Day","StationName")
    # Collate Data, stack by rows
    weather_data = rbind(weather_data,data)  
  }
  
  # This you may want to change: 
  
  # Write .csv using paste(stationName, "_2008_16weather_data.csv", sep='')
  write.csv(weather_data,paste0(stationName, "_",strsplit(day1, '-', fixed = T)[[1]][1],"_weather_data.csv"), row.names = F)
  
  # Thats all ...
}     

In [59]:
getWindData(day1 = '2015-01-01', 
            day2 = '2015-1-10',
            stationID = 31408,
            stationName = 'Barnwell AGDM')

In [60]:
# 2018-01-08 working version 1.1