In [1]:
import pandas as pd
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from dateutil import rrule
from datetime import datetime, timedelta
from bs4 import BeautifulSoup
import requests
import re
from fuzzywuzzy import fuzz



In [2]:
def getHourlyData(stationID, year, month):
    base_url = "http://climate.weather.gc.ca/climate_data/bulk_data_e.html?"
    query_url = "format=csv&stationID={}&Year={}&Month={}&timeframe=1".format(stationID, year, month)
    api_endpoint = base_url + query_url
    return pd.read_csv(api_endpoint, skiprows=0)

In [9]:
stationID = 51442
start_date = datetime.strptime('Jan2015', '%b%Y')
end_date = datetime.strptime('Jan2016', '%b%Y')

frames = []
for dt in rrule.rrule(rrule.MONTHLY, dtstart=start_date, until=end_date):
    df = getHourlyData(stationID, dt.year, dt.month)
    frames.append(df)

weather_data = pd.concat(frames)
# weather_data['Date/Time'] = pd.to_datetime(weather_data['Date/Time'])
# weather_data['Temp (°C)'] = pd.to_numeric(weather_data['Temp (°C)'])

In [11]:
# Specify Parameters
province = "BC"      # Which province to parse?
start_year = "2006"  # I want the results to go back to at least 2006 or earlier
max_pages = 5        # Number of maximum pages to parse, EC's limit is 100 rows per page, there are about 500 stations in BC with data going back to 2006

# Store each page in a list and parse them later
soup_frames = []

for i in range(max_pages):
    startRow = 1 + i*100
    print('Downloading Page: ', i)
    
    base_url = "http://climate.weather.gc.ca/historical_data/search_historic_data_stations_e.html?"
    queryProvince = "searchType=stnProv&timeframe=1&lstProvince={}&optLimit=yearRange&".format(province)
    queryYear = "StartYear={}&EndYear=2017&Year=2017&Month=5&Day=29&selRowPerPage=100&txtCentralLatMin=0&txtCentralLatSec=0&txtCentralLongMin=0&txtCentralLongSec=0&".format(start_year)
    queryStartRow = "startRow={}".format(startRow)

    response = requests.get(base_url + queryProvince + queryYear + queryStartRow) # Using requests to read the HTML source
    soup = BeautifulSoup(response.text, 'html.parser') # Parse with Beautiful Soup
    soup_frames.append(soup)

Downloading Page:  0
Downloading Page:  1
Downloading Page:  2
Downloading Page:  3
Downloading Page:  4


In [32]:
# Empty list to store the station data
station_data = []

for soup in soup_frames: # For each soup
    forms = soup.findAll("form", {"id" : re.compile('stnRequest*')}) # We find the forms with the stnRequest* ID using regex 
    for form in forms:
        try:
            # The stationID is a child of the form
            station = form.find("input", {"name" : "StationID"})['value']
            
            # The station name is a sibling of the input element named lstProvince
            name = form.find("input", {"name" : "lstProvince"}).find_next_siblings("div")[0].text
            
            # The intervals are listed as children in a 'select' tag named timeframe
            timeframes = form.find("select", {"name" : "timeframe"}).findChildren()
            intervals =[t.text for t in timeframes]
            
            # We can find the min and max year of this station using the first and last child
            years = form.find("select", {"name" : "Year"}).findChildren()            
            min_year = years[0].text
            max_year = years[-1].text
            
            # Store the data in an array
            data = [station, name, intervals, min_year, max_year]
            station_data.append(data)
        except:
            pass

# Create a pandas dataframe using the collected data and give it the appropriate column names
stations_df = pd.DataFrame(station_data, columns=['StationID', 'Name', 'Intervals', 'Year Start', 'Year End'])
stations_df['Year Start'] = pd.to_numeric(stations_df['Year Start'])
stations_df['Year End'] = pd.to_numeric(stations_df['Year End'])
stations_df = stations_df[stations_df['Year End'] >= 1999]
stations_df = stations_df[stations_df['Intervals'].apply(lambda x: 'Hourly' in x)]

  forms = soup.findAll("form", {"id" : re.compile('stnRequest*')}) # We find the forms with the stnRequest* ID using regex
  timeframes = form.find("select", {"name" : "timeframe"}).findChildren()
  years = form.find("select", {"name" : "Year"}).findChildren()


In [54]:
weather_data.columns

Index(['Longitude (x)', 'Latitude (y)', 'Station Name', 'Climate ID',
       'Date/Time (LST)', 'Year', 'Month', 'Day', 'Time (LST)', 'Flag',
       'Temp (°C)', 'Temp Flag', 'Dew Point Temp (°C)', 'Dew Point Temp Flag',
       'Rel Hum (%)', 'Rel Hum Flag', 'Precip. Amount (mm)',
       'Precip. Amount Flag', 'Wind Dir (10s deg)', 'Wind Dir Flag',
       'Wind Spd (km/h)', 'Wind Spd Flag', 'Visibility (km)',
       'Visibility Flag', 'Stn Press (kPa)', 'Stn Press Flag', 'Hmdx',
       'Hmdx Flag', 'Wind Chill', 'Wind Chill Flag', 'Weather'],
      dtype='object')

In [58]:
stationID = 706
start_date = datetime.strptime('Jun1992', '%b%Y')
end_date = datetime.strptime('Jun1995', '%b%Y')

frames = []
for dt in rrule.rrule(rrule.MONTHLY, dtstart=start_date, until=end_date):
    df = getHourlyData(stationID, dt.year, dt.month)
    frames.append(df)

weather_data = pd.concat(frames)
weather_data['Date/Time (LST)'] = pd.to_datetime(weather_data['Date/Time (LST)'])
weather_data['Temp (°C)'] = pd.to_numeric(weather_data['Temp (°C)'])

In [59]:
weather_data

Unnamed: 0,Longitude (x),Latitude (y),Station Name,Climate ID,Date/Time (LST),Year,Month,Day,Time (LST),Flag,...,Wind Spd Flag,Visibility (km),Visibility Flag,Stn Press (kPa),Stn Press Flag,Hmdx,Hmdx Flag,Wind Chill,Wind Chill Flag,Weather
0,-121.76,49.24,AGASSIZ RCS,1100119,1992-06-01 00:00:00,1992,6,1,00:00,,...,,,,,,,,,,
1,-121.76,49.24,AGASSIZ RCS,1100119,1992-06-01 01:00:00,1992,6,1,01:00,,...,,,,,,,,,,
2,-121.76,49.24,AGASSIZ RCS,1100119,1992-06-01 02:00:00,1992,6,1,02:00,,...,,,,,,,,,,
3,-121.76,49.24,AGASSIZ RCS,1100119,1992-06-01 03:00:00,1992,6,1,03:00,,...,,,,,,,,,,
4,-121.76,49.24,AGASSIZ RCS,1100119,1992-06-01 04:00:00,1992,6,1,04:00,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715,-121.76,49.24,AGASSIZ RCS,1100119,1995-06-30 19:00:00,1995,6,30,19:00,,...,,,M,,M,33.0,,,,
716,-121.76,49.24,AGASSIZ RCS,1100119,1995-06-30 20:00:00,1995,6,30,20:00,,...,,,M,,M,30.0,,,,
717,-121.76,49.24,AGASSIZ RCS,1100119,1995-06-30 21:00:00,1995,6,30,21:00,,...,,,M,,M,26.0,,,,
718,-121.76,49.24,AGASSIZ RCS,1100119,1995-06-30 22:00:00,1995,6,30,22:00,,...,,,M,,M,,,,,


In [61]:
da = pd.read_csv('/users/rpayne/data/unproc/STN_ECCC/api/hourly/ec_hourly_id54238.csv', index_col=False)

In [62]:
da

Unnamed: 0,Longitude (x),Latitude (y),Station Name,Climate ID,Date/Time (LST),Year,Month,Day,Time (LST),Flag,...,Wind Spd Flag,Visibility (km),Visibility Flag,Stn Press (kPa),Stn Press Flag,Hmdx,Hmdx Flag,Wind Chill,Wind Chill Flag,Weather
0,-122.36,49.03,ABBOTSFORD A,1100032,2016-01-01 00:00:00,2016,1,1,00:00,,...,,,,,,,,,,
1,-122.36,49.03,ABBOTSFORD A,1100032,2016-01-01 01:00:00,2016,1,1,01:00,,...,,,,,,,,,,
2,-122.36,49.03,ABBOTSFORD A,1100032,2016-01-01 02:00:00,2016,1,1,02:00,,...,,,,,,,,,,
3,-122.36,49.03,ABBOTSFORD A,1100032,2016-01-01 03:00:00,2016,1,1,03:00,,...,,,,,,,,,,
4,-122.36,49.03,ABBOTSFORD A,1100032,2016-01-01 04:00:00,2016,1,1,04:00,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87667,-122.36,49.03,ABBOTSFORD A,1100032,2025-12-31 19:00:00,2025,12,31,19:00,,...,,,,,,,,,,
87668,-122.36,49.03,ABBOTSFORD A,1100032,2025-12-31 20:00:00,2025,12,31,20:00,,...,,,,,,,,,,
87669,-122.36,49.03,ABBOTSFORD A,1100032,2025-12-31 21:00:00,2025,12,31,21:00,,...,,,,,,,,,,
87670,-122.36,49.03,ABBOTSFORD A,1100032,2025-12-31 22:00:00,2025,12,31,22:00,,...,,,,,,,,,,


In [50]:
a = stations_df[stations_df['StationID']==384]['Year Start'].values[0]  # Example to get the Year Start for a specific station
end_date = datetime.strptime(f'Dec{a}', '%b%Y')
end_date

datetime.datetime(1994, 12, 1, 0, 0)

In [43]:
stations_df['Year End']

2      2025
3      2025
4      2012
5      2001
7      2025
       ... 
474    2010
476    2025
479    2025
480    2012
485    2025
Name: Year End, Length: 245, dtype: int64