In [None]:
################################################################################
######## FINAL SCRIPT TO GET ALL weather stations from CANADA ##################
################################################################################

#import BeautifulSoup
from bs4 import BeautifulSoup

import requests

import pandas as pd
import math



provinces = ["AB", "BC", "MB", "NB", "NL", "NT", "NS", "NU", "ON", "PE", "QC", "SK", "YT"]

province_stations = []

for province in provinces:
    r=requests.get('https://climate.weather.gc.ca/historical_data/search_historic_data_stations_e.html?searchType=stnProv&timeframe=1&lstProvince='+province+'&optLimit=yearRange&StartYear=1840&EndYear=2021&Year=2021&Month=2&Day=25&selRowPerPage=100')
    data = r.text
    soup = BeautifulSoup(data, "html.parser")
    
    ## Get the paragraph with the number of stations mentioned - we use the href link in the text 
    ## to locate this paragrapph. Later we extract the number of stations for that province
    link_location = soup.find(
        'a',
        href="/glossary_e.html#dataInt",
        text=re.compile(".*Data Interval.*")
    )
    paragraph = link_location.parent
    p_text = paragraph.text
    number_of_stations = int(text.split(" ")[0])

    ### we use the number of stations to create a rounded number to the next hundred so we can use it in our loop.
    rounded_number_of_stations = int(math.ceil(number_of_stations / 100.0)) * 100
    
    
    weather_stations=[]
    
    for i in range(1,rounded_number_of_stations,100):
        
        # web scrape the page for each province and 
        # search result page (with 100 results listed per page)
        r=requests.get('https://climate.weather.gc.ca/historical_data/search_historic_data_stations_e.html?searchType=stnProv&timeframe=1&lstProvince='+province+'&optLimit=yearRange&StartYear=1840&EndYear=2021&Year=2021&Month=2&Day=26&selRowPerPage=100&startRow='+str(i)+'')
        data = r.text
        soup = BeautifulSoup(data, "html.parser")

        # from each search result page
        # get all the forms with this action on the page
        forms = soup.select('form[id*="-sm"]')
        for i in range(len(forms)):
            # create blank dataframe
            w_station = pd.DataFrame(columns=['Station', 'Station ID', 'Province', 'Index_col', 'Monthly Range', 'Daily Range'])
            
            # populate the dataframe with each result from the search result page (100 results per page)
            w_station['Station ID'] = [forms[i].find('input', {'name': 'StationID'}).get('value')]
            w_station['Daily Range'] = [forms[i].find('input', {'name': 'dlyRange'}).get('value')]
            w_station['Monthly Range'] = [forms[i].find('input', {'name': 'mlyRange'}).get('value')]
            w_station['Province'] = [forms[i].find('input', {'name': 'Prov'}).get('value')]
            w_station['Index_col'] = [forms[i].find('input', {'name': 'Line'}).get('value')]
            title = forms[i].find("div", class_="col-md-10 col-sm-8 col-xs-8")
            w_station['Station'] = title.text
            
            # append the dataframe with all stations per search resut page to a list
            weather_stations.append(w_station)
    # concat all province stations (from all search result pages)
    all_province_stations = pd.concat(weather_stations, ignore_index=True)
    
    # append each province dataframe to a list
    province_stations.append(all_province_stations)

# final dataframe with all stations from each province
Canada_stations = pd.concat(province_stations, ignore_index=True)    
    
