In [1]:
import pandas as pd
import numpy as np
import datetime
from tqdm import tqdm

In [3]:
def getMonthlyData(year, month):
    """
    Fetches and processes climate data for a given year and month.

    Parameters:
    year (int): The year for which data is to be fetched.
    month (int): The month for which data is to be fetched.

    Returns:
    DataFrame: A DataFrame containing the mean values of numeric columns
               grouped by province or territory, with the year and month included.
    """
    
    # Base URL for the climate data API
    base_url = "https://climate.weather.gc.ca/prods_servs/cdn_climate_summary_report_e.html?"
    
    # Query parameters for the API call
    query_url = f"intYear={year}&intMonth={month}&prov=&dataFormat=csv&btnSubmit=Download+data"
    
    # Full API endpoint
    api_endpoint = base_url + query_url
    
    # Read the CSV data from the API endpoint
    df = pd.read_csv(api_endpoint, skiprows=0)
    
    # Columns to drop from the DataFrame
    columns_to_drop = ['Long', 'Lat', 'Stn_Name', 'Clim_ID']
    df.drop(columns=columns_to_drop, inplace=True)
    
    # Replace missing value placeholders with NaN
    df.replace('######', np.nan, inplace=True)
    
    # Select numeric columns for aggregation
    numeric_columns = df.select_dtypes(include=['float64']).columns
    
    # Calculate mean values by province or territory
    mean_by_prov_ter = df.groupby('Prov_or_Ter')[numeric_columns].mean()
    
    # Add year and month columns to the aggregated DataFrame
    mean_by_prov_ter['year'] = year
    mean_by_prov_ter['month'] = month
    mean_by_prov_ter['Prov_or_Ter'] = mean_by_prov_ter.index
    
    return mean_by_prov_ter

In [None]:
def interpolate_weather_date(weather_data):
    """
    Interpolates missing precipitation data for weather records, ensuring a complete
    dataset with monthly entries for each province or territory from January 1980 to December 2023.

    Parameters:
    weather_data (DataFrame): A DataFrame containing weather data with columns 'year', 'month',
                              'Prov_or_Ter', and 'Precipitation'.

    Returns:
    DataFrame: A DataFrame with interpolated 'Precipitation' values, indexed by 'REF_DATE' and 'Prov_or_Ter',
               including all months for each province or territory in the specified date range.
    """

    # Create a complete date range from January 1, 1980, to December 31, 2023, with monthly frequency
    full_dates = pd.date_range(start='1980-01-01', end='2023-12-31', freq='MS')

    # Get the unique provinces or territories in the weather_data
    provinces = weather_data['Prov_or_Ter'].unique()
    provinces = provinces[provinces == provinces]  # Remove NaN values

    # Create a multi-index with the full date range and the unique provinces
    multi_index = pd.MultiIndex.from_product([full_dates, provinces], names=['REF_DATE', 'Prov_or_Ter'])

    # Combine year and month columns into a datetime column 'REF_DATE'
    weather_data['REF_DATE'] = pd.to_datetime(weather_data['year'].astype(str) + '-' + weather_data['month'].astype(str))

    # Set the index of weather_data to the newly created 'REF_DATE' and 'Prov_or_Ter'
    weather_data.set_index(['REF_DATE', 'Prov_or_Ter'], inplace=True)

    # Reindex the weather_data to include the full multi-index of dates and provinces, filling missing combinations
    weather_data = weather_data.reindex(multi_index).reset_index()

    # Filter out rows where 'Prov_or_Ter' is NaN (should not be necessary due to unique filtering above)
    weather_data = weather_data[weather_data['Prov_or_Ter'].notna()]

    # Interpolate missing 'Precipitation' values for each province separately
    columns = ['Precipitation', 'Snow on ground at EOM']

    for prov in provinces:
        for col in columns:
            precip = weather_data[col][weather_data['Prov_or_Ter'] == prov]
            weather_data.loc[weather_data['Prov_or_Ter'] == prov, col] = precip.interpolate()
            
    weather_data = weather_data.drop('REF_DATE', axis=1)

    return weather_data

In [None]:
def getAllMonthlyData(start_year=1980, end_year=2024):
    """
    Fetches and concatenates climate data from the start year to the end year, excluding future months.
    
    Parameters:
    start_year (int): The starting year for the data fetch. Defaults to 1980.
    end_year (int): The ending year for the data fetch. Defaults to 2024.
    
    Returns:
    DataFrame: A DataFrame containing the concatenated monthly climate data for each year and month within the range.
    """
    all_dataframes = []
    total_months = (end_year - start_year + 1) * 12
    
    # Calculate the remaining months for the current year
    if end_year == datetime.datetime.now().year:
        total_months -= 12 - datetime.datetime.now().month + 1

    # Iterate over each year in the specified range with a progress bar
    with tqdm(total=total_months, desc="Fetching monthly data") as pbar:
        for year in range(start_year, end_year + 1):
            # Iterate over each month of the year
            for month in range(1, 13):
                # Skip future months in the current year
                if year == end_year and month >= datetime.datetime.now().month:
                    break

                # Fetch monthly data and append to the list
                df = getMonthlyData(year, month)
                all_dataframes.append(df)
                
                # Update the progress bar
                pbar.update(1)
    
    # Concatenate all dataframes into a single dataframe
    combined_df = pd.concat(all_dataframes, ignore_index=True)
    
    columns_to_keep = ['Prov_or_Ter', 'year', 'month', 'Tm', 'Tx', 'Tn', 'S', 'P', 'S_G', 'Pd', 'CDD']

    clean = combined_df[columns_to_keep]

    clean = clean.rename(columns={'Tm' : 'Temperature', 
                        'Tx': 'Max Temperature', 
                        'Tn': 'Min Temperature', 
                        'S' : 'Snowfall',
                        'P' : 'Precipitation', 
                        'S_G' : 'Snow on ground at EOM', 
                        'Pd' : '# precipitation days',
                        'CDD' : '# warm days (18C+)'})
    
    province_map = {
        'AB': 'Alberta',
        'BC': 'British Columbia',
        'MB': 'Manitoba',
        'NB': 'New Brunswick',
        'NL': 'Newfoundland and Labrador',
        'NS': 'Nova Scotia',
        'ON': 'Ontario',
        'PE': 'Prince Edward Island',
        'QC': 'Quebec',
        'SK': 'Saskatchewan',
        'NT': 'Northwest Territories',
        'NU': 'Nunavut',
        'YT': 'Yukon'
    }

    clean['Prov_or_Ter'] = clean['Prov_or_Ter'].map(province_map)

    clean = interpolate_weather_date(clean)

    return clean