# Process Hourly Data

- Adjust some column labels and extract only the columns of interest
- Apply some processing to the weather conditions categories (assuming "N/A" is used to indicate previous category persists until a new non "N/A" value is listed)
- Merge hourly data with station info and timezones, and calculate UTC times

In [1]:
import numpy as np
import pandas as pd

## Stations Metadata

In [2]:
stations_info = pd.read_csv('data/airport_stations.csv', index_col=0)
stations = list(stations_info.index)
stations_info.head()

Unnamed: 0_level_0,Name,Province,Timezone,UTC Offset (hours),Env Canada ID,Latitude (deg),Longitude (deg),Elevation (m),First Year,Last Year
Airport Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
YEG,EDMONTON INTL A,ALBERTA,MST,-7.0,50149,53.31,-113.58,723.3,2012,2018
YFB,IQALUIT A,NUNAVUT,EST,-5.0,52079,63.76,-68.56,33.5,2014,2018
YFC,FREDERICTON,NEW BRUNSWICK,AST,-4.0,48568,45.87,-66.54,20.7,2010,2018
YHZ,HALIFAX INTL A,NOVA SCOTIA,AST,-4.0,50620,44.88,-63.51,145.4,2012,2018
YOW,OTTAWA INTL A,ONTARIO,EST,-5.0,49568,45.32,-75.67,114.9,2011,2018


## Data Processing Functions

In [3]:
def read_hourly_raw(csv_file, pre_process=True, skiprows=15):
    """Read hourly weather data from CSV file, and return as a DataFrame.
    
    If argument `pre_process` is True, some minor pre-processing is applied
    to the DataFrame: adjust some column labels, and include only the columns
    we're interested in.
    """
    df = pd.read_csv(csv_file, skiprows=skiprows, index_col=0, parse_dates=True)
    
    if pre_process:
        # Remove redundant time columns and any columns with label ending in "Flag"
        time_cols = ['Year', 'Month', 'Day' , 'Time']
        flag_cols = [col for col in df.columns if col.endswith('Flag')]        
        df = df.drop(time_cols + flag_cols, axis=1)
        
        # Remove non-ascii degree symbol from column labels
        # and rename 'Weather' to 'Conditions'
        def adjust_label(label):
            return label.replace('\xb0', 'deg ').replace('Weather', 'Conditions')
        columns = [adjust_label(col) for col in df.columns]
        df.columns = columns
        
        # Rename datetime index
        df.index.name = 'Datetime (Local Standard)'
    
    return df

In [4]:
def load_hourly_data(csv_files, pre_process=True, skiprows=15, verbose=True):
    """Read hourly data from list of csv files, merge, and return as a DataFrame
    
    If argument `pre_process` is True, some minor pre-processing is applied
    to the DataFrame: adjust some column labels, and include only the columns
    we're interested in.
    """
    df_list = []
    for csv_file in csv_files:
        if verbose:
            print(f'Reading {csv_file}')
        df_in = read_hourly_raw(csv_file, pre_process=pre_process, skiprows=skiprows)
        df_list.append(df_in)
    data = pd.concat(df_list, axis=0)
    return data

In [5]:
def process_hourly_data(data, station, stations_info, verbose=True):
    """Process hourly weather data and station metadata, and return as a DataFrame"""
    
    # Check for any rows where all measurements are missing
    all_missing = data.isnull().all(axis=1)
    if verbose:
        print(f'{all_missing.value_counts().get(True)} rows with all measurements missing')

    # Assume weather category persists until indicated otherwise by a new non-null value
    # so use forward filling, except if all other measurements are missing, then leave as null
    data_out = data.copy()
    data_out['Conditions'] = data_out['Conditions'].fillna(method='ffill')
    data_out.loc[all_missing, 'Conditions'] = np.nan
    
    # Convert wind direction from 10s of degrees to degrees
    data_out['Wind Dir (deg)'] = 10 * data_out['Wind Dir (10s deg)']

    # Add station metadata
    data_out['Station ID'] = station
    data_out['Station Name'] = stations_info.loc[station, 'Name']
    data_out['Timezone'] = stations_info.loc[station, 'Timezone']

    # Calculate UTC datetimes
    utc_offset_hours =  stations_info.loc[station, 'UTC Offset (hours)']
    tdelta = pd.Timedelta(-utc_offset_hours, unit='h')
    data_out['Datetime (UTC)'] = data_out.index + tdelta
    
    # Reorder columns
    columns = ['Station ID', 'Station Name', 'Timezone', 'Datetime (UTC)', 
               'Temp (deg C)', 'Dew Point Temp (deg C)', 'Rel Hum (%)', 
               'Wind Dir (deg)', 'Wind Spd (km/h)', 'Visibility (km)', 
               'Stn Press (kPa)', 'Hmdx', 'Wind Chill', 'Conditions']
    data_out = data_out[columns]
    
    return data_out

## Input Parameters

Let's process data for 2018 year to date (Jan-Jul).

In [6]:
# Specify data directories and date ranges to process
datadir = 'data/raw/'
savedir = 'data/processed/'
year = 2018
months = range(1, 8)
months_str = f'{year}{min(months):02d}-{year}{max(months):02d}'

## Demo - Single Station

Show the data processing steps for one station.

In [7]:
station = 'YEG'
csv_files = [f'{datadir}weather_hourly_{station}_{year}{month:02d}.csv' for month in months]
data_in = load_hourly_data(csv_files)
print(data_in.shape)
data_in.head()

Reading data/raw/weather_hourly_YEG_201801.csv
Reading data/raw/weather_hourly_YEG_201802.csv
Reading data/raw/weather_hourly_YEG_201803.csv
Reading data/raw/weather_hourly_YEG_201804.csv
Reading data/raw/weather_hourly_YEG_201805.csv
Reading data/raw/weather_hourly_YEG_201806.csv
Reading data/raw/weather_hourly_YEG_201807.csv
(5088, 10)


Unnamed: 0_level_0,Temp (deg C),Dew Point Temp (deg C),Rel Hum (%),Wind Dir (10s deg),Wind Spd (km/h),Visibility (km),Stn Press (kPa),Hmdx,Wind Chill,Conditions
Datetime (Local Standard),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2018-01-01 00:00:00,-28.1,-32.3,68.0,18.0,18.0,24.1,94.58,,-40.0,
2018-01-01 01:00:00,-28.3,-32.5,68.0,18.0,18.0,24.1,94.52,,-40.0,
2018-01-01 02:00:00,-28.7,-32.8,68.0,18.0,14.0,24.1,94.48,,-39.0,Clear
2018-01-01 03:00:00,-28.5,-32.6,68.0,15.0,12.0,24.1,94.44,,-38.0,
2018-01-01 04:00:00,-28.3,-32.5,68.0,16.0,10.0,24.1,94.38,,-37.0,


In [8]:
data_out = process_hourly_data(data_in, station, stations_info)
data_out.head()

3 rows with all measurements missing


Unnamed: 0_level_0,Station ID,Station Name,Timezone,Datetime (UTC),Temp (deg C),Dew Point Temp (deg C),Rel Hum (%),Wind Dir (deg),Wind Spd (km/h),Visibility (km),Stn Press (kPa),Hmdx,Wind Chill,Conditions
Datetime (Local Standard),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2018-01-01 00:00:00,YEG,EDMONTON INTL A,MST,2018-01-01 07:00:00,-28.1,-32.3,68.0,180.0,18.0,24.1,94.58,,-40.0,
2018-01-01 01:00:00,YEG,EDMONTON INTL A,MST,2018-01-01 08:00:00,-28.3,-32.5,68.0,180.0,18.0,24.1,94.52,,-40.0,
2018-01-01 02:00:00,YEG,EDMONTON INTL A,MST,2018-01-01 09:00:00,-28.7,-32.8,68.0,180.0,14.0,24.1,94.48,,-39.0,Clear
2018-01-01 03:00:00,YEG,EDMONTON INTL A,MST,2018-01-01 10:00:00,-28.5,-32.6,68.0,150.0,12.0,24.1,94.44,,-38.0,Clear
2018-01-01 04:00:00,YEG,EDMONTON INTL A,MST,2018-01-01 11:00:00,-28.3,-32.5,68.0,160.0,10.0,24.1,94.38,,-37.0,Clear


In [9]:
data_out['Conditions'].value_counts(dropna=False)

Mostly Cloudy                             1819
Mainly Clear                              1005
Clear                                      661
Cloudy                                     477
Snow                                       437
Fog                                        133
Snow,Blowing Snow                          104
Rain Showers                               100
Rain                                        78
Blowing Snow                                56
Thunderstorms                               35
Snow,Fog                                    33
Thunderstorms,Rain Showers                  31
Ice Crystals                                20
Freezing Fog                                18
Rain,Fog                                    15
Snow Showers                                11
Snow Showers,Blowing Snow                    5
NaN                                          5
Rain Showers,Fog                             4
Haze                                         4
Heavy Rain Sh

## Process All Stations and Save to CSV

In [10]:
for station in stations:
    csv_files = [f'{datadir}weather_hourly_{station}_{year}{month:02d}.csv' for month in months]
    savefile = f'{savedir}weather_hourly_{station}_{months_str}.csv'
    data_in = load_hourly_data(csv_files)
    data_out = process_hourly_data(data_in, station, stations_info)
    print(f'Saving to {savefile}')
    data_out.to_csv(savefile)
print('Done!')

Reading data/raw/weather_hourly_YEG_201801.csv
Reading data/raw/weather_hourly_YEG_201802.csv
Reading data/raw/weather_hourly_YEG_201803.csv
Reading data/raw/weather_hourly_YEG_201804.csv
Reading data/raw/weather_hourly_YEG_201805.csv
Reading data/raw/weather_hourly_YEG_201806.csv
Reading data/raw/weather_hourly_YEG_201807.csv
3 rows with all measurements missing
Saving to data/processed/weather_hourly_YEG_201801-201807.csv
Reading data/raw/weather_hourly_YFB_201801.csv
Reading data/raw/weather_hourly_YFB_201802.csv
Reading data/raw/weather_hourly_YFB_201803.csv
Reading data/raw/weather_hourly_YFB_201804.csv
Reading data/raw/weather_hourly_YFB_201805.csv
Reading data/raw/weather_hourly_YFB_201806.csv
Reading data/raw/weather_hourly_YFB_201807.csv
4 rows with all measurements missing
Saving to data/processed/weather_hourly_YFB_201801-201807.csv
Reading data/raw/weather_hourly_YFC_201801.csv
Reading data/raw/weather_hourly_YFC_201802.csv
Reading data/raw/weather_hourly_YFC_201803.csv
Rea