
# 02 Feature Engineering

This notebook transforms the raw daily weather dataset into a rich analytical file suitable for machine learning. We engineer additional temporal and aggregated features to capture short-term trends and seasonality, and we define a binary target variable indicating whether a heavy precipitation event (greater than 10 mm) occurs on the following day.

The engineered features include lagged variables (e.g., previous day's precipitation and temperature), rolling sums over 3-day and 7-day windows, and cyclical encodings of month and day-of-year to capture seasonality. The resulting analytical dataset is saved to `data/processed/analytical_data.csv`.

A random sample of 100 records is extracted from the analytical dataset with each column labelled as `Source`, `Derived`, or `Target` and saved to `data/processed/analytical_sample.csv` for quick inspection and modelling.


In [1]:

import os
import pandas as pd
import numpy as np

# Ensure processed data directory exists
os.makedirs('data/processed', exist_ok=True)

# Load raw weather data
df = pd.read_csv('data/raw/toronto_daily_weather_2019_2020.csv')

# Convert Date/Time to datetime (if exists) or construct from year-month-day columns
if 'Date/Time' in df.columns:
    df['Date'] = pd.to_datetime(df['Date/Time'])
else:
    df['Date'] = pd.to_datetime(df[['Year','Month','Day']])

# Sort by date
weather_df = df.sort_values('Date').reset_index(drop=True)


In [2]:

# Keep relevant source variables
df_fe = weather_df.copy()

# Fill precipitation NaN with 0 for rolling calculations
for col in ['Total Rain (mm)', 'Total Snow (cm)', 'Total Precip (mm)']:
    if col in df_fe.columns:
        df_fe[col] = df_fe[col].fillna(0)

# Remove flag and non-numeric indicator columns to reduce missing values
flag_cols = [c for c in df_fe.columns if 'Flag' in c or 'Quality' in c or 'Dir of Max Gust' in c or 'Spd of Max Gust' in c or 'Visibility' in c or 'Weather' in c]
df_fe = df_fe.drop(columns=flag_cols)

# Fill numeric columns with 0 to handle missing values
numeric_cols = df_fe.select_dtypes(include=['float', 'int']).columns
df_fe[numeric_cols] = df_fe[numeric_cols].fillna(0)

# Lagged features
df_fe['Precip_Last_Day'] = df_fe['Total Precip (mm)'].shift(1)
df_fe['Mean_Temp_Last_Day'] = df_fe['Mean Temp (°C)'].shift(1)

# Rolling sums
for window in [3, 7]:
    df_fe[f'Precip_{window}day_sum'] = df_fe['Total Precip (mm)'].rolling(window).sum().shift(1)
    df_fe[f'Total_Rain_{window}day_sum'] = df_fe['Total Rain (mm)'].rolling(window).sum().shift(1)
    df_fe[f'Total_Snow_{window}day_sum'] = df_fe['Total Snow (cm)'].rolling(window).sum().shift(1)

# Cyclical features
df_fe['Day_of_Year'] = df_fe['Date'].dt.dayofyear
df_fe['Month_of_Year'] = df_fe['Month']

df_fe['Day_of_Year_sin'] = np.sin(2 * np.pi * df_fe['Day_of_Year'] / 365.25)
df_fe['Day_of_Year_cos'] = np.cos(2 * np.pi * df_fe['Day_of_Year'] / 365.25)
df_fe['Month_of_Year_sin'] = np.sin(2 * np.pi * df_fe['Month_of_Year'] / 12)
df_fe['Month_of_Year_cos'] = np.cos(2 * np.pi * df_fe['Month_of_Year'] / 12)

# Target variable
df_fe['Next_Day_Precip'] = df_fe['Total Precip (mm)'].shift(-1)
threshold = 10.0
df_fe['Flood_Event_Imminent'] = (df_fe['Next_Day_Precip'] > threshold).astype(int)

# Drop rows with missing values after lags and rolling operations
analytical_df = df_fe.dropna().reset_index(drop=True)

# Display head
analytical_df.head()


Unnamed: 0,Longitude (x),Latitude (y),Station Name,Climate ID,Date/Time,Year,Month,Day,Max Temp (°C),Min Temp (°C),...,Total_Rain_7day_sum,Total_Snow_7day_sum,Day_of_Year,Month_of_Year,Day_of_Year_sin,Day_of_Year_cos,Month_of_Year_sin,Month_of_Year_cos,Next_Day_Precip,Flood_Event_Imminent
0,-79.4,43.67,TORONTO CITY,6158355,2019-01-08,2019,1,8,8.7,3.3,...,0.0,0.0,8,1,0.137185,0.990545,0.5,0.866025,0.9,0
1,-79.4,43.67,TORONTO CITY,6158355,2019-01-09,2019,1,9,3.4,-3.5,...,0.0,0.0,9,1,0.154204,0.988039,0.5,0.866025,0.0,0
2,-79.4,43.67,TORONTO CITY,6158355,2019-01-10,2019,1,10,-3.5,-10.3,...,0.0,0.0,10,1,0.171177,0.98524,0.5,0.866025,0.0,0
3,-79.4,43.67,TORONTO CITY,6158355,2019-01-11,2019,1,11,-6.5,-12.2,...,0.0,0.0,11,1,0.188099,0.98215,0.5,0.866025,0.0,0
4,-79.4,43.67,TORONTO CITY,6158355,2019-01-12,2019,1,12,-4.1,-7.4,...,0.0,0.0,12,1,0.204966,0.978769,0.5,0.866025,0.0,0


In [3]:

# Save full analytical dataset
analytical_path = 'data/processed/analytical_data.csv'
analytical_df.to_csv(analytical_path, index=False)
print(f'Saved analytical dataset to {analytical_path}')

# Create 100-record random sample
sample = analytical_df.sample(n=min(100, len(analytical_df)), random_state=42).copy()

# Create a mapping for variable types
variable_types = {}
source_cols = ['Total Rain (mm)', 'Total Snow (cm)', 'Total Precip (mm)', 'Mean Temp (°C)', 'Max Temp (°C)', 'Min Temp (°C)', 'Month', 'Day', 'Year']
derived_cols = [col for col in analytical_df.columns if col not in source_cols + ['Next_Day_Precip', 'Flood_Event_Imminent', 'Date']]

for col in analytical_df.columns:
    if col in source_cols:
        variable_types[col] = 'Source'
    elif col == 'Flood_Event_Imminent' or col == 'Next_Day_Precip':
        variable_types[col] = 'Target'
    elif col == 'Date':
        variable_types[col] = 'Source'
    else:
        variable_types[col] = 'Derived'

# Add variable type row to sample for each column
sample_with_labels = sample.copy()
sample_with_labels.columns = pd.MultiIndex.from_arrays([
    sample.columns,
    [variable_types.get(col, '') for col in sample.columns]
])

# Flatten MultiIndex for CSV output by joining with '__'
sample_with_labels.columns = [f"{col[0]}__{col[1]}" for col in sample_with_labels.columns]

sample_path = 'data/processed/analytical_sample.csv'
sample_with_labels.to_csv(sample_path, index=False)
print(f'Saved analytical sample to {sample_path}')

sample_with_labels.head()


Saved analytical dataset to data/processed/analytical_data.csv
Saved analytical sample to data/processed/analytical_sample.csv


Unnamed: 0,Longitude (x)__Derived,Latitude (y)__Derived,Station Name__Derived,Climate ID__Derived,Date/Time__Derived,Year__Source,Month__Source,Day__Source,Max Temp (°C)__Source,Min Temp (°C)__Source,...,Total_Rain_7day_sum__Derived,Total_Snow_7day_sum__Derived,Day_of_Year__Derived,Month_of_Year__Derived,Day_of_Year_sin__Derived,Day_of_Year_cos__Derived,Month_of_Year_sin__Derived,Month_of_Year_cos__Derived,Next_Day_Precip__Target,Flood_Event_Imminent__Target
336,-79.4,43.67,TORONTO CITY,6158355,2019-12-10,2019,12,10,10.7,-5.6,...,0.0,0.0,344,12,-0.357464,0.933927,-2.449294e-16,1.0,0.0,0
517,-79.4,43.67,TORONTO CITY,6158355,2020-06-08,2020,6,8,22.8,12.2,...,0.0,0.0,160,6,0.379453,-0.925211,1.224647e-16,-1.0,0.0,0
598,-79.4,43.67,TORONTO CITY,6158355,2020-08-28,2020,8,28,23.6,17.5,...,0.0,0.0,241,8,-0.843728,-0.536771,-0.8660254,-0.5,0.0,0
33,-79.4,43.67,TORONTO CITY,6158355,2019-02-10,2019,2,10,-2.5,-7.8,...,0.0,0.0,41,2,0.648262,0.761418,0.8660254,0.5,0.7,0
439,-79.4,43.67,TORONTO CITY,6158355,2020-03-22,2020,3,22,2.2,-4.7,...,0.0,0.0,82,3,0.987196,0.159513,1.0,6.123234000000001e-17,12.1,1
