# Using Machine Learning to Forecast Air Quality in Beijing

## 3 - Feature Engineering

### Import Python Packages

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller

import pmdarima as pm
from pmdarima import pipeline
from pmdarima import preprocessing as ppc
from pmdarima import arima
from stldecompose import decompose, forecast
from stldecompose.forecast_funcs import (naive, drift, mean, seasonal_naive)

from tqdm import tqdm as tqdm

The history saving thread hit an unexpected error (DatabaseError('database disk image is malformed')).History will not be written to the database.


### Load cleaned data set

In [2]:
df = pd.read_csv('data/dailypm25.csv',
                 index_col=0,
                 parse_dates=[0],
                 date_parser=pd.to_datetime,
                 infer_datetime_format=True)
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2190 entries, 2010-01-02 to 2015-12-31
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   wind_dir           2190 non-null   object 
 1   year               2190 non-null   int64  
 2   month              2190 non-null   int64  
 3   season             2190 non-null   int64  
 4   pm25               2190 non-null   float64
 5   dew_point          2190 non-null   float64
 6   humidity           2190 non-null   float64
 7   pressure           2190 non-null   float64
 8   temp               2190 non-null   float64
 9   wind_speed         2190 non-null   float64
 10  precipitation      2190 non-null   float64
 11  cum_precipitation  2190 non-null   float64
dtypes: float64(8), int64(3), object(1)
memory usage: 222.4+ KB


Unnamed: 0,wind_dir,year,month,season,pm25,dew_point,humidity,pressure,temp,wind_speed,precipitation,cum_precipitation
2010-01-02,SE,2010,1,4,144.333333,-8.5,77.9375,1024.75,-5.125,24.86,0.0,0.0
2010-01-03,SE,2010,1,4,78.375,-10.125,87.916667,1022.791667,-8.541667,70.937917,0.466667,11.2
2010-01-04,NW,2010,1,4,29.291667,-20.875,46.208333,1029.291667,-11.5,111.160833,0.0,0.0
2010-01-05,NW,2010,1,4,43.541667,-24.583333,42.041667,1033.625,-14.458333,56.92,0.0,0.0
2010-01-06,NE,2010,1,4,59.375,-23.708333,39.208333,1033.75,-12.541667,18.511667,0.0,0.0


In [3]:
df.tail()

Unnamed: 0,wind_dir,year,month,season,pm25,dew_point,humidity,pressure,temp,wind_speed,precipitation,cum_precipitation
2015-12-27,NE,2015,12,4,56.208333,-13.958333,53.541667,1038.625,-5.666667,3.950833,0.0,0.0
2015-12-28,NW,2015,12,4,112.416667,-11.458333,60.75,1035.041667,-4.291667,13.656667,0.0,0.0
2015-12-29,cv,2015,12,4,331.875,-6.625,76.125,1028.875,-2.791667,1.244583,0.0,0.0
2015-12-30,NW,2015,12,4,101.75,-8.75,58.458333,1030.375,-0.333333,26.5025,0.0,0.0
2015-12-31,NW,2015,12,4,70.875,-10.083333,59.416667,1032.458333,-2.833333,9.073333,0.0,0.0


In [4]:
from pmdarima.arima.stationarity import ADFTest

# Test whether we should difference at the alpha=0.05
# significance level
adf_test = ADFTest(alpha=0.05)
p_val, should_diff = adf_test.should_diff(df.pm25.values)  # (0.01, False)

In [5]:
# Generating a 7 day forecast for the first week of 2015

BURN_IN = "2012-01-01"
FORECAST_START = "2015-01-01"
FORECAST_END = "2015-12-31"
FORECAST_DAYS = 7

In [6]:
timestamps = pd.date_range(start=BURN_IN, end=FORECAST_END, freq='D')

In [7]:
def create_lag_features(series, lag_range, prefix):

    df = pd.DataFrame()

    for lag in lag_range:
        df[prefix + "_lag_" + str(lag - 1)] = series.shift(lag)

    return(df)

In [8]:
def fit_stl(history, steps):
    decomp = decompose(history, period=365)
    pred = forecast(decomp, steps=steps, fc_func=drift, seasonal=True)
    return(pred)

In [9]:
def fit_arima(history, steps):
    pipe = pipeline.Pipeline(
        [
            ("fourier", ppc.FourierFeaturizer(m=365.25, k=10)),
            ("arima", arima.AutoARIMA(
                stepwise=True,
                trace=1,
                error_action="ignore",
                seasonal=False,
                suppress_warnings=True
                ))])

    pipe.fit(history);
    pred = pipe.predict(n_periods=steps)
    return(pred)

In [10]:
def forecast_horizons(df, col, timestamps, forecast_start, horizons, lags):

    dfs_with_horizons = []
    
    for timestamp in tqdm(timestamps[:-horizons]):

        df_with_horizon = pd.DataFrame()

        df_with_horizon["horizon"] = list(range(1, horizons+1))
        df_with_horizon["date_origin"] = [(timestamp.date() - timedelta(days=1)).strftime('%Y-%m-%d')]*horizons
        df_with_horizon["date_target"] = pd.date_range(start=timestamp.date(), periods=horizons, freq='D').astype(str).to_list()
        df_with_horizon["target"] = df[timestamp.date():][col].head(horizons).values

        history = df[:timestamp.date()][col].head(-1)

        if timestamp.date().strftime('%Y-%m-%d') < forecast_start:

            pred_stl = np.empty((horizons, 1))
            pred_stl[:] = np.nan
            pred_arima = np.empty((horizons, 1))
            pred_arima[:] = np.nan

            df_stl_lags = create_lag_features(series=pd.Series(history), lag_range=range(2, lags + 1), prefix="stl").tail(horizons).reset_index(drop=True)
            df_arima_lags = create_lag_features(series=pd.Series(history), lag_range=range(2, lags + 1), prefix="arima").tail(horizons).reset_index(drop=True)

        else:

            pred_stl = fit_stl(history, steps=horizons).values
            pred_arima = fit_arima(history, steps=horizons)

            df_stl_lags = create_lag_features(series=pd.Series(np.append(history, pred_stl)), lag_range=range(2, lags + 2), prefix="stl").tail(horizons).reset_index(drop=True)
            df_arima_lags = create_lag_features(series=pd.Series(np.append(history, pred_arima)), lag_range=range(2, lags + 2), prefix="arima").tail(horizons).reset_index(drop=True)

        df_with_horizon["pred_stl"] = pred_stl
        df_with_horizon["pred_arima"] = pred_arima

        dfs_with_horizons.append(pd.concat([df_with_horizon, df_stl_lags, df_arima_lags], axis=1))

    return pd.concat(dfs_with_horizons).sort_values(["date_target", "date_origin"])

In [11]:
df_with_lags_and_horizons = forecast_horizons(df=df, col="pm25", timestamps=timestamps, forecast_start=FORECAST_START, horizons=FORECAST_DAYS, lags=14);

t circle: 1.000)
Fit ARIMA(2,1,1)x(0,0,0,0) [intercept=True]; AIC=24098.214, BIC=24240.334, Time=9.291 seconds
Near non-invertible roots for order (2, 1, 1)(0, 0, 0, 0); setting score to inf (at least one inverse root too close to the border of the unit circle: 1.000)
Fit ARIMA(3,1,2)x(0,0,0,0) [intercept=True]; AIC=24093.939, BIC=24247.428, Time=18.747 seconds
Near non-invertible roots for order (3, 1, 2)(0, 0, 0, 0); setting score to inf (at least one inverse root too close to the border of the unit circle: 1.000)
Fit ARIMA(2,1,3)x(0,0,0,0) [intercept=True]; AIC=24097.182, BIC=24250.671, Time=13.063 seconds
Near non-invertible roots for order (2, 1, 3)(0, 0, 0, 0); setting score to inf (at least one inverse root too close to the border of the unit circle: 1.000)
Fit ARIMA(1,1,1)x(0,0,0,0) [intercept=True]; AIC=24167.893, BIC=24304.327, Time=13.443 seconds
Near non-invertible roots for order (1, 1, 1)(0, 0, 0, 0); setting score to inf (at least one inverse root too close to the border

In [12]:
df_with_lags_and_horizons.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10178 entries, 0 to 6
Data columns (total 34 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   horizon       10178 non-null  int64  
 1   date_origin   10178 non-null  object 
 2   date_target   10178 non-null  object 
 3   target        10178 non-null  float64
 4   pred_stl      2506 non-null   float64
 5   pred_arima    2506 non-null   float64
 6   stl_lag_1     10178 non-null  float64
 7   stl_lag_2     10178 non-null  float64
 8   stl_lag_3     10178 non-null  float64
 9   stl_lag_4     10178 non-null  float64
 10  stl_lag_5     10178 non-null  float64
 11  stl_lag_6     10178 non-null  float64
 12  stl_lag_7     10178 non-null  float64
 13  stl_lag_8     10178 non-null  float64
 14  stl_lag_9     10178 non-null  float64
 15  stl_lag_10    10178 non-null  float64
 16  stl_lag_11    10178 non-null  float64
 17  stl_lag_12    10178 non-null  float64
 18  stl_lag_13    10178 non-null  

In [13]:
df_with_lags_and_horizons.head()

Unnamed: 0,horizon,date_origin,date_target,target,pred_stl,pred_arima,stl_lag_1,stl_lag_2,stl_lag_3,stl_lag_4,...,arima_lag_6,arima_lag_7,arima_lag_8,arima_lag_9,arima_lag_10,arima_lag_11,arima_lag_12,arima_lag_13,stl_lag_14,arima_lag_14
0,1,2011-12-31,2012-01-01,72.25,,,54.625,30.416667,20.854167,84.375,...,52.125,107.583333,26.416667,14.5,14.458333,156.916667,49.416667,83.708333,,
1,2,2011-12-31,2012-01-02,63.416667,,,43.333333,54.625,30.416667,20.854167,...,62.791667,52.125,107.583333,26.416667,14.5,14.458333,156.916667,49.416667,,
0,1,2012-01-01,2012-01-02,63.416667,,,43.333333,54.625,30.416667,20.854167,...,62.791667,52.125,107.583333,26.416667,14.5,14.458333,156.916667,49.416667,,
2,3,2011-12-31,2012-01-03,14.791667,,,85.625,43.333333,54.625,30.416667,...,84.375,62.791667,52.125,107.583333,26.416667,14.5,14.458333,156.916667,,
1,2,2012-01-01,2012-01-03,14.791667,,,85.625,43.333333,54.625,30.416667,...,84.375,62.791667,52.125,107.583333,26.416667,14.5,14.458333,156.916667,,


In [14]:
df_with_lags_and_horizons.tail()

Unnamed: 0,horizon,date_origin,date_target,target,pred_stl,pred_arima,stl_lag_1,stl_lag_2,stl_lag_3,stl_lag_4,...,arima_lag_6,arima_lag_7,arima_lag_8,arima_lag_9,arima_lag_10,arima_lag_11,arima_lag_12,arima_lag_13,stl_lag_14,arima_lag_14
5,6,2015-12-22,2015-12-28,112.416667,69.781876,69.926394,82.275822,60.966481,69.81547,34.673479,...,241.125,238.208333,169.0625,74.291667,59.958333,7.666667,6.083333,144.375,188.125,188.125
4,5,2015-12-23,2015-12-28,112.416667,69.917847,79.246681,82.411146,61.101478,69.950149,254.541667,...,241.125,238.208333,169.0625,74.291667,59.958333,7.666667,6.083333,144.375,188.125,188.125
6,7,2015-12-22,2015-12-29,331.875,40.833797,73.957458,91.076831,82.275822,60.966481,69.81547,...,336.958333,241.125,238.208333,169.0625,74.291667,59.958333,7.666667,6.083333,144.375,144.375
5,6,2015-12-23,2015-12-29,331.875,40.970091,83.133738,91.212478,82.411146,61.101478,69.950149,...,336.958333,241.125,238.208333,169.0625,74.291667,59.958333,7.666667,6.083333,144.375,144.375
6,7,2015-12-23,2015-12-30,101.75,20.176158,85.955393,69.917847,91.212478,82.411146,61.101478,...,254.541667,336.958333,241.125,238.208333,169.0625,74.291667,59.958333,7.666667,6.083333,6.083333


In [15]:
df_with_lags_and_horizons.to_csv("data/df_with_lags_and_horizons.csv")