In [None]:
import numpy as np
import pandas as pd



In [31]:
#df = pd.read_csv('cleaned.csv', parse_dates=['datetime'])
df_interp = pd.read_csv('cleaned_interp.csv', parse_dates=['datetime'])
pollutants = ['PM2.5', 'PM10', 'SO2', 'NO2', 'CO', 'O3']

In [None]:
#ordinal encoding, decision based on the stats from this link: 
#https://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_categorical.html#sphx-glr-auto-examples-ensemble-plot-gradient-boosting-categorical-py

from sklearn.preprocessing import OrdinalEncoder

enc = OrdinalEncoder(dtype=int)
df_interp['station_id'] = enc.fit_transform(df_interp['station'].values.reshape(-1,1)) + 1
df_interp.drop(['station', 'wd'], axis=1, inplace=True)

In [42]:
df_interp.columns

Index(['datetime', 'PM2.5', 'PM10', 'SO2', 'NO2', 'CO', 'O3', 'TEMP', 'PRES',
       'DEWP', 'RAIN', 'WSPM', 'station_id', 'PM2.5_lag1', 'PM2.5_lag2',
       'PM2.5_lag3', 'PM10_lag1', 'PM10_lag2', 'PM10_lag3', 'SO2_lag1',
       'SO2_lag2', 'SO2_lag3', 'NO2_lag1', 'NO2_lag2', 'NO2_lag3', 'CO_lag1',
       'CO_lag2', 'CO_lag3', 'O3_lag1', 'O3_lag2', 'O3_lag3',
       'PM2.5_rolling_6h', 'PM10_rolling_6h', 'SO2_rolling_6h',
       'NO2_rolling_6h', 'CO_rolling_6h', 'O3_rolling_6h', 'hour', 'dayofweek',
       'month', 'weekend', 'PM2.5_target', 'PM10_target', 'SO2_target',
       'NO2_target', 'CO_target', 'O3_target', 'day', 'year'],
      dtype='object')

In [33]:
#creating lagged feature variables
#sorting this way and then using groupby before the shift will prevent data leakage from one station to another

df_interp = df_interp.sort_values(['station_id', 'datetime'])
for pollutant in pollutants:
    for lag in range(1, 4):
        df_interp[f'{pollutant}_lag{lag}'] = df_interp.groupby('station_id')[pollutant].shift(lag)

In [34]:
#6 hour rolling average

for pollutant in pollutants:
    rolling_mean= df_interp.groupby('station_id')[f'{pollutant}'].rolling(window=6).mean()
    df_interp[f'{pollutant}_rolling_6h'] = rolling_mean.droplevel(0)

In [39]:
#creating time-based variables for xgboost to handle

df_interp['hour'] = df_interp['datetime'].dt.hour
df_interp['dayofweek'] = df_interp['datetime'].dt.day_of_week
df_interp['day'] = df_interp['datetime'].dt.day
df_interp['year'] = df_interp['datetime'].dt.year
df_interp['month'] = df_interp['datetime'].dt.month
df_interp['weekend'] = (df_interp['dayofweek'] >= 5).astype(int)

In [36]:
#creating target variables for each pollutant

for pollutant in pollutants:
    df_interp[f'{pollutant}_target'] = df_interp.groupby('station_id')[f'{pollutant}'].shift(-1)

In [37]:
#can drop NaNs now that the lagged and rolling features have been created
df_interp.dropna(inplace=True)

In [44]:
def get_pollutant_X_y(pollutant, df):
    X = df[[f'{pollutant}', f'{pollutant}_lag1', f'{pollutant}_lag2', f'{pollutant}_lag3', f'{pollutant}_rolling_6h', 'hour', 'day', 'weekend', 'month', 'year', 'TEMP', 'PRES', 'DEWP', 'WSPM']]
    y = df[f'{pollutant}_target']

    return X, y

In [41]:
from sklearn.model_selection import TimeSeriesSplit

tscv = TimeSeriesSplit(
    n_splits=5,
    gap=48,
    test_size=5000)

In [43]:
df_interp[['CO', 'CO_target']]

Unnamed: 0,CO,CO_target
5,400.0,500.0
6,500.0,500.0
7,500.0,500.0
8,500.0,400.0
9,400.0,400.0
...,...,...
420762,300.0,400.0
420763,400.0,500.0
420764,500.0,500.0
420765,500.0,400.0
