# Feature engineering

### As stated before, to account for imbalanced target, it is advised to perform good-quality feature engineering as we may not use sampling in Time Series.

In [None]:
import os
import pandas as pd
import numpy as np

from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
from sklearn.preprocessing import StandardScaler

In [10]:
def seed_everything(seed=42):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

def perform_VIF(data_frame: pd.DataFrame, predictors: list, ascending: str = False):
    X = data_frame.dropna()[predictors].drop(columns=[TARGET])
    X = add_constant(X)

    vif_df = pd.DataFrame()
    vif_df["feature"] = X.columns
    vif_df["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    vif_df = vif_df[vif_df["feature"] != "const"]
    
    display(vif_df.sort_values(by="VIF", ascending=ascending))

In [11]:
data_folder_path = "data\DataHourlyChina"
TARGET = "PM2.5"
RANDOM_SEED = 42
seed_everything(seed=RANDOM_SEED)

df = pd.read_csv(os.path.join(data_folder_path, "POST_EDA_PRSA_Data_Aotizhongxin_20130301-20170228.csv"), index_col="datetime")
df.head()

Unnamed: 0_level_0,year,month,day,hour,PM2.5,PM10,SO2,NO2,CO,O3,...,wd_NW,wd_S,wd_SE,wd_SSE,wd_SSW,wd_SW,wd_W,wd_WNW,wd_WSW,dayofweek
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-03-01 00:00:00,2013,3,1,0,4.0,4.0,4.0,7.0,300.0,77.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
2013-03-01 01:00:00,2013,3,1,1,8.0,8.0,4.0,7.0,300.0,77.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
2013-03-01 02:00:00,2013,3,1,2,7.0,7.0,5.0,10.0,300.0,73.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
2013-03-01 03:00:00,2013,3,1,3,6.0,6.0,11.0,11.0,300.0,72.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
2013-03-01 04:00:00,2013,3,1,4,3.0,3.0,12.0,12.0,300.0,72.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4


#### Remove the circular unstable variables. We'll have to express them to the model differently

In [12]:
df = df.drop(columns=['day', 'hour', 'month'])
df.head()

Unnamed: 0_level_0,year,PM2.5,PM10,SO2,NO2,CO,O3,TEMP,RAIN,WSPM,...,wd_NW,wd_S,wd_SE,wd_SSE,wd_SSW,wd_SW,wd_W,wd_WNW,wd_WSW,dayofweek
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-03-01 00:00:00,2013,4.0,4.0,4.0,7.0,300.0,77.0,-0.7,0.0,4.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
2013-03-01 01:00:00,2013,8.0,8.0,4.0,7.0,300.0,77.0,-1.1,0.0,4.7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
2013-03-01 02:00:00,2013,7.0,7.0,5.0,10.0,300.0,73.0,-1.1,0.0,5.6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
2013-03-01 03:00:00,2013,6.0,6.0,11.0,11.0,300.0,72.0,-1.4,0.0,3.1,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
2013-03-01 04:00:00,2013,3.0,3.0,12.0,12.0,300.0,72.0,-2.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4


### Feature selection

In [13]:
perform_VIF(df, df.columns.values)

Unnamed: 0,feature,VIF
4,NO2,3.401287
5,CO,2.726402
6,O3,2.598881
13,wd_NE,2.586915
2,PM10,2.366728
10,wd_ENE,2.243298
21,wd_SW,2.225798
7,TEMP,2.12652
14,wd_NNE,1.856159
20,wd_SSW,1.805009


In [14]:
df.index = pd.to_datetime(df.index)

df['is_weekend'] = df['dayofweek'].apply(lambda x: 1 if x >= 5 else 0)

df['sin_hour'] = np.sin(2 * np.pi * df.index.hour / 24)
df['cos_hour'] = np.cos(2 * np.pi * df.index.hour / 24)
df['month_sin'] = np.sin(2 * np.pi * df.index.month / 12)

df['dayofweek_sin'] = np.sin(2 * np.pi * df['dayofweek'] / 7)
df['dayofweek_cos'] = np.cos(2 * np.pi * df['dayofweek'] / 7)
df = df.drop(columns=['dayofweek']) # also cyclical; no longer needed

df["TEMP_x_CO"] = df["TEMP"] * df["CO"]
df["NO2_x_RAIN"] = df["NO2"] * df["RAIN"]
df["WSPM_X_SO2"] = df["WSPM"] * df['SO2']

df["PM10_diff1"] = df["PM10"].diff(1)
df["TEMP_diff1"] = df["TEMP"].diff(1)
df = df.dropna()

In [15]:
perform_VIF(df, df.columns.values)

Unnamed: 0,feature,VIF
3,SO2,4.927944
33,WSPM_X_SO2,4.340234
32,NO2_x_RAIN,4.076375
8,RAIN,4.076339
7,TEMP,3.82656
4,NO2,3.593632
6,O3,2.997808
5,CO,2.962115
25,is_weekend,2.858783
29,dayofweek_sin,2.767486


## Standardization

In [16]:
scaler = StandardScaler()
scaled = scaler.fit_transform(df.drop(columns=[TARGET]))
scaled_df = pd.DataFrame(scaled, columns=df.drop(columns=[TARGET]).columns, index=df.index)
df = pd.concat([scaled_df, df[TARGET]], axis=1)

df.head()

Unnamed: 0_level_0,year,PM10,SO2,NO2,CO,O3,TEMP,RAIN,WSPM,wd_ENE,...,cos_hour,month_sin,dayofweek_sin,dayofweek_cos,TEMP_x_CO,NO2_x_RAIN,WSPM_X_SO2,PM10_diff1,TEMP_diff1,PM2.5
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-03-01 01:00:00,-1.412466,-1.078452,-0.593658,-1.424646,-0.80909,0.37449,-1.288641,-0.073894,2.487723,-0.356598,...,1.366067,1.423946,-0.612319,-1.275284,-0.846123,-0.075799,-0.202244,0.110008,-0.303442,8.0
2013-03-01 02:00:00,-1.412466,-1.08907,-0.549144,-1.342647,-0.80909,0.304318,-1.288641,-0.073894,3.236048,-0.356598,...,1.224784,1.423946,-0.612319,-1.275284,-0.846123,-0.075799,0.037109,-0.027529,-0.000201,7.0
2013-03-01 03:00:00,-1.412466,-1.099689,-0.282056,-1.315314,-0.80909,0.286775,-1.314966,-0.073894,1.157367,-0.356598,...,1.000035,1.423946,-0.612319,-1.275284,-0.852138,-0.075799,0.19581,-0.027529,-0.227632,6.0
2013-03-01 04:00:00,-1.412466,-1.131543,-0.237542,-1.287982,-0.80909,0.286775,-1.367616,-0.073894,0.242748,-0.356598,...,0.707136,1.423946,-0.612319,-1.275284,-0.864169,-0.075799,-0.066958,-0.082543,-0.455063,3.0
2013-03-01 05:00:00,-1.412466,-1.110307,0.029546,-1.123985,-0.725066,0.181516,-1.385166,-0.073894,1.656251,-0.356598,...,0.366048,1.423946,-0.612319,-1.275284,-0.882884,-0.075799,1.041351,0.054994,-0.151822,5.0


In [17]:
df.to_csv(os.path.join(data_folder_path, "POST_EDA_POST_FEAT_ENG_STANDARDIZED.csv"), index=True, index_label='datetime')