In [98]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
"""
Preprocessing techniques:
Imputation methods:
   deletion approach for missing values

Standardisation
"""

apt_data = pd.read_csv("NZWN.csv", na_values=['M'], low_memory=False)
# convert valid to datetime
apt_data['valid'] = pd.to_datetime(apt_data['valid'])
apt_data.head()
apt_data.isnull().sum()


station                   0
valid                     0
tmpf                     26
dwpf                     26
relh                     26
drct                      1
sknt                      1
p01i                      0
alti                      1
mslp                 104462
vsby                     69
gust                  91454
skyc1                   104
skyc2                 74449
skyc3                 92173
skyc4                104448
skyl1                 36734
skyl2                 74451
skyl3                 92180
skyl4                104462
wxcodes               91722
ice_accretion_1hr    104462
ice_accretion_3hr    104462
ice_accretion_6hr    104462
peak_wind_gust       104462
peak_wind_drct       104462
peak_wind_time       104462
feel                     26
metar                     0
snowdepth            104462
dtype: int64

In [99]:
## Split data into training, validation and test sets

"""
Training: data spanning from 2020 to 2023
Validation: data in the year of 2024
Testing: test on data in 2025
"""

train = apt_data[apt_data['valid'].dt.year <= 2023]
val = apt_data[apt_data['valid'].dt.year == 2024]
test = apt_data[apt_data['valid'].dt.year == 2025]

"""
Columns to delete

Station: all entries refer to NZWN
Mean Sea Level Pressure: all missing values
Metar: raw data is already converted for us
Sky conditions: except one column the rest have mostly missing values
Sky Level: except one column the rest have mostly missing values
wxcodes: all missing values
ice accrediation: all missing values
peak wind gust, peak wind direction and peak wind time: all missing values
snowdepth: all missing values


"""

train = train.drop(columns = ['station', 'skyc2', 'skyc3', 'skyc4', 'skyl2', 'skyl3', 'skyl4', 'wxcodes', 'ice_accretion_1hr', 'ice_accretion_3hr', 'ice_accretion_6hr', 'gust','peak_wind_drct', 'peak_wind_gust', 'peak_wind_time', 'snowdepth','mslp', 'metar'])

val = val.drop(columns = ['station', 'skyc2', 'skyc3', 'skyc4', 'skyl2', 'skyl3', 'skyl4', 'wxcodes', 'ice_accretion_1hr', 'ice_accretion_3hr', 'ice_accretion_6hr', 'gust', 'peak_wind_drct', 'peak_wind_gust', 'peak_wind_time', 'snowdepth','mslp', 'metar'])

test = test.drop(columns = ['station', 'skyc2', 'skyc3', 'skyc4', 'skyl2', 'skyl3', 'skyl4', 'wxcodes', 'ice_accretion_1hr', 'ice_accretion_3hr', 'ice_accretion_6hr', 'gust', 'peak_wind_drct', 'peak_wind_gust', 'peak_wind_time', 'snowdepth','mslp', 'metar'])

## utilise deletion approaches for small amounts of missing values except for sky cover where we use imputation approach
train = train.dropna(subset=['tmpf','dwpf', 'relh', 'vsby', 'feel', 'skyc1'])
val = val.dropna(subset=['tmpf','dwpf', 'relh', 'vsby', 'feel','skyc1'])
test = test.dropna(subset=['tmpf','dwpf', 'relh', 'vsby', 'feel','skyc1'])


In [100]:
train.head()

Unnamed: 0,valid,tmpf,dwpf,relh,drct,sknt,p01i,alti,vsby,skyc1,skyl1,feel
0,2020-01-01 00:00:00,66.2,57.2,72.74,10.0,23.0,0.0,29.97,6.21,BKN,1800.0,66.2
1,2020-01-01 00:30:00,68.0,57.2,68.35,360.0,28.0,0.0,29.94,6.21,BKN,1900.0,68.0
2,2020-01-01 01:00:00,68.0,57.2,68.35,10.0,26.0,0.0,29.94,6.21,SCT,2100.0,68.0
3,2020-01-01 01:30:00,68.0,59.0,72.92,10.0,24.0,0.0,29.94,6.21,FEW,2000.0,68.0
4,2020-01-01 02:00:00,68.0,59.0,72.92,10.0,26.0,0.0,29.94,6.21,SCT,1900.0,68.0


In [101]:
def train_target_selection(train, val, test, target_name):
    x_train, y_train = train.drop(columns=[target_name]), train[target_name]

    x_val, y_val = val.drop(columns=[target_name]), val[target_name]

    x_test, y_test = test.drop(columns=[target_name]), test[target_name]

    return x_train, y_train, x_val, y_val, x_test, y_test

x_train_wspeed, y_train_wspeed, x_val_wspeed, y_val_wspeed, x_test_wspeed, y_test_wspeed = train_target_selection(train, val, test, "sknt")

x_train_vsby, y_train_vsby, x_val_vsby, y_val_vsby, x_test_vsby, y_test_vsby = train_target_selection(train, val, test, "vsby")

x_train_temp, y_train_temp, x_val_temp, y_val_temp, x_test_temp, y_test_temp = train_target_selection(train, val, test, "tmpf")

In [102]:
train.isnull().sum()

valid        0
tmpf         0
dwpf         0
relh         0
drct         0
sknt         0
p01i         0
alti         0
vsby         0
skyc1        0
skyl1    24446
feel         0
dtype: int64

In [103]:
## Standardise data: fit only on training data
## Utilise column transformer for both categorical and numerical columns

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

def create_preprocessor(target_name):
    sky_cover_order = ['CLR', 'FEW', 'SCT', 'BKN', 'OVC', 'VV']
    ordinal_cols = ['skyc1']

    all_numeric = ['tmpf', 'dwpf', 'relh', 'drct', 'sknt','p01i', 'alti', 'vsby', 'feel']
    numerical_missing = ['skyl1']
    numerical_complete = [col for col in all_numeric if col!=target_name]
    # Utilise pipeline to impute then scale
    numerical_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
    ]
    )

     # create preprocessor scaler
    preprocessor = ColumnTransformer(
        transformers=[
        ('numerical_missing', numerical_pipeline, numerical_missing),
        ('numerical_complete', StandardScaler(), numerical_complete),
        ('ordinal_skyc', OrdinalEncoder(categories=[sky_cover_order], handle_unknown='use_encoded_value', unknown_value=-1), ordinal_cols)])

    return preprocessor


In [104]:
## Standardise data using preprocessor for targets

## Wind target
wind_preprocesser = create_preprocessor('sknt')
x_train_processed_wind = wind_preprocesser.fit_transform(x_train_wspeed)
x_val_processed_wind = wind_preprocesser.fit_transform(x_val_wspeed)
x_test_processed_wind = wind_preprocesser.fit_transform(x_test_wspeed)


## Temperature target
temperature_processor = create_preprocessor('tmpf')
x_train_processed_temp = temperature_processor.fit_transform(x_train_temp)
x_val_processed_temp = temperature_processor.fit_transform(x_val_temp)
x_test_processed_temp= temperature_processor.fit_transform(x_test_temp)

## Visibility target
visibility_processor = create_preprocessor('vsby')
x_train_processed_vsby = visibility_processor.fit_transform(x_train_vsby)
x_val_processed_vsby = visibility_processor.fit_transform(x_val_vsby)
x_test_processed_vsby = visibility_processor.fit_transform(x_test_vsby)