# AutoML and Hyperparameter Training

Here I use Automatic Machine Learning tools to quickly and efficently assess a range of potential models

## Import Packages  

In [1]:
# plotting
import matplotlib.pyplot as plt

# general
import pandas as pd
import numpy as np
import calendar

# ml
from pycaret.regression import *

## Import data

This is my first time combining the PCs and the PWT data I'm trying to emulate, so I'm taking a bit of work below to get them into the same format

In [None]:
# set lists of coordinates and time ranges (pulled from Prepare_AI_Ready_Data.py) (CURRENTLY JUST ONE, BUT CAN ADD MORE)
coords = [[180,240,45,65],[130,250,20,75]]
times = [['1970-01-01','2023-12-31']]  # Ensure the time range is valid

# select which of the list I want to load
coords_num = 1
times_num = 0

# pull the correct coordinate and time (as set above)
c = coords[coords_num]
t = times[times_num]

# Import Target Data
pc_df_raw = pd.read_csv(f'../data/dimensionality_reduction/principal_components_{c[0]}-{c[1]}_{c[2]}-{c[3]}_{t[0][:4]}-{t[1][:4]}.csv')

In [None]:
target_df_raw = pd.read_csv('../data/target/era5_monthbymonth_pwt.csv')

# delete all columns which contain 'ann' in the column name
target_df_raw = target_df_raw.loc[:,~target_df_raw.columns.str.contains('ann')]

In [4]:
# sort imported data into same format as training data

# identify all column names that contain 'jan'
jan_cols = target_df_raw.columns[target_df_raw.columns.str.contains('jan')]

# remove '_jan' from column names
new_cols = [col.replace('_jan','') for col in jan_cols]

# add new_cols as empty columns to the pc_df_raw dataframe
master_df = pc_df_raw.copy()
for col in new_cols:
    master_df[col] = np.nan

# loop throuch pc_df_raw row by row and add the corresponding january data to the new dataframe
for index, row in master_df.iterrows():

    # get year, month from 'time' column
    year = int(row['time'].split('-')[0])
    month = int(row['time'].split('-')[1])

    # get 3-character month string from month integer (i.e. 1 -> 'jan')
    month_str = calendar.month_abbr[month].lower()

    # find the corresponding row in the target dataframe
    target_row = target_df_raw.loc[target_df_raw['Year'] == year]

    for col in jan_cols:
        
        # get the corresponding column name
        c = col.replace('_jan','_'+month_str)

        # remove '_jan' from column name
        nc = col.replace('_jan','')

        # get the value from the target dataframe
        value = target_row[c].values[0]

        # add the value to the new dataframe
        master_df.at[index, nc] = value
    
# convert time (format as string 'YYYY/MM/DD') to datetime
master_df['time'] = pd.to_datetime(master_df['time'])

# add a month column to the df
master_df['month'] = master_df['time'].dt.month

# remove rows with year > 2022
master_df = master_df[master_df['time'] < '2023-01-01']

# print head to confirm
print(master_df.head())

# save dataframe in this format (for use in other scripts)
master_df.to_csv(f'../data/dimensionality_reduction/principal_components_{c[0]}-{c[1]}_{c[2]}-{c[3]}_{t[0][:4]}-{t[1][:4]}_target.csv', index=False)


            PC1           PC2           PC3           PC4          PC5  \
0  26011.582895  16179.864478  20148.257224  -1243.128472 -3203.919113   
1 -38540.276288 -26933.230459   7662.333731   -473.188801  -574.772963   
2 -27748.077675 -27043.209357   6201.702439 -12455.893354 -5782.296933   
3    914.655777  -1933.663131 -17384.504214  -8083.327042 -8493.886231   
4   4350.635969   8551.012038   3549.986502  -8621.343188  6588.165550   

           PC6          PC7          PC8          PC9         PC10  ...  \
0  2082.483822  1533.300644 -2073.035967  -403.748801 -1831.821897  ...   
1  -283.820441 -3347.927477  -806.090809  1080.197321 -1307.212841  ...   
2   890.853223   769.349049 -2303.218959  4509.428779  -212.349491  ...   
3  1760.217848   299.825646 -2445.984972  2386.216683  -413.029896  ...   
4  1714.238166 -3375.579228   267.801280  1445.144680   843.720135  ...   

   pwt_800hpa  pwt_825hpa  pwt_850hpa  pwt_875hpa  pwt_900hpa  pwt_925hpa  \
0  264.275124  265.928571  

## Apply autoML Frameworks

Here I apply pycaret to automate machine learning model selection along with hyperparameter tuning. I used CoPilot to efficiently set this up. Does this mean we can call this auto-autoML?

In [5]:
# filter for just the columns I want (PC1-PC10, and pwt_500hpa)
automl_data = master_df[['PC1','PC2','PC3','PC4','PC5','PC6','PC7','PC8','PC9','PC10','month','pwt_500hpa']]

# check for missing or nan values
print(automl_data.isnull().sum())

# check all values are floats
print(automl_data.dtypes)

# normalize all features and target to be between 0 and 1
automl_data = automl_data.apply(lambda x: (x - x.min()) / (x.max() - x.min()))

# check the data
automl_data.head()

PC1           0
PC2           0
PC3           0
PC4           0
PC5           0
PC6           0
PC7           0
PC8           0
PC9           0
PC10          0
month         0
pwt_500hpa    0
dtype: int64
PC1           float64
PC2           float64
PC3           float64
PC4           float64
PC5           float64
PC6           float64
PC7           float64
PC8           float64
PC9           float64
PC10          float64
month           int32
pwt_500hpa    float64
dtype: object


Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,month,pwt_500hpa
0,0.600002,0.637487,0.638194,0.425833,0.337849,0.472255,0.540526,0.33168,0.424085,0.308266,0.0,0.165404
1,0.281566,0.220377,0.505401,0.438916,0.411108,0.392262,0.261215,0.424446,0.539133,0.35989,0.090909,0.335815
2,0.334804,0.219313,0.489866,0.235291,0.266006,0.431972,0.496812,0.314826,0.804997,0.467631,0.181818,0.231047
3,0.476198,0.462243,0.239016,0.309595,0.19045,0.46136,0.469945,0.304372,0.640387,0.447883,0.272727,0.223241
4,0.493148,0.56368,0.461664,0.300453,0.610695,0.459806,0.259632,0.503077,0.567427,0.571554,0.363636,0.343418


In [6]:
# Initialize PyCaret setup
setup(data=automl_data, 
      target='pwt_500hpa', 
      session_id=123,
      normalize=False,        
      transformation=True,   
      fold=5,                
      verbose=True)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,pwt_500hpa
2,Target type,Regression
3,Original data shape,"(636, 12)"
4,Transformed data shape,"(636, 12)"
5,Transformed train set shape,"(445, 12)"
6,Transformed test set shape,"(191, 12)"
7,Numeric features,11
8,Preprocess,True
9,Imputation type,simple


<pycaret.regression.oop.RegressionExperiment at 0x16a2aa080>

In [7]:
best = compare_models(exclude = ['ransac'])


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,0.0653,0.0071,0.0839,0.8513,0.0614,0.242,0.032
rf,Random Forest Regressor,0.0665,0.0075,0.0864,0.8413,0.0634,0.246,0.048
gbr,Gradient Boosting Regressor,0.0665,0.0077,0.0878,0.8362,0.0644,0.25,0.046
catboost,CatBoost Regressor,0.0689,0.0081,0.0896,0.8295,0.0654,0.259,0.442
lightgbm,Light Gradient Boosting Machine,0.0681,0.0082,0.0903,0.8266,0.0666,0.2594,0.24
xgboost,Extreme Gradient Boosting,0.0718,0.0085,0.0922,0.8198,0.0676,0.2624,0.036
ada,AdaBoost Regressor,0.0739,0.0091,0.0953,0.8078,0.0691,0.2792,0.022
dt,Decision Tree Regressor,0.0897,0.0129,0.1135,0.7283,0.0841,0.3149,0.014
knn,K Neighbors Regressor,0.1087,0.0196,0.1393,0.5868,0.0998,0.4281,0.016
omp,Orthogonal Matching Pursuit,0.1768,0.0432,0.2077,0.0926,0.1419,0.5999,0.012


In [8]:
evaluate_model(best)

# print hyperparameters of best model


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [11]:
# evaluate the 2nd best model
best2 = compare_models(exclude = ['ransac', best])
evaluate_model(best2)


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,0.0653,0.0071,0.0839,0.8513,0.0614,0.242,0.036
rf,Random Forest Regressor,0.0665,0.0075,0.0864,0.8413,0.0634,0.246,0.054
gbr,Gradient Boosting Regressor,0.0665,0.0077,0.0878,0.8362,0.0644,0.25,0.036
catboost,CatBoost Regressor,0.0689,0.0081,0.0896,0.8295,0.0654,0.259,0.564
lightgbm,Light Gradient Boosting Machine,0.0681,0.0082,0.0903,0.8266,0.0666,0.2594,0.24
xgboost,Extreme Gradient Boosting,0.0718,0.0085,0.0922,0.8198,0.0676,0.2624,0.04
ada,AdaBoost Regressor,0.0739,0.0091,0.0953,0.8078,0.0691,0.2792,0.022
dt,Decision Tree Regressor,0.0897,0.0129,0.1135,0.7283,0.0841,0.3149,0.012
knn,K Neighbors Regressor,0.1087,0.0196,0.1393,0.5868,0.0998,0.4281,0.016
omp,Orthogonal Matching Pursuit,0.1768,0.0432,0.2077,0.0926,0.1419,0.5999,0.012


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…