# XGBoost Model Development

The purpose of this notebook is to:

1. Read in the preprocessed (feature engineered) dataset
2. Declare a parameter grid
3. Perform the machine learning cross validation pipeline
4. Train and export a final model and test scores

## Import necessary packages

In [1]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import ParameterGrid

from sklearn.metrics import mean_squared_error

## Read in the preprocessed (feature engineered) dataset

In [2]:
df = pd.read_csv("../data/preprocessed.csv")
df.columns, df.shape

(Index(['Name', 'Grade', 'Section', 'Class', 'School', 'Race', 'Date', 'Place',
        'Time (sec)', 'Speed Rating', 'SR', 'Gender', 'Race Section',
        'Latitude', 'Longitude', 'Temperature', 'Cloud Coverage', 'Wind Speed',
        'Precipitation', 'Dew Point', 'Humidity', 'Wind Chill', 'Wind Gust',
        'Heat Index', 'Visibility', 'Distance (mi)', 'Time-Place',
        'Name-School', 'Speed (mi/sec)', 'Average_Time',
        'Time_Difference_From_Avg', 'Average_Time_Class',
        'Time_Difference_From_Avg_Class', 'First_Place_Time',
        'Time_Difference_First_Place', 'First_Place_Time_Class',
        'Time_Difference_First_Place_Class', 'Average_Speed',
        'Speed_Difference_From_Avg', 'Temp_Humidity', 'WindSpeed_WindChill',
        'Temp_WindSpeed', 'Humidity_WindSpeed', 'HeatIndex_Humidity',
        'DewPoint_Temperature', 'DewPoint_Humidity', 'DewPoint_WindSpeed',
        'Year', 'Month', 'Day'],
       dtype='object'),
 (330108, 50))

In [3]:
X = df.drop(columns=["Speed Rating", "SR"])
y = df["Speed Rating"]

In [4]:
drop_ftrs = ['Date', 'Name', 'Name-School', 'Race Section', 'School', 'Race']
cat_ftrs = ['Gender', 'Section']
ordinal_ftrs = ['Class', 'Grade', 'Year', 'Month', 'Day']
ordinal_cats = [['D','C','B','CITY','A', 'AA'],[7,8,9,10,11,12,13], [2014,2015,2016,2017,2018,2019], [1,2,3,4,5,6,7,8,9,10,11,12], \
                [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]]
num_ftrs = ['Place', 'Time (sec)', 'Temperature', 'Cloud Coverage', 'Wind Speed', \
            'Precipitation', 'Dew Point', 'Humidity', 'Visibility', 'Time-Place', 'Speed_Difference_From_Avg', \
            'Distance (mi)', 'Speed (mi/sec)', 'Average_Time', 'Average_Speed', 'Time_Difference_From_Avg', 'Average_Time_Class', \
            'Time_Difference_From_Avg_Class', 'Temp_Humidity', 'WindSpeed_WindChill', 'Temp_WindSpeed', 'Humidity_WindSpeed', \
            'HeatIndex_Humidity', 'DewPoint_Temperature', 'DewPoint_Humidity', 'DewPoint_WindSpeed', 'Latitude', 'Longitude', \
            'Time_Difference_First_Place', 'Time_Difference_First_Place_Class','First_Place_Time_Class', 'First_Place_Time', \
            'Wind Chill', 'Wind Gust', 'Heat Index']

# one-hot encoder
# We need to replace the NaN with a string first!
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant',fill_value='missing')),
    ('onehot', OneHotEncoder(sparse_output=False,handle_unknown='ignore'))])

# ordinal encoder
# We need to replace the NaN with a string first!
ordinal_transformer = Pipeline(steps=[
    ('imputer2', SimpleImputer(strategy='constant',fill_value='NA')),
    ('ordinal', OrdinalEncoder(categories = ordinal_cats))])

# standard scaler
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

# collect the transformers
prep = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_ftrs),
        ('cat', categorical_transformer, cat_ftrs),
        ('ord', ordinal_transformer, ordinal_ftrs)])

## Declare a parameter grid

*among other variables*

In [5]:
nr_states = 5

param_grid = {
    "max_depth": [1,3,5,7,9,11,13,15],
    "min_samples_split": [2,4,6,8,10],
    "max_features": [1,3,6,9]
}

unique_years = sorted(df['Year'].unique().tolist())

final_models = []
test_scores = []

def fill_nan_col(data, model, column='num__Wind Gust'):
    data_with = data[data[column].notnull()]
    data_missing = data[data[column].isnull()]

    X_missing = data_missing.drop(columns=[column])
    predicted_values = model.predict(X_missing)

    data.loc[data[column].isnull(), column] = predicted_values
    return data

## Perform the machine learning cross validation pipeline

The pipeline is as follows:

1. Loop through random states
2. Loop through time series iterations (split by year)
3. Find best model by training on train sets and saving best val RMSE score
4. Save the model and test score for each random state

In [6]:
for rand_state in range(nr_states):
    print('\nRANDOM STATE', rand_state + 1, 'OF', nr_states)

    state_best_model = None
    state_best_model_test_score = None
    state_best_val_score = float('inf')

    for i, year in enumerate(unique_years[:-2]):
        val_year, test_year = year + 1, year + 2
        print(f"  time series split {i+1}: training {min(unique_years)} to {year}, val {val_year}, test {test_year}")

        # split
        train_years_condition = (X['Year'] >= min(unique_years)) & (X['Year'] <= year)
        X_train, y_train = X[train_years_condition], y[train_years_condition]
        X_val, y_val = X[(X['Year'] == val_year)], y[(X['Year'] == val_year)]
        X_test, y_test = X[(X['Year'] == test_year)], y[(X['Year'] == test_year)]

        # fit train, transform train-val-test sets
        prep.fit(X_train.drop(columns=drop_ftrs))
        feature_names = prep.get_feature_names_out()
        train_prep = pd.DataFrame(prep.transform(X_train.drop(columns=drop_ftrs)), columns=feature_names)
        val_prep = pd.DataFrame(prep.transform(X_val.drop(columns=drop_ftrs)), columns=feature_names)
        test_prep = pd.DataFrame(prep.transform(X_test.drop(columns=drop_ftrs)), columns=feature_names)
        print("  split: train", train_prep.shape, "val", val_prep.shape, "test", test_prep.shape)

        # xgb missing values
        train_with_gust = train_prep[train_prep['num__Wind Gust'].notnull()]
        train_missing_gust = train_prep[train_prep['num__Wind Gust'].isnull()]
        
        X_train_gust = train_with_gust.drop(columns=['num__Wind Gust'])
        y_train_gust = train_with_gust['num__Wind Gust']
        
        xgb_gust_model = XGBRegressor(random_state=42*(rand_state + 1))
        xgb_gust_model.fit(X_train_gust, y_train_gust)
        
        train_prep = fill_nan_col(train_prep, xgb_gust_model)
        val_prep = fill_nan_col(val_prep, xgb_gust_model)
        test_prep = fill_nan_col(test_prep, xgb_gust_model)

        # Hyperparameter tuning and model selection
        iteration_models, train_scores, val_scores = [], [], []

        for params in ParameterGrid(param_grid):
            print('    ', params)
            model = RandomForestRegressor(**params, random_state=42*(rand_state + 1))
            model.fit(train_prep, y_train)
            iteration_models.append(model)
            
            y_val_pred = model.predict(val_prep)
            val_score = np.sqrt(mean_squared_error(y_val, y_val_pred))
            val_scores.append(val_score)

            y_train_pred = model.predict(train_prep)
            train_score = np.sqrt(mean_squared_error(y_train, y_train_pred))
            train_scores.append(train_score)

        # Evaluate the best model on the test set
        best_iteration_model = iteration_models[np.argmin(val_scores)]
        y_test_pred = best_iteration_model.predict(test_prep)
        test_score = np.sqrt(mean_squared_error(y_test, y_test_pred))
        print('    test score:', test_score)

        # Update the best model for the state if necessary
        if min(val_scores) < state_best_val_score:
            state_best_val_score = min(val_scores)
            state_best_model = best_iteration_model
            state_best_model_test_score = test_score

    final_models.append(state_best_model)
    test_scores.append(state_best_model_test_score)


RANDOM STATE 1 OF 5
  time series split 1: training 2014 to 2014, val 2015, test 2016
  split: train (62353, 56) val (64070, 56) test (65411, 56)
     {'max_depth': 1, 'max_features': 1, 'min_samples_split': 2}
     {'max_depth': 1, 'max_features': 1, 'min_samples_split': 4}
     {'max_depth': 1, 'max_features': 1, 'min_samples_split': 6}
     {'max_depth': 1, 'max_features': 1, 'min_samples_split': 8}
     {'max_depth': 1, 'max_features': 1, 'min_samples_split': 10}
     {'max_depth': 1, 'max_features': 3, 'min_samples_split': 2}
     {'max_depth': 1, 'max_features': 3, 'min_samples_split': 4}
     {'max_depth': 1, 'max_features': 3, 'min_samples_split': 6}
     {'max_depth': 1, 'max_features': 3, 'min_samples_split': 8}
     {'max_depth': 1, 'max_features': 3, 'min_samples_split': 10}
     {'max_depth': 1, 'max_features': 6, 'min_samples_split': 2}
     {'max_depth': 1, 'max_features': 6, 'min_samples_split': 4}
     {'max_depth': 1, 'max_features': 6, 'min_samples_split': 6}
     {

In [7]:
print('mean test score:', np.mean(test_scores), 'std:', np.std(test_scores))

mean test score: 7.472247257501337 std: 0.019657121498841634


## Train and export a final model and test scores

In [18]:
# split
print("splitting")
train_years_condition = (X['Year'] >= 2014) & (X['Year'] <= 2018)
X_train, y_train = X[train_years_condition], y[train_years_condition]
X_test, y_test = X[(X['Year'] == 2019)], y[(X['Year'] == 2019)]

# fit train, transform train-val-test sets
print("preprocessing")
prep.fit(X_train.drop(columns=drop_ftrs))
feature_names = prep.get_feature_names_out()
train_prep = pd.DataFrame(prep.transform(X_train.drop(columns=drop_ftrs)), columns=feature_names)
test_prep = pd.DataFrame(prep.transform(X_test.drop(columns=drop_ftrs)), columns=feature_names)

# xgb missing values
print("filling missing values")
train_with_gust = train_prep[train_prep['num__Wind Gust'].notnull()]
train_missing_gust = train_prep[train_prep['num__Wind Gust'].isnull()]

X_train_gust = train_with_gust.drop(columns=['num__Wind Gust'])
y_train_gust = train_with_gust['num__Wind Gust']

xgb_gust_model = XGBRegressor(random_state=42*rand_state)
xgb_gust_model.fit(X_train_gust, y_train_gust)

train_prep = fill_nan_col(train_prep, xgb_gust_model)
test_prep = fill_nan_col(test_prep, xgb_gust_model)

# best model :)
print("training best model")
best_final_model = final_models[np.argmin(test_scores)]
best_model = RandomForestRegressor(**best_final_model.get_params())
best_model.fit(train_prep, y_train)
y_test_pred = best_iteration_model.predict(test_prep)
test_score = np.sqrt(mean_squared_error(y_test, y_test_pred))
test_score

splitting
preprocessing
filling missing values


Also export the train-test sets here to use for evaluation (ie. calculate the baseline score)

In [1]:
from joblib import dump

dump({'train_prep': train_prep, 'test_prep': test_prep, 'y_train': y_train, 'y_test': y_test}, 'train_test_prep.joblib')
dump({'model': best_model, 'scores': test_scores}, '../results/randforest_results.joblib')

NameError: name 'train_prep' is not defined