# Hypertuning with Cross-Validation Folds: Neural Networks
This code estimates neural network models and prints out our evaluation metric (f-beta, where beta=2). The code is working with the cleaned cross-validation folds created from the 2015-2018 dataset. We do both a baseline assessment accross 3-months of data and a full hypertuning with the 2015-2018 cross-validation folds. In the future, we plan to also try ensemble models. We adopted a bayesian hypertuning strategy appropriate for big data called Tree-structured Parazen Estimator (TPE) within the hyperopt package. TPE starts learning good values for your hyperparameters (within a range we set) as it goes through multiple trials. The bayesian approach is helpful for big data tuning because we do not have the compute resources to do a comprehensive grid search.

![Pipeline Image](https://i.imgur.com/wq62T0E.png)

### Project Description
This is a group project conducted for course w261: Machine Learning at Scale at the University of California Berkeley in Summer 2023. This project develops a machine learning model that predicts flight delays based on historical flight, airport station, and weather data spanning five years from 2015-2019 in the United States.

###Group members
Jessica Stockham, Chase Madison, Kisha Kim, Eric Danforth

Citation: Code written by Chase Madison

In [0]:
import numpy as np
import re
import pandas as pd
from collections import namedtuple
from datetime import datetime, timedelta, date
import holidays

from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import Window

from pyspark.sql.functions import udf, col,isnan,when,count
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler,StandardScaler, Imputer, Bucketizer
from pyspark.ml import Pipeline
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
#from sparkxgb.xgboost import XGBoostClassifier

import xgboost as xgb
from xgboost.spark import SparkXGBClassifier

from sklearn.metrics import confusion_matrix

from hyperopt import fmin, tpe, Trials, SparkTrials, hp, space_eval
import mlflow
#import mlfow.spark

spark.sparkContext.setLogLevel('ERROR')



In [0]:
## Place this cell in any team notebook that needs access to the team cloud storage
mids261_mount_path = '/mnt/mids-w261'  # 261 course blob storage is mounted here
secret_scope = 'sec5-team1-scope'  # Name of the secret scope Chase created in Databricks CLI
secret_key = 'sec5-team1-key'  # Name of the secret key Chase created in Databricks CLI
storage_account = 'sec5team1storage'  # Name of the Azure Storage Account Chase created
blob_container = 'sec5-team1-container'  # Name of the container Chase created in Azure Storage Account
team_blob_url = f'wasbs://{blob_container}@{storage_account}.blob.core.windows.net'  # Points to the root of your team storage bucket
spark.conf.set(  # SAS Token: Grant the team limited access to Azure Storage resources
  f'fs.azure.sas.{blob_container}.{storage_account}.blob.core.windows.net',
  dbutils.secrets.get(scope=secret_scope, key=secret_key)
)

In [0]:
# LOAD CLEANED CV FOLDS FROM BLOB

def load_folds_from_blob_and_cache(blob_url, fold_name):
    folds = list()
    DEFAULT_PARTITION_COUNT = 50

    # Compute the fold count
    files = dbutils.fs.ls(f"{blob_url}/{fold_name}")
    fold_names = sorted([f.name for f in files if f.name.startswith("train")])
    match = re.match(r"train_(\d+)_df", fold_names[-1])
    fold_count = int(match.group(1)) + 1
    print(f"Loading {fold_count} folds...")

    # Load folds
    for i in range(fold_count):
        train_df = (
            spark.read.parquet(f"{blob_url}/{fold_name}/train_{i}_df")
            .repartition(DEFAULT_PARTITION_COUNT)
            .cache()
        )
        val_df = (
            spark.read.parquet(f"{blob_url}/{fold_name}/val_{i}_df")
            .repartition(DEFAULT_PARTITION_COUNT)
            .cache()
        )
        folds.append((train_df, val_df))
    return folds


# Train Cross Validation Folds
def trainPredictEval(estimator):  

    """
    Get validation fscore across all folds. Function is called by objective_function_rf()

    Parameters:
        estimator: machine learning model defined in objective_function_rf()
    
    returns:
        average validation fscore accross all folds
    """
    from statistics import mean 

    metricsList = []

    # Load folds data
    for i, (train_df, val_df) in enumerate(folds):

        print(f'CV FOLD START: {i}: {datetime.now()}')
        
        # Train
        model = estimator.fit(train_df)

        print(f'Model built: {i}: {datetime.now()}')
        
        pred = model.transform(val_df).cache()
        
        print(f'Prediction Validation Set: {i}: {datetime.now()}')
            
        # Compute Metrics

        evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="fMeasureByLabel", beta=2.0, metricLabel=1.0)
        fmeasure = evaluator.evaluate(pred, {evaluator.metricLabel: 1.0})
        print(fmeasure)

        metricsList.append(fmeasure)
        print(f'fold fscore: {fmeasure}')

        pred.unpersist()

    avgFscore = mean(metricsList)
    print(f'average fscore accross fold: {avgFscore}')

    # mlflow logging
    mlflow.log_metric("f2_score", (-1)*avgFscore)

    # negate fscore becuase hyperopt minimizes
    return (-1)*avgFscore

###Hypertuning Cross-Validation with 60 Month Dataset

In [0]:
##### LOAD 60 MONTH DATASET ##########

# 60 MONTH DATA - 5 folds - (READY TO USE)
# RUN THIS CODE BELOW TO PULL IN CV FOLDS IN VARIABLE "folds" for 60 month (only brings in 2015-2018 as training set)
timeInterval = '60mo'
fold_name = "folds" + timeInterval
folds = load_folds_from_blob_and_cache(team_blob_url, fold_name)

# Filter to the most recent 2 folds (2017 and 2018)
fold_small = folds[3:5]

# Create folds_slim that excludes ORIGIN_hot and DEST_hot (representing about 600 columns)
folds_slim = []

for i, (train_df, val_df) in enumerate(fold_small):

    train_df_new = train_df.drop("features")
    val_df_new = val_df.drop("features")
    
    features_all = ['IS_FIRST_FLIGHT_OF_DAY_double_hot',
    'is_holiday_adjacent_double_hot',
    'OP_UNIQUE_CARRIER_hot',
    'is_holiday_double_hot',
    'CRS_DEP_BUCKET_hot',
    'DAY_OF_WEEK_hot',
    'origin_type_hot',
    'dest_type_hot',
    'MONTH_hot',
    'YEAR_hot'] + ['scaled_numeric']

    #print(f'features_all: {features_all}')
    assembler = VectorAssembler(inputCols=features_all, outputCol="features")

    train_df_slim = assembler.transform(train_df_new)
    val_df_slim = assembler.transform(val_df_new)

    train_df_slim = train_df_slim.select(['features', 'label'])
    val_df_slim = val_df_slim.select(['features', 'label'])

    folds_slim.append((train_df_slim, val_df_slim))

folds_slim


Loading 5 folds...


[(DataFrame[features: vector, label: double],
  DataFrame[features: vector, label: double]),
 (DataFrame[features: vector, label: double],
  DataFrame[features: vector, label: double])]

In [0]:
# Schema has "features, label" + individual features + intermediate features used for processing. 
# Jess changed this. Before just kept "features, label" but realized this gives us more flexibility.
# Could change "features" input on the fly if you wanted.
train.printSchema()

root
 |-- label: double (nullable = true)
 |-- DISTANCE: double (nullable = true)
 |-- ELEVATION: double (nullable = true)
 |-- FE_PRIOR_DAILY_AVG_DEP_DELAY: double (nullable = true)
 |-- FE_PRIOR_AVG_DURATION: double (nullable = true)
 |-- FE_AVG_DURATION: double (nullable = true)
 |-- FE_NUM_FLIGHT_SCHEDULED: long (nullable = true)
 |-- DEP_DELAY_LAG: double (nullable = true)
 |-- DAY_OF_WEEK: integer (nullable = true)
 |-- MONTH: integer (nullable = true)
 |-- YEAR: integer (nullable = true)
 |-- OP_UNIQUE_CARRIER: string (nullable = true)
 |-- origin_type: string (nullable = true)
 |-- dest_type: string (nullable = true)
 |-- ORIGIN: string (nullable = true)
 |-- DEST: string (nullable = true)
 |-- is_holiday_double: double (nullable = true)
 |-- is_holiday_adjacent_double: double (nullable = true)
 |-- IS_FIRST_FLIGHT_OF_DAY_double: double (nullable = true)
 |-- CRS_DEP_TIME: integer (nullable = true)
 |-- DATE: timestamp (nullable = true)
 |-- FL_DATE: date (nullable = true)
 |--

# MLP: Multilayer Perceptron Neural Networks with Logistic Regression

In [0]:
from pyspark.ml.classification import MultilayerPerceptronClassifier
# CANT REMEMBER IF THESE IMPORTS ARE NEEDED, BUT JUST DELETE IF THEY DONT THROW A DEPENDENCY ERROR
# from pyspark.ml.linalg import Vectors, VectorUDT
# from pyspark.sql.types import StructType, StructField, DoubleType

# GRID SEARCH MLP
def objective_function_mlp(params):

    """
    Define estimator

    Parameters:
        params: default in hyperopt. Do not change.
    
    returns:
        trainPredictEval(estimator) function
    """

    # set hyperparameters we want to tune
    architecture = params["architecture"]

    with mlflow.start_run():

        # Train
        estimator = MultilayerPerceptronClassifier(
            layers=architecture,  # The architecture of the neural network
            seed=42
        )
        
        return trainPredictEval(estimator)

In [0]:
# Get the number of features in the training data

# Calculate the maximum number of features across all folds
num_features = len([x["name"] for x in sorted(folds[0][0].schema["features"].metadata["ml_attr"]["attrs"]["binary"] + folds[0][0].schema["features"].metadata["ml_attr"]["attrs"]["numeric"], key=lambda x: x["idx"])])
print(num_features)

# Defining the layer node sizes for the neural network
nn_architecture = {
    #'one_layer': [num_features, 32, 2],     # One hidden layer with many nodes
    'two_layers': [num_features, 8, 4, 2]  # Two hidden layers with fewer nodes
}
search_space_mlp = {
    "architecture": hp.choice("architecture", [network for network in nn_architecture.values()]),
}

81


In [0]:
# RUN THIS: HYPERTUNING: NEURAL NETWORK 1

print(f'Job Start: {datetime.now()}')

# End prior mlfow run
mlflow.end_run()

# Keep logging off during hypertuning
mlflow.pyspark.ml.autolog(log_models=False)

num_evals = 2
trials = Trials()

best_hyperparam_mlp = fmin(
    fn=objective_function_mlp,
    space=search_space_mlp,
    algo=tpe.suggest,
    max_evals=num_evals,
    trials=trials,
    rstate=np.random.default_rng(42)
)

# BEST PARAMETERS
best_params = space_eval(search_space_mlp, best_hyperparam_mlp)
print(f'best parameters: {best_params}')

# LOG IT
with mlflow.start_run():
    mlflow.log_params(best_params)
    mlflow.log_metric("CV_2folds_fscore_nn", trials.best_trial['result']['loss'])

# End prior mlfow run
mlflow.end_run()

Job Start: 2023-08-08 18:27:06.222221
  0%|          | 0/2 [00:00<?, ?trial/s, best loss=?]                                                     CV FOLD START: 0: 2023-08-08 18:27:07.040570
  0%|          | 0/2 [00:00<?, ?trial/s, best loss=?]




                                                     Model built: 0: 2023-08-08 19:06:48.133606
  0%|          | 0/2 [39:41<?, ?trial/s, best loss=?]                                                     Prediction Validation Set: 0: 2023-08-08 19:06:48.227344
  0%|          | 0/2 [39:41<?, ?trial/s, best loss=?]




                                                     0.4600264477658919
  0%|          | 0/2 [41:23<?, ?trial/s, best loss=?]                                                     fold fscore: 0.4600264477658919
  0%|          | 0/2 [41:23<?, ?trial/s, best loss=?]                                                     CV FOLD START: 1: 2023-08-08 19:08:29.600001
  0%|          | 0/2 [41:23<?, ?trial/s, best loss=?]




                                                     Model built: 1: 2023-08-08 20:05:11.669153
  0%|          | 0/2 [1:38:05<?, ?trial/s, best loss=?]                                                       Prediction Validation Set: 1: 2023-08-08 20:05:11.759502
  0%|          | 0/2 [1:38:05<?, ?trial/s, best loss=?]




                                                       0.40756826905311866
  0%|          | 0/2 [1:40:03<?, ?trial/s, best loss=?]                                                       fold fscore: 0.40756826905311866
  0%|          | 0/2 [1:40:03<?, ?trial/s, best loss=?]                                                       average fscore accross fold: 0.4337973584095053
  0%|          | 0/2 [1:40:03<?, ?trial/s, best loss=?] 50%|█████     | 1/2 [1:40:03<1:40:03, 6003.26s/trial, best loss: -0.4337973584095053]                                                                                      CV FOLD START: 0: 2023-08-08 20:07:10.175875
 50%|█████     | 1/2 [1:40:03<1:40:03, 6003.26s/trial, best loss: -0.4337973584095053]




In [0]:
# RUN THIS: HYPERTUNING: NERUAL NETWORK 2

print(f'Job Start: {datetime.now()}')

# End prior mlfow run
mlflow.end_run()

# Keep logging off during hypertuning
mlflow.pyspark.ml.autolog(log_models=False)

num_evals = 1
trials = Trials()

best_hyperparam_mlp = fmin(
    fn=objective_function_mlp,
    space=search_space_mlp,
    algo=tpe.suggest,
    max_evals=num_evals,
    trials=trials,
    rstate=np.random.default_rng(42)
)

# BEST PARAMETERS
best_params = space_eval(search_space_mlp, best_hyperparam_mlp)
print(f'best parameters: {best_params}')

# LOG IT
# Just logging the artifacts, not the data, to save compute time
# Need to rename your experiment

# Log best CV parameters and fscore
# experiment_name = "phase3_cv_rf_CV"
# experiment_id = mlflow.create_experiment(experiment_name)
# with mlflow.start_run(experiment_id=experiment_id):

with mlflow.start_run():
    mlflow.log_params(best_params)
    mlflow.log_metric("CV_2folds_fscore_nn", trials.best_trial['result']['loss'])

# End prior mlfow run
mlflow.end_run()

Job Start: 2023-08-08 20:34:46.074184
  0%|          | 0/1 [00:00<?, ?trial/s, best loss=?]                                                     CV FOLD START: 0: 2023-08-08 20:34:46.876072
  0%|          | 0/1 [00:00<?, ?trial/s, best loss=?]




                                                     Model built: 0: 2023-08-08 21:05:20.077972
  0%|          | 0/1 [30:33<?, ?trial/s, best loss=?]                                                     Prediction Validation Set: 0: 2023-08-08 21:05:20.176718
  0%|          | 0/1 [30:34<?, ?trial/s, best loss=?]




                                                     0.41530879029981377
  0%|          | 0/1 [31:38<?, ?trial/s, best loss=?]                                                     fold fscore: 0.41530879029981377
  0%|          | 0/1 [31:38<?, ?trial/s, best loss=?]                                                     CV FOLD START: 1: 2023-08-08 21:06:25.126077
  0%|          | 0/1 [31:38<?, ?trial/s, best loss=?]




                                                     Model built: 1: 2023-08-08 21:46:37.434522
  0%|          | 0/1 [1:11:51<?, ?trial/s, best loss=?]                                                       Prediction Validation Set: 1: 2023-08-08 21:46:37.623358
  0%|          | 0/1 [1:11:51<?, ?trial/s, best loss=?]




                                                       0.4055715091543289
  0%|          | 0/1 [1:13:44<?, ?trial/s, best loss=?]                                                       fold fscore: 0.4055715091543289
  0%|          | 0/1 [1:13:44<?, ?trial/s, best loss=?]                                                       average fscore accross fold: 0.4104401497270713
  0%|          | 0/1 [1:13:44<?, ?trial/s, best loss=?]100%|██████████| 1/1 [1:13:45<00:00, 4425.02s/trial, best loss: -0.4104401497270713]100%|██████████| 1/1 [1:13:45<00:00, 4425.03s/trial, best loss: -0.4104401497270713]
best parameters: {'architecture': (81, 8, 4, 2)}
