In [0]:
## Place this cell in any team notebook that needs access to the team cloud storage.


# The following blob storage is accessible to team members only (read and write)
# access key is valid til TTL
# after that you will need to create a new SAS key and authenticate access again via DataBrick command line
blob_container  = "final-project-summer24-team3"       # The name of your container created in https://portal.azure.com
storage_account = "summer2024team3"  # The name of your Storage account created in https://portal.azure.com
secret_scope    = "summer24_team_3_2_scope"           # The name of the scope created in your local computer using the Databricks CLI
secret_key      = "final-project-summer24-team3"             # The name of the secret key created in your local computer using the Databricks CLI
team_blob_url   = f"wasbs://{blob_container}@{storage_account}.blob.core.windows.net"  #points to the root of your team storage bucket


# the 261 course blob storage is mounted here.
mids261_mount_path      = "/mnt/mids-w261"

# SAS Token: Grant the team limited access to Azure Storage resources
spark.conf.set(
  f"fs.azure.sas.{blob_container}.{storage_account}.blob.core.windows.net",
  dbutils.secrets.get(scope = secret_scope, key = secret_key)
)

In [0]:
from pyspark.sql.functions import col, concat_ws, lpad, expr, unix_timestamp, from_unixtime, when, count, lag, greatest, row_number,lit
import pyspark.sql.functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import IntegerType, StringType, DoubleType, FloatType, StructField, StructType
from datetime import datetime, timedelta
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time as t
from pyspark.ml.feature import Bucketizer, SQLTransformer, OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import Pipeline
from itertools import chain
from pyspark.mllib.evaluation import MulticlassMetrics
from xgboost.spark import SparkXGBClassifier, SparkXGBRegressor

## Load All DataFrames

In [0]:
display(dbutils.fs.ls(f"{team_blob_url}/TP/"))

path,name,size,modificationTime
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/_SUCCESS,_SUCCESS,0,1720561571000
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/_committed_1002670972388615845,_committed_1002670972388615845,625,1720560531000
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/_committed_5669257934384103852,_committed_5669257934384103852,221,1720561571000
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/_committed_6618439955609603938,_committed_6618439955609603938,419,1720561337000
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/_committed_9167039456723159873,_committed_9167039456723159873,318,1720559468000
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/_committed_vacuum825530481471543349,_committed_vacuum825530481471543349,95,1720561338000
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/_started_1002670972388615845,_started_1002670972388615845,0,1720560531000
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/_started_5669257934384103852,_started_5669257934384103852,0,1720561570000
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/_started_6618439955609603938,_started_6618439955609603938,0,1720561337000
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/df_1y_cleaned_transformed/,df_1y_cleaned_transformed/,0,1722046865000


In [0]:
xgboost1011 = spark.read.parquet(f'{team_blob_url}/TP/xgboost_fbeta_recall')
xgboost1066 = spark.read.parquet(f'{team_blob_url}/TP/xgboost_precision')
log_base = spark.read.parquet(f'{team_blob_url}/TP/logbaseline_probabilities_2018_2019/')
log_variant = spark.read.parquet(f'{team_blob_url}/TP/logbaseline_0.03_probabilities_2018_2019/')
nnval2 = spark.read.parquet(f'{team_blob_url}/TP/nn_preds_val2')
nnval5 = spark.read.parquet(f'{team_blob_url}/TP/nn_preds_val5')
nntest2 = spark.read.parquet(f'{team_blob_url}/TP/nn_preds_test2')
nntest5 = spark.read.parquet(f'{team_blob_url}/TP/nn_preds_test5')

## Create Train Data

In [0]:
nnval2 = nnval2.withColumnRenamed("probability", "nnval2_prob")
merged_nn = nnval2.join(nnval5, "index", "left") \
    .select("index", "nnval2_prob", F.col("probability").alias("nnval5_prob"))
merged_log_base = merged_nn.join(log_base, "index", "left") \
    .select("index", "nnval2_prob", "nnval5_prob", 
            F.col("DEP_DEL15").alias("target"), F.col("probability_1").alias("log_base_prob"), "YEAR")
merged_log_all = merged_log_base.join(log_variant, "index", "left") \
    .select("index", "nnval2_prob", "nnval5_prob", "log_base_prob",
            F.col("probability_1").alias("log_variant_prob"), "target")
merged_boost11 = merged_log_all.join(xgboost1011, "index", "left")\
    .select("index", "nnval2_prob", "nnval5_prob", "log_base_prob", "log_variant_prob",
            F.col("probability_1").alias("xgboost1011_prob"), "target")
val_df = merged_boost11.join(xgboost1066, "index", "left")\
    .select("index", "nnval2_prob", "nnval5_prob", "log_base_prob", "log_variant_prob", "xgboost1011_prob",
            F.col("probability_1").alias("xgboost1066_prob"), "target")

In [0]:
# CheckSum for the train dataset

# Count total number of rows
total_rows = val_df.count()
print(f"Total number of rows: {total_rows}")

# Check for null data in any column
null_counts = val_df.select([F.sum(F.col(c).isNull().cast("int")).alias(c) for c in val_df.columns])
null_counts.show()

# Count rows with any null value
rows_with_nulls = val_df.filter(F.concat_ws("", *val_df.columns).isNull()).count()
print(f"Number of rows with null values: {rows_with_nulls}")

Total number of rows: 3606364
+-----+-----------+-----------+-------------+----------------+----------------+----------------+------+
|index|nnval2_prob|nnval5_prob|log_base_prob|log_variant_prob|xgboost1011_prob|xgboost1066_prob|target|
+-----+-----------+-----------+-------------+----------------+----------------+----------------+------+
|    0|          0|          0|            0|               0|               0|               0|     0|
+-----+-----------+-----------+-------------+----------------+----------------+----------------+------+

Number of rows with null values: 0


### Transform Train Data

In [0]:
feature_columns = ["nnval2_prob", "nnval5_prob", "log_base_prob", "log_variant_prob", "xgboost1011_prob", "xgboost1066_prob"]
assembler = VectorAssembler(
    inputCols=[x for x in feature_columns],
    outputCol="features",
    handleInvalid = "skip"
)

pipeline = Pipeline(stages=[assembler])
val_trf = pipeline \
    .fit(val_df) \
    .transform(val_df) \
    .select("index", "features", "target")

Downloading artifacts:   0%|          | 0/10 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

## Helper Functions

In [0]:
def block_splits(n_splits, df):
    """
    Split DataFrame into blocks for cross-validation.

    Args:
    n_splits (int): Number of splits.
    df (pyspark.sql.DataFrame): Input DataFrame.

    Returns:
    tuple: (blocks, df_with_row_num)
        - blocks: List of [start, mid, stop] indices for each split.
        - df_with_row_num: DataFrame with added row_num column.
    """
    
    
    w = Window.orderBy("index")
    df_with_row_num = df.withColumn("row_num", F.row_number().over(w))

    n_samples = df.count()
    k_fold_size = n_samples // n_splits

    blocks = []
    for i in range(n_splits):
        start = i * k_fold_size + 1  # +1 because row_number starts at 1
        stop = min(start + k_fold_size - 1, n_samples)  # Ensure we don't exceed n_samples
        mid = int(0.8 * (stop - start)) + start

        block = [start, mid, stop]
        blocks.append(block)
    # Add a row number column to the df and return the list of indexes and the new df
    return blocks, df_with_row_num


def evaluate_roc_curve(predictions, target_feature):
    evaluator = BinaryClassificationEvaluator(labelCol=target_feature)
    area_under_curve = evaluator.evaluate(predictions)
    return area_under_curve


def evaluate_multiclass_metrics(predictions, target_feature):
    # Convert DataFrame to RDD
    prediction_and_label = predictions\
                            .select(["prediction", target_feature])\
                            .withColumn(target_feature, col(target_feature).cast(FloatType()))\
                            .orderBy(target_feature)

    # Create MulticlassMetrics object
    metrics = MulticlassMetrics(prediction_and_label.rdd.map(tuple))

    # Collect metrics
    accuracy = metrics.accuracy
    precision = metrics.precision(1.0)
    recall = metrics.recall(1.0)
    f1_measure = metrics.fMeasure(1.0)
    
    return accuracy, precision, recall, f1_measure

In [0]:
# Cross Validation XG Boost for training
def xgboost_cv(blocks, df, max_depth=3, subsample=0.5, colsample_bytree=0.5):

    target_feature = "target"
    acc, prec, rec, f1, fb, res_frame = [], [], [], [], [], []

    error_flag = []


    xgb_regressor = SparkXGBClassifier(
        features_col="features",
        label_col=target_feature,
        num_workers=2,
        max_depth=max_depth,
        subsample=subsample,
        colsample_bytree=colsample_bytree
        )


    for block in blocks:
        start_index, end_index, end_val = block

        train_block = df.filter((col("row_num") >= start_index) & (col("row_num") < end_index))
        val_block = df.filter((col("row_num") >= end_index) & (col("row_num") < end_val))

        xgb_model = xgb_regressor.fit(train_block)

        val_preds = xgb_model.transform(val_block)

        

        try:
            res_frame.append(val_preds)#.select(['QUARTER','FL_DATE','CRS_DEP_TIME','DEP_DEL15', 'prediction']))

            accuracy, precision, recall, f1_measure = evaluate_multiclass_metrics(val_preds, target_feature)
            beta = 1/(2**0.5)
            fbeta = (1 + beta**2) * (precision * recall) / ((beta**2 * precision) + recall)
        
            acc.append(accuracy)
            prec.append(precision)
            rec.append(recall)
            f1.append(f1_measure)
            fb.append(fbeta)
            error_flag.append(0)
        except:
            res_frame.append(val_preds)
            error_flag.append(1)
            res_frame.append(-1)

    def safe_mean(l):
        if len(l) == 0:
            return -1 
        else:
            return np.mean(l)
            
    avg_acc = safe_mean(acc)
    avg_prec = safe_mean(prec)
    avg_rec = safe_mean(rec)
    avg_f1 = safe_mean(f1)
    avg_fb = safe_mean(fb)

    print("\nParams for the Trial:")
    print(f"Max_depth: {max_depth}, Subsample: {subsample}, Colsample_bytree: {colsample_bytree}")

    print("\nAverage Metrics Across All Blocks:")
    print(f"Accuracy: {avg_acc:.4f}")
    print(f"Precision: {avg_prec:.4f}")
    print(f"Recall: {avg_rec:.4f}")
    print(f"F-Beta: {avg_fb:.4f}")
    print(f"F1 Score: {avg_f1:.4f}")
    #print(f"AUC: {avg_auc:.4f}")

    return {
        'max_depth': max_depth,
        'subsample': subsample,
        'colsample_bytree': colsample_bytree,
        'avg_accuracy': float(avg_acc),
        'avg_precision': float(avg_prec),
        'avg_recall': float(avg_rec),
        'avg_f1': float(avg_f1),
        'avg_f_beta': float(avg_fb)
    }

In [0]:
# Single Data Pass Through XG Boost for training the best model
# And evaluating the test dataset
def xgboost_model(train_block, test_block, max_depth=3, subsample=0.5, colsample_bytree=0.5):

    target_feature = "target"

    xgb_regressor = SparkXGBClassifier(
        features_col="features",
        label_col=target_feature,
        num_workers=2,
        max_depth=max_depth,
        subsample=subsample,
        colsample_bytree=colsample_bytree
        )

    xgb_model = xgb_regressor.fit(train_block)

    test_preds = xgb_model.transform(test_block)   

    try:
        res_frame = test_preds

        accuracy, precision, recall, f1_measure = evaluate_multiclass_metrics(test_preds, target_feature)
        beta = 1/(2**0.5)
        fbeta = (1 + beta**2) * (precision * recall) / ((beta**2 * precision) + recall)
        error_flag = 0
    except:
        res_frame = test_preds
        error_flag = 1
        res_frame = -1

    def safe_value(val):
        if val is None:
            return -1 
        else:
            return val
        
    accuracy = safe_value(accuracy)
    recall = safe_value(recall)
    precision = safe_value(precision)
    f1_measure = safe_value(f1_measure)
    fbeta = safe_value(fbeta)     

    print("\nParams for the Trial:")
    print(f"Max_depth: {max_depth}, Subsample: {subsample}, Colsample_bytree: {colsample_bytree}")

    print("\nAverage Metrics:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F-Beta: {fbeta:.4f}")
    print(f"F1 Score: {f1_measure:.4f}")
    #print(f"AUC: {avg_auc:.4f}")

    return {
        'max_depth': max_depth,
        'subsample': subsample,
        'colsample_bytree': colsample_bytree,
        'avg_accuracy': float(accuracy),
        'avg_precision': float(precision),
        'avg_recall': float(recall),
        'avg_f1': float(f1_measure),
        'avg_f_beta': float(fbeta)
    }

## Cross Validation Training

In [0]:
# Set up the Grid Search
from itertools import product

blocks, val_df_with_row_num = block_splits(3, val_trf)
cv_results = []
param_grid = {
    'max_depth':[3,6,10],
    'subsample':[0.5,0.75,1.0],
    'colsample_bytree':[0.5,0.75,1.0]
}
keys, values = zip(*param_grid.items())
all_combinations = [dict(zip(keys, combination)) for combination in product(*values)]

In [0]:
# Main training loop  - Performs Grid Search 
#   - - Trains multiple models = len(all_combinations) 
# WARNING!!! ------> THIS TAKES OVER AN HOUR
for params in all_combinations:
    print(f"current params: {params}")
    cv_results.append(xgboost_cv(blocks, val_df_with_row_num, **params))
schema = StructType([
    StructField('max_depth', IntegerType(), True),
    StructField('subsample', FloatType(), True),
    StructField('colsample_bytree', FloatType(), True),
    StructField('avg_accuracy', FloatType(), True),
    StructField('avg_precision', FloatType(), True),
    StructField('avg_recall', FloatType(), True),
    StructField('avg_f1', FloatType(), True),
    StructField('avg_f_beta', FloatType(), True),
])

cv_results_df = spark.createDataFrame(cv_results, schema=schema)

# Print the best result after saving the df of all results
# Now you know your best hyperparameters
best_result = max(cv_results, key=lambda x: x['avg_f_beta'])
print(f"Best result: {best_result}")
print(f"Highest avg_f_beta: {best_result['avg_f_beta']}")

current params: {'max_depth': 3, 'subsample': 0.5, 'colsample_bytree': 0.5}


2024-08-07 15:28:18,518 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logistic', 'colsample_bytree': 0.5, 'device': 'cpu', 'max_depth': 3, 'subsample': 0.5, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-08-07 15:28:40,627 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-08-07 15:29:48,631 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logistic', 'colsample_bytree': 0.5, 'device': 'cpu', 'max_depth': 3, 'subsample': 0.5, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-08-07 15:30:04,193 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-08-07 15:30:41,848 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logistic


Params for the Trial:
Max_depth: 3, Subsample: 0.5, Colsample_bytree: 0.5

Average Metrics Across All Blocks:
Accuracy: 0.7482
Precision: 0.7441
Recall: 0.5461
F-Beta: 0.6630
F1 Score: 0.6291
current params: {'max_depth': 3, 'subsample': 0.5, 'colsample_bytree': 0.75}


2024-08-07 15:31:39,084 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logistic', 'colsample_bytree': 0.75, 'device': 'cpu', 'max_depth': 3, 'subsample': 0.5, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-08-07 15:31:49,233 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-08-07 15:32:24,144 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logistic', 'colsample_bytree': 0.75, 'device': 'cpu', 'max_depth': 3, 'subsample': 0.5, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-08-07 15:32:33,446 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-08-07 15:33:40,219 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logist


Params for the Trial:
Max_depth: 3, Subsample: 0.5, Colsample_bytree: 0.75

Average Metrics Across All Blocks:
Accuracy: 0.7484
Precision: 0.7441
Recall: 0.5469
F-Beta: 0.6633
F1 Score: 0.6296
current params: {'max_depth': 3, 'subsample': 0.5, 'colsample_bytree': 1.0}


2024-08-07 15:34:43,712 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logistic', 'colsample_bytree': 1.0, 'device': 'cpu', 'max_depth': 3, 'subsample': 0.5, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-08-07 15:35:03,743 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-08-07 15:35:47,190 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logistic', 'colsample_bytree': 1.0, 'device': 'cpu', 'max_depth': 3, 'subsample': 0.5, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-08-07 15:35:56,099 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-08-07 15:36:37,509 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logistic


Params for the Trial:
Max_depth: 3, Subsample: 0.5, Colsample_bytree: 1.0

Average Metrics Across All Blocks:
Accuracy: 0.7484
Precision: 0.7433
Recall: 0.5479
F-Beta: 0.6635
F1 Score: 0.6300
current params: {'max_depth': 3, 'subsample': 0.75, 'colsample_bytree': 0.5}


2024-08-07 15:37:25,710 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logistic', 'colsample_bytree': 0.5, 'device': 'cpu', 'max_depth': 3, 'subsample': 0.75, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-08-07 15:37:34,514 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-08-07 15:38:11,331 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logistic', 'colsample_bytree': 0.5, 'device': 'cpu', 'max_depth': 3, 'subsample': 0.75, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-08-07 15:38:21,446 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-08-07 15:38:57,180 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logist


Params for the Trial:
Max_depth: 3, Subsample: 0.75, Colsample_bytree: 0.5

Average Metrics Across All Blocks:
Accuracy: 0.7483
Precision: 0.7437
Recall: 0.5471
F-Beta: 0.6633
F1 Score: 0.6296
current params: {'max_depth': 3, 'subsample': 0.75, 'colsample_bytree': 0.75}


2024-08-07 15:39:43,869 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logistic', 'colsample_bytree': 0.75, 'device': 'cpu', 'max_depth': 3, 'subsample': 0.75, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-08-07 15:39:53,540 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-08-07 15:40:30,723 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logistic', 'colsample_bytree': 0.75, 'device': 'cpu', 'max_depth': 3, 'subsample': 0.75, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-08-07 15:40:40,304 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-08-07 15:41:16,958 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logi


Params for the Trial:
Max_depth: 3, Subsample: 0.75, Colsample_bytree: 0.75

Average Metrics Across All Blocks:
Accuracy: 0.7483
Precision: 0.7436
Recall: 0.5473
F-Beta: 0.6633
F1 Score: 0.6297
current params: {'max_depth': 3, 'subsample': 0.75, 'colsample_bytree': 1.0}


2024-08-07 15:42:02,206 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logistic', 'colsample_bytree': 1.0, 'device': 'cpu', 'max_depth': 3, 'subsample': 0.75, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-08-07 15:42:11,277 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-08-07 15:42:49,652 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logistic', 'colsample_bytree': 1.0, 'device': 'cpu', 'max_depth': 3, 'subsample': 0.75, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-08-07 15:42:58,375 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-08-07 15:43:35,120 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logist


Params for the Trial:
Max_depth: 3, Subsample: 0.75, Colsample_bytree: 1.0

Average Metrics Across All Blocks:
Accuracy: 0.7483
Precision: 0.7434
Recall: 0.5476
F-Beta: 0.6634
F1 Score: 0.6298
current params: {'max_depth': 3, 'subsample': 1.0, 'colsample_bytree': 0.5}


2024-08-07 15:44:22,286 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logistic', 'colsample_bytree': 0.5, 'device': 'cpu', 'max_depth': 3, 'subsample': 1.0, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-08-07 15:44:31,703 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-08-07 15:45:09,237 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logistic', 'colsample_bytree': 0.5, 'device': 'cpu', 'max_depth': 3, 'subsample': 1.0, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-08-07 15:45:18,086 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-08-07 15:45:55,810 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logistic


Params for the Trial:
Max_depth: 3, Subsample: 1.0, Colsample_bytree: 0.5

Average Metrics Across All Blocks:
Accuracy: 0.7483
Precision: 0.7442
Recall: 0.5464
F-Beta: 0.6631
F1 Score: 0.6293
current params: {'max_depth': 3, 'subsample': 1.0, 'colsample_bytree': 0.75}


2024-08-07 15:46:41,161 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logistic', 'colsample_bytree': 0.75, 'device': 'cpu', 'max_depth': 3, 'subsample': 1.0, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-08-07 15:46:49,418 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-08-07 15:47:49,664 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logistic', 'colsample_bytree': 0.75, 'device': 'cpu', 'max_depth': 3, 'subsample': 1.0, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-08-07 15:48:09,221 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-08-07 15:49:04,569 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logist


Params for the Trial:
Max_depth: 3, Subsample: 1.0, Colsample_bytree: 0.75

Average Metrics Across All Blocks:
Accuracy: 0.7483
Precision: 0.7435
Recall: 0.5474
F-Beta: 0.6633
F1 Score: 0.6297
current params: {'max_depth': 3, 'subsample': 1.0, 'colsample_bytree': 1.0}


2024-08-07 15:49:53,074 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logistic', 'colsample_bytree': 1.0, 'device': 'cpu', 'max_depth': 3, 'subsample': 1.0, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-08-07 15:50:02,624 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-08-07 15:50:38,867 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logistic', 'colsample_bytree': 1.0, 'device': 'cpu', 'max_depth': 3, 'subsample': 1.0, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-08-07 15:50:48,413 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-08-07 15:51:22,984 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logistic


Params for the Trial:
Max_depth: 3, Subsample: 1.0, Colsample_bytree: 1.0

Average Metrics Across All Blocks:
Accuracy: 0.7485
Precision: 0.7439
Recall: 0.5474
F-Beta: 0.6636
F1 Score: 0.6300
current params: {'max_depth': 6, 'subsample': 0.5, 'colsample_bytree': 0.5}


2024-08-07 15:52:08,654 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logistic', 'colsample_bytree': 0.5, 'device': 'cpu', 'max_depth': 6, 'subsample': 0.5, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-08-07 15:52:19,113 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-08-07 15:52:57,482 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logistic', 'colsample_bytree': 0.5, 'device': 'cpu', 'max_depth': 6, 'subsample': 0.5, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-08-07 15:53:08,062 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-08-07 15:53:46,335 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logistic


Params for the Trial:
Max_depth: 6, Subsample: 0.5, Colsample_bytree: 0.5

Average Metrics Across All Blocks:
Accuracy: 0.7476
Precision: 0.7426
Recall: 0.5461
F-Beta: 0.6622
F1 Score: 0.6285
current params: {'max_depth': 6, 'subsample': 0.5, 'colsample_bytree': 0.75}


2024-08-07 15:54:34,905 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logistic', 'colsample_bytree': 0.75, 'device': 'cpu', 'max_depth': 6, 'subsample': 0.5, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-08-07 15:54:45,301 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-08-07 15:55:28,167 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logistic', 'colsample_bytree': 0.75, 'device': 'cpu', 'max_depth': 6, 'subsample': 0.5, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-08-07 15:55:38,793 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-08-07 15:56:23,876 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logist


Params for the Trial:
Max_depth: 6, Subsample: 0.5, Colsample_bytree: 0.75

Average Metrics Across All Blocks:
Accuracy: 0.7476
Precision: 0.7426
Recall: 0.5459
F-Beta: 0.6620
F1 Score: 0.6283
current params: {'max_depth': 6, 'subsample': 0.5, 'colsample_bytree': 1.0}


2024-08-07 15:57:19,127 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logistic', 'colsample_bytree': 1.0, 'device': 'cpu', 'max_depth': 6, 'subsample': 0.5, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-08-07 15:57:29,251 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-08-07 15:58:13,694 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logistic', 'colsample_bytree': 1.0, 'device': 'cpu', 'max_depth': 6, 'subsample': 0.5, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-08-07 15:58:24,164 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-08-07 15:59:09,489 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logistic


Params for the Trial:
Max_depth: 6, Subsample: 0.5, Colsample_bytree: 1.0

Average Metrics Across All Blocks:
Accuracy: 0.7477
Precision: 0.7432
Recall: 0.5455
F-Beta: 0.6622
F1 Score: 0.6284
current params: {'max_depth': 6, 'subsample': 0.75, 'colsample_bytree': 0.5}


2024-08-07 16:00:04,329 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logistic', 'colsample_bytree': 0.5, 'device': 'cpu', 'max_depth': 6, 'subsample': 0.75, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-08-07 16:00:14,341 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-08-07 16:00:59,171 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logistic', 'colsample_bytree': 0.5, 'device': 'cpu', 'max_depth': 6, 'subsample': 0.75, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-08-07 16:01:09,310 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-08-07 16:01:53,815 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logist


Params for the Trial:
Max_depth: 6, Subsample: 0.75, Colsample_bytree: 0.5

Average Metrics Across All Blocks:
Accuracy: 0.7478
Precision: 0.7433
Recall: 0.5457
F-Beta: 0.6623
F1 Score: 0.6285
current params: {'max_depth': 6, 'subsample': 0.75, 'colsample_bytree': 0.75}


2024-08-07 16:02:48,608 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logistic', 'colsample_bytree': 0.75, 'device': 'cpu', 'max_depth': 6, 'subsample': 0.75, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-08-07 16:02:58,471 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-08-07 16:03:42,344 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logistic', 'colsample_bytree': 0.75, 'device': 'cpu', 'max_depth': 6, 'subsample': 0.75, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-08-07 16:03:52,402 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-08-07 16:04:37,657 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logi


Params for the Trial:
Max_depth: 6, Subsample: 0.75, Colsample_bytree: 0.75

Average Metrics Across All Blocks:
Accuracy: 0.7478
Precision: 0.7430
Recall: 0.5462
F-Beta: 0.6624
F1 Score: 0.6287
current params: {'max_depth': 6, 'subsample': 0.75, 'colsample_bytree': 1.0}


2024-08-07 16:05:33,262 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logistic', 'colsample_bytree': 1.0, 'device': 'cpu', 'max_depth': 6, 'subsample': 0.75, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-08-07 16:05:43,361 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-08-07 16:06:30,111 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logistic', 'colsample_bytree': 1.0, 'device': 'cpu', 'max_depth': 6, 'subsample': 0.75, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-08-07 16:06:40,249 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-08-07 16:07:26,877 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logist


Params for the Trial:
Max_depth: 6, Subsample: 0.75, Colsample_bytree: 1.0

Average Metrics Across All Blocks:
Accuracy: 0.7480
Precision: 0.7432
Recall: 0.5465
F-Beta: 0.6627
F1 Score: 0.6290
current params: {'max_depth': 6, 'subsample': 1.0, 'colsample_bytree': 0.5}


2024-08-07 16:08:24,764 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logistic', 'colsample_bytree': 0.5, 'device': 'cpu', 'max_depth': 6, 'subsample': 1.0, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-08-07 16:08:34,265 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-08-07 16:09:24,091 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logistic', 'colsample_bytree': 0.5, 'device': 'cpu', 'max_depth': 6, 'subsample': 1.0, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-08-07 16:09:33,815 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-08-07 16:10:21,186 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logistic


Params for the Trial:
Max_depth: 6, Subsample: 1.0, Colsample_bytree: 0.5

Average Metrics Across All Blocks:
Accuracy: 0.7481
Precision: 0.7436
Recall: 0.5465
F-Beta: 0.6629
F1 Score: 0.6291
current params: {'max_depth': 6, 'subsample': 1.0, 'colsample_bytree': 0.75}


2024-08-07 16:11:17,659 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logistic', 'colsample_bytree': 0.75, 'device': 'cpu', 'max_depth': 6, 'subsample': 1.0, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-08-07 16:11:27,390 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-08-07 16:12:13,528 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logistic', 'colsample_bytree': 0.75, 'device': 'cpu', 'max_depth': 6, 'subsample': 1.0, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-08-07 16:12:23,131 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-08-07 16:13:08,320 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logist


Params for the Trial:
Max_depth: 6, Subsample: 1.0, Colsample_bytree: 0.75

Average Metrics Across All Blocks:
Accuracy: 0.7483
Precision: 0.7439
Recall: 0.5465
F-Beta: 0.6631
F1 Score: 0.6293
current params: {'max_depth': 6, 'subsample': 1.0, 'colsample_bytree': 1.0}


2024-08-07 16:14:02,307 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logistic', 'colsample_bytree': 1.0, 'device': 'cpu', 'max_depth': 6, 'subsample': 1.0, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-08-07 16:14:11,634 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-08-07 16:14:58,811 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logistic', 'colsample_bytree': 1.0, 'device': 'cpu', 'max_depth': 6, 'subsample': 1.0, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-08-07 16:15:08,206 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-08-07 16:15:52,093 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logistic


Params for the Trial:
Max_depth: 6, Subsample: 1.0, Colsample_bytree: 1.0

Average Metrics Across All Blocks:
Accuracy: 0.7479
Precision: 0.7434
Recall: 0.5461
F-Beta: 0.6626
F1 Score: 0.6288
current params: {'max_depth': 10, 'subsample': 0.5, 'colsample_bytree': 0.5}


2024-08-07 16:16:46,833 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logistic', 'colsample_bytree': 0.5, 'device': 'cpu', 'max_depth': 10, 'subsample': 0.5, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-08-07 16:17:00,763 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-08-07 16:17:48,088 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logistic', 'colsample_bytree': 0.5, 'device': 'cpu', 'max_depth': 10, 'subsample': 0.5, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-08-07 16:18:02,071 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-08-07 16:18:48,311 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logist


Params for the Trial:
Max_depth: 10, Subsample: 0.5, Colsample_bytree: 0.5

Average Metrics Across All Blocks:
Accuracy: 0.7429
Precision: 0.7306
Recall: 0.5461
F-Beta: 0.6558
F1 Score: 0.6242
current params: {'max_depth': 10, 'subsample': 0.5, 'colsample_bytree': 0.75}


2024-08-07 16:19:51,449 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logistic', 'colsample_bytree': 0.75, 'device': 'cpu', 'max_depth': 10, 'subsample': 0.5, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-08-07 16:20:05,371 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-08-07 16:20:51,785 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logistic', 'colsample_bytree': 0.75, 'device': 'cpu', 'max_depth': 10, 'subsample': 0.5, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-08-07 16:21:06,391 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-08-07 16:21:55,542 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logi


Params for the Trial:
Max_depth: 10, Subsample: 0.5, Colsample_bytree: 0.75

Average Metrics Across All Blocks:
Accuracy: 0.7423
Precision: 0.7294
Recall: 0.5455
F-Beta: 0.6549
F1 Score: 0.6234
current params: {'max_depth': 10, 'subsample': 0.5, 'colsample_bytree': 1.0}


2024-08-07 16:22:57,881 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logistic', 'colsample_bytree': 1.0, 'device': 'cpu', 'max_depth': 10, 'subsample': 0.5, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-08-07 16:23:12,431 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-08-07 16:23:58,952 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logistic', 'colsample_bytree': 1.0, 'device': 'cpu', 'max_depth': 10, 'subsample': 0.5, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-08-07 16:24:13,413 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-08-07 16:25:00,604 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logist


Params for the Trial:
Max_depth: 10, Subsample: 0.5, Colsample_bytree: 1.0

Average Metrics Across All Blocks:
Accuracy: 0.7423
Precision: 0.7290
Recall: 0.5457
F-Beta: 0.6548
F1 Score: 0.6234
current params: {'max_depth': 10, 'subsample': 0.75, 'colsample_bytree': 0.5}


2024-08-07 16:26:02,232 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logistic', 'colsample_bytree': 0.5, 'device': 'cpu', 'max_depth': 10, 'subsample': 0.75, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-08-07 16:26:16,023 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-08-07 16:27:04,042 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logistic', 'colsample_bytree': 0.5, 'device': 'cpu', 'max_depth': 10, 'subsample': 0.75, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-08-07 16:27:18,110 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-08-07 16:28:04,115 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logi


Params for the Trial:
Max_depth: 10, Subsample: 0.75, Colsample_bytree: 0.5

Average Metrics Across All Blocks:
Accuracy: 0.7441
Precision: 0.7336
Recall: 0.5460
F-Beta: 0.6574
F1 Score: 0.6253
current params: {'max_depth': 10, 'subsample': 0.75, 'colsample_bytree': 0.75}


2024-08-07 16:29:05,428 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logistic', 'colsample_bytree': 0.75, 'device': 'cpu', 'max_depth': 10, 'subsample': 0.75, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-08-07 16:29:18,909 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-08-07 16:30:05,078 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logistic', 'colsample_bytree': 0.75, 'device': 'cpu', 'max_depth': 10, 'subsample': 0.75, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-08-07 16:30:18,733 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-08-07 16:31:06,586 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:lo


Params for the Trial:
Max_depth: 10, Subsample: 0.75, Colsample_bytree: 0.75

Average Metrics Across All Blocks:
Accuracy: 0.7441
Precision: 0.7337
Recall: 0.5459
F-Beta: 0.6573
F1 Score: 0.6252
current params: {'max_depth': 10, 'subsample': 0.75, 'colsample_bytree': 1.0}


2024-08-07 16:32:09,277 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logistic', 'colsample_bytree': 1.0, 'device': 'cpu', 'max_depth': 10, 'subsample': 0.75, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-08-07 16:32:23,208 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-08-07 16:33:10,171 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logistic', 'colsample_bytree': 1.0, 'device': 'cpu', 'max_depth': 10, 'subsample': 0.75, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-08-07 16:33:24,227 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-08-07 16:34:10,141 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logi


Params for the Trial:
Max_depth: 10, Subsample: 0.75, Colsample_bytree: 1.0

Average Metrics Across All Blocks:
Accuracy: 0.7437
Precision: 0.7329
Recall: 0.5456
F-Beta: 0.6567
F1 Score: 0.6247
current params: {'max_depth': 10, 'subsample': 1.0, 'colsample_bytree': 0.5}


2024-08-07 16:35:10,603 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logistic', 'colsample_bytree': 0.5, 'device': 'cpu', 'max_depth': 10, 'subsample': 1.0, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-08-07 16:35:23,392 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-08-07 16:36:35,950 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logistic', 'colsample_bytree': 0.5, 'device': 'cpu', 'max_depth': 10, 'subsample': 1.0, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-08-07 16:36:58,264 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-08-07 16:37:37,377 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logist


Params for the Trial:
Max_depth: 10, Subsample: 1.0, Colsample_bytree: 0.5

Average Metrics Across All Blocks:
Accuracy: 0.7452
Precision: 0.7363
Recall: 0.5466
F-Beta: 0.6590
F1 Score: 0.6265
current params: {'max_depth': 10, 'subsample': 1.0, 'colsample_bytree': 0.75}


2024-08-07 16:39:06,504 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logistic', 'colsample_bytree': 0.75, 'device': 'cpu', 'max_depth': 10, 'subsample': 1.0, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-08-07 16:39:20,400 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-08-07 16:40:03,006 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logistic', 'colsample_bytree': 0.75, 'device': 'cpu', 'max_depth': 10, 'subsample': 1.0, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-08-07 16:40:15,527 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-08-07 16:40:56,406 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logi


Params for the Trial:
Max_depth: 10, Subsample: 1.0, Colsample_bytree: 0.75

Average Metrics Across All Blocks:
Accuracy: 0.7456
Precision: 0.7374
Recall: 0.5461
F-Beta: 0.6594
F1 Score: 0.6267
current params: {'max_depth': 10, 'subsample': 1.0, 'colsample_bytree': 1.0}


2024-08-07 16:41:51,696 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logistic', 'colsample_bytree': 1.0, 'device': 'cpu', 'max_depth': 10, 'subsample': 1.0, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-08-07 16:42:03,771 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-08-07 16:42:45,058 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logistic', 'colsample_bytree': 1.0, 'device': 'cpu', 'max_depth': 10, 'subsample': 1.0, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-08-07 16:42:57,725 INFO XGBoost-PySpark: _fit Finished xgboost training!
2024-08-07 16:43:44,675 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logist


Params for the Trial:
Max_depth: 10, Subsample: 1.0, Colsample_bytree: 1.0

Average Metrics Across All Blocks:
Accuracy: 0.7454
Precision: 0.7370
Recall: 0.5459
F-Beta: 0.6590
F1 Score: 0.6263
Best result: {'max_depth': 3, 'subsample': 1.0, 'colsample_bytree': 1.0, 'avg_accuracy': 0.7484984860080525, 'avg_precision': 0.7439035905250303, 'avg_recall': 0.5474377881338233, 'avg_f1': 0.6299505464410227, 'avg_f_beta': 0.6636078207142596}
Highest avg_f_beta: 0.6636078207142596


# Final Train and Test

### Test Data Merge

In [0]:
# Similar to the train merge, we start with the NN files as those are already scoped to the year we are interested in - 2019
nntest2 = nntest2.withColumnRenamed("probability", "nnval2_prob") # keeping the col name val so I can recycle my pipeline
merged_nn = nntest2.join(nntest5, "index", "left") \
    .select("index", "nnval2_prob", F.col("probability").alias("nnval5_prob")) # keeping the col name val so I can recycle my pipeline 
	
merged_log_base = merged_nn.join(log_base, "index", "left") \
    .select("index", "nnval2_prob", "nnval5_prob", 
            F.col("DEP_DEL15").alias("target"), F.col("probability_1").alias("log_base_prob"), "YEAR")
			
merged_log_all = merged_log_base.join(log_variant, "index", "left") \
    .select("index", "nnval2_prob", "nnval5_prob", "log_base_prob",
            F.col("probability_1").alias("log_variant_prob"), "target")
			
merged_boost11 = merged_log_all.join(xgboost1011, "index", "left")\
    .select("index", "nnval2_prob", "nnval5_prob", "log_base_prob", "log_variant_prob",
            F.col("probability_1").alias("xgboost1011_prob"), "target")
			
test_df = merged_boost11.join(xgboost1066, "index", "left")\
    .select("index", "nnval2_prob", "nnval5_prob", "log_base_prob", "log_variant_prob", "xgboost1011_prob",
            F.col("probability_1").alias("xgboost1066_prob"), "target")

# Count total number of rows
total_rows = test_df.count()
print(f"Total number of rows: {total_rows}")

# Check for null data in any column
null_counts = test_df.select([F.sum(F.col(c).isNull().cast("int")).alias(c) for c in test_df.columns])
null_counts.show()

# Count rows with any null value
rows_with_nulls = test_df.filter(F.concat_ws("", *test_df.columns).isNull()).count()
print(f"Number of rows with null values: {rows_with_nulls}")

Total number of rows: 7270189
+-----+-----------+-----------+-------------+----------------+----------------+----------------+------+
|index|nnval2_prob|nnval5_prob|log_base_prob|log_variant_prob|xgboost1011_prob|xgboost1066_prob|target|
+-----+-----------+-----------+-------------+----------------+----------------+----------------+------+
|    0|          0|          0|            0|         6205149|               0|               0|     0|
+-----+-----------+-----------+-------------+----------------+----------------+----------------+------+

Number of rows with null values: 0


#### Transform

In [0]:
# Use the same pipeline from training on the train data
test_trf = pipeline \
    .fit(val_df) \
    .transform(test_df) \
    .select("index", "features", "target")

Downloading artifacts:   0%|          | 0/10 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

#### Train and Predict

In [0]:
test_results = xgboost_model(val_trf, test_trf, max_depth=3, subsample=1, colsample_bytree=1)

2024-08-07 16:46:20,335 INFO XGBoost-PySpark: _fit Running xgboost-2.0.3 on 2 workers with
	booster params: {'objective': 'binary:logistic', 'colsample_bytree': 1, 'device': 'cpu', 'max_depth': 3, 'subsample': 1, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
2024-08-07 16:46:40,010 INFO XGBoost-PySpark: _fit Finished xgboost training!



Params for the Trial:
Max_depth: 3, Subsample: 1, Colsample_bytree: 1

Average Metrics:
Accuracy: 0.7549
Precision: 0.5260
Recall: 0.6632
F-Beta: 0.5650
F1 Score: 0.5867
