In [0]:
## Place this cell in any team notebook that needs access to the team cloud storage.


# The following blob storage is accessible to team members only (read and write)
# access key is valid til TTL
# after that you will need to create a new SAS key and authenticate access again via DataBrick command line
blob_container  = "final-project-summer24-team3"       # The name of your container created in https://portal.azure.com
storage_account = "summer2024team3"  # The name of your Storage account created in https://portal.azure.com
secret_scope    = "summer24_team_3_2_scope"           # The name of the scope created in your local computer using the Databricks CLI
secret_key      = "final-project-summer24-team3"             # The name of the secret key created in your local computer using the Databricks CLI
team_blob_url   = f"wasbs://{blob_container}@{storage_account}.blob.core.windows.net"  #points to the root of your team storage bucket


# the 261 course blob storage is mounted here.
mids261_mount_path      = "/mnt/mids-w261"

# SAS Token: Grant the team limited access to Azure Storage resources
spark.conf.set(
  f"fs.azure.sas.{blob_container}.{storage_account}.blob.core.windows.net",
  dbutils.secrets.get(scope = secret_scope, key = secret_key)
)

In [0]:
from pyspark.sql.functions import col, concat_ws, lpad, expr, unix_timestamp, from_unixtime, when, count, lag, greatest, row_number,lit
import pyspark.sql.functions as F
from pyspark.sql.types import IntegerType, StringType, DoubleType, FloatType, StructField, StructType
import numpy as np
import pandas as pd
from pyspark.sql import Row

## Load All DataFrames

In [0]:
display(dbutils.fs.ls(f"{team_blob_url}/TP/"))

path,name,size,modificationTime
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/_SUCCESS,_SUCCESS,0,1720561571000
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/_committed_1002670972388615845,_committed_1002670972388615845,625,1720560531000
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/_committed_5669257934384103852,_committed_5669257934384103852,221,1720561571000
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/_committed_6618439955609603938,_committed_6618439955609603938,419,1720561337000
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/_committed_9167039456723159873,_committed_9167039456723159873,318,1720559468000
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/_committed_vacuum825530481471543349,_committed_vacuum825530481471543349,95,1720561338000
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/_started_1002670972388615845,_started_1002670972388615845,0,1720560531000
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/_started_5669257934384103852,_started_5669257934384103852,0,1720561570000
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/_started_6618439955609603938,_started_6618439955609603938,0,1720561337000
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/df_1y_cleaned_transformed/,df_1y_cleaned_transformed/,0,1722046865000


In [0]:
xgboost1011 = spark.read.parquet(f'{team_blob_url}/TP/xgboost_fbeta_recall')
xgboost1066 = spark.read.parquet(f'{team_blob_url}/TP/xgboost_precision')
log_base = spark.read.parquet(f'{team_blob_url}/TP/logbaseline_probabilities_2018_2019/')
# log_variant = spark.read.parquet(f'{team_blob_url}/TP/logbaseline_0.03_probabilities_2018_2019/')
log_variant = spark.read.parquet(f'{team_blob_url}/TP/logbaseline_0.03_probabilities_2018_2019v2/')

nnval2 = spark.read.parquet(f'{team_blob_url}/TP/nn_preds_val2')
nnval5 = spark.read.parquet(f'{team_blob_url}/TP/nn_preds_val5')
nntest2 = spark.read.parquet(f'{team_blob_url}/TP/nn_preds_test2')
nntest5 = spark.read.parquet(f'{team_blob_url}/TP/nn_preds_test5')

In [0]:
display(nntest2.limit(10))

index,probability
16296400,0.6048686075183758
16296401,0.3355729967655831
16296402,0.3463531302089725
16296403,0.5621076769602723
16296404,0.4098567286621875
16296405,0.4458836408337053
16296406,0.3406940287018571
16296407,0.6211324349535869
16296408,0.3851916748209665
16296409,0.2942468212008801


## Create Train Data

In [0]:
nnval2 = nnval2.withColumnRenamed("probability", "nnval2_prob")
merged_nn = nnval2.join(nnval5, "index", "left") \
    .select("index", "nnval2_prob", F.col("probability").alias("nnval5_prob"))
merged_log_base = merged_nn.join(log_base, "index", "left") \
    .select("index", "nnval2_prob", "nnval5_prob", 
            F.col("DEP_DEL15").alias("target"), F.col("probability_1").alias("log_base_prob"), "YEAR")
merged_log_all = merged_log_base.join(log_variant, "index", "left") \
    .select("index", "nnval2_prob", "nnval5_prob", "log_base_prob",
            F.col("probability_1").alias("log_variant_prob"), "target")
merged_boost11 = merged_log_all.join(xgboost1011, "index", "left")\
    .select("index", "nnval2_prob", "nnval5_prob", "log_base_prob", "log_variant_prob",
            F.col("probability_1").alias("xgboost1011_prob"), "target")
val_df = merged_boost11.join(xgboost1066, "index", "left")\
    .select("index", "nnval2_prob", "nnval5_prob", "log_base_prob", "log_variant_prob", "xgboost1011_prob",
            F.col("probability_1").alias("xgboost1066_prob"), "target")

In [0]:
# CheckSum for the train dataset

# Count total number of rows
total_rows = val_df.count()
print(f"Total number of rows: {total_rows}")

# Check for null data in any column
null_counts = val_df.select([F.sum(F.col(c).isNull().cast("int")).alias(c) for c in val_df.columns])
null_counts.show()

# Count rows with any null value
rows_with_nulls = val_df.filter(F.concat_ws("", *val_df.columns).isNull()).count()
print(f"Number of rows with null values: {rows_with_nulls}")

Total number of rows: 3606364
+-----+-----------+-----------+-------------+----------------+----------------+----------------+------+
|index|nnval2_prob|nnval5_prob|log_base_prob|log_variant_prob|xgboost1011_prob|xgboost1066_prob|target|
+-----+-----------+-----------+-------------+----------------+----------------+----------------+------+
|    0|          0|          0|            0|               0|               0|               0|     0|
+-----+-----------+-----------+-------------+----------------+----------------+----------------+------+

Number of rows with null values: 0


In [0]:
display(val_df.limit(10))

index,nnval2_prob,nnval5_prob,log_base_prob,log_variant_prob,xgboost1011_prob,xgboost1066_prob,target
8445850,0.3740877207774964,0.3657552296179887,0.4616521096070315,0.4535365964512451,0.1644136309623718,0.2966469824314117,0.0
11556484,0.3476063907457776,0.3417001248693432,0.2207280720620729,0.2957501684137092,0.1592976301908493,0.149367555975914,0.0
11556485,0.4408351089344866,0.3410723047238974,0.2137383648902544,0.2957501684137092,0.1880872100591659,0.2034016698598861,0.0
11556488,0.6692957622704241,0.6250291671440961,0.5070722794013323,0.5124258272187046,0.25215944647789,0.3480943739414215,0.0
8445853,0.3371937677691113,0.3092135328938249,0.3955320183860062,0.4086396035501758,0.146016776561737,0.1465182304382324,0.0
8445854,0.3961651842954616,0.377170907256903,0.7284613161243112,0.6761881065184936,0.8245221972465515,0.7815264463424683,1.0
11556483,0.5842771002478089,0.6751280846460634,0.4445933388433368,0.4699780933333716,0.1231305450201034,0.2088167071342468,0.0
8445849,0.408488887291019,0.2640792484881383,0.3231392036725951,0.3563467487219979,0.1259733885526657,0.1454849988222122,0.0
11556487,0.5788756606085461,0.6788285132979043,0.4578196931783874,0.4699780933333716,0.4345680475234985,0.594561755657196,0.0
8445858,0.3980004066501542,0.3189986037373163,0.5398692118524435,0.5227492074276583,0.3748185932636261,0.3288916051387787,1.0


In [0]:
def simple_meta_model(df):
    """Input: dataframe with probabilities. Output: dataframe with predictions"""

    # ensemble model such that if any model predicts delayed, then output delayed

    # Define function for prediction
    def predict(row):
        prediction = 1 if any(x > 0.5 for x in row[1:]) else 0
        return Row(*row, prediction)

    column_names = df.columns
    column_names.append('predictions')

    # Apply the function to each row
    predictions_df = df.rdd.map(lambda row: predict(row)).toDF(column_names)

    return predictions_df


In [0]:
val_predictions = simple_meta_model(val_df)

display(val_predictions.limit(10))

index,nnval2_prob,nnval5_prob,log_base_prob,log_variant_prob,xgboost1011_prob,xgboost1066_prob,target,predictions
8445850,0.3740877207774964,0.3657552296179887,0.4616521096070315,0.4535365964512451,0.1644136309623718,0.2966469824314117,0.0,0
8445871,0.318800871272576,0.3256548744784978,0.1548655548313956,0.2957501684137092,0.2103431224822998,0.2847370207309723,1.0,1
8445894,0.3448788214608317,0.3165933127982639,0.1910945244353655,0.3064717705475707,0.1175583824515342,0.1019168198108673,1.0,1
8445899,0.295906043856487,0.2915445998019462,0.1719209294197938,0.2957501684137092,0.1462762057781219,0.0982941910624504,0.0,0
8445902,0.3377712500336106,0.4980410985027349,0.3099838529356438,0.4968592542127373,0.0969814881682396,0.2085460126399994,0.0,0
8445909,0.3368145497618807,0.320315415438434,0.3681596466478157,0.4463485800615537,0.3091604113578796,0.3015647530555725,0.0,0
8445914,0.4458744314071377,0.3696895501790981,0.5304779359872036,0.6316441428155444,0.1757322251796722,0.1574684828519821,0.0,1
8445938,0.4250428396054828,0.3897048317729652,0.173287933227793,0.2957501684137092,0.1805399954319,0.1203279346227645,0.0,0
8445952,0.4327347679128427,0.4047513092983122,0.356085955081986,0.4699780933333716,0.1331409811973571,0.1218594014644622,0.0,0
8445953,0.2381159657956752,0.2985381279526456,0.1651136724153293,0.2957501684137092,0.0910238474607467,0.0867304056882858,0.0,0


In [0]:
# compute evaluation metrics
def evaluate_metrics(df):
    """Compute evaluation metrics"""
    metrics = df.select(F.count(F.when((F.col("predictions") == 1 )& (F.col("target") == 1), 1)).alias('true_positives'),
                        F.count(F.when((F.col("predictions") == 0) & (F.col("target") == 0), 1)).alias('true_negatives'),
                        F.count(F.when((F.col("predictions") == 1) & (F.col("target") == 0), 1)).alias('false_positives'),
                        F.count(F.when((F.col("predictions") == 0) & (F.col("target") == 1), 1)).alias('false_negatives')).collect()[0]
    
    precision = metrics['true_positives'] / (metrics['true_positives'] + metrics['false_positives'])
    print('Precision:', precision)

    recall = metrics['true_positives'] / (metrics['true_positives'] + metrics['false_negatives'])
    print('Recall:', recall)

    beta = 1/(2**0.5)
    fbeta = (1 + beta**2) * (precision * recall) / ((beta**2 * precision) + recall)
    print('F-0.5:', fbeta)

    display(metrics)
    

In [0]:
evaluate_metrics(val_predictions)

Precision: 0.7356412837065999
Recall: 1.0
F-0.5: 0.8067299847623204


Row(true_positives=1407757, true_negatives=1692718, false_positives=505889, false_negatives=0)

### TEST

In [0]:
# Similar to the train merge, we start with the NN files as those are already scoped to the year we are interested in - 2019
nntest2 = nntest2.withColumnRenamed("probability", "nnval2_prob") # keeping the col name val so I can recycle my pipeline
merged_nn = nntest2.join(nntest5, "index", "left") \
    .select("index", "nnval2_prob", F.col("probability").alias("nnval5_prob")) # keeping the col name val so I can recycle my pipeline 
	
merged_log_base = merged_nn.join(log_base, "index", "left") \
    .select("index", "nnval2_prob", "nnval5_prob", 
            F.col("DEP_DEL15").alias("target"), F.col("probability_1").alias("log_base_prob"), "YEAR")
			
merged_log_all = merged_log_base.join(log_variant, "index", "left") \
    .select("index", "nnval2_prob", "nnval5_prob", "log_base_prob",
            F.col("probability_1").alias("log_variant_prob"), "target")
			
merged_boost11 = merged_log_all.join(xgboost1011, "index", "left")\
    .select("index", "nnval2_prob", "nnval5_prob", "log_base_prob", "log_variant_prob",
            F.col("probability_1").alias("xgboost1011_prob"), "target")
			
test_df = merged_boost11.join(xgboost1066, "index", "left")\
    .select("index", "nnval2_prob", "nnval5_prob", "log_base_prob", "log_variant_prob", "xgboost1011_prob",
            F.col("probability_1").alias("xgboost1066_prob"), "target")

# Count total number of rows
total_rows = test_df.count()
print(f"Total number of rows: {total_rows}")

# Check for null data in any column
null_counts = test_df.select([F.sum(F.col(c).isNull().cast("int")).alias(c) for c in test_df.columns])
null_counts.show()

# Count rows with any null value
rows_with_nulls = test_df.filter(F.concat_ws("", *test_df.columns).isNull()).count()
print(f"Number of rows with null values: {rows_with_nulls}")

Total number of rows: 7270189
+-----+-----------+-----------+-------------+----------------+----------------+----------------+------+
|index|nnval2_prob|nnval5_prob|log_base_prob|log_variant_prob|xgboost1011_prob|xgboost1066_prob|target|
+-----+-----------+-----------+-------------+----------------+----------------+----------------+------+
|    0|          0|          0|            0|               0|               0|               0|     0|
+-----+-----------+-----------+-------------+----------------+----------------+----------------+------+

Number of rows with null values: 0


In [0]:
test_predictions = simple_meta_model(test_df)

In [0]:
evaluate_metrics(test_predictions)

Precision: 0.5214633593038044
Recall: 1.0
F-0.5: 0.620429431242401


Row(true_positives=1475746, true_negatives=4440180, false_positives=1354263, false_negatives=0)