In [0]:
# The following blob storage is accessible to team members only (read and write)
# access key is valid til TTL
# after that you will need to create a new SAS key and authenticate access again via DataBrick command line
blob_container  = "final-project-summer24-team3"       # The name of your container created in https://portal.azure.com
storage_account = "summer2024team3"  # The name of your Storage account created in https://portal.azure.com
secret_scope    = "summer24_team_3_2_scope"           # The name of the scope created in your local computer using the Databricks CLI
secret_key      = "final-project-summer24-team3"             # The name of the secret key created in your local computer using the Databricks CLI
team_blob_url   = f"wasbs://{blob_container}@{storage_account}.blob.core.windows.net"  #points to the root of your team storage bucket

# the 261 course blob storage is mounted here.
mids261_mount_path      = "/mnt/mids-w261"

# SAS Token: Grant the team limited access to Azure Storage resources
spark.conf.set(
  f"fs.azure.sas.{blob_container}.{storage_account}.blob.core.windows.net",
  dbutils.secrets.get(scope = secret_scope, key = secret_key)
)

# see what's in the blob storage root folder 
display(dbutils.fs.ls(f"{team_blob_url}"))

path,name,size,modificationTime
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/,TP/,0,1722707375000
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/df_clean_transformed_ian/,df_clean_transformed_ian/,0,1722606719000


In [0]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql.functions import col, isnan, count, when, split, concat, lit, min, row_number, lower, lpad, udf, first, countDistinct, coalesce, to_timestamp, monotonically_increasing_id, explode, array, concat_ws, unix_timestamp
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType, FloatType, StructType, StructField, DoubleType
aimport numpy as np 
from pyspark.sql.window import Window

import warnings
warnings.filterwarnings("ignore")

In [0]:
#merged_3m = spark.read.parquet("wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/flight_stations_weather_3m_cleaned2")
#merged_1y = spark.read.parquet("wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/flight_stations_weather_1y_cleaned2")
merged_all = spark.read.parquet("wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/flight_stations_weather_all_cleaned3")

In [0]:
#df_train_3m = merged_3m.filter(col("MONTH").isin(1,2))
#df_test_3m = merged_3m.filter(col("MONTH") == 3)
#df_train_1y = merged_1y.filter(col('QUARTER').isin(1, 2, 3))
#df_test_1y = merged_1y.filter(col('QUARTER').isin(4))

df_train0 = merged_all.filter(col("YEAR").isin(["2015", "2016", "2017", "2018"]))
df_train = merged_all.filter(col("YEAR").isin("2015", "2016", "2017"))
# Hold out 2018 for model blending 
df_val = merged_all.filter(col("YEAR").isin("2018"))
df_test = merged_all.filter(col("YEAR").isin("2019"))

In [0]:
df_train0.count()

12053218

In [0]:
df_train.count()

8445849

In [0]:
df_val.count()

3607369

In [0]:
df_test.count()

7270542

### Settings

In [0]:
import time 

# Functions
def block_splits(df, n_splits):
    n_samples = df.count()
    k_fold_size = n_samples // n_splits
    indices = np.arange(n_samples)

    blocks = []
    for i in range(n_splits):
        start = i * k_fold_size
        stop = start + k_fold_size
        mid = int(0.8 * (stop - start)) + start
        blocks.append((start, mid, stop))
    return blocks 

def evaluate_multiclass_metrics(predictions, target_feature):
    # Convert DataFrame to RDD
    prediction_and_label = predictions\
                            .select(["prediction", target_feature])\
                            .withColumn(target_feature, col(target_feature).cast(FloatType()))\
                            #.orderBy(target_feature)

    # Create MulticlassMetrics object
    metrics = MulticlassMetrics(prediction_and_label.rdd.map(tuple))

    # Collect metrics
    accuracy = metrics.accuracy
    precision = metrics.precision(1.0)
    recall = metrics.recall(1.0)
    beta = 1/(2**0.5)
    fbeta = (1 + beta**2) * (precision * recall) / ((beta**2 * precision) + recall)
    return accuracy, precision, recall, fbeta

def nn_cv(df, assembler, target_feature, k, layers, max_iters = 10):
    def train_and_evaluate(df, assembler, layers, fold_tuple, max_iters):
        start, mid, stop = fold_tuple
        train_fold = df.filter((col("index") >= start) & (col("index") < mid))
        test_fold = df.filter((col("index") >= mid) & (col("index") < stop))
        assembler_train = assembler.transform(train_fold).select("features", "index", "DEP_DEL15")
        assembler_test = assembler.transform(test_fold).select("features", "index", "DEP_DEL15")

        # Define and train the classifier
        classifier = MultilayerPerceptronClassifier(layers=layers, featuresCol = "features", 
                                                    labelCol = "DEP_DEL15", maxIter = max_iters)
        model = classifier.fit(assembler_train)
        
        # Make predictions and evaluate
        predictions = model.transform(assembler_test)
        #evaluator = BinaryClassificationEvaluator(metricName="accuracy")
        #accuracy = evaluator.evaluate(predictions, target_feature)
        accuracy, precision, recall, fbeta = evaluate_multiclass_metrics(predictions, target_feature)     
        return accuracy, precision, recall, fbeta

    folds = block_splits(df, k)
    accuracy_list = []
    precision_list = []
    recall_list = []
    fbeta_list = [] 
    start = time.time()
    i = 1
    for fold in folds:
        acc, precision, recall, fbeta = train_and_evaluate(df, assembler, layers, fold, max_iters)
        accuracy_list.append(acc)
        precision_list.append(precision)
        recall_list.append(recall)
        fbeta_list.append(fbeta)
        print(f"Finished running {i} fold(s) in {round((time.time() - start)/60)} min")
        i += 1 

    avg_accuracy = np.mean(accuracy_list)
    avg_precision = np.mean(precision_list)
    avg_recall = np.mean(recall_list)
    avg_fbeta = np.mean(fbeta_list)
    print(f"Accuracy: {avg_accuracy}\nPrecision: {avg_precision}\nRecall: {avg_recall}\nF-Beta: {avg_fbeta}")

In [0]:
# Input features
vec_cols = ["MONTH_vec",
            "DAY_OF_WEEK_vec",
            "OP_CARRIER_vec", 
            "HOUR_vec", 
            "type_vec"]
vector_size_udf = udf(lambda x: x.size, IntegerType())
for column in vec_cols: 
    df_train = df_train.withColumn(f"{column}_size", vector_size_udf(column))

n_vec_cols = sum(df_train.select([f"{vec_col}_size" for vec_col in vec_cols]).first().asDict().values())
numeric_cols = ['DISTANCE', 
            'ELEVATION',
            'HourlyAltimeterSetting',
            'HourlyDryBulbTemperature',
            'HourlyPrecipitation',
            'HourlyRelativeHumidity',
            'HourlySeaLevelPressure',
            'HourlyVisibility',
            'HourlyWindDirection',
            'HourlyWindSpeed',
            "Rain", 
            "Snow", 
            "Thunder",
            "View Obstruction", 
            'DEP_12hr',
            'DEP_6hr',
            'DEP_4hr',
            'DISTANCE_LAG',
            'REALIZED_DELAY_MIN',
            'REALIZED_DEL15', 
            'HOLIDAY', 
            'OUTDEG_AIRPORT_6hr'] 
n_features = len(numeric_cols) + n_vec_cols
print(f"Number of features: {n_features}")
features = numeric_cols + vec_cols
assembler = VectorAssembler(inputCols=features, outputCol="features", handleInvalid = "skip")


Number of features: 84


### Basic NN on 2015-2018

In [0]:
layers0 = [n_features, 64, 2]
nn_cv(df_train0, assembler, "DEP_DEL15", 3, layers0, max_iters = 50)


Finished running 1 fold(s) in 29 min
Finished running 2 fold(s) in 52 min
Finished running 3 fold(s) in 85 min
Accuracy: 0.6932785989078264
Precision: 0.6332504282585641
Recall: 0.3271850441581449
F-Beta: 0.4826902599174084


### Hyperparameter Tuning on 2015-2017

### 1 Hidden Layer NN 

In [0]:
layers1 = [n_features, 128, 2]
nn_cv(df_train, assembler, "DEP_DEL15", 3, layers1, max_iters = 50)



Finished running 1 fold(s) in 37 min
Finished running 2 fold(s) in 68 min




Finished running 3 fold(s) in 90 min
Accuracy: 0.6801540756408363
Precision: 0.6051564069699442
Recall: 0.1589100586119373
F-Beta: 0.30016312448491045


In [0]:
layers2 = [n_features, 256, 2]
nn_cv(df_train, assembler, "DEP_DEL15", 3, layers2, max_iters = 50)


Finished running 1 fold(s) in 67 min
Finished running 2 fold(s) in 136 min
Finished running 3 fold(s) in 184 min
Accuracy: 0.6939715727657685
Precision: 0.6337167141335804
Recall: 0.20243750220401482
F-Beta: 0.3686122812526964


#### 2 Hidden Layers NN 

In [0]:
layers3 = [n_features, 128, 64, 2]
nn_cv(df_train, assembler, "DEP_DEL15", 3, layers3, max_iters = 75)


Finished running 1 fold(s) in 90 min
Finished running 2 fold(s) in 158 min
Finished running 3 fold(s) in 210 min
Accuracy: 0.7047499527931657
Precision: 0.6231293705080367
Recall: 0.29225635800781924
F-Beta: 0.4512546373659514


In [0]:
layers4 = [n_features, 256, 128, 2]
nn_cv(df_train, assembler, "DEP_DEL15", 3, layers4, max_iters = 75)


Finished running 1 fold(s) in 184 min
Finished running 2 fold(s) in 348 min
Finished running 3 fold(s) in 464 min
Accuracy: 0.7089228182218811
Precision: 0.6301697784770429
Recall: 0.3079438511805255
F-Beta: 0.4671374362117941


#### 3 Hidden Layers NN 

In [0]:
layers5 = [n_features, 256, 128, 64, 2]
nn_cv(df_train, assembler, "DEP_DEL15", 3, layers5, max_iters = 100)


Finished running 1 fold(s) in 294 min
Finished running 2 fold(s) in 538 min
Finished running 3 fold(s) in 724 min
Accuracy: 0.7087517065518405
Precision: 0.6290800779178971
Recall: 0.30996201480321567
F-Beta: 0.4681343657413295


### Model Testing

In [0]:
assembler = assembler = VectorAssembler(inputCols=features, outputCol="features", handleInvalid = "skip")
assembler_train = assembler.transform(df_train).select(["features", "index", "DEP_DEL15"])
assembler_val = assembler.transform(df_val).select(["features", "index", "DEP_DEL15"])
assembler_test = assembler.transform(df_test).select(["features", "index", "DEP_DEL15"])

schema = StructType([
    StructField("index", IntegerType(), True),
    StructField("probability", DoubleType(), True)
])

layers1 = [n_features, 128, 2]
layers2 = [n_features, 256, 2]
layers3 = [n_features, 128, 64, 2]
layers4 = [n_features, 256, 128, 2]
layers5 = [n_features, 256, 128, 64, 2]

#### 1 Hidden Layer NN

In [0]:
classifier1 = MultilayerPerceptronClassifier(layers=layers1, featuresCol = "features", 
                                                    labelCol = "DEP_DEL15", maxIter = 50)

model1 = classifier1.fit(assembler_train)
model1.save(f"{team_blob_url}/TP/nn_model1")
preds_train1 = model1.transform(assembler_train)
preds_val1 = model1.transform(assembler_val)
preds_test1 = model1.transform(assembler_test)
preds_proba_val1 = preds_val1.select(["index", "probability"]).rdd.map(lambda x: (x[0], float(x[1][1]))).toDF(schema)
preds_proba_test1 = preds_test1.select(["index", "probability"]).rdd.map(lambda x: (x[0], float(x[1][1]))).toDF(schema)

#preds_proba_val1.write.parquet(f"{team_blob_url}/TP/nn_preds_val1")
#preds_proba_test1.write.parquet(f"{team_blob_url}/TP/nn_preds_test1")


In [0]:
acc, precision, recall, fbeta = evaluate_multiclass_metrics(preds_train1, "DEP_DEL15")
print("Accuracy:", acc)
print("Precision:", precision)
print("Recall:", recall)
print("F-beta:", fbeta)

Accuracy: 0.6427266848123485
Precision: 0.6895575369986473
Recall: 0.1074928209188101
F-beta: 0.24583398528251477


In [0]:
acc, precision, recall, fbeta = evaluate_multiclass_metrics(preds_test1, "DEP_DEL15")
print("Accuracy:", acc)
print("Precision:", precision)
print("Recall:", recall)
print("F-beta:", fbeta)

Accuracy: 0.7935686403750989
Precision: 0.46656450971336433
Recall: 0.11842756138251434
F-beta: 0.23565206838897584


In [0]:
classifier2 = MultilayerPerceptronClassifier(layers=layers2, featuresCol = "features", 
                                                    labelCol = "DEP_DEL15", maxIter = 50)
start = time.time()
model2 = classifier2.fit(assembler_train)
print(f"Time Elapsed {round((time.time() - start)/60)} minutes")
model2.save(f"{team_blob_url}/TP/nn_model2")
preds_train2 = model2.transform(assembler_train)
preds_val2 = model2.transform(assembler_val)
preds_test2 = model2.transform(assembler_test)
preds_proba_val2 = preds_val2.select(["index", "probability"]).rdd.map(lambda x: (x[0], float(x[1][1]))).toDF(schema)
preds_proba_test2 = preds_test2.select(["index", "probability"]).rdd.map(lambda x: (x[0], float(x[1][1]))).toDF(schema)
preds_proba_val2.write.parquet(f"{team_blob_url}/TP/nn_preds_val2")
preds_proba_test2.write.parquet(f"{team_blob_url}/TP/nn_preds_test2")

Time Elapsed 83 minutes


In [0]:
acc, precision, recall, fbeta = evaluate_multiclass_metrics(preds_train2, "DEP_DEL15")
print("Accuracy:", acc)
print("Precision:", precision)
print("Recall:", recall)
print("F-beta:", fbeta)

Accuracy: 0.6623518556683936
Precision: 0.681380595223625
Recall: 0.20808573847369896
F-beta: 0.3875504076766845


In [0]:
acc, precision, recall, fbeta = evaluate_multiclass_metrics(preds_test2, "DEP_DEL15")
print("Accuracy:", acc)
print("Precision:", precision)
print("Recall:", recall)
print("F-beta:", fbeta)

Accuracy: 0.789980150447258
Precision: 0.4636709026222763
Recall: 0.22113493785515936
F-beta: 0.3395382083098366


#### 2 Hidden Layers NN 

In [0]:
classifier3 = MultilayerPerceptronClassifier(layers=layers3, featuresCol = "features", 
                                                    labelCol = "DEP_DEL15", maxIter = 75)

start = time.time()
model3 = classifier3.fit(assembler_train)
print(f"Time Elapsed {round((time.time() - start)/60)} minutes")
model3.save(f"{team_blob_url}/TP/nn_model3")
preds_train3 = model3.transform(assembler_train)
preds_val3 = model3.transform(assembler_val)
preds_test3 = model3.transform(assembler_test)
preds_proba_val3 = preds_val3.select(["index", "probability"]).rdd.map(lambda x: (x[0], float(x[1][1]))).toDF(schema)
preds_proba_test3 = preds_test3.select(["index", "probability"]).rdd.map(lambda x: (x[0], float(x[1][1]))).toDF(schema)
preds_proba_val3.write.parquet(f"{team_blob_url}/TP/nn_preds_val3")
preds_proba_test3.write.parquet(f"{team_blob_url}/TP/nn_preds_test3")

Time Elapsed 105 minutes


In [0]:
acc, precision, recall, fbeta = evaluate_multiclass_metrics(preds_train3, "DEP_DEL15")
print("Accuracy:", acc)
print("Precision:", precision)
print("Recall:", recall)
print("F-beta:", fbeta)

Accuracy: 0.6779182923160381
Precision: 0.6434408807556679
Recall: 0.34042008308581645
F-beta: 0.4962093173406567


In [0]:
acc, precision, recall, fbeta = evaluate_multiclass_metrics(preds_test3, "DEP_DEL15")
print("Accuracy:", acc)
print("Precision:", precision)
print("Recall:", recall)
print("F-beta:", fbeta)

Accuracy: 0.7704206864498295
Precision: 0.4228116753234828
Recall: 0.35881716772398503
F-beta: 0.3990861988747188


In [0]:
classifier4 = MultilayerPerceptronClassifier(layers=layers4, featuresCol = "features", 
                                                    labelCol = "DEP_DEL15", maxIter = 75)
start = time.time()
model4 = classifier4.fit(assembler_train)
print(f"Time Elapsed {round((time.time() - start)/60)} minutes")
model4.save(f"{team_blob_url}/TP/nn_model4")
preds_train4 = model4.transform(assembler_train)
preds_val4 = model4.transform(assembler_val)
preds_test4 = model4.transform(assembler_test)
preds_proba_val4 = preds_val4.select(["index", "probability"]).rdd.map(lambda x: (x[0], float(x[1][1]))).toDF(schema)
preds_proba_test4 = preds_test4.select(["index", "probability"]).rdd.map(lambda x: (x[0], float(x[1][1]))).toDF(schema)
preds_proba_val4.write.parquet(f"{team_blob_url}/TP/nn_preds_val4")
preds_proba_test4.write.parquet(f"{team_blob_url}/TP/nn_preds_test4")

Time Elapsed 193 minutes


In [0]:
acc, precision, recall, fbeta = evaluate_multiclass_metrics(preds_train4, "DEP_DEL15")
print("Accuracy:", acc)
print("Precision:", precision)
print("Recall:", recall)
print("F-beta:", fbeta)

Accuracy: 0.6763387178156958
Precision: 0.6706020953544157
Recall: 0.2901287711621471
F-beta: 0.46662540544915637


In [0]:
acc, precision, recall, fbeta = evaluate_multiclass_metrics(preds_test4, "DEP_DEL15")
print("Accuracy:", acc)
print("Precision:", precision)
print("Recall:", recall)
print("F-beta:", fbeta)

Accuracy: 0.7836324475195899
Precision: 0.4514172769864899
Recall: 0.3062735728235076
F-beta: 0.38983591739289447


#### 3 Hidden Layers NN

In [0]:
classifier5 = MultilayerPerceptronClassifier(layers=layers5, featuresCol = "features", 
                                                    labelCol = "DEP_DEL15", maxIter = 100)
start = time.time()
model5 = classifier5.fit(assembler_train)
print(f"Time Elapsed {round((time.time() - start)/60)} minutes")
model5.save(f"{team_blob_url}/TP/nn_model5")
preds_train5 = model5.transform(assembler_train)
preds_val5 = model5.transform(assembler_val)
preds_test5 = model5.transform(assembler_test)
preds_proba_val5 = preds_val5.select(["index", "probability"]).rdd.map(lambda x: (x[0], float(x[1][1]))).toDF(schema)
preds_proba_test5 = preds_test5.select(["index", "probability"]).rdd.map(lambda x: (x[0], float(x[1][1]))).toDF(schema)
preds_proba_val5.write.parquet(f"{team_blob_url}/TP/nn_preds_val5")
preds_proba_test5.write.parquet(f"{team_blob_url}/TP/nn_preds_test5")

Time Elapsed 358 minutes


In [0]:
acc, precision, recall, fbeta = evaluate_multiclass_metrics(preds_train5, "DEP_DEL15")
print("Accuracy:", acc)
print("Precision:", precision)
print("Recall:", recall)
print("F-beta:", fbeta)

Accuracy: 0.678029748495164
Precision: 0.6563812752285809
Recall: 0.3191464208154987
F-beta: 0.48540802342054157


In [0]:
acc, precision, recall, fbeta = evaluate_multiclass_metrics(preds_test5, "DEP_DEL15")
print("Accuracy:", acc)
print("Precision:", precision)
print("Recall:", recall)
print("F-beta:", fbeta)

Accuracy: 0.7764407500272689
Precision: 0.4347221155390449
Recall: 0.3374849059390979
F-beta: 0.3966294121301026


In [0]:
display(dbutils.fs.ls(f"{team_blob_url}/TP/"))

path,name,size,modificationTime
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/_SUCCESS,_SUCCESS,0,1720561571000
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/_committed_1002670972388615845,_committed_1002670972388615845,625,1720560531000
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/_committed_5669257934384103852,_committed_5669257934384103852,221,1720561571000
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/_committed_6618439955609603938,_committed_6618439955609603938,419,1720561337000
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/_committed_9167039456723159873,_committed_9167039456723159873,318,1720559468000
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/_committed_vacuum825530481471543349,_committed_vacuum825530481471543349,95,1720561338000
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/_started_1002670972388615845,_started_1002670972388615845,0,1720560531000
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/_started_5669257934384103852,_started_5669257934384103852,0,1720561570000
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/_started_6618439955609603938,_started_6618439955609603938,0,1720561337000
wasbs://final-project-summer24-team3@summer2024team3.blob.core.windows.net/TP/df_1y_cleaned_transformed/,df_1y_cleaned_transformed/,0,1722046865000
